In [2]:
import pandas as pd

## List/ Dictionary

In [None]:
# Creating a List by Series
lst = [1,2,3,4,5]
series = pd.Series(lst)
print(series)
print(type(series))

In [None]:
a = pd.Series(['p','q','r','s','t'], index = [10,11,12,13,14])
# With a particular index
a

In [None]:
dict_series = pd.Series({'p':1, 'q':2, 'r':3, 's':4, 't':5})
# With a dictionary
dict_series

In [None]:
dict_series = pd.Series({'p':[1,5,6], 'q':[2,6,7], 'r':[3,9,0], 's':[4,4,5], 't':[5,1,2]})
# With a dictionary with value as list
dict_series

In [None]:
dict_series[0][2]

## DataFrame

In [None]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'], index=['a', 'b'])
print(df)

In [None]:
a = [{'a':5, 'b':7, 'c':9, 'd':2}, 
     {'a':4, 'b':8, 'c':19, 'd':12}]        # Dictionary Keys represents Column names

df = pd.DataFrame(a)
df

In [None]:
b = [{'a':5, 'b':7, 'c':9, 'd':2}, 
     {'a':4, 'b':8, 'e':19, 'd':12}]        
# Dictionary Keys represents Column names and if colums name are diiferent then it will be added as new column

db = pd.DataFrame(b)
db

In [3]:
b = {'RollNo.':pd.Series([1,2,3,4,5]), 
    'Maths':pd.Series([67,89,23,90,56]), 
    'Physics':pd.Series([12,98,44,90,78])}      
# here instead of list , the values are in series
# The above is same as below
# b = {'RollNo.':[1,2,3,4,5],
#     'Maths':[67,89,23,90,56],
#     'Physics':[12,98,44,90,78]}
df = pd.DataFrame(b)
df

Unnamed: 0,RollNo.,Maths,Physics
0,1,67,12
1,2,89,98
2,3,23,44
3,4,90,90
4,5,56,78


In [5]:
df = pd.read_csv(r'sample.csv')
# Reading a csv file
df

Unnamed: 0,Roll No.,Physics,Chemistry,Maths,Computer
0,1,56.0,57.0,58.0,59.0
1,2,23.0,24.0,25.0,26.0
2,3,89.0,25.0,26.0,27.0
3,4,45.0,26.0,27.0,28.0
4,5,23.0,27.0,28.0,29.0
5,6,90.0,,29.0,30.0
6,7,12.0,13.0,14.0,15.0
7,8,78.0,14.0,15.0,16.0
8,9,,15.0,16.0,17.0
9,10,45.0,16.0,17.0,18.0


In [None]:
df.columns
# Give Column names
df.head()
# give first 5 rows
df.head(10)
# give first 10 rows
df.tail()
# give last 5 rows
df.tail(10)
# give last 10 rows

In [None]:
df.describe()
# count: The number of non-null entries.
# mean: The average of the data.
# std: The standard deviation.
# min: The minimum value.
# 25%: The 25th percentile (first quartile).
# 50%: The 50th percentile (median or second quartile).
# 75%: The 75th percentile (third quartile).
# max: The maximum value.

## Handling Missing Data

In [35]:
ds = pd.read_csv('sample.csv')
ds.head()

Unnamed: 0,Roll No.,Physics,Chemistry,Maths,Computer
0,1,56.0,57.0,58.0,59.0
1,2,23.0,24.0,25.0,26.0
2,3,89.0,25.0,26.0,27.0
3,4,45.0,26.0,27.0,28.0
4,5,23.0,27.0,28.0,29.0


In [None]:
df.isnull()
# Check for null values and return True for null values

In [None]:
df.isnull().sum()
# Check for null values and return the sum of null values for each column

#### Drop missing values using dropna()

In [None]:
df2 = ds.dropna() #default axis=0
# Drop the rows with null values and return the new dataframe
df2.shape

In [None]:
# Drop the columns with null values
df2 = ds.dropna(axis=1) # axis=1 for columns
df2.shape

In [None]:
df.dropna(how = 'any')    # if any row value is null then remove that row

In [None]:
df.dropna(how = 'all')    # if all row values are null then remove that row

In [None]:
df.dropna(inplace = True)
# it will remove all the null values from the dataframe and update the dataframe without returing new dataframe

### Filling missing values using fillna()

In [None]:
df.fillna(2)
# Fill all the null values with 2 => fillna(value)

In [None]:
df.fillna({'Physics':'none', 'Chemistry':0, 'Maths':30}) 
# Fill the null values with the given values for the given columns

In [None]:
df.fillna(method = 'ffill')
# Fill the null values with the previous value along the axis given
df.fillna(method = 'bfill')
# Fill the null values with the next value along the axis given

In [None]:
ds['Physics'].fillna(value=ds['Physics'].mean())
# Fill the null values with the mean of the column but didn't update the dataframe

In [None]:
df.fillna(method = 'bfill', inplace = True)
# Fill the null values with the next value along the axis given and update the dataframe

### Replace missing values using replace()

In [None]:
df.replace(to_replace=26, value=30)
# Replace the value 26 with 30
df.replace(34, 10000)
# Replace the value 34 with 10000
df.replace(to_replace=[50,51,52,53,54,55,56,57,58,59], value= 'A')
# Replace the values within the "to_replace" with A
df.replace(to_replace=[50,51,52,53], value= ['A', 'B', 'C', 'D'])
# Replace the values within the "to_replace" with the values in the "value" as key value pair
df['Physics'].replace(to_replace=[50,51,52,53], value= ['A', 'B', 'C', 'D'], inplace = True)
# Replace the values of a column within the "to_replace" with the values in the "value" as key value pair 
# and update the dataframe

In [None]:
df.replace('[A-Za-z]', 0)
# Replace the values with the value 0 if the value is a string but this will not work
# as there is a parameter "regex = True" missing
df.replace('[A-Za-z]', 0, regex = True)
# Replace the values with the value 0(any other defined) if the value is a string
df.replace(to_replace= 15, method = 'ffill')
# Replace the value 15 with the previous value along the axis given
df.replace(to_replace= 15, method = 'bfill')
# Replace the value 15 with the next value along the axis given

In pandas, `loc` and `iloc` are used for data selection and manipulation, but they differ in how they access data.

### `loc`
- **Label-based**: Uses labels or boolean arrays to select data.
- **Inclusive**: Includes both start and end labels in slicing.
- **Syntax**: `df.loc[row_labels, column_labels]`

### `iloc`
- **Integer-based**: Uses integer positions to select data.
- **Exclusive**: End index is excluded in slicing.
- **Syntax**: `df.iloc[row_indices, column_indices]`

### Example DataFrame


In [None]:
import pandas as pd

data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
df = pd.DataFrame(data, index=['a', 'b', 'c'])



### Using `loc`


In [None]:
# Select row 'a' and columns 'A' and 'B'
df.loc['a', ['A', 'B']]  # Output: A    1, B    4

# Select rows 'a' to 'b' and columns 'A' to 'B'
df.loc['a':'b', 'A':'B']  # Output: DataFrame with rows 'a' and 'b', columns 'A' and 'B'



### Using `iloc`


In [None]:
# Select first row and first two columns
df.iloc[0, [0, 1]]  # Output: A    1, B    4

# Select first two rows and first two columns
df.iloc[0:2, 0:2]  # Output: DataFrame with first two rows, first two columns

In [None]:
# Using loc
df.loc['a', 'A']  # Output: 1

# Using iloc
df.iloc[0, 0]  # Output: 1



Choose `loc` for label-based indexing and `iloc` for position-based indexing.