In [1]:
import pandas as pd 

In [2]:
# sample data frame:
Employee_data = {
    "name": ["Bilal Kiani", "Afan", "Shabbar Alam", "Saood Sheikh", "Junaid Kiani", "Abdul Qadar", "Saqib Sheikh", "Ameen Khan"],
    "age": [22, None, 24, 21, 43, 44, 34, 50],
    "sallary": [20000, None, 40000, 50000, None, 60000, 43000, 50500],
    "performance_score": [50, None, 89, 78, 68, 90, 95, 86]
}
df = pd.DataFrame(Employee_data)
print("Sample DataFrame: ")
print(df)

Sample DataFrame: 
           name   age  sallary  performance_score
0   Bilal Kiani  22.0  20000.0               50.0
1          Afan   NaN      NaN                NaN
2  Shabbar Alam  24.0  40000.0               89.0
3  Saood Sheikh  21.0  50000.0               78.0
4  Junaid Kiani  43.0      NaN               68.0
5   Abdul Qadar  44.0  60000.0               90.0
6  Saqib Sheikh  34.0  43000.0               95.0
7    Ameen Khan  50.0  50500.0               86.0


In [3]:
# we can handle missing values in different ways

# Way 1-  if value in data set not effects our dataset, then we preffer to delete that rows or columns of that values, for this we use method    dropna()
# syntex:   df.dropna(axis= 0 or 1, inplace = True)   here axis 0 means deleting row and 1 means deleting column 
df.dropna(axis = 1, inplace = False) # inplace = False means will return new dataset not changes the original dataset, but we make sure to make changes in original dataset as most of the time required, below i false just for further operation on that  dataset

Unnamed: 0,name
0,Bilal Kiani
1,Afan
2,Shabbar Alam
3,Saood Sheikh
4,Junaid Kiani
5,Abdul Qadar
6,Saqib Sheikh
7,Ameen Khan


In [4]:
df.dropna(inplace=False) # if we not write axis  by default it is 0

Unnamed: 0,name,age,sallary,performance_score
0,Bilal Kiani,22.0,20000.0,50.0
2,Shabbar Alam,24.0,40000.0,89.0
3,Saood Sheikh,21.0,50000.0,78.0
5,Abdul Qadar,44.0,60000.0,90.0
6,Saqib Sheikh,34.0,43000.0,95.0
7,Ameen Khan,50.0,50500.0,86.0


In [5]:
# Way 2-  if missing values are frequent and deleting rows or columns can effect our data then insted fo deletingn we can fill these cells with default values,
# for this we use   .fillna()   method 
#syntex:   df.fillna(value, inplace = true)      here value means your missing values in data set will replace with that value 
df.fillna(0, inplace= True)
print(df)

           name   age  sallary  performance_score
0   Bilal Kiani  22.0  20000.0               50.0
1          Afan   0.0      0.0                0.0
2  Shabbar Alam  24.0  40000.0               89.0
3  Saood Sheikh  21.0  50000.0               78.0
4  Junaid Kiani  43.0      0.0               68.0
5   Abdul Qadar  44.0  60000.0               90.0
6  Saqib Sheikh  34.0  43000.0               95.0
7    Ameen Khan  50.0  50500.0               86.0


In [6]:
Employee_data2 = {
    "name": ["Bilal Kiani", "Afan", "Shabbar Alam", "Saood Sheikh", "Junaid Kiani", "Abdul Qadar", "Saqib Sheikh", "Ameen Khan"],
    "age": [22, None, 24, 21, 43, 44, 34, 50],
    "sallary": [20000, None, 40000, 50000, None, 60000, 43000, 50500],
    "performance_score": [50, None, 89, 78, 68, 90, 95, 86]
}
dff = pd.DataFrame(Employee_data2)
print("Sample DataFrame: ")
print(dff)

Sample DataFrame: 
           name   age  sallary  performance_score
0   Bilal Kiani  22.0  20000.0               50.0
1          Afan   NaN      NaN                NaN
2  Shabbar Alam  24.0  40000.0               89.0
3  Saood Sheikh  21.0  50000.0               78.0
4  Junaid Kiani  43.0      NaN               68.0
5   Abdul Qadar  44.0  60000.0               90.0
6  Saqib Sheikh  34.0  43000.0               95.0
7    Ameen Khan  50.0  50500.0               86.0


In [7]:
# and if we wana fill values in only some columns like filling only in sallary column and returns only that column:
# syntex:    df["sallary"].fillna(value, inplace= True)
# "Take the salary column, fill the blanks, and put it back into the salary column."
dff["sallary"] = dff["sallary"].fillna(0)

print(dff)

           name   age  sallary  performance_score
0   Bilal Kiani  22.0  20000.0               50.0
1          Afan   NaN      0.0                NaN
2  Shabbar Alam  24.0  40000.0               89.0
3  Saood Sheikh  21.0  50000.0               78.0
4  Junaid Kiani  43.0      0.0               68.0
5   Abdul Qadar  44.0  60000.0               90.0
6  Saqib Sheikh  34.0  43000.0               95.0
7    Ameen Khan  50.0  50500.0               86.0


In [8]:
# now if we wana fill some calculated values insted of default values:
# syntex:  dff["age"].fillna(df["age"].mean(), inplace= True)
dff["age"].fillna(df["age"].mean(), inplace= True)
print(dff)
print("\n \n")
dff["performance_score"].fillna(df["performance_score"].mean(), inplace= True)
print(dff)

           name    age  sallary  performance_score
0   Bilal Kiani  22.00  20000.0               50.0
1          Afan  29.75      0.0                NaN
2  Shabbar Alam  24.00  40000.0               89.0
3  Saood Sheikh  21.00  50000.0               78.0
4  Junaid Kiani  43.00      0.0               68.0
5   Abdul Qadar  44.00  60000.0               90.0
6  Saqib Sheikh  34.00  43000.0               95.0
7    Ameen Khan  50.00  50500.0               86.0

 

           name    age  sallary  performance_score
0   Bilal Kiani  22.00  20000.0               50.0
1          Afan  29.75      0.0               69.5
2  Shabbar Alam  24.00  40000.0               89.0
3  Saood Sheikh  21.00  50000.0               78.0
4  Junaid Kiani  43.00      0.0               68.0
5   Abdul Qadar  44.00  60000.0               90.0
6  Saqib Sheikh  34.00  43000.0               95.0
7    Ameen Khan  50.00  50500.0               86.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dff["age"].fillna(df["age"].mean(), inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dff["performance_score"].fillna(df["performance_score"].mean(), inplace= True)


In [9]:
# Way-3     interpolation,  we will discuss in new file   "interpolation.ipynb"