# handling missing data

In [5]:
import pandas as pd 
data = {
"Name"  : ['yash','yash vardhan',None,'tushi','mohit','monu','shinchan','nobitha','doremone'],
"age"   : [21,23,None,18,25,26,45,22,30],   # Note: these are strings, not numbers
"salary":[101000,100000,None,20000,150,12000,15000,25000,30000],
"work"  : ['data science','data analyst',None,'accounted','swiper','hardware Engineering',
           'maneger','bank maneger','scientest']
}
df = pd.DataFrame(data)
print(df)
print("showing a missing values in the form boolean :-")
print(df.isnull())
print(df.isnull().sum())

           Name   age    salary                  work
0          yash  21.0  101000.0          data science
1  yash vardhan  23.0  100000.0          data analyst
2          None   NaN       NaN                  None
3         tushi  18.0   20000.0             accounted
4         mohit  25.0     150.0                swiper
5          monu  26.0   12000.0  hardware Engineering
6      shinchan  45.0   15000.0               maneger
7       nobitha  22.0   25000.0          bank maneger
8      doremone  30.0   30000.0             scientest
showing a missing values in the form boolean :-
    Name    age  salary   work
0  False  False   False  False
1  False  False   False  False
2   True   True    True   True
3  False  False   False  False
4  False  False   False  False
5  False  False   False  False
6  False  False   False  False
7  False  False   False  False
8  False  False   False  False
Name      1
age       1
salary    1
work      1
dtype: int64


# remove the missing values

In [2]:
import pandas as pd 
data = {
"Name"  : ['yash','yash vardhan',None,'tushi','mohit','monu','shinchan','nobitha','doremone'],
"age"   : [21,23,None,18,25,26,45,22,30],   # Note: these are strings, not numbers
"salary":[101000,100000,None,20000,150,12000,15000,25000,30000],
"work"  : ['data science','data analyst',None,'accounted','swiper','hardware Engineering',
           'maneger','bank maneger','scientest']
}
df = pd.DataFrame(data)
print(df)
print("data frame after removing the missing values :-")
df.dropna(inplace=True)
print(df)

           Name   age    salary                  work
0          yash  21.0  101000.0          data science
1  yash vardhan  23.0  100000.0          data analyst
2          None   NaN       NaN                  None
3         tushi  18.0   20000.0             accounted
4         mohit  25.0     150.0                swiper
5          monu  26.0   12000.0  hardware Engineering
6      shinchan  45.0   15000.0               maneger
7       nobitha  22.0   25000.0          bank maneger
8      doremone  30.0   30000.0             scientest
data frame after removing the missing values :-
           Name   age    salary                  work
0          yash  21.0  101000.0          data science
1  yash vardhan  23.0  100000.0          data analyst
3         tushi  18.0   20000.0             accounted
4         mohit  25.0     150.0                swiper
5          monu  26.0   12000.0  hardware Engineering
6      shinchan  45.0   15000.0               maneger
7       nobitha  22.0   25000.0   

# filling the missing values 

In [18]:
import pandas as pd 
data = {
"Name"  : ['yash','yash vardhan',None,'tushi','mohit','monu','shinchan','nobitha','doremone'],
"age"   : [21,23,None,18,25,26,45,22,30],   # Note: these are strings, not numbers
"salary":[101000,100000,None,20000,150,12000,15000,25000,30000],
"work"  : ['data science','data analyst',None,'accounted','swiper','hardware Engineering',
           'maneger','bank maneger','scientest']
}
df = pd.DataFrame(data)
print(df)
df.fillna(0,inplace=True)
print("missing values fill with zero:-")
print(df)

           Name   age    salary                  work
0          yash  21.0  101000.0          data science
1  yash vardhan  23.0  100000.0          data analyst
2          None   NaN       NaN                  None
3         tushi  18.0   20000.0             accounted
4         mohit  25.0     150.0                swiper
5          monu  26.0   12000.0  hardware Engineering
6      shinchan  45.0   15000.0               maneger
7       nobitha  22.0   25000.0          bank maneger
8      doremone  30.0   30000.0             scientest
missing values fill with zero:-
           Name   age    salary                  work
0          yash  21.0  101000.0          data science
1  yash vardhan  23.0  100000.0          data analyst
2             0   0.0       0.0                     0
3         tushi  18.0   20000.0             accounted
4         mohit  25.0     150.0                swiper
5          monu  26.0   12000.0  hardware Engineering
6      shinchan  45.0   15000.0               mane

# fiiling a missing values with spacific postion  

In [3]:
import pandas as pd 

data = {
    "Name"  : ['yash','yash vardhan',None,'tushi','mohit','monu','shinchan','nobitha','doremone'],
    "age"   : [21,23,None,18,25,26,45,22,30],   # ages are numbers, but one is missing (None)
    "salary":[101000,100000,None,20000,150,12000,15000,25000,30000], # salary has missing value
    "work"  : ['data science','data analyst',None,'accounted','swiper',
               'hardware Engineering','maneger','bank maneger','scientest']
}

df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

# Fill missing values in salary column with mean
# df['salary'].fillna(df['salary'].mean(), inplace=True)     # showing the error this line temprarey 

# Using df['salary'].fillna(..., inplace=True) works now,
# but in Pandas 3.0 this will stop working, because df['salary'] is 
# considered a "view" (copy) of the DataFrame, not the DataFrame itself.

# df['salary'] = df['salary'].fillna(df['salary'].mean()) # 1st option
# df.fillna({'salary': df['salary'].mean()}, inplace=True) # 2nd option
df['salary'] = df['salary'].fillna(df['salary'].mean())


print("\nAfter Filling Salary Missing Values:\n", df)

Original DataFrame:
            Name   age    salary                  work
0          yash  21.0  101000.0          data science
1  yash vardhan  23.0  100000.0          data analyst
2          None   NaN       NaN                  None
3         tushi  18.0   20000.0             accounted
4         mohit  25.0     150.0                swiper
5          monu  26.0   12000.0  hardware Engineering
6      shinchan  45.0   15000.0               maneger
7       nobitha  22.0   25000.0          bank maneger
8      doremone  30.0   30000.0             scientest

After Filling Salary Missing Values:
            Name   age     salary                  work
0          yash  21.0  101000.00          data science
1  yash vardhan  23.0  100000.00          data analyst
2          None   NaN   37893.75                  None
3         tushi  18.0   20000.00             accounted
4         mohit  25.0     150.00                swiper
5          monu  26.0   12000.00  hardware Engineering
6      shinchan