# Finding and working with missing values

In [1]:
import pandas as pd
import numpy as np
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
df = pd.DataFrame(dict)
print(df)

   First Score  Second Score  Third Score
0        100.0          30.0          NaN
1         90.0          45.0         40.0
2          NaN          56.0         80.0
3         95.0           NaN         98.0


In [2]:
# finding the missing values in dataframe
df.isna()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [3]:
df.isnull()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [4]:
df.isna().sum()

First Score     1
Second Score    1
Third Score     1
dtype: int64

In [5]:
df.isnull().sum()

First Score     1
Second Score    1
Third Score     1
dtype: int64

In [6]:
print(df)

   First Score  Second Score  Third Score
0        100.0          30.0          NaN
1         90.0          45.0         40.0
2          NaN          56.0         80.0
3         95.0           NaN         98.0


In [7]:
df["First Score"].isna().sum()

np.int64(1)

In [8]:
# Find Boolean array with Non-NAN Values
df.notnull()

Unnamed: 0,First Score,Second Score,Third Score
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


In [9]:
# same like this
df.notna()

Unnamed: 0,First Score,Second Score,Third Score
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


In [10]:
# finding the number of non-nan values
df.notnull().sum()

First Score     3
Second Score    3
Third Score     3
dtype: int64

In [11]:
df.notna().sum()

First Score     3
Second Score    3
Third Score     3
dtype: int64

In [12]:
print(df)

   First Score  Second Score  Third Score
0        100.0          30.0          NaN
1         90.0          45.0         40.0
2          NaN          56.0         80.0
3         95.0           NaN         98.0


In [13]:
# To fill NAN values with Our Values, we use fillna()
df.fillna(0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


In [14]:
print(df)

   First Score  Second Score  Third Score
0        100.0          30.0          NaN
1         90.0          45.0         40.0
2          NaN          56.0         80.0
3         95.0           NaN         98.0


In [15]:
df["First Score"].fillna(1)

0    100.0
1     90.0
2      1.0
3     95.0
Name: First Score, dtype: float64

In [16]:
df.replace(to_replace=np.nan, value=0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


In [37]:
# Second Examples
# Create DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame(({
     'Courses':["Spark",'Java',"Scala",'Python'],
     'Fee' :[20000,np.nan,26000,24000],
     'Duration':['30days','40days', pd.NA,'40days'],
     'Discount':[1000,np.nan,2500,None]
               }))
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       NaN
2   Scala  26000.0     <NA>    2500.0
3  Python  24000.0   40days       NaN


In [38]:
# Fillna to replace all NaN
df2 = df.fillna("None")
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java     None   40days     None
2   Scala  26000.0     None   2500.0
3  Python  24000.0   40days     None


In [39]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       NaN
2   Scala  26000.0     <NA>    2500.0
3  Python  24000.0   40days       NaN


In [40]:
# Fillna on one column--Discount--Series Object
df2["Discount"] = df["Discount"].fillna('0')
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java     None   40days        0
2   Scala  26000.0     None   2500.0
3  Python  24000.0   40days        0


In [43]:
# Fillna() on multiple columns
df2[["Fee","Duration"]] = df[["Fee","Duration"]].fillna(0)
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java      0.0   40days        0
2   Scala  26000.0        0   2500.0
3  Python  24000.0   40days        0


In [44]:
# Fillna() on multiple columns with Different Values
df2 = df.fillna(value = {"Discount": '0', "Fee":10000})
print(df2)

  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java  10000.0   40days        0
2   Scala  26000.0     <NA>   2500.0
3  Python  24000.0   40days        0


In [45]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       NaN
2   Scala  26000.0     <NA>    2500.0
3  Python  24000.0   40days       NaN


In [46]:
df["Discount"].isna()

0    False
1     True
2    False
3     True
Name: Discount, dtype: bool

In [47]:
df.loc[df["Discount"].isna(),["Discount"]]

Unnamed: 0,Discount
1,
3,


In [48]:
df.loc[df["Discount"].isna(),["Discount"]] = 0

In [49]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       0.0
2   Scala  26000.0     <NA>    2500.0
3  Python  24000.0   40days       0.0


In [53]:
df.loc[1:1,["Fee"]] = 50000

In [58]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java  50000.0   40days       0.0
2   Scala  26000.0   25days    2500.0
3  Python  24000.0   40days       0.0


In [59]:
df.iloc[2:3,2] = "30days"

In [60]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java  50000.0   40days       0.0
2   Scala  26000.0   30days    2500.0
3  Python  24000.0   40days       0.0


In [62]:
df.loc[2:2,["Duration"]] = "25days"

In [63]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java  50000.0   40days       0.0
2   Scala  26000.0   25days    2500.0
3  Python  24000.0   40days       0.0


In [65]:
df.iloc[1:2,3] = 3000

In [66]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java  50000.0   40days    3000.0
2   Scala  26000.0   25days    2500.0
3  Python  24000.0   40days       0.0


In [68]:
df.iloc[3::,3] = 6000

In [69]:
print(df)

  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java  50000.0   40days    3000.0
2   Scala  26000.0   25days    2500.0
3  Python  24000.0   40days    6000.0


In [70]:
#Droping the Values from DataFrame
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, pd.NA, np.nan, 65]}
df=pd.DataFrame(dict)
print(df)

   First Score  Second Score  Third Score Fourth Score
0        100.0          30.0           52          NaN
1         90.0           NaN           40         <NA>
2          NaN          45.0           80          NaN
3         95.0          56.0           98           65


In [71]:
df.dropna() # here default axis is 0 (Row) and how is "any" 

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65


In [72]:
print(df)

   First Score  Second Score  Third Score Fourth Score
0        100.0          30.0           52          NaN
1         90.0           NaN           40         <NA>
2          NaN          45.0           80          NaN
3         95.0          56.0           98           65


In [73]:
df.dropna(axis=1,how="any")

Unnamed: 0,Third Score
0,52
1,40
2,80
3,98


In [74]:
print(df)

   First Score  Second Score  Third Score Fourth Score
0        100.0          30.0           52          NaN
1         90.0           NaN           40         <NA>
2          NaN          45.0           80          NaN
3         95.0          56.0           98           65


In [75]:
df.dropna(axis=0,how="any")

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65


In [76]:
print(df)

   First Score  Second Score  Third Score Fourth Score
0        100.0          30.0           52          NaN
1         90.0           NaN           40         <NA>
2          NaN          45.0           80          NaN
3         95.0          56.0           98           65
