<a href="https://colab.research.google.com/github/ayushd05/python/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Handling Missing Data


In [1]:
import numpy as np
import pandas as pd


In [2]:
#to detect null values

In [3]:
pd.isnull(np.nan)

True

In [4]:
pd.isnull(None)

True

In [5]:
pd.isna(np.nan)

True

In [6]:
pd.isna(None)

True

In [7]:
#oppostite ones also exist

In [8]:
pd.notnull(None)

False

In [9]:
pd.notnull(np.nan)


False

In [10]:
pd.notnull(3)

True

In [11]:
#these funtions also work with series and dataframe is:
pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [12]:
pd.notnull(pd.Series([1, np.nan, 7]))

0     True
1    False
2     True
dtype: bool

In [13]:
pd.isnull(pd.DataFrame({
    'Column A' : [1 , np.nan, 7],
    'Column B' : [np.nan , 2, 3],
    'Column c' : [np.nan, 2, np.nan]
}))

Unnamed: 0,Column A,Column B,Column c
0,False,True,True
1,True,False,False
2,False,False,True


In [14]:
s = pd.Series([1,2,3,np.nan,np.nan,4])

In [15]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [16]:
pd.notnull(s).sum()               #counts the number of non-null cells

4

In [17]:
pd.isnull(s).sum()

2

In [18]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [19]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [20]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [21]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [22]:
#dropping null values
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [23]:
df =pd.DataFrame({
    'Column A' : [1, np.nan , 30, np.nan],
    'Column B' : [2,8,31,np.nan],
    'Column C' : [5,8,34,110]
})

In [24]:
df


Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,,8.0,8
2,30.0,31.0,34
3,,,110


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  4 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 224.0 bytes


In [26]:
df.shape

(4, 3)

In [27]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C
0,False,False,False
1,True,False,False
2,False,False,False
3,True,True,False


In [28]:
df.notnull()

Unnamed: 0,Column A,Column B,Column C
0,True,True,True
1,False,True,True
2,True,True,True
3,False,False,True


In [29]:
df.dropna()             #to drop any row having any null value

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
2,30.0,31.0,34


In [30]:
df2 = pd.DataFrame({
    'Column A': [1, np.nan , 30],
    'Column A': [2, np.nan, 31],
    'Column A': [np.nan, np.nan, 100]

})

In [31]:
df.dropna(how='all')     #will drop the rows whose every value is null

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,,8.0,8
2,30.0,31.0,34
3,,,110


In [32]:
df.dropna(how='any')       #default behaviour

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
2,30.0,31.0,34


In [33]:
#we can use thresh parameter to indicate a threshold (a minimum number) of non-null values for the rows and column to be kept:
#df.dropna(thresh = int(number of number or nonzero in a row to be kept))

In [34]:
df

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,,8.0,8
2,30.0,31.0,34
3,,,110


In [35]:
df.dropna(thresh=2)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,,8.0,8
2,30.0,31.0,34


In [36]:
df.dropna(thresh =3, axis = 'columns')

Unnamed: 0,Column B,Column C
0,2.0,5
1,8.0,8
2,31.0,34
3,,110


### Filling null values

In [37]:
#fillna is used to fill the null values
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [38]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [39]:
#object.fillna(method='ffill')  this use to fill with the values just above the null
#object.fillna(method='bfill')  this use to fill with the values just below the null

In [40]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [41]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [42]:
#filling values on Dtaframes
df


Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,,8.0,8
2,30.0,31.0,34
3,,,110


In [43]:
df.fillna({ 'Column A': 0, 'Column B': 99, 'Column C' : df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,0.0,8.0,8
2,30.0,31.0,34
3,0.0,99.0,110


In [44]:
df.fillna(method= 'ffill', axis =0)                     #axis=0(horizontal) axis=0(vertical)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5
1,1.0,8.0,8
2,30.0,31.0,34
3,30.0,31.0,110


In [45]:
df.fillna(method= 'bfill', axis =1)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,5.0
1,8.0,8.0,8.0
2,30.0,31.0,34.0
3,110.0,110.0,110.0


###Cleaning Not Null values

In [46]:
df = pd.DataFrame({
    'Sex' : ['M', 'F', 'F', 'D', '?'],
    'Age' :[29,30,24, 290, 25]
})

In [47]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [48]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [49]:
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [50]:
df['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [51]:
df['Sex'].replace({'D': 'F',  'N': 'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [52]:
df[df['Age']>100]

Unnamed: 0,Sex,Age
3,D,290


In [53]:
df.loc[df['Age']> 100, 'Age'] = df.loc[df['Age']> 100, 'Age']/10

In [54]:
df


Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25
