In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing'],
    'NAN':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
}

In [3]:
df = pd.DataFrame(people)

In [4]:
df

Unnamed: 0,first,last,email,age,NAN
0,Corey,Schafer,CoreyMSchafer@gmail.com,33,
1,Jane,Doe,JaneDoe@email.com,55,
2,John,Doe,JohnDoe@email.com,63,
3,Chris,Schafer,,36,
4,,,,,
5,,,Anonymous@email.com,,
6,,Missing,,Missing,


In [5]:
#Row 6 contians custom missing values to replace that to NAN
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [6]:
df

Unnamed: 0,first,last,email,age,NAN
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0,
1,Jane,Doe,JaneDoe@email.com,55.0,
2,John,Doe,JohnDoe@email.com,63.0,
3,Chris,Schafer,,36.0,
4,,,,,
5,,,Anonymous@email.com,,
6,,,,,


In [7]:
df.dropna()

Unnamed: 0,first,last,email,age,NAN


In [8]:
#any - it will drop the rows which contains one of the value NaN
#all - it will drop the rows only if all the value contains NaN

df.dropna(axis="index", how="all")

Unnamed: 0,first,last,email,age,NAN
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0,
1,Jane,Doe,JaneDoe@email.com,55.0,
2,John,Doe,JohnDoe@email.com,63.0,
3,Chris,Schafer,,36.0,
5,,,Anonymous@email.com,,


In [9]:
#It will work the opposite, it will drop columns contains NaN based on how argument
df.dropna(axis="columns", how="all")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [10]:
#If you see above, 4th row contians contains NaN in all the columns it dropped all the colyumns because hw argument is any..
#it retuned only empty ..each column contians atleast one NaN
df.dropna(axis="columns", how="any")

0
1
2
3
4
5
6


In [11]:
#if we want to check particular column contains NaN, we have to use subset
df.dropna(axis="index", how="any", subset=['email'])
#below result dropped row 3 and 4 because it contians email value NaN and none

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [12]:
#below query will check if any of the column email or last contains NAn or none
df.dropna(axis="index", how="all", subset=['last','email'])


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [14]:
#to see if the dataframe contians any NaN
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [16]:
#to replace NaN with some value, belwo will replace NAN with missing
df.fillna('Missing')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,Missing,36
4,Missing,Missing,Missing,Missing
5,Missing,Missing,Anonymous@email.com,Missing
6,Missing,Missing,Missing,Missing


In [17]:
#Casting data types
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [18]:
#since age is a string datatype object, converting to float. if any of the value contains NAn it wil give error if we convert to 
#int instead convert to float

df["age"] = df["age"].astype(float)

In [19]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [20]:
df["age"].mean()

46.75