In [59]:
import pandas as pd
import numpy as np
data = pd.read_csv('new.csv')
data

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,,60000,,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,,10000,,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,,24
7,Tata,2018.0,15000,,21
8,Maruti,2019.0,12000,Ghaziabad,24


Entries missing values are given the value NaN, short for "Not a Number". For technical reasons these NaN values are always of the float64 dtype.

To select NaN entries you can use pd.isnull() (or its companion pd.notnull())

In [13]:
data1 = pd.isnull(data)
data1

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,False,True,False,False,False
1,False,False,False,False,False
2,False,True,False,True,False
3,False,False,False,False,False
4,False,True,False,True,False
5,False,False,False,False,False
6,False,False,False,True,False
7,False,False,False,True,False
8,False,False,False,False,False


In [15]:
data2 = pd.notnull(data)
data2

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,True,False,True,True,True
1,True,True,True,True,True
2,True,False,True,False,True
3,True,True,True,True,True
4,True,False,True,False,True
5,True,True,True,True,True
6,True,True,True,False,True
7,True,True,True,False,True
8,True,True,True,True,True


In [17]:
data[pd.isnull(data.Year)]

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
2,Tata,,60000,,25
4,Maruti,,10000,,28


In [20]:
data[pd.notnull(data.City)]

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
3,Mahindra,2015.0,25000,Delhi,26
5,Hyundai,2016.0,46000,Delhi,29
8,Maruti,2019.0,12000,Ghaziabad,24


Replacing missing values is a common operation. Pandas provides a really handy method for this problem: fillna(). fillna() provides a few different strategies for mitigating such data. For example, we can simply replace each NaN with an "Unknown"

In [23]:
data3 = data.fillna('Unknown')
data3

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,Unknown,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,Unknown,60000,Unknown,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,Unknown,10000,Unknown,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,Unknown,24
7,Tata,2018.0,15000,Unknown,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [24]:
data4 = data.Year.fillna(0)
data4

0       0.0
1    2014.0
2       0.0
3    2015.0
4       0.0
5    2016.0
6    2014.0
7    2018.0
8    2019.0
Name: Year, dtype: float64

or more specific:

In [26]:
new_df = data.fillna({'Year': 2000,
                     'City': 'Pune'})
new_df

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,2000.0,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,2000.0,60000,Pune,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,2000.0,10000,Pune,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,Pune,24
7,Tata,2018.0,15000,Pune,21
8,Maruti,2019.0,12000,Ghaziabad,24


# Carry forward and Carry Backward

In [29]:
newdf = data.fillna(method ='ffill')
newdf

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,2014.0,60000,Delhi,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,2015.0,10000,Delhi,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,Delhi,24
7,Tata,2018.0,15000,Delhi,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [30]:
newdf1 = data.fillna(method = 'bfill')
newdf1

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,2014.0,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,2015.0,60000,Delhi,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,2016.0,10000,Delhi,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,Ghaziabad,24
7,Tata,2018.0,15000,Ghaziabad,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [32]:
newdf2 = data.fillna(method = 'bfill',axis ='columns')
newdf2
#here the cursor will go cell by cell row wise and copy the values likewise

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,50000.0,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,60000.0,60000,25,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,10000.0,10000,28,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,24,24
7,Tata,2018.0,15000,21,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [35]:
newdf3 =data.fillna(method ='ffill', limit = 1)
newdf3
#limit will only carry forward one cell

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,2014.0,60000,Delhi,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,2015.0,10000,Delhi,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,Delhi,24
7,Tata,2018.0,15000,,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [36]:
newdf4 = data.interpolate()
newdf4

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,2014.5,60000,,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,2015.5,10000,,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,,24
7,Tata,2018.0,15000,,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [40]:
#interpolate took the middle value- Linear interpolation to come up with the values.
#see the interpolate methods in python documentation . by default it takes linear interpolation- middle value

In [41]:
#method = time will need date as index and then it will explore in time dimension. values close to dates will have values which are closer.

In [44]:
newdf5 = data.dropna() #it will drop the rows with na values
newdf5

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
1,Hyundai,2014.0,30000,Delhi,27
3,Mahindra,2015.0,25000,Delhi,26
5,Hyundai,2016.0,46000,Delhi,29
8,Maruti,2019.0,12000,Ghaziabad,24


In [49]:
#But, I dont want to drop all the rows. I want to drop rows only which have all the values as na then how?
newdf6 = data.dropna(how = 'all')
newdf6
#I will drop rows which has all na values. However our data set does not have such row

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,,60000,,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,,10000,,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,,24
7,Tata,2018.0,15000,,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [52]:
newdf7 = data.dropna(thresh = 1)
newdf7
#threshold = 1 means that I need atleas 1 valid value or else i will drop it


Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
2,Tata,,60000,,25
3,Mahindra,2015.0,25000,Delhi,26
4,Maruti,,10000,,28
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,,24
7,Tata,2018.0,15000,,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [54]:
newdf8 = data.dropna(thresh = 4)
newdf8
#as row 2 had only three valid values so it is dropped

Unnamed: 0,Brand,Year,Kms Driven,City,Mileage
0,Maruti,,50000,Gurgaon,28
1,Hyundai,2014.0,30000,Delhi,27
3,Mahindra,2015.0,25000,Delhi,26
5,Hyundai,2016.0,46000,Delhi,29
6,Renault,2014.0,31000,,24
7,Tata,2018.0,15000,,21
8,Maruti,2019.0,12000,Ghaziabad,24


In [57]:
df = pd.read_csv('weather1.csv')
df

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,99999,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
7,1/4/2017,mumbai,92,9999,Rain
8,1/1/2017,999,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy


To handle the special values like 9999 use replace method:


In [62]:
df1 = df.replace([9999,99999],np.NaN)
df1

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32.0,6.0,Rain
1,1/2/2017,new york,36.0,7.0,Sunny
2,1/3/2017,new york,28.0,12.0,Snow
3,1/4/2017,new york,,7.0,Sunny
4,1/1/2017,mumbai,90.0,5.0,Sunny
5,1/2/2017,mumbai,85.0,12.0,Fog
6,1/3/2017,mumbai,87.0,15.0,Fog
7,1/4/2017,mumbai,92.0,,Rain
8,1/1/2017,999,45.0,20.0,Sunny
9,1/2/2017,paris,50.0,13.0,Cloudy


#also can replace value based on specific colum for that we need to give dictionary as input

In [63]:
df

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,99999,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
7,1/4/2017,mumbai,92,9999,Rain
8,1/1/2017,999,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy


In [70]:
new_df = df.replace({
    'temperature' : 99999,
    'windspeed': 9999,
    'city':999},
    np.NaN
)
new_df

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32.0,6.0,Rain
1,1/2/2017,new york,36.0,7.0,Sunny
2,1/3/2017,new york,28.0,12.0,Snow
3,1/4/2017,new york,,7.0,Sunny
4,1/1/2017,mumbai,90.0,5.0,Sunny
5,1/2/2017,mumbai,85.0,12.0,Fog
6,1/3/2017,mumbai,87.0,15.0,Fog
7,1/4/2017,mumbai,92.0,,Rain
8,1/1/2017,999,45.0,20.0,Sunny
9,1/2/2017,paris,50.0,13.0,Cloudy


In [72]:
new_df2 = df.replace('[A-Za-z]','',regex= True)
new_df2

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,,32,6,
1,1/2/2017,,36,7,
2,1/3/2017,,28,12,
3,1/4/2017,,99999,7,
4,1/1/2017,,90,5,
5,1/2/2017,,85,12,
6,1/3/2017,,87,15,
7,1/4/2017,,92,9999,
8,1/1/2017,999.0,45,20,
9,1/2/2017,,50,13,


In [74]:
dataframe = pd.DataFrame({
    'score': ['exceptional','good','average','poor','exceptional','average'],
    'student' : ['Amit','Ajit','Rony','Akansha','Mayuri','Poonam']
})
dataframe

Unnamed: 0,score,student
0,exceptional,Amit
1,good,Ajit
2,average,Rony
3,poor,Akansha
4,exceptional,Mayuri
5,average,Poonam


In [75]:
dataframe.replace(['exceptional','good','average','poor'],['A','B','C','D'])

Unnamed: 0,score,student
0,A,Amit
1,B,Ajit
2,C,Rony
3,D,Akansha
4,A,Mayuri
5,C,Poonam


Using list can be pretty powerful in this case