In [1]:
import numpy as np
import pandas as pd

## Missing data handling

### 1) df.isna() and df.isna()

In [2]:

df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                            pd.NaT]})
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [3]:
df.isna()

Unnamed: 0,name,toy,born
0,False,True,True
1,False,False,False
2,False,False,True


In [4]:
df.isna().sum()

name    0
toy     1
born    2
dtype: int64

In [5]:
df.isna().sum().sum()

3

In [6]:
df.isnull()

Unnamed: 0,name,toy,born
0,False,True,True
1,False,False,False
2,False,False,True


In [7]:
df.isnull().sum().sum()

3

In [8]:
df.born.isnull()

0     True
1    False
2     True
Name: born, dtype: bool

In [9]:
df.born.isnull().sum()

2

### 2) df.notna() and df.notnull()

In [10]:
df.notnull()

Unnamed: 0,name,toy,born
0,True,False,False
1,True,True,True
2,True,True,False


In [11]:
df.notnull().sum()

name    3
toy     2
born    1
dtype: int64

In [12]:
df.notnull().sum()

name    3
toy     2
born    1
dtype: int64

In [13]:
df.notna()

Unnamed: 0,name,toy,born
0,True,False,False
1,True,True,True
2,True,True,False


In [14]:
df.toy.notna()

0    False
1     True
2     True
Name: toy, dtype: bool

### 3) df.dropna()

In [15]:
# DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

df = pd.read_csv("Dataset/weather_data.csv")

df.head()

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain


In [16]:
df.isnull().sum()

day            0
temperature    4
windspeed      4
event          2
dtype: int64

In [17]:
df.dropna()

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [18]:
df.dropna(axis='columns')

Unnamed: 0,day
0,1/1/2017
1,1/4/2017
2,1/5/2017
3,1/6/2017
4,1/7/2017
5,1/8/2017
6,1/9/2017
7,1/10/2017
8,1/11/2017


In [19]:
df.dropna(how='all')

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [20]:
df.dropna(thresh=2) # thresh=4 means minimum 2 not null value in the row. I can use axis=1

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [21]:
dict2 = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score':[52, 40, 80, 98],
        'Fourth Score':[np.nan, np.nan, np.nan, 65]}
my_dict2 = pd.DataFrame(data=dict2)

In [22]:
my_dict2

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [23]:
my_dict2.dropna()

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


In [24]:
my_dict2.loc[[3], ['Fourth Score']] = np.nan
my_dict2

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,


In [25]:
my_dict2.dropna(axis='columns', how='all')

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,52
1,90.0,,40
2,,45.0,80
3,95.0,56.0,98


In [26]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [27]:
df.dropna(subset=['windspeed', 'event']) #subset['windspeed', 'event'] means dropna function apply only on 'windspeed', 'event' column 

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [28]:
df.dropna(subset=['windspeed', 'event'], inplace=True)

In [29]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


### 4) df.fillna()

In [30]:
# DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)

df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, np.nan],
                   [np.nan, 3, np.nan, 4]],
                  columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,
3,,3.0,,4.0


In [31]:
df.fillna(value=0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0.0
1,3.0,4.0,0.0,1.0
2,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,4.0


In [32]:
data = pd.read_csv("Dataset/Furtune_5.csv")

data

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,,Health,47689453,
3,4,,IT Servics,23562549,
4,5,Mark,,5674893,21%


In [33]:
data.fillna(value=0)

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,0,Health,47689453,0
3,4,0,IT Servics,23562549,0
4,5,Mark,0,5674893,21%


In [34]:
data.fillna(value={"Name":"none",'Companies':2,"Growth":20})

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,none,Health,47689453,20
3,4,none,IT Servics,23562549,20
4,5,Mark,2,5674893,21%


In [35]:
data.fillna(method="ffill") # method="ffill" means forward fill

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,Stripfind,Health,47689453,20%
3,4,Stripfind,IT Servics,23562549,20%
4,5,Mark,IT Servics,5674893,21%


In [36]:
data.fillna(method="bfill") # method="bfill" means backward fill

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,Mark,Health,47689453,21%
3,4,Mark,IT Servics,23562549,21%
4,5,Mark,,5674893,21%


In [37]:
data.fillna(method="bfill", axis='columns')

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,Health,Health,47689453,
3,4,IT Servics,IT Servics,23562549,
4,5,Mark,5674893,5674893,21%


In [38]:
data.fillna(method="ffill", axis='columns')

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,3,Health,47689453,47689453
3,4,4,IT Servics,23562549,23562549
4,5,Mark,Mark,5674893,21%


In [39]:
data.fillna(method="bfill", limit=1) # limit=1 means one value fill in a columns

Unnamed: 0,ID,Name,Companies,Profit,Growth
0,1,Lamtone,IT Servics,5274553,30%
1,2,Stripfind,Financial Services,23648990,20%
2,3,,Health,47689453,
3,4,Mark,IT Servics,23562549,21%
4,5,Mark,,5674893,21%


### 5) df.replace()

In [40]:
# DataFrame.replace(to_replace=None, value=NoDefault.no_default, inplace=False, limit=None, regex=False, method=NoDefault.no_default)


data2 = pd.read_csv("Dataset/Fortune_10.csv")
data2.head()

Unnamed: 0,ID,Name,Industry,Inception,Revenue,Expenses,Profit,Growth
0,1,Lamtone,IT Services,2009,"$11,757,018","6,482,465 Dollars",5274553,30%
1,2,Stripfind,Financial Services,2010,"$12,329,371","916,455 Dollars",11412916,20%
2,3,Canecorporation,Health,2012,"$10,597,009","7,591,189 Dollars",3005820,7%
3,4,Mattouch,IT Services,2013,"$14,026,934","7,429,377 Dollars",6597557,26%
4,5,Techdrill,Health,2009,"$10,573,990","7,435,363 Dollars",3138627,8%


In [41]:
data2.isnull().sum()

ID           0
Name         0
Industry     0
Inception    0
Revenue      0
Expenses     1
Profit       0
Growth       0
dtype: int64

In [42]:
data2.replace(to_replace="Health", value="Finance")

Unnamed: 0,ID,Name,Industry,Inception,Revenue,Expenses,Profit,Growth
0,1,Lamtone,IT Services,2009,"$11,757,018","6,482,465 Dollars",5274553,30%
1,2,Stripfind,Financial Services,2010,"$12,329,371","916,455 Dollars",11412916,20%
2,3,Canecorporation,Finance,2012,"$10,597,009","7,591,189 Dollars",3005820,7%
3,4,Mattouch,IT Services,2013,"$14,026,934","7,429,377 Dollars",6597557,26%
4,5,Techdrill,Finance,2009,"$10,573,990","7,435,363 Dollars",3138627,8%
5,6,Techline,Finance,2006,"$13,898,119","5,470,303 Dollars",8427816,23%
6,7,Cityace,Finance,2010,"$9,254,614","6,249,498 Dollars",3005116,6%
7,8,Kayelectronics,Finance,2009,"$9,451,943","3,878,113 Dollars",5573830,4%
8,9,Ganzlax,IT Services,2011,"$14,001,180",,11901180,18%
9,10,Trantraxlax,Government Services,2011,"$11,088,336","5,635,276 Dollars",5453060,7%


In [43]:
data2.replace(to_replace="Health", method='bfill')

Unnamed: 0,ID,Name,Industry,Inception,Revenue,Expenses,Profit,Growth
0,1,Lamtone,IT Services,2009,"$11,757,018","6,482,465 Dollars",5274553,30%
1,2,Stripfind,Financial Services,2010,"$12,329,371","916,455 Dollars",11412916,20%
2,3,Canecorporation,IT Services,2012,"$10,597,009","7,591,189 Dollars",3005820,7%
3,4,Mattouch,IT Services,2013,"$14,026,934","7,429,377 Dollars",6597557,26%
4,5,Techdrill,IT Services,2009,"$10,573,990","7,435,363 Dollars",3138627,8%
5,6,Techline,IT Services,2006,"$13,898,119","5,470,303 Dollars",8427816,23%
6,7,Cityace,IT Services,2010,"$9,254,614","6,249,498 Dollars",3005116,6%
7,8,Kayelectronics,IT Services,2009,"$9,451,943","3,878,113 Dollars",5573830,4%
8,9,Ganzlax,IT Services,2011,"$14,001,180",,11901180,18%
9,10,Trantraxlax,Government Services,2011,"$11,088,336","5,635,276 Dollars",5453060,7%


In [44]:
data2.replace(to_replace={"Industry":"Health","Name":"Techdrill"}, value={"Industry":0,"Name":1})

Unnamed: 0,ID,Name,Industry,Inception,Revenue,Expenses,Profit,Growth
0,1,Lamtone,IT Services,2009,"$11,757,018","6,482,465 Dollars",5274553,30%
1,2,Stripfind,Financial Services,2010,"$12,329,371","916,455 Dollars",11412916,20%
2,3,Canecorporation,0,2012,"$10,597,009","7,591,189 Dollars",3005820,7%
3,4,Mattouch,IT Services,2013,"$14,026,934","7,429,377 Dollars",6597557,26%
4,5,1,0,2009,"$10,573,990","7,435,363 Dollars",3138627,8%
5,6,Techline,0,2006,"$13,898,119","5,470,303 Dollars",8427816,23%
6,7,Cityace,0,2010,"$9,254,614","6,249,498 Dollars",3005116,6%
7,8,Kayelectronics,0,2009,"$9,451,943","3,878,113 Dollars",5573830,4%
8,9,Ganzlax,IT Services,2011,"$14,001,180",,11901180,18%
9,10,Trantraxlax,Government Services,2011,"$11,088,336","5,635,276 Dollars",5453060,7%


In [45]:
data2.replace(to_replace={"Industry":"Health","Name":"Techdrill"}, value=0)

Unnamed: 0,ID,Name,Industry,Inception,Revenue,Expenses,Profit,Growth
0,1,Lamtone,IT Services,2009,"$11,757,018","6,482,465 Dollars",5274553,30%
1,2,Stripfind,Financial Services,2010,"$12,329,371","916,455 Dollars",11412916,20%
2,3,Canecorporation,0,2012,"$10,597,009","7,591,189 Dollars",3005820,7%
3,4,Mattouch,IT Services,2013,"$14,026,934","7,429,377 Dollars",6597557,26%
4,5,0,0,2009,"$10,573,990","7,435,363 Dollars",3138627,8%
5,6,Techline,0,2006,"$13,898,119","5,470,303 Dollars",8427816,23%
6,7,Cityace,0,2010,"$9,254,614","6,249,498 Dollars",3005116,6%
7,8,Kayelectronics,0,2009,"$9,451,943","3,878,113 Dollars",5573830,4%
8,9,Ganzlax,IT Services,2011,"$14,001,180",,11901180,18%
9,10,Trantraxlax,Government Services,2011,"$11,088,336","5,635,276 Dollars",5453060,7%


In [46]:
data3 = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
data3

Unnamed: 0,A,B
0,bat,abc
1,foo,bar
2,bait,xyz


In [47]:
data3.replace(to_replace=r'^ba.$', value='new', regex=True)

Unnamed: 0,A,B
0,new,abc
1,foo,new
2,bait,xyz


In [48]:
data3.replace(to_replace={'A': r'^ba.$'}, value={'A': 'new'}, regex=True)

Unnamed: 0,A,B
0,new,abc
1,foo,bar
2,bait,xyz


In [49]:
data3.replace(regex=r'^ba.$', value='new')

Unnamed: 0,A,B
0,new,abc
1,foo,new
2,bait,xyz


### 6) df.interpolate()

In [50]:
# DataFrame.interpolate(method='linear', axis=0, limit=None, inplace=False, limit_direction=None, 
#                       limit_area=None, downcast=None, **kwargs)

# Fill NaN values using an interpolation method.

# Please note that only method='linear' is supported for DataFrame/Series with a MultiIndex.

# Filling in NaN in a Series via linear interpolation.

s = pd.Series([0, 1, np.nan, 3])
s

0    0.0
1    1.0
2    NaN
3    3.0
dtype: float64

In [51]:
s.interpolate()

0    0.0
1    1.0
2    2.0
3    3.0
dtype: float64

In [52]:
s = pd.Series([np.nan, "single_one", np.nan,
               "fill_two_more", np.nan, np.nan, np.nan,
               4.71, np.nan])
s

0              NaN
1       single_one
2              NaN
3    fill_two_more
4              NaN
5              NaN
6              NaN
7             4.71
8              NaN
dtype: object

In [53]:
# Filling in NaN in a Series by padding, but filling at most two consecutive NaN at a time.

s.interpolate(method='pad', limit=2)

0              NaN
1       single_one
2       single_one
3    fill_two_more
4    fill_two_more
5    fill_two_more
6              NaN
7             4.71
8             4.71
dtype: object

In [54]:
# Filling in NaN in a Series via polynomial interpolation or splines: 
#     Both ‘polynomial’ and ‘spline’ methods require that you also specify an order (int).

s = pd.Series([0, 2, np.nan, 8])

s.interpolate(method='polynomial', order=2)

0    0.000000
1    2.000000
2    4.666667
3    8.000000
dtype: float64

In [55]:
df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
                   (np.nan, 2.0, np.nan, np.nan),
                   (2.0, 3.0, np.nan, 9.0),
                   (np.nan, 4.0, -4.0, 16.0)],
                  columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,0.0,,-1.0,1.0
1,,2.0,,
2,2.0,3.0,,9.0
3,,4.0,-4.0,16.0


In [56]:
df.interpolate(method='linear', limit_direction='forward', axis=0)

Unnamed: 0,a,b,c,d
0,0.0,,-1.0,1.0
1,1.0,2.0,-2.0,5.0
2,2.0,3.0,-3.0,9.0
3,2.0,4.0,-4.0,16.0
