# Handling missing values

In [38]:
import pandas as pd

In [39]:
df = pd.read_csv("./data/landslides.csv")
df.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0


## Check and handle missing values

In [40]:
df.isnull().sum()

id                   0
date                 3
time              1064
country_name         0
state/province       1
population           0
landslide_type       1
trigger              2
fatalities         247
dtype: int64

In [41]:
df.dropna(subset=['trigger', 'landslide_type', 'state/province', 'date'],
          inplace=True)

In [42]:
df.isnull().sum()

id                   0
date                 0
time              1062
country_name         0
state/province       0
population           0
landslide_type       0
trigger              0
fatalities         246
dtype: int64

In [43]:
df.time.value_counts()

Night            96
Morning          87
Afternoon        58
Early morning    36
3:00:00          12
                 ..
1:13              1
9:40:00           1
11:50:00          1
                  1
21:06             1
Name: time, Length: 159, dtype: int64

In [44]:
mean = df.fatalities.mean()
df.fatalities = df.fatalities.fillna(mean)

In [45]:
df.fatalities.value_counts()

0.000000      1179
1.465278       246
1.000000        49
2.000000        47
3.000000        40
4.000000        19
5.000000        17
6.000000        12
8.000000        10
7.000000         9
9.000000         6
13.000000        6
10.000000        6
11.000000        6
23.000000        4
14.000000        4
12.000000        3
17.000000        2
25.000000        2
20.000000        2
92.000000        1
71.000000        1
48.000000        1
32.000000        1
16.000000        1
15.000000        1
91.000000        1
29.000000        1
28.000000        1
19.000000        1
26.000000        1
21.000000        1
27.000000        1
68.000000        1
18.000000        1
24.000000        1
280.000000       1
Name: fatalities, dtype: int64

In [46]:
df['time'] = df.time.fillna('Not Known')
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,1.465278
1,42,3/22/07,Not Known,United States,Ohio,17288,Landslide,Rain,1.465278
2,56,4/6/07,Not Known,United States,Pennsylvania,15930,Landslide,Rain,1.465278
3,59,4/14/07,Not Known,Canada,Quebec,42786,Riverbank collapse,Rain,1.465278
4,61,4/15/07,Not Known,United States,Kentucky,6903,Landslide,Downpour,0.000000
...,...,...,...,...,...,...,...,...,...
1687,7534,11/29/15,Not Known,United States,North Carolina,1646,Mudslide,Unknown,0.000000
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.000000
1690,7539,2/23/16,Not Known,United States,West Virginia,2406,Landslide,Rain,0.000000
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.000000
