# Additional cleaning and wrangling to Natural_Disasters_Clean

Contents
- 01 Importing libraries and data
- 02 Cleaning
- 03 Wrangling
- 04 Subsetting




## 01 Importing libraries and data 

In [1]:
import pandas as pd
import numpy as np
import os


In [2]:
# Define path

path = r'//Users/amypalomino/Documents/June 22 Final Project/02 Data'

In [3]:
# Import dataframe 

df = pd.read_csv(os.path.join(path, 'Prepared Data', 'Natural_Disasters_Final_Clean.csv'),
                index_col = False)

## 02 Cleaning

In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,...,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Reconstruction Costs US$,Insured Damages US$,Total Damages US$,CPI,Deaths
0,0,1970-0013-ARG,1970,13,Natural,Hydrological,Flood,,,,...,36.0,0.0,0.0,0.0,0.0,,,25000.0,15.001282,Low Death Toll
1,1,1970-0109-AUS,1970,109,Natural,Meteorological,Storm,Tropical cyclone,,Ada,...,13.0,0.0,0.0,0.0,0.0,,,72475.0,15.001282,Low Death Toll
2,2,1970-0044-BEN,1970,44,Natural,Hydrological,Flood,,,,...,0.0,0.0,0.0,0.0,0.0,,,200.0,15.001282,
3,3,1970-0063-BGD,1970,63,Natural,Meteorological,Storm,Tropical cyclone,,,...,300000.0,0.0,3648000.0,0.0,3648000.0,,,86400.0,15.001282,Very High Death Toll
4,4,1970-0026-BGD,1970,26,Natural,Meteorological,Storm,,,,...,17.0,0.0,110.0,0.0,110.0,,,,15.001282,Low Death Toll


In [5]:
df.columns

Index(['Unnamed: 0', 'Dis No', 'Year', 'Seq', 'Disaster Group',
       'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype',
       'Disaster Subsubtype', 'Event Name', 'Country', 'ISO', 'Region',
       'Continent', 'Location', 'Origin', 'Associated Dis', 'Associated Dis2',
       'OFDA Response', 'Appeal', 'Declaration', 'Aid Contribution',
       'Dis Mag Value', 'Dis Mag Scale', 'Latitude', 'Longitude', 'Start Year',
       'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day',
       'Total Deaths', 'No Injured', 'No Affected', 'No Homeless',
       'Total Affected', 'Reconstruction Costs US$', 'Insured Damages US$',
       'Total Damages US$', 'CPI', 'Deaths'],
      dtype='object')

In [6]:
df = df.drop(columns = ['Unnamed: 0'])

In [7]:
df.dtypes

Dis No                       object
Year                          int64
Seq                           int64
Disaster Group               object
Disaster Subgroup            object
Disaster Type                object
Disaster Subtype             object
Disaster Subsubtype          object
Event Name                   object
Country                      object
ISO                          object
Region                       object
Continent                    object
Location                     object
Origin                       object
Associated Dis               object
Associated Dis2              object
OFDA Response               float64
Appeal                      float64
Declaration                 float64
Aid Contribution            float64
Dis Mag Value               float64
Dis Mag Scale                object
Latitude                     object
Longitude                    object
Start Year                    int64
Start Month                 float64
Start Day                   

In [8]:
df.isnull().sum()

Dis No                          0
Year                            0
Seq                             0
Disaster Group                  0
Disaster Subgroup               0
Disaster Type                   0
Disaster Subtype             2747
Disaster Subsubtype         13600
Event Name                  10999
Country                         0
ISO                             0
Region                          0
Continent                       0
Location                     1346
Origin                      10864
Associated Dis              11412
Associated Dis2             13946
OFDA Response                   0
Appeal                          0
Declaration                     0
Aid Contribution            13967
Dis Mag Value                   0
Dis Mag Scale                1073
Latitude                    12313
Longitude                   12309
Start Year                      0
Start Month                   268
Start Day                    3067
End Year                        0
End Month     

In [9]:
# Find the median of the Start Month 
df['Start Month'].median()

7.0

In [10]:
# Find the mean of the Start Month 
df['Start Month'].mean()

6.411240957150807

In [11]:
#Imput the missing values with the median Start Month
df['Start Month'].fillna(7,inplace = True)

In [12]:
#Find the median start day
df['Start Day'].median()

15.0

In [13]:
# Find the mean of the Start Day 
df['Start Day'].mean()

15.207912239785783

In [14]:
#Imput the missing values with the median Start Day
df['Start Day'].fillna(15,inplace = True)

In [15]:
df['Start Day'].value_counts()

15.0    3519
1.0      727
25.0     421
28.0     419
20.0     418
22.0     406
24.0     401
4.0      398
10.0     385
18.0     376
3.0      373
14.0     371
7.0      370
17.0     370
6.0      370
8.0      369
12.0     369
9.0      368
2.0      363
23.0     361
26.0     352
27.0     352
21.0     351
13.0     349
5.0      345
11.0     342
16.0     319
19.0     310
29.0     294
30.0     289
31.0     187
Name: Start Day, dtype: int64

In [16]:
df['Start Month'].value_counts()

7.0     1799
1.0     1633
8.0     1522
9.0     1269
6.0     1226
10.0    1154
5.0     1134
4.0     1033
12.0    1021
11.0     958
2.0      952
3.0      943
Name: Start Month, dtype: int64

In [17]:
#Check the date range is correct in each momth 
df.groupby('Start Month').agg({'Start Day':['max']})

# September has a max date of 31 - this needs fixing 

Unnamed: 0_level_0,Start Day
Unnamed: 0_level_1,max
Start Month,Unnamed: 1_level_2
1.0,31.0
2.0,29.0
3.0,31.0
4.0,30.0
5.0,31.0
6.0,30.0
7.0,31.0
8.0,31.0
9.0,31.0
10.0,31.0


In [18]:
# Convert 31 to 30 in the month of Sept
df.loc[(df['Start Month']== 9.0) & (df['Start Day'] == 31.0), 'Start Day'] = 30.0

In [19]:
df.groupby('Start Month').agg({'Start Day':['max']})

Unnamed: 0_level_0,Start Day
Unnamed: 0_level_1,max
Start Month,Unnamed: 1_level_2
1.0,31.0
2.0,29.0
3.0,31.0
4.0,30.0
5.0,31.0
6.0,30.0
7.0,31.0
8.0,31.0
9.0,30.0
10.0,31.0


In [20]:
# Create date column that includes day, month and year
df['Date'] = pd.to_datetime(dict(year=df['Start Year'], month=df['Start Month'], day=df['Start Day']))

In [21]:
df.head()

Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,...,No Injured,No Affected,No Homeless,Total Affected,Reconstruction Costs US$,Insured Damages US$,Total Damages US$,CPI,Deaths,Date
0,1970-0013-ARG,1970,13,Natural,Hydrological,Flood,,,,Argentina,...,0.0,0.0,0.0,0.0,,,25000.0,15.001282,Low Death Toll,1970-01-04
1,1970-0109-AUS,1970,109,Natural,Meteorological,Storm,Tropical cyclone,,Ada,Australia,...,0.0,0.0,0.0,0.0,,,72475.0,15.001282,Low Death Toll,1970-01-15
2,1970-0044-BEN,1970,44,Natural,Hydrological,Flood,,,,Benin,...,0.0,0.0,0.0,0.0,,,200.0,15.001282,,1970-09-15
3,1970-0063-BGD,1970,63,Natural,Meteorological,Storm,Tropical cyclone,,,Bangladesh,...,0.0,3648000.0,0.0,3648000.0,,,86400.0,15.001282,Very High Death Toll,1970-11-12
4,1970-0026-BGD,1970,26,Natural,Meteorological,Storm,,,,Bangladesh,...,0.0,110.0,0.0,110.0,,,,15.001282,Low Death Toll,1970-04-13


In [22]:
df.shape

(14644, 42)

In [23]:
df.columns

Index(['Dis No', 'Year', 'Seq', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype',
       'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value',
       'Dis Mag Scale', 'Latitude', 'Longitude', 'Start Year', 'Start Month',
       'Start Day', 'End Year', 'End Month', 'End Day', 'Total Deaths',
       'No Injured', 'No Affected', 'No Homeless', 'Total Affected',
       'Reconstruction Costs US$', 'Insured Damages US$', 'Total Damages US$',
       'CPI', 'Deaths', 'Date'],
      dtype='object')

In [24]:
# Impute missing values for relevant columsn with 0
df['Total Deaths'].fillna(0, inplace = True)
df['No Injured'].fillna(0, inplace = True)
df['No Affected'].fillna(0, inplace = True)
df['No Homeless'].fillna(0, inplace = True)
df['Total Affected'].fillna(0, inplace = True)
df['Aid Contribution'].fillna(0, inplace = True)
df['Reconstruction Costs US$'].fillna(0, inplace = True)
df['Insured Damages US$'].fillna(0, inplace = True)
df['Total Damages US$'].fillna(0, inplace = True)
df['CPI'].fillna(0, inplace = True)
df['Deaths'].fillna(0, inplace = True)


In [25]:
df.isnull().sum()

Dis No                          0
Year                            0
Seq                             0
Disaster Group                  0
Disaster Subgroup               0
Disaster Type                   0
Disaster Subtype             2747
Disaster Subsubtype         13600
Event Name                  10999
Country                         0
ISO                             0
Region                          0
Continent                       0
Location                     1346
Origin                      10864
Associated Dis              11412
Associated Dis2             13946
OFDA Response                   0
Appeal                          0
Declaration                     0
Aid Contribution                0
Dis Mag Value                   0
Dis Mag Scale                1073
Latitude                    12313
Longitude                   12309
Start Year                      0
Start Month                     0
Start Day                       0
End Year                        0
End Month     

In [26]:
# Check for duplicates

dups = df.duplicated()
dups.sum()

0

## 03 Wrangling

In [27]:
pd.options.display.max_rows = None

In [28]:
df.groupby('Country').agg({'Year':['min','max','mean','count']})

Unnamed: 0_level_0,Year,Year,Year,Year
Unnamed: 0_level_1,min,max,mean,count
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Afghanistan,1971,2021,2004.8,200
Albania,1979,2021,2004.166667,36
Algeria,1973,2021,1999.961039,77
American Samoa,1989,2009,2002.0,5
Angola,1981,2021,2006.662162,74
Anguilla,1979,2017,1990.833333,6
Antigua and Barbuda,1983,2017,1998.8,10
Argentina,1970,2021,2001.193277,119
Armenia,1997,2020,2008.636364,11
Australia,1970,2021,1996.8,240


In [29]:
# Create colum to act as event count 
df['Count'] = 1

In [30]:
# Tansform 'Count' as aggregation to derive 'No of disasters'
df['No_of_disasters'] = (df.groupby('Country')['Count'].transform(np.sum))

In [31]:
df.head()

Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,...,No Homeless,Total Affected,Reconstruction Costs US$,Insured Damages US$,Total Damages US$,CPI,Deaths,Date,Count,No_of_disasters
0,1970-0013-ARG,1970,13,Natural,Hydrological,Flood,,,,Argentina,...,0.0,0.0,0.0,0.0,25000.0,15.001282,Low Death Toll,1970-01-04,1,119
1,1970-0109-AUS,1970,109,Natural,Meteorological,Storm,Tropical cyclone,,Ada,Australia,...,0.0,0.0,0.0,0.0,72475.0,15.001282,Low Death Toll,1970-01-15,1,240
2,1970-0044-BEN,1970,44,Natural,Hydrological,Flood,,,,Benin,...,0.0,0.0,0.0,0.0,200.0,15.001282,0,1970-09-15,1,51
3,1970-0063-BGD,1970,63,Natural,Meteorological,Storm,Tropical cyclone,,,Bangladesh,...,0.0,3648000.0,0.0,0.0,86400.0,15.001282,Very High Death Toll,1970-11-12,1,317
4,1970-0026-BGD,1970,26,Natural,Meteorological,Storm,,,,Bangladesh,...,0.0,110.0,0.0,0.0,0.0,15.001282,Low Death Toll,1970-04-13,1,317


In [32]:
df['No_of_disasters'].describe()

count    14644.000000
mean       278.366157
std        302.644550
min          1.000000
25%         68.750000
50%        123.000000
75%        317.000000
max        949.000000
Name: No_of_disasters, dtype: float64

In [33]:
# Derive flag to indidcate disaster frequency
df.loc[
    df['No_of_disasters'] <=99, 'disaster_frequency'] ='Infrequent'

In [34]:
df.loc[(df['No_of_disasters'] >=100) &       
       (df['No_of_disasters'] <=250), 'disaster_frequency'] = 'Frequent'

In [35]:
df.loc[(df['No_of_disasters'] >=251) &
       (df['No_of_disasters'] <=500), 'disaster_frequency'] ='Very frequent'

In [36]:
df.loc[
    df['No_of_disasters'] >500, 'disaster_frequency'] ='Extremely frequent'

In [37]:
df.head(5)

Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,...,Total Affected,Reconstruction Costs US$,Insured Damages US$,Total Damages US$,CPI,Deaths,Date,Count,No_of_disasters,disaster_frequency
0,1970-0013-ARG,1970,13,Natural,Hydrological,Flood,,,,Argentina,...,0.0,0.0,0.0,25000.0,15.001282,Low Death Toll,1970-01-04,1,119,Frequent
1,1970-0109-AUS,1970,109,Natural,Meteorological,Storm,Tropical cyclone,,Ada,Australia,...,0.0,0.0,0.0,72475.0,15.001282,Low Death Toll,1970-01-15,1,240,Frequent
2,1970-0044-BEN,1970,44,Natural,Hydrological,Flood,,,,Benin,...,0.0,0.0,0.0,200.0,15.001282,0,1970-09-15,1,51,Infrequent
3,1970-0063-BGD,1970,63,Natural,Meteorological,Storm,Tropical cyclone,,,Bangladesh,...,3648000.0,0.0,0.0,86400.0,15.001282,Very High Death Toll,1970-11-12,1,317,Very frequent
4,1970-0026-BGD,1970,26,Natural,Meteorological,Storm,,,,Bangladesh,...,110.0,0.0,0.0,0.0,15.001282,Low Death Toll,1970-04-13,1,317,Very frequent


In [38]:
df['disaster_frequency'].value_counts(dropna = False)

Infrequent            5797
Frequent              4343
Extremely frequent    3653
Very frequent          851
Name: disaster_frequency, dtype: int64

## 04 Subsetting

In [39]:
df_extreme = df[df['disaster_frequency'] == 'Extremely frequent']

In [40]:
df_extreme.shape

(3653, 45)

In [41]:
df_extreme['Country'].value_counts()

United States of America (the)    949
China                             906
India                             658
Philippines (the)                 612
Indonesia                         528
Name: Country, dtype: int64

In [42]:
df_extreme['Continent'].value_counts()

Asia        2704
Americas     949
Name: Continent, dtype: int64

In [44]:
df.to_csv(os.path.join(path,'Prepared Data', 'Natural_Disasters_Final_Clean_FINAL.csv'))