## Severe Weather Data Cleanup
#### CSVs come from: https://www.ncdc.noaa.gov/data-access

* 210 files total | 3 types
* Storm Event Details
* Storm Event Locations
* Storm Event Fatalities
* Dates: 1950-2019(Jan)

In [1]:
# Import Dependencies
import glob
import os
import pandas as pd

# Merging & Cleaning | StormEventsFatalities CSVs

In [2]:
# Reading StormEventsLocations CSVs in using glob

path = '../Resources/StormData/StormEventsFatalities'

ffiles = glob.glob(os.path.join(path, '*.csv'))

fdata = []
for ffile in ffiles:
    record = pd.read_csv(ffile)
    fdata.append(record)

In [3]:
# Concat CSVs into single DataFrame & check for duplicate columns
ffulldata = pd.concat(fdata, ignore_index=True)
ffulldata.head()

Unnamed: 0,FAT_YEARMONTH,FAT_DAY,FAT_TIME,FATALITY_ID,EVENT_ID,FATALITY_TYPE,FATALITY_DATE,FATALITY_AGE,FATALITY_SEX,FATALITY_LOCATION,EVENT_YEARMONTH
0,195306,8,1800,1005317,10083307,D,06/08/1953 18:00:00,,,,195306.0
1,195306,8,1810,1005318,10083308,D,06/08/1953 18:10:00,,,,195306.0
2,195306,8,1908,1005319,10083310,D,06/08/1953 19:08:00,,,,195306.0
3,195306,8,1930,1005320,10083311,D,06/08/1953 19:30:00,,,,195306.0
4,195306,8,2000,1005321,10083312,D,06/08/1953 20:00:00,,,,195306.0


In [4]:
# Splitting the YEARMONTH column to get separate YEAR columns for DECADE
ffulldata['FAT_YEARMONTH1'] = ffulldata['FAT_YEARMONTH'].astype(str)
ffulldata['YEAR'] = ffulldata['FAT_YEARMONTH1'].str[0:4]
ffulldata['YEAR'] = pd.to_numeric(ffulldata.YEAR, errors='coerce')

ffulldata.head()

Unnamed: 0,FAT_YEARMONTH,FAT_DAY,FAT_TIME,FATALITY_ID,EVENT_ID,FATALITY_TYPE,FATALITY_DATE,FATALITY_AGE,FATALITY_SEX,FATALITY_LOCATION,EVENT_YEARMONTH,FAT_YEARMONTH1,YEAR
0,195306,8,1800,1005317,10083307,D,06/08/1953 18:00:00,,,,195306.0,195306,1953
1,195306,8,1810,1005318,10083308,D,06/08/1953 18:10:00,,,,195306.0,195306,1953
2,195306,8,1908,1005319,10083310,D,06/08/1953 19:08:00,,,,195306.0,195306,1953
3,195306,8,1930,1005320,10083311,D,06/08/1953 19:30:00,,,,195306.0,195306,1953
4,195306,8,2000,1005321,10083312,D,06/08/1953 20:00:00,,,,195306.0,195306,1953


In [5]:
# Create a new column for DECADE using binning on the YEAR column
bins = [1950, 1960, 1970, 1980, 1990, 2000, 2010, 2019]
decades = ['1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2010', '2010-2019']

ffulldata['DECADE'] = pd.cut(ffulldata['YEAR'], bins, labels=decades)
ffulldata.head()

Unnamed: 0,FAT_YEARMONTH,FAT_DAY,FAT_TIME,FATALITY_ID,EVENT_ID,FATALITY_TYPE,FATALITY_DATE,FATALITY_AGE,FATALITY_SEX,FATALITY_LOCATION,EVENT_YEARMONTH,FAT_YEARMONTH1,YEAR,DECADE
0,195306,8,1800,1005317,10083307,D,06/08/1953 18:00:00,,,,195306.0,195306,1953,1950-1959
1,195306,8,1810,1005318,10083308,D,06/08/1953 18:10:00,,,,195306.0,195306,1953,1950-1959
2,195306,8,1908,1005319,10083310,D,06/08/1953 19:08:00,,,,195306.0,195306,1953,1950-1959
3,195306,8,1930,1005320,10083311,D,06/08/1953 19:30:00,,,,195306.0,195306,1953,1950-1959
4,195306,8,2000,1005321,10083312,D,06/08/1953 20:00:00,,,,195306.0,195306,1953,1950-1959


In [6]:
# Rename columns
ffulldata = ffulldata.rename(columns={"FATALITY_DATE":"FATALITY_DATETIME"})

In [7]:
# Drop nulls in FATALITY_LOCATION column
ffulldata = ffulldata.dropna(axis=0, subset=['FATALITY_LOCATION'])
ffulldata.head()

Unnamed: 0,FAT_YEARMONTH,FAT_DAY,FAT_TIME,FATALITY_ID,EVENT_ID,FATALITY_TYPE,FATALITY_DATETIME,FATALITY_AGE,FATALITY_SEX,FATALITY_LOCATION,EVENT_YEARMONTH,FAT_YEARMONTH1,YEAR,DECADE
560,199601,5,0,1001409,5556653,D,01/05/1996 12:00:00,19.0,M,Other,199601.0,199601,1996,1990-1999
561,199601,29,0,1001275,5548075,D,01/29/1996 12:00:00,75.0,F,Outside/Open Areas,199601.0,199601,1996,1990-1999
562,199601,19,0,1001579,5569986,D,01/19/1996 12:00:00,54.0,M,Mobile/Trailer Home,199601.0,199601,1996,1990-1999
563,199601,1,0,1001537,5574628,D,01/01/1996 12:00:00,37.0,M,Long Span Roof,199601.0,199601,1996,1990-1999
564,199601,18,0,1001538,5574649,D,01/18/1996 12:00:00,23.0,F,Vehicle/Towed Trailer,199601.0,199601,1996,1990-1999


In [8]:
# Re-order columns
ffulldata = ffulldata[['EVENT_ID', 'FATALITY_ID', 'FATALITY_TYPE', 'FATALITY_AGE', 'FATALITY_SEX', 'FATALITY_LOCATION', 
                       'YEAR', 'DECADE', 'FATALITY_DATETIME']]

ffulldata.head()

Unnamed: 0,EVENT_ID,FATALITY_ID,FATALITY_TYPE,FATALITY_AGE,FATALITY_SEX,FATALITY_LOCATION,YEAR,DECADE,FATALITY_DATETIME
560,5556653,1001409,D,19.0,M,Other,1996,1990-1999,01/05/1996 12:00:00
561,5548075,1001275,D,75.0,F,Outside/Open Areas,1996,1990-1999,01/29/1996 12:00:00
562,5569986,1001579,D,54.0,M,Mobile/Trailer Home,1996,1990-1999,01/19/1996 12:00:00
563,5574628,1001537,D,37.0,M,Long Span Roof,1996,1990-1999,01/01/1996 12:00:00
564,5574649,1001538,D,23.0,F,Vehicle/Towed Trailer,1996,1990-1999,01/18/1996 12:00:00


In [9]:
# Export the ffulldata as 1 CSV file
ffulldata.to_csv('../Resources/StormEventsFatalitiesALL.csv')