In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from path import Path

### Import Data

In [2]:
# Read tornadoes csv into DataFrame
file_path = Path('Raw_Data/1950-2020_torn.csv')
tornadoes_df = pd.read_csv(file_path, index_col=0)
tornadoes_df

FileNotFoundError: [Errno 2] No such file or directory: Path('Raw_Data/1950-2020_torn.csv')

In [None]:
tornadoes_df.columns

In [None]:
tornadoes_df.dtypes

In [None]:
# Sort DF by Year then State
tornadoes_df.sort_values(by=['yr','mo','st'])

### Clean the whole dataset

In [None]:
# Merge Date & Time then conver to datetime
tornadoes_clean = tornadoes_df.copy

In [None]:
# Filter out any bad data (ie 0<=EF<=5)
tornadoes_clean = tornadoes_clean[ (tornadoes_df['ef'] >= 0) & ( tornadoes_df["ef"] <= 5)]
tornadoes_clean

In [None]:
# Filter for Lower 48 (exclude PR, VI, HI, AK)
tornadoes_clean = tornadoes_clean[tornadoes_clean['st'] != 'PR']
tornadoes_clean = tornadoes_clean[tornadoes_clean['st'] != 'VI']
tornadoes_clean = tornadoes_clean[tornadoes_clean['st'] != 'HI']
tornadoes_clean = tornadoes_clean[tornadoes_clean['st'] != 'AK']
tornadoes_clean

In [None]:
# Replace loss and closs 0 with NaN ( 0 means no data avaliable NOT $0 in loss)
tornadoes_clean['closs']=tornadoes_clean['closs'].replace(0, np.nan)
tornadoes_clean['loss']=tornadoes_clean['loss'].replace(0, np.nan)
tornadoes_clean

In [None]:
# Starting in 2016 the loss data is in whole dollar amounts need to convert rows with years prior to 2016 to whole dollar
# original loss and closs were in millions of dollars
# Multiply loss and closs by $1,000,000 to get whole number values
loss = tornadoes_clean.loc[tornadoes_clean["yr"]<2016]
loss

In [None]:
loss["loss"] = loss["loss"]*1000000
loss["closs"] = loss["closs"]*1000000
loss

In [None]:
# drop all rows in tor_EF between 2008-2015
tornadoes_clean_2016_2020 = tornadoes_clean[tornadoes_clean["yr"] >= 2016]
tornadoes_clean_2016_2020

In [None]:
tornadoes_clean = loss.append(tornadoes_clean_2016_2020)
tornadoes_clean

In [None]:
tornadoes_clean.columns

### Filter for years of interest & Export Cleaned Data

In [None]:
# Delete columns not needed
tornadoes_df_clean = tornadoes_clean['yr', 'mo', 'dy', 'date', 'time', 'tz', 'st', 'st_fips', 'stn',
       'ef', 'inj', 'fat', 'loss', 'closs', 'slat', 'slon', 'elat', 'elon',
       'len', 'wid', 'ns', 'sn', 'sg', 'f1', 'f2', 'f3', 'f4', 'fc']

In [None]:
# Rename columns
df.rename(columns = {'yr':'Year', '':'new_col2'}, inplace = True)

In [None]:
# Rearrange colums
cols = ['Year', 'Timestamp', 'State', 'State_Fips', 'County_Fips', 'f2', 'f3', 'f4', 
        'ef', 'inj', 'fat', 'loss', 'closs', 'slat', 'slon', 'elat', 'elon', 'len', 'wid' ]

tornadoes_df[cols]

In [None]:
# Filter for years 1950-2007 --> F Scale
tornadoes_F = tornadoes_df_clean[tornadoes_df['yr'] < 2008]
tornadoes_F

In [None]:
# Export Cleaned Data to CSV
tornadoes_F.to_csv("Cleaned_Data/1950-2007_tornadoes_cleaned.csv")

In [None]:
# Filter for years 2008-2020 --> EF Scale
tornadoes_EF = tornadoes_df_clean[tornadoes_df['yr'] >= 2008]
tornadoes_EF

In [None]:
# Export Cleaned Data to CSV
tornadoes_EF.to_csv("Cleaned_Data/2008-2020_tornadoes_cleaned.csv")

In [None]:
# Summary Stats on Data
tornadoes_EF[["ef","inj","fat","loss","closs","len","wid"]].describe()