In [2]:
# imports
import csv
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import numpy as np
import re
import string
from IPython.core.display import HTML

# settings
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# data viz imports
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# get data
# read in covid19_states.csv
df1_covidstatecases = pd.read_csv("data/covid19_states_cases.csv", encoding='utf-8')
# read in covid19_states_humidity.csv
df2_statehumidity = pd.read_csv("data/covid19_states_humidity.csv", encoding='utf-8')

In [7]:
# glimpse at data to ensure it read in appropriately
print(df1_covidstatecases.head())
print(df2_statehumidity.head())

     USAState TotalCases NewCases TotalDeaths  NewDeaths ActiveCases
0   USA Total  3,379,896  +24,250     137,572      169.0   1,740,458
1    New York    426,807      NaN      32,393        NaN     216,906
2  California    319,985      NaN       7,030        4.0     227,398
3     Florida    269,811  +15,300       4,242       45.0     232,919
4       Texas    259,465      NaN       3,228        NaN     128,357
   Rank Average Humidity        USAState  Population
0   1.0           82.01%           Iowa    3,078,116
1   2.0           81.86%  New Hampshire    1,321,069
2   3.0           81.46%         Alaska      728,300
3   4.0           80.76%          Maine    1,328,535
4   5.0           80.74%   North Dakota      704,925


In [8]:
# apply a first round of data wrangling: text cleaning techniques
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [11]:
# lets wrangle USAState variable in the humidity dataframe
df2_statehumidity['USAState'] = pd.DataFrame(df2_statehumidity['USAState'].apply(round1))
df2_statehumidity['USAState'] = df2_statehumidity['USAState'].str.strip()

# lets wrangle USAState variable in the humidity dataframe & strip out leading and lagging whitespaces
df1_covidstatecases['USAState'] = pd.DataFrame(df1_covidstatecases['USAState'].apply(round1))
df1_covidstatecases['USAState'] = df1_covidstatecases['USAState'].str.strip()

In [12]:
# here we look at value fields for primary keys to see if the cleansing function worked
frames1 = [df1_covidstatecases['USAState'], df2_statehumidity['USAState']]
result1 = pd.concat(frames1).sort_values()
result1

19                         alabama
35                         alabama
2                           alaska
49                          alaska
13                         arizona
7                          arizona
30                        arkansas
30                        arkansas
8                       california
2                       california
24                        colorado
9                         colorado
15                     connecticut
21                     connecticut
49                        delaware
38                        delaware
62           diamond princess ship
40            district of columbia
23            district of columbia
58                 federal prisons
3                          florida
27                         florida
8                          georgia
38                         georgia
60             grand princess ship
52                            guam
51                          hawaii
46                          hawaii
11                  

In [14]:
# merge wrangled dataframes
merged = pd.merge(df1_covidstatecases,df2_statehumidity, how='right', on='USAState')

# select only neccessary column headers from merged file
df_cleanmerge = merged[['USAState', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'ActiveCases', 'Average Humidity', 'Population']]
df_cleanmerge.head()

Unnamed: 0,USAState,TotalCases,NewCases,TotalDeaths,NewDeaths,ActiveCases,Average Humidity,Population
0,new york,426807,,32393,,216906,75.60%,19594330
1,california,319985,,7030,4.0,227398,80.36%,38066920
2,florida,269811,15300.0,4242,45.0,232919,77.05%,19361792
3,texas,259465,,3228,,128357,76.95%,26092033
4,new jersey,180672,,15603,,88899,71.31%,8874374


In [15]:
# replace nan with zeroes
df_cleanmerge = df_cleanmerge.fillna(0)

# write clean merged file to .csv: df_cleanmerge
df_cleanmerge.to_csv("data\covid19_states_humidityandcases.csv", encoding='utf-8', index=False)