In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [2]:
public_covid_file = "Data/public_covid19_canada.csv"
public_covid_canada_df = pd.read_csv(public_covid_file)
public_covid_canada_df.head()

Unnamed: 0,case_id,provincial_case_id,age,sex,health_region,province,country,date_report,report_week,travel_yn,travel_history_country,locally_acquired
0,1.0,1.0,50-59,Male,Toronto,Ontario,Canada,25-01-2020,19-01-2020,1,China,
1,2.0,2.0,50-59,Female,Toronto,Ontario,Canada,27-01-2020,26-01-2020,1,China,
2,3.0,1.0,40-49,Male,Vancouver Coastal,BC,Canada,28-01-2020,26-01-2020,1,China,
3,4.0,3.0,20-29,Female,Middlesex-London,Ontario,Canada,31-01-2020,26-01-2020,1,China,
4,5.0,2.0,50-59,Female,Vancouver Coastal,BC,Canada,04-02-2020,02-02-2020,0,,Close Contact


In [3]:
mortality_file = "Data/Mortality_Canada.csv"
mortality_canada_df = pd.read_csv(mortality_file)
mortality_canada_df.head()

Unnamed: 0,death_id,province_death_id,case_id,age,sex,health_region,province,country,date_death_report
0,1,1,60.0,80-89,Male,Vancouver Coastal,BC,Canada,08-03-2020
1,2,1,477.0,70-79,Male,Simcoe Muskoka,Ontario,Canada,11-03-2020
2,3,2,,,,Vancouver Coastal,BC,Canada,16-03-2020
3,4,3,,,,Vancouver Coastal,BC,Canada,16-03-2020
4,5,4,,,,Vancouver Coastal,BC,Canada,16-03-2020


In [4]:
testing_file = "Data/Testing_Canada.csv"
testcases_canada_df = pd.read_csv(testing_file)
testcases_canada_df.head()

Unnamed: 0,date_testing,province,cumulative_testing,province_source
0,15-03-2020,Alberta,7108,Alberta
1,16-03-2020,Alberta,10598,BC
2,17-03-2020,Alberta,12355,Manitoba
3,18-03-2020,Alberta,14566,New Brunswick
4,19-03-2020,Alberta,17013,NL


In [5]:
recovered_file = "Data/Recovered_Canada.csv"
recovered_canada_df = pd.read_csv(recovered_file)
recovered_canada_df.head()

Unnamed: 0,date_recovered,province,cumulative_recovered,province_source
0,12-02-2020,Alberta,,Alberta
1,13-02-2020,Alberta,,BC
2,14-02-2020,Alberta,,Manitoba
3,15-02-2020,Alberta,,New Brunswick
4,16-02-2020,Alberta,,NL


### Transform premise DataFrame

In [6]:
# Create a filtered dataframe from specific columns & Rename the column headers

mortality_df = mortality_canada_df[['province','date_death_report','age', 'sex',]]
# mortality_df
mortality_df.columns = ['province', 'date', 'age', 'sex']
# mortality_df.head()
#
recovered_df = recovered_canada_df[['date_recovered', 'province', 'cumulative_recovered']]
recovered_df.columns = ['date', 'province', 'cumulative_recovered']

testcases_df = testcases_canada_df[['date_testing', 'province', 'cumulative_testing']]
testcases_df.columns = ['date', 'province', 'cumulative_testing']   
testcases_df

confirmed_cases_df = public_covid_canada_df[['provincial_case_id', 'age', 'sex', 'province', 'date_report', 'travel_yn',
       'travel_history_country', 'locally_acquired']]
confirmed_cases_df.columns = ['positive_cases', 'age', 'sex', 'province', 'date', 'travel_yn',
       'travel_history_country', 'locally_acquired']   
confirmed_cases_df

Unnamed: 0,positive_cases,age,sex,province,date,travel_yn,travel_history_country,locally_acquired
0,1.0,50-59,Male,Ontario,25-01-2020,1,China,
1,2.0,50-59,Female,Ontario,27-01-2020,1,China,
2,1.0,40-49,Male,BC,28-01-2020,1,China,
3,3.0,20-29,Female,Ontario,31-01-2020,1,China,
4,2.0,50-59,Female,BC,04-02-2020,0,,Close Contact
...,...,...,...,...,...,...,...,...
3904,,,,,,,,
3905,,,,,,,,
3906,,,,,,,,
3907,,,,,,,,


In [15]:
# Clean the data by dropping duplicates and setting the index
mortality_df.dropna(how='all', inplace=True)
recovered_df.dropna(how='all', inplace=True)
testcases_df.dropna(how='all', inplace=True)
confirmed_cases_df.dropna(how='all', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [34]:
testcases_df["cumulative_testing"].unique()

array(['7108', '10598', '12355', '14566', '17013', '20360', '23742',
       '26999', '30058', '32776', '35508', '6326', '17912', '21296',
       '26681', '31739', '400', '2280', '2912', '3270', '3534', '3801',
       '4245', '4520', nan, '203', '204', '381', '421', '520', '788',
       '858', '1096', '1255', '1550', '161', '494', '791', '901', '970',
       '938*', '1131', '1336', '418', '676', '941', '1153', '1387',
       '1561', '1847', '2116', '2349', '2525', '2840', '8465', '10178',
       '11171', '13897', '16650', '19511', '23384', '26420', '28506',
       '32457', '35635', '78', '147', '183', '213', '240', '325', '393',
       '416', '550', '6202', '7801', '8934', '10451', '10222*', '10935',
       '12068', '13727', '15763', '30971', '796', '1107', '1978', '2561',
       '3093', '3917', '4536', '5269', '5757', '6270', '153', '222',
       '275', '299', '370', '410', '492', '533', '108', '154', '166',
       '101', '398', '517'], dtype=object)

In [44]:
testcases_df["cumulative_testing"] = testcases_df["cumulative_testing"].replace({'938*'},{'938'})
testcases_df["cumulative_testing"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


array(['7108', '10598', '12355', '14566', '17013', '20360', '23742',
       '26999', '30058', '32776', '35508', '6326', '17912', '21296',
       '26681', '31739', '400', '2280', '2912', '3270', '3534', '3801',
       '4245', '4520', nan, '203', '204', '381', '421', '520', '788',
       '858', '1096', '1255', '1550', '161', '494', '791', '901', '970',
       '938', '1131', '1336', '418', '676', '941', '1153', '1387', '1561',
       '1847', '2116', '2349', '2525', '2840', '8465', '10178', '11171',
       '13897', '16650', '19511', '23384', '26420', '28506', '32457',
       '35635', '78', '147', '183', '213', '240', '325', '393', '416',
       '550', '6202', '7801', '8934', '10451', '10222', '10935', '12068',
       '13727', '15763', '30971', '796', '1107', '1978', '2561', '3093',
       '3917', '4536', '5269', '5757', '6270', '153', '222', '275', '299',
       '370', '410', '492', '533', '108', '154', '166', '101', '398',
       '517'], dtype=object)

In [16]:
# Set index
mortality_df.reset_index(drop=True)
recovered_df.reset_index(drop=True)
testcases_df.reset_index(drop=True)
confirmed_cases_df.reset_index(drop=True)


Unnamed: 0,positive_cases,age,sex,province,date,travel_yn,travel_history_country,locally_acquired
0,1.0,50-59,Male,Ontario,25-01-2020,1,China,
1,2.0,50-59,Female,Ontario,27-01-2020,1,China,
2,1.0,40-49,Male,BC,28-01-2020,1,China,
3,3.0,20-29,Female,Ontario,31-01-2020,1,China,
4,2.0,50-59,Female,BC,04-02-2020,0,,Close Contact
...,...,...,...,...,...,...,...,...
3404,655.0,Not Reported,Not Reported,BC,25-03-2020,Not Reported,,
3405,656.0,Not Reported,Not Reported,BC,25-03-2020,Not Reported,,
3406,657.0,Not Reported,Not Reported,BC,25-03-2020,Not Reported,,
3407,658.0,Not Reported,Not Reported,BC,25-03-2020,Not Reported,,


In [None]:
# adding index column name
mortality_df.index.name = 'id'
recovered_df.index.name = 'id'
testcases_df.index.name = 'id'
confirmed_cases_df.index.name = 'id'

### Create database connection

In [21]:
connection_string = "postgres:postgresus8@localhost:5432/covid19_db"
engine = create_engine(f'postgresql://{connection_string}')

In [22]:
# Confirm tables
engine.table_names()

['confirmed_cases', 'mortality', 'recovered', 'test']

### Load DataFrames into database

In [27]:
mortality_df.to_sql(name='mortality', con=engine, if_exists='append', index=True)

In [None]:
recovered_df.to_sql(name='recovered', con=engine, if_exists='append', index=True)

In [45]:
testcases_df.to_sql(name='test', con=engine, if_exists='append', index=True)

In [46]:
confirmed_cases_df.to_sql(name='confirmed_cases', con=engine, if_exists='append', index=True)

DataError: (psycopg2.errors.StringDataRightTruncation) value too long for type character varying(10)

[SQL: INSERT INTO confirmed_cases (id, positive_cases, age, sex, province, date, travel_yn, travel_history_country, locally_acquired) VALUES (%(id)s, %(positive_cases)s, %(age)s, %(sex)s, %(province)s, %(date)s, %(travel_yn)s, %(travel_history_country)s, %(locally_acquired)s)]
[parameters: ({'id': 0, 'positive_cases': 1.0, 'age': '50-59', 'sex': 'Male', 'province': 'Ontario', 'date': '25-01-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}, {'id': 1, 'positive_cases': 2.0, 'age': '50-59', 'sex': 'Female', 'province': 'Ontario', 'date': '27-01-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}, {'id': 2, 'positive_cases': 1.0, 'age': '40-49', 'sex': 'Male', 'province': 'BC', 'date': '28-01-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}, {'id': 3, 'positive_cases': 3.0, 'age': '20-29', 'sex': 'Female', 'province': 'Ontario', 'date': '31-01-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}, {'id': 4, 'positive_cases': 2.0, 'age': '50-59', 'sex': 'Female', 'province': 'BC', 'date': '04-02-2020', 'travel_yn': '0', 'travel_history_country': None, 'locally_acquired': 'Close Contact'}, {'id': 5, 'positive_cases': 3.0, 'age': '30-39', 'sex': 'Male', 'province': 'BC', 'date': '06-02-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}, {'id': 6, 'positive_cases': 4.0, 'age': '30-39', 'sex': 'Female', 'province': 'BC', 'date': '06-02-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}, {'id': 7, 'positive_cases': 5.0, 'age': '30-39', 'sex': 'Female', 'province': 'BC', 'date': '14-02-2020', 'travel_yn': '1', 'travel_history_country': 'China', 'locally_acquired': None}  ... displaying 10 of 3409 total bound parameter sets ...  {'id': 3407, 'positive_cases': 658.0, 'age': 'Not Reported', 'sex': 'Not Reported', 'province': 'BC', 'date': '25-03-2020', 'travel_yn': 'Not Reported', 'travel_history_country': None, 'locally_acquired': None}, {'id': 3408, 'positive_cases': 659.0, 'age': 'Not Reported', 'sex': 'Not Reported', 'province': 'BC', 'date': '25-03-2020', 'travel_yn': 'Not Reported', 'travel_history_country': None, 'locally_acquired': None})]
(Background on this error at: http://sqlalche.me/e/9h9h)

In [None]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)