In [1]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as date

### Extract CSVs into DataFrames
We found two datasets on Kaggle: 

1) SARS 2003 Outbreak Complete Dataset (csv)

This dataset is from the World Health Organization and has basic information on the total number of cases, number of patient deaths and number of recovered patients.
   

    https://www.kaggle.com/imdevskp/sars-outbreak-2003-complete-dataset

2) Novel Corona Virus 2019 Dataset (csv)

This dataset is from the World Health Organization and has basic information on the total number of cases, number of patient deaths and number of recovered patients.
    
    https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset

### Creating Our Database
To create our Database we used PgAdmin to run PostgresSQL 11 where we created a table for each dataset and preformed a join. Each table included 6 columns: ID, Date, Country, Confirmed, Dealths and Recovered. 

In [5]:
#Load SARS CSV and Convert to DataFrame
sars_file = "./data sources/sars_2003_complete_dataset_clean.csv"

# Use Pandas to read corona data, remove timestamp
sars_df = pd.read_csv(sars_file, parse_dates=[0])
sars_df['Date1'] = sars_df['Date'].dt.date
sars_df.head()


Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered,Date1
0,2003-03-17,Germany,1,0,0,2003-03-17
1,2003-03-17,Canada,8,2,0,2003-03-17
2,2003-03-17,Singapore,20,0,0,2003-03-17
3,2003-03-17,"Hong Kong SAR, China",95,1,0,2003-03-17
4,2003-03-17,Switzerland,2,0,0,2003-03-17


In [6]:
# Get Unique Country Names for Comparison
sars_df.Country.unique()

array(['Germany', 'Canada', 'Singapore', 'Hong Kong SAR, China',
       'Switzerland', 'Thailand', 'Viet Nam', 'China', 'Taiwan, China',
       'Slovenia', 'United Kingdom', 'Spain', 'United States', 'Italy',
       'Republic of Ireland', 'France', 'Romania', 'Australia', 'Belgium',
       'Brazil', 'Malaysia', 'Kuwait', 'Japan', 'South Africa',
       'Indonesia', 'Philippines', 'Sweden', 'India', 'Mongolia',
       'Bulgaria', 'Republic of Korea', 'Macao SAR, China', 'Poland',
       'New Zealand', 'Colombia', 'Finland', 'Russian Federation'],
      dtype=object)

In [7]:
#Load Corona CSV and Convert to DataFrame
corona_file = "./data sources/2019_nCoV_data.csv"
corona_df = pd.read_csv(corona_file)
corona_df.head()

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020 12:00:00,Anhui,China,01/22/2020 12:00:00,1.0,0.0,0.0
1,2,01/22/2020 12:00:00,Beijing,China,01/22/2020 12:00:00,14.0,0.0,0.0
2,3,01/22/2020 12:00:00,Chongqing,China,01/22/2020 12:00:00,6.0,0.0,0.0
3,4,01/22/2020 12:00:00,Fujian,China,01/22/2020 12:00:00,1.0,0.0,0.0
4,5,01/22/2020 12:00:00,Gansu,China,01/22/2020 12:00:00,0.0,0.0,0.0


In [8]:
corona_df.Country.unique()

array(['China', 'US', 'Japan', 'Thailand', 'South Korea',
       'Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'Singapore',
       'Philippines', 'Malaysia', 'Vietnam', 'Australia', 'Mexico',
       'Brazil', 'France', 'Nepal', 'Canada', 'Cambodia', 'Sri Lanka',
       'Ivory Coast', 'Germany', 'Finland', 'United Arab Emirates',
       'India', 'Italy', 'Sweden', 'Russia', 'Spain', 'UK', 'Belgium',
       'Others', 'Egypt'], dtype=object)

### Transform SARS DataFrame

In [10]:
# Separate Country Column into Province/State and Country
rename_sars_df = sars_df.replace(
    {"Hong Kong SAR, China": "Hong Kong", 
     "Taiwan, China": "Taiwan", 
     "Macao SAR, China": "Macau", 
     "Viet Nam": "Vietnam",
     "Republic of Ireland": "Ireland", 
     "Republic of Korea" : "Korea",
     "Russian Federation" : "Russia"})

# Drop Original Date Column
new_sars_df = rename_sars_df[['Country', 'Cumulative number of case(s)', 'Number of deaths', 'Number recovered','Date1']].copy()

# Rename the Column Headers
clean_sars_df = new_sars_df.rename(columns={"Date1" : "Date", "Cumulative number of case(s)": "Confirmed",
                                    "Number of deaths": "Deaths",
                                    "Number recovered": "Recovered"})

clean_sars_df.head()

Unnamed: 0,Country,Confirmed,Deaths,Recovered,Date
0,Germany,1,0,0,2003-03-17
1,Canada,8,2,0,2003-03-17
2,Singapore,20,0,0,2003-03-17
3,Hong Kong,95,1,0,2003-03-17
4,Switzerland,2,0,0,2003-03-17


### Transform Corona DataFrame

In [None]:
# Create a filtered dataframe from specific columns (DROP LAST UPDATE COLUMN)

#Drop datetime formatting

# Rename the column headers

# Clean the data by dropping duplicates and setting the index


### Create Database Connetion

In [12]:
connection_string = "postgres:<your password>@localhost:5432/respiratory_outbreak_db"
engine = create_engine(f'postgresql://{connection_string}')

In [13]:
# Confirm tables
engine.table_names()

['sars', 'corona']

### Load DataFrames into Database

In [16]:
clean_sars_df.to_sql(name='sars', con=engine, if_exists='append', index=False)

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "Country" of relation "sars" does not exist
LINE 1: INSERT INTO sars ("Country", "Confirmed", "Deaths", "Recover...
                          ^

[SQL: INSERT INTO sars ("Country", "Confirmed", "Deaths", "Recovered", "Date") VALUES (%(Country)s, %(Confirmed)s, %(Deaths)s, %(Recovered)s, %(Date)s)]
[parameters: ({'Country': 'Germany', 'Confirmed': 1, 'Deaths': 0, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Canada', 'Confirmed': 8, 'Deaths': 2, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Singapore', 'Confirmed': 20, 'Deaths': 0, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Hong Kong', 'Confirmed': 95, 'Deaths': 1, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Switzerland', 'Confirmed': 2, 'Deaths': 0, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Thailand', 'Confirmed': 1, 'Deaths': 0, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Vietnam', 'Confirmed': 40, 'Deaths': 1, 'Recovered': 0, 'Date': datetime.date(2003, 3, 17)}, {'Country': 'Germany', 'Confirmed': 2, 'Deaths': 0, 'Recovered': 0, 'Date': datetime.date(2003, 3, 18)}  ... displaying 10 of 2538 total bound parameter sets ...  {'Country': 'United States', 'Confirmed': 75, 'Deaths': 0, 'Recovered': 67, 'Date': datetime.date(2003, 7, 11)}, {'Country': 'Vietnam', 'Confirmed': 63, 'Deaths': 5, 'Recovered': 58, 'Date': datetime.date(2003, 7, 11)})]
(Background on this error at: http://sqlalche.me/e/f405)

In [15]:
clean_corona_df.to_sql(name='corona', con=engine, if_exists='append', index=False)

NameError: name 'clean_corona_df' is not defined

### Confirm data has been added by querying the respiratory_outbreak table

In [11]:
pd.read_sql_query('select * from sars', con=engine).head()

NameError: name 'engine' is not defined