In [7]:
# Dependencies
from sqlalchemy import create_engine, inspect
import pandas as pd

### Store CSVs into dataframes

In [129]:
# User will need to change filepaths to local locations
file_path_1='country_lookup.csv'
file_path_2='../data/outdoor-air-pollution-deaths.csv'
file_path_3='../data/coal_plants.csv'
file_path_4='../data/air_pollution.csv'


In [130]:
country_df = pd.read_csv(file_path_1)
deaths_df = pd.read_csv(file_path_2, index_col=0)
coal_df = pd.read_csv(file_path_3)
pollution_df = pd.read_csv(file_path_4, index_col=0)

### Clean dataframes

In [69]:
country_df.head()

Unnamed: 0,name,alpha_2,alpha_3,country_code,region,sub_region,region_code,sub_region_code
0,Afghanistan,AF,AFG,4,Asia,Southern Asia,142.0,34.0
1,Aland Islands,AX,ALA,248,Europe,Northern Europe,150.0,154.0
2,Albania,AL,ALB,8,Europe,Southern Europe,150.0,39.0
3,Algeria,DZ,DZA,12,Africa,Northern Africa,2.0,15.0
4,American Samoa,AS,ASM,16,Oceania,Polynesia,9.0,61.0


In [131]:
deaths_df.head()

Unnamed: 0,Country,alpha_3,Year,Outdoor particulate matter (deaths per 100k),Outdoor ozone pollution (deaths per 100k)
0,Afghanistan,AFG,1990,46.446589,5.616442
1,Afghanistan,AFG,1991,46.033841,5.60396
2,Afghanistan,AFG,1992,44.243766,5.611822
3,Afghanistan,AFG,1993,44.440148,5.655266
4,Afghanistan,AFG,1994,45.594328,5.718922


In [133]:
deaths_df.columns

Index(['Country', 'alpha_3', 'Year',
       'Outdoor particulate matter (deaths per 100k)',
       'Outdoor ozone pollution (deaths per 100k)'],
      dtype='object')

In [71]:
coal_df.head()

Unnamed: 0,alpha_3,country,year,coal_mw_new,coal_mw_retired,coal_mw_change
0,ALB,Albania,2000,0,0,0
1,ARG,Argentina,2000,0,0,0
2,AUS,Australia,2000,0,0,0
3,AUT,Austria,2000,0,0,0
4,BGD,Bangladesh,2000,0,0,0


In [102]:
pollution_df.head()

Unnamed: 0,alpha_3,year,Value_PM,Value_MR,Value_CO2,Value_NOx,Value_SOx
0,AUS,2000,7.36613,107.634,17.6,98.775,123.387
1,AUS,2005,6.90976,83.66,18.1,108.746,124.901
2,AUS,2010,6.78718,72.271,17.4,105.636,107.345
3,AUS,2011,6.71166,71.617,17.1,100.627,111.239
4,AUS,2012,7.00126,69.955,16.8,106.465,109.946


In [47]:
country_df.isna().sum()

name               0
alpha_2            1
alpha_3            0
country_code       0
region             1
sub_region         1
region_code        1
sub_region_code    1
dtype: int64

### Make a connection into postgres database

In [139]:
# add in your postgres username and password with the following format:
# "<insert user name>:<insert password>@localhost:5432/pollution_db"
rds_connection_string = "postgresql://postgres:postgres@localhost:5432/pollution_db"
engine = create_engine(rds_connection_string)

In [140]:
engine.table_names()

['coal_plants', 'country', 'air_quality', 'mortality_rates']

### Import dataframes into SQL

In [141]:
country_df.to_sql(name='country', con=engine, if_exists='append', index=False)

In [142]:
coal_df.to_sql(name='coal_plants', con=engine, if_exists='append',index=False)

In [143]:
# Running into errors with column names, so had to find and rename for SQL to read correctly
deaths_df.columns

Index(['Country', 'alpha_3', 'Year',
       'Outdoor particulate matter_deaths per 100k',
       'Outdoor ozone pollution_deaths per 100k'],
      dtype='object')

In [148]:
# Running into errors with column names, so had to find and rename for SQL to read correctly
rename_columns= {
    'Country': 'country', 
    'Outdoor particulate matter (deaths per 100k)':'Outdoor particulate matter_deaths per 100k',
    'Outdoor ozone pollution (deaths per 100k)': 'Outdoor ozone pollution_deaths per 100k', 
    'Year': 'year'
}

deaths_df=deaths_df.rename(columns=rename_columns)

In [150]:
deaths_df[deaths_df['alpha_3']!='OWID_WRL'].to_sql(name='mortality_rates', con=engine, if_exists='append', index=False)

In [153]:
# dropped OWID_WRL for alpha_3 so Kevin can leave (OWID_WRL does not exist in the country table, 
# so this value was causing an error due to foreign key constraints)

In [152]:
pollution_df.to_sql(name='air_quality', con=engine, if_exists='append', index=False)