<a href="https://colab.research.google.com/github/cbeckler/eco_cancer_incidence_rates/blob/main/ETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.1.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:10 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Get:14 http://security.ubuntu.com/ubuntu fo

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2023-01-20 20:15:18--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2023-01-20 20:15:20 (1.17 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
# start spark session that can connect to postgre
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("final-project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd

# import cancer data

cancer_path = "/content/drive/MyDrive/Data/cancer_incidence.csv"

cancer_df = pd.read_csv(cancer_path, converters={' FIPS': '{:0>5}'.format})

cancer_df.head()

Unnamed: 0,index,County,FIPS,"Age-Adjusted Incidence Rate(Ê) - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,Average Annual Count,Recent Trend,Recent 5-Year Trend (ˆ) in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,0,"US (SEER+NPCR)(1,10)",0,62.4,62.3,62.6,214614,falling,-2.5,-3.0,-2.0
1,1,"Autauga County, Alabama(6,10)",1001,74.9,65.1,85.7,43,stable,0.5,-14.9,18.6
2,2,"Baldwin County, Alabama(6,10)",1003,66.9,62.4,71.7,170,stable,3.0,-10.2,18.3
3,3,"Barbour County, Alabama(6,10)",1005,74.6,61.8,89.4,25,stable,-6.4,-18.3,7.3
4,4,"Bibb County, Alabama(6,10)",1007,86.4,71.0,104.2,23,stable,-4.5,-31.4,32.9


In [6]:
# create a copy of cancer df to keep label nulls

null_cancer = cancer_df.copy()

# get rid of recent trend nulls (stored as *, _, __)

cancer_df = cancer_df.loc[(cancer_df['Recent Trend']=='rising')|(cancer_df['Recent Trend']=='falling')|(cancer_df['Recent Trend']=='stable')]

# drop unusued columns and rename vars

cancer_df = cancer_df[[' FIPS', 'Recent Trend']].copy()

cancer_df = cancer_df.rename(columns={' FIPS':'FIPS', 'Recent Trend':'recent_trend'})

cancer_df['recent_trend'].value_counts()

stable     2429
falling     200
rising       43
Name: recent_trend, dtype: int64

In [7]:
# drop unusued columns and rename vars

null_cancer = null_cancer[[' FIPS', 'Recent Trend']].copy()

null_cancer = null_cancer.rename(columns={' FIPS':'FIPS', 'Recent Trend':'recent_trend'})

null_cancer = null_cancer.replace({'*': None, '_': None, '__': None})

null_cancer['recent_trend'].value_counts()

stable     2429
falling     200
rising       43
Name: recent_trend, dtype: int64

In [8]:
null_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141 entries, 0 to 3140
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   FIPS          3141 non-null   object
 1   recent_trend  2672 non-null   object
dtypes: object(2)
memory usage: 49.2+ KB


In [9]:
# import pm data

pm_path = "/content/drive/MyDrive/Data/Daily_PM2.5_Concentrations_All_County__2001-2016.csv"

pm_df = pd.read_csv(pm_path, converters={'statefips': '{:0>2}'.format, 'countyfips': '{:0>3}'.format})

pm_df.head()

Unnamed: 0,year,date,statefips,countyfips,PM25_max_pred,PM25_med_pred,PM25_mean_pred,PM25_pop_pred
0,2001,01JAN2001,1,1,10.664367,10.264546,10.137631,10.188703
1,2001,01JAN2001,1,3,9.803209,8.739505,8.743748,8.811486
2,2001,01JAN2001,1,5,12.087599,11.809159,11.812775,11.802062
3,2001,01JAN2001,1,7,8.579425,8.435394,8.458118,8.448871
4,2001,01JAN2001,1,9,14.399446,13.577741,13.300528,13.231461


In [10]:
# import ozone data

oz_path = "/content/drive/MyDrive/Data/Daily_County-Level_Ozone_Concentrations__2001-2016.csv"

oz_df = pd.read_csv(oz_path, converters={'statefips': '{:0>2}'.format, 'countyfips': '{:0>3}'.format})

oz_df.head()

Unnamed: 0,Year,Month,Day,statefips,countyfips,O3_max_pred,O3_med_pred,O3_mean_pred,O3_pop_pred
0,2001,JAN,1,1,1,31.939058,31.691988,31.680859,31.671226
1,2001,JAN,1,1,3,33.646855,33.170271,32.994775,32.93507
2,2001,JAN,1,1,5,34.288917,34.068507,34.077954,34.086631
3,2001,JAN,1,1,7,30.349767,30.036093,29.931756,29.991733
4,2001,JAN,1,1,9,26.472655,25.776595,25.857571,25.872472


In [11]:
# create prediction dataset

pm_pred = pm_df.loc[(pm_df['year']>=2003)&(pm_df['PM25_med_pred']<=100)]

# filter pollution data for years matching cancer data
# it was found med_pred > 100 were outlier values
pm_df = pm_df.loc[(pm_df['year']<=2014)&(pm_df['PM25_med_pred']<=100)]

In [12]:
# check years to ensure they match cancer data range
pm_df.year.unique()

array([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014])

In [13]:
# check pred years to see that it's a 14 year slice

pm_pred.year.unique()

array([2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016])

In [14]:
# create state + county FIPS to match cancer data FIPS

pm_df['FIPS_pm'] = pm_df['statefips'] + pm_df['countyfips']

pm_df.head()

Unnamed: 0,year,date,statefips,countyfips,PM25_max_pred,PM25_med_pred,PM25_mean_pred,PM25_pop_pred,FIPS_pm
0,2001,01JAN2001,1,1,10.664367,10.264546,10.137631,10.188703,1001
1,2001,01JAN2001,1,3,9.803209,8.739505,8.743748,8.811486,1003
2,2001,01JAN2001,1,5,12.087599,11.809159,11.812775,11.802062,1005
3,2001,01JAN2001,1,7,8.579425,8.435394,8.458118,8.448871,1007
4,2001,01JAN2001,1,9,14.399446,13.577741,13.300528,13.231461,1009


In [15]:
# create state + county FIPS to match cancer data FIPS

pm_pred['FIPS_pm'] = pm_pred['statefips'] + pm_pred['countyfips']

pm_pred.head()

Unnamed: 0,year,date,statefips,countyfips,PM25_max_pred,PM25_med_pred,PM25_mean_pred,PM25_pop_pred,FIPS_pm
2269118,2003,01JAN2003,13,125,4.204755,4.204755,4.204755,4.204755,13125
2269571,2003,01JAN2003,1,1,4.888281,4.563992,4.447518,4.453329,1001
2269572,2003,01JAN2003,1,3,8.084559,6.518457,6.563117,6.662984,1003
2269573,2003,01JAN2003,1,5,5.012596,4.782934,4.802877,4.818045,1005
2269574,2003,01JAN2003,1,7,3.835988,3.612213,3.626566,3.627089,1007


In [16]:
# aggregate pollution data by county

pm_agg = pm_df.groupby('FIPS_pm').agg({'PM25_max_pred':'mean', 'PM25_med_pred':'mean', 'PM25_mean_pred':'mean'})

pm_agg.reset_index(inplace=True)

pm_agg.head()

Unnamed: 0,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,1001,12.239594,11.870589,11.7879
1,1003,11.024418,9.82287,9.794272
2,1005,11.68817,11.099414,11.084238
3,1007,12.280141,11.700512,11.721958
4,1009,13.498799,12.690525,12.741736


In [17]:
# aggregate pollution data by county

pm_pred_agg = pm_pred.groupby('FIPS_pm').agg({'PM25_max_pred':'mean', 'PM25_med_pred':'mean', 'PM25_mean_pred':'mean'})

pm_pred_agg.reset_index(inplace=True)

pm_pred_agg.head()

Unnamed: 0,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,1001,11.923392,11.545831,11.46093
1,1003,10.724676,9.57737,9.549603
2,1005,11.368095,10.766793,10.752574
3,1007,11.830653,11.281247,11.299146
4,1009,12.847922,12.097639,12.144016


In [18]:
# aggregate pollution data by county and year

pm_year_agg = pm_df.groupby(['FIPS_pm', 'year']).agg({'PM25_max_pred':'mean', 'PM25_med_pred':'mean', 'PM25_mean_pred':'mean'})

pm_year_agg.reset_index(inplace=True)

pm_year_agg.head()

Unnamed: 0,FIPS_pm,year,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,1001,2001,13.186807,12.834644,12.784293
1,1001,2002,12.490668,12.16293,12.068693
2,1001,2003,12.869481,12.485005,12.400802
3,1001,2004,13.111945,12.716436,12.619426
4,1001,2005,13.543276,13.164472,13.071456


In [19]:
# aggregate pollution data by county and year

pm_pred_year_agg = pm_pred.groupby(['FIPS_pm', 'year']).agg({'PM25_max_pred':'mean', 'PM25_med_pred':'mean', 'PM25_mean_pred':'mean'})

pm_pred_year_agg.reset_index(inplace=True)

pm_pred_year_agg.head()

Unnamed: 0,FIPS_pm,year,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,1001,2003,12.869481,12.485005,12.400802
1,1001,2004,13.111945,12.716436,12.619426
2,1001,2005,13.543276,13.164472,13.071456
3,1001,2006,13.638252,13.22427,13.159749
4,1001,2007,14.542888,14.036194,13.923639


In [20]:
# long to wide format

pm_year_agg = pd.melt(pm_year_agg, id_vars=['FIPS_pm', 'year'], value_vars=['PM25_max_pred', 'PM25_med_pred', 'PM25_mean_pred'])

pm_year_agg['col_name'] = pm_year_agg['variable'] + '_' + pm_year_agg['year'].astype(str)

pm_year_agg = pm_year_agg.pivot(index='FIPS_pm', columns='col_name', values='value')

pm_year_agg.reset_index(inplace=True)

pm_year_agg.head()

col_name,FIPS_pm,PM25_max_pred_2001,PM25_max_pred_2002,PM25_max_pred_2003,PM25_max_pred_2004,PM25_max_pred_2005,PM25_max_pred_2006,PM25_max_pred_2007,PM25_max_pred_2008,PM25_max_pred_2009,...,PM25_med_pred_2005,PM25_med_pred_2006,PM25_med_pred_2007,PM25_med_pred_2008,PM25_med_pred_2009,PM25_med_pred_2010,PM25_med_pred_2011,PM25_med_pred_2012,PM25_med_pred_2013,PM25_med_pred_2014
0,1001,13.186807,12.490668,12.869481,13.111945,13.543276,13.638252,14.542888,12.507304,10.279905,...,13.164472,13.22427,14.036194,12.022979,10.003785,10.788455,10.80847,10.43864,10.220977,11.282171
1,1003,12.52019,11.26968,12.162325,12.081706,12.994364,11.942907,12.256714,10.514962,9.342412,...,11.503305,10.51393,10.622514,9.2479,8.429134,8.885592,8.865912,8.737817,8.437009,9.635918
2,1005,12.364383,11.443971,11.531937,12.543327,12.787813,12.915748,14.095731,11.702072,10.480079,...,12.087744,12.229105,13.11262,11.07486,9.945611,10.689897,10.289959,9.64168,9.644445,10.9254
3,1007,13.944184,12.781192,13.16291,13.082518,14.310805,14.060887,14.206462,12.319946,9.992209,...,13.484249,13.370442,13.508796,11.717692,9.575728,10.485379,10.607298,9.992089,9.996376,10.747303
4,1009,16.032861,14.493547,14.722058,14.622386,15.809701,15.465637,16.052929,13.357497,10.91108,...,14.797208,14.53009,15.0282,12.598568,10.283973,11.37275,11.053671,10.687267,10.306375,10.994724


In [21]:
# long to wide format

pm_pred_year_agg = pd.melt(pm_pred_year_agg, id_vars=['FIPS_pm', 'year'], value_vars=['PM25_max_pred', 'PM25_med_pred', 'PM25_mean_pred'])

pm_pred_year_agg['col_name'] = pm_pred_year_agg['variable'] + '_' + pm_pred_year_agg['year'].astype(str)

pm_pred_year_agg = pm_pred_year_agg.pivot(index='FIPS_pm', columns='col_name', values='value')

pm_pred_year_agg.reset_index(inplace=True)

pm_pred_year_agg.head()

col_name,FIPS_pm,PM25_max_pred_2003,PM25_max_pred_2004,PM25_max_pred_2005,PM25_max_pred_2006,PM25_max_pred_2007,PM25_max_pred_2008,PM25_max_pred_2009,PM25_max_pred_2010,PM25_max_pred_2011,...,PM25_med_pred_2007,PM25_med_pred_2008,PM25_med_pred_2009,PM25_med_pred_2010,PM25_med_pred_2011,PM25_med_pred_2012,PM25_med_pred_2013,PM25_med_pred_2014,PM25_med_pred_2015,PM25_med_pred_2016
0,1001,12.869481,13.111945,13.543276,13.638252,14.542888,12.507304,10.279905,11.093826,11.106039,...,14.036194,12.022979,10.003785,10.788455,10.80847,10.43864,10.220977,11.282171,10.439159,10.013333
1,1003,12.162325,12.081706,12.994364,11.942907,12.256714,10.514962,9.342412,9.720208,9.763411,...,10.622514,9.2479,8.429134,8.885592,8.865912,8.737817,8.437009,9.635918,9.146815,8.334951
2,1005,11.531937,12.543327,12.787813,12.915748,14.095731,11.702072,10.480079,11.327428,10.793651,...,13.11262,11.07486,9.945611,10.689897,10.289959,9.64168,9.644445,10.9254,9.413704,8.823483
3,1007,13.16291,13.082518,14.310805,14.060887,14.206462,12.319946,9.992209,10.969478,11.03507,...,13.508796,11.717692,9.575728,10.485379,10.607298,9.992089,9.996376,10.747303,9.988738,9.468531
4,1009,14.722058,14.622386,15.809701,15.465637,16.052929,13.357497,10.91108,12.075522,11.630947,...,15.0282,12.598568,10.283973,11.37275,11.053671,10.687267,10.306375,10.994724,10.230211,10.008507


In [22]:
# create ozone pred df
oz_pred = oz_df[oz_df['Year']>= 2003]

# Remove years after 2014 to match cancer data
# no ozone outliers were found in the data
oz_df = oz_df[oz_df['Year']<=2014]

In [23]:
# check years match cancer years

oz_df.Year.unique()

array([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014])

In [24]:
# check pred df years are a 14 year slice

oz_pred.Year.unique()

array([2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016])

In [25]:
# create state + county FIPS to match cancer data FIPS

oz_df['FIPS_oz'] = oz_df['statefips'] + oz_df['countyfips']

oz_df.head()

Unnamed: 0,Year,Month,Day,statefips,countyfips,O3_max_pred,O3_med_pred,O3_mean_pred,O3_pop_pred,FIPS_oz
0,2001,JAN,1,1,1,31.939058,31.691988,31.680859,31.671226,1001
1,2001,JAN,1,1,3,33.646855,33.170271,32.994775,32.93507,1003
2,2001,JAN,1,1,5,34.288917,34.068507,34.077954,34.086631,1005
3,2001,JAN,1,1,7,30.349767,30.036093,29.931756,29.991733,1007
4,2001,JAN,1,1,9,26.472655,25.776595,25.857571,25.872472,1009


In [26]:
# create state + county FIPS to match cancer data FIPS

oz_pred['FIPS_oz'] = oz_pred['statefips'] + oz_pred['countyfips']

oz_pred.head()

Unnamed: 0,Year,Month,Day,statefips,countyfips,O3_max_pred,O3_med_pred,O3_mean_pred,O3_pop_pred,FIPS_oz
2269570,2003,JAN,1,1,1,24.5943,24.371716,24.357799,24.343577,1001
2269571,2003,JAN,1,1,3,24.933854,23.521125,23.267157,23.268305,1003
2269572,2003,JAN,1,1,5,31.503468,30.481313,30.381041,30.375045,1005
2269573,2003,JAN,1,1,7,22.380612,21.500477,21.434621,21.559526,1007
2269574,2003,JAN,1,1,9,21.246812,19.951979,19.83788,19.714255,1009


In [27]:
#aggreagate ozone data by county

oz_agg = oz_df.groupby(['FIPS_oz']).agg({'O3_max_pred':'mean','O3_med_pred':'mean','O3_mean_pred':'mean'})

oz_agg.reset_index(inplace=True)

oz_agg.head()

Unnamed: 0,FIPS_oz,O3_max_pred,O3_med_pred,O3_mean_pred
0,1001,41.001571,39.948742,39.921898
1,1003,45.070022,41.241604,41.341833
2,1005,41.139416,40.185889,40.162437
3,1007,40.505549,39.399455,39.389288
4,1009,42.714107,41.096201,41.085945


In [28]:
#aggreagate ozone data by county

oz_pred_agg = oz_pred.groupby(['FIPS_oz']).agg({'O3_max_pred':'mean','O3_med_pred':'mean','O3_mean_pred':'mean'})

oz_pred_agg.reset_index(inplace=True)

oz_pred_agg.head()

Unnamed: 0,FIPS_oz,O3_max_pred,O3_med_pred,O3_mean_pred
0,1001,40.514807,39.489763,39.457052
1,1003,44.482241,40.738241,40.819741
2,1005,40.595697,39.667357,39.633515
3,1007,40.101601,39.013332,39.004118
4,1009,42.326738,40.748248,40.74638


In [29]:
# aggregate ozone data by county and year

oz_year_agg = oz_df.groupby(['FIPS_oz', 'Year']).agg({'O3_max_pred':'mean','O3_med_pred':'mean','O3_mean_pred':'mean'})

oz_year_agg.reset_index(inplace=True)

oz_year_agg.head()

Unnamed: 0,FIPS_oz,Year,O3_max_pred,O3_med_pred,O3_mean_pred
0,1001,2001,42.305986,41.32281,41.30632
1,1001,2002,41.864027,40.669792,40.647115
2,1001,2003,40.068504,38.835014,38.825136
3,1001,2004,39.985884,38.801989,38.81942
4,1001,2005,41.686453,40.529218,40.518377


In [30]:
# aggregate ozone data by county and year

oz_pred_year_agg = oz_pred.groupby(['FIPS_oz', 'Year']).agg({'O3_max_pred':'mean','O3_med_pred':'mean','O3_mean_pred':'mean'})

oz_pred_year_agg.reset_index(inplace=True)

oz_pred_year_agg.head()

Unnamed: 0,FIPS_oz,Year,O3_max_pred,O3_med_pred,O3_mean_pred
0,1001,2003,40.068504,38.835014,38.825136
1,1001,2004,39.985884,38.801989,38.81942
2,1001,2005,41.686453,40.529218,40.518377
3,1001,2006,44.96225,43.681686,43.69134
4,1001,2007,45.038095,43.838816,43.84032


In [31]:
# long to wide format

oz_year_agg = pd.melt(oz_year_agg, id_vars=['FIPS_oz', 'Year'], value_vars=['O3_max_pred', 'O3_med_pred', 'O3_mean_pred'])

oz_year_agg['col_name'] = oz_year_agg['variable'] + '_' + oz_year_agg['Year'].astype(str)

oz_year_agg = oz_year_agg.pivot(index='FIPS_oz', columns='col_name', values='value')

oz_year_agg.reset_index(inplace=True)

oz_year_agg.head()

col_name,FIPS_oz,O3_max_pred_2001,O3_max_pred_2002,O3_max_pred_2003,O3_max_pred_2004,O3_max_pred_2005,O3_max_pred_2006,O3_max_pred_2007,O3_max_pred_2008,O3_max_pred_2009,...,O3_med_pred_2005,O3_med_pred_2006,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014
0,1001,42.305986,41.864027,40.068504,39.985884,41.686453,44.96225,45.038095,41.422075,37.122685,...,40.529218,43.681686,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065
1,1003,46.301171,44.59073,45.091291,45.814931,47.630605,49.781161,47.312026,44.124541,42.082399,...,43.565735,45.922312,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659
2,1005,42.835619,40.764086,39.282389,40.656445,41.673038,44.833797,44.750438,42.070679,38.118018,...,40.585246,43.720973,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995
3,1007,41.606691,41.316442,39.441656,39.402323,42.373669,44.924412,44.941542,40.942346,36.03071,...,41.133554,43.764713,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933
4,1009,43.658069,43.200936,43.007695,41.240142,42.970437,45.503027,46.806477,43.306486,38.76275,...,41.301827,43.677824,45.038788,41.869574,37.462589,42.542686,41.728563,41.646953,37.739042,38.077132


In [32]:
# long to wide format

oz_pred_year_agg = pd.melt(oz_pred_year_agg, id_vars=['FIPS_oz', 'Year'], value_vars=['O3_max_pred', 'O3_med_pred', 'O3_mean_pred'])

oz_pred_year_agg['col_name'] = oz_pred_year_agg['variable'] + '_' + oz_pred_year_agg['Year'].astype(str)

oz_pred_year_agg = oz_pred_year_agg.pivot(index='FIPS_oz', columns='col_name', values='value')

oz_pred_year_agg.reset_index(inplace=True)

oz_pred_year_agg.head()

col_name,FIPS_oz,O3_max_pred_2003,O3_max_pred_2004,O3_max_pred_2005,O3_max_pred_2006,O3_max_pred_2007,O3_max_pred_2008,O3_max_pred_2009,O3_max_pred_2010,O3_max_pred_2011,...,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014,O3_med_pred_2015,O3_med_pred_2016
0,1001,40.068504,39.985884,41.686453,44.96225,45.038095,41.422075,37.122685,42.330859,41.921746,...,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065,35.994939,39.567964
1,1003,45.091291,45.814931,47.630605,49.781161,47.312026,44.124541,42.082399,46.222049,45.816964,...,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659,36.873269,38.88646
2,1005,39.282389,40.656445,41.673038,44.833797,44.750438,42.070679,38.118018,42.489218,42.705044,...,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995,35.4112,38.890084
3,1007,39.441656,39.402323,42.373669,44.924412,44.941542,40.942346,36.03071,41.107728,40.971872,...,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933,35.970548,39.368079
4,1009,43.007695,41.240142,42.970437,45.503027,46.806477,43.306486,38.76275,44.257177,43.400903,...,45.038788,41.869574,37.462589,42.542686,41.728563,41.646953,37.739042,38.077132,37.006927,41.614854


In [33]:
# get row count of initial df

len(cancer_df)

2672

In [34]:
# get row count of dfs for pred database

print(len(null_cancer))

print(len(pm_pred_agg))

print(len(oz_pred_agg))

3141
3109
3109


In [35]:
# merge cancer and pollution data

final_df = pd.merge(cancer_df, pm_agg, how='inner', left_on='FIPS', right_on='FIPS_pm')

final_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,1001,stable,1001,12.239594,11.870589,11.7879
1,1003,stable,1003,11.024418,9.82287,9.794272
2,1005,stable,1005,11.68817,11.099414,11.084238
3,1007,stable,1007,12.280141,11.700512,11.721958
4,1009,stable,1009,13.498799,12.690525,12.741736


In [36]:
# merge null cancer and pollution data--right join to keep all pollution data, cancer nulls are okay

pred_df = pd.merge(null_cancer, pm_pred_agg, how='right', left_on='FIPS', right_on='FIPS_pm')

pred_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,1001,stable,1001,11.923392,11.545831,11.46093
1,1003,stable,1003,10.724676,9.57737,9.549603
2,1005,stable,1005,11.368095,10.766793,10.752574
3,1007,stable,1007,11.830653,11.281247,11.299146
4,1009,stable,1009,12.847922,12.097639,12.144016


In [37]:
# get row count of matches

len(final_df)

2651

In [38]:
len(pred_df)

3109

In [39]:
# merge ozone data

final_df = pd.merge(final_df, oz_agg, how='inner', left_on='FIPS', right_on='FIPS_oz')

final_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred,FIPS_oz,O3_max_pred,O3_med_pred,O3_mean_pred
0,1001,stable,1001,12.239594,11.870589,11.7879,1001,41.001571,39.948742,39.921898
1,1003,stable,1003,11.024418,9.82287,9.794272,1003,45.070022,41.241604,41.341833
2,1005,stable,1005,11.68817,11.099414,11.084238,1005,41.139416,40.185889,40.162437
3,1007,stable,1007,12.280141,11.700512,11.721958,1007,40.505549,39.399455,39.389288
4,1009,stable,1009,13.498799,12.690525,12.741736,1009,42.714107,41.096201,41.085945


In [40]:
# merge ozone data--want matches for both pm and ozone

pred_df = pd.merge(pred_df, oz_pred_agg, how='inner', left_on='FIPS_pm', right_on='FIPS_oz')

pred_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred,FIPS_oz,O3_max_pred,O3_med_pred,O3_mean_pred
0,1001,stable,1001,11.923392,11.545831,11.46093,1001,40.514807,39.489763,39.457052
1,1003,stable,1003,10.724676,9.57737,9.549603,1003,44.482241,40.738241,40.819741
2,1005,stable,1005,11.368095,10.766793,10.752574,1005,40.595697,39.667357,39.633515
3,1007,stable,1007,11.830653,11.281247,11.299146,1007,40.101601,39.013332,39.004118
4,1009,stable,1009,12.847922,12.097639,12.144016,1009,42.326738,40.748248,40.74638


In [41]:
# get count of matches

len(final_df)

2651

In [42]:
len(pred_df)

3109

In [43]:
# get count of label distribution

final_df['recent_trend'].value_counts()

stable     2410
falling     198
rising       43
Name: recent_trend, dtype: int64

In [44]:
# merge year data

final_df = pd.merge(final_df, pm_year_agg, how='left', left_on='FIPS', right_on='FIPS_pm')

final_df = pd.merge(final_df, oz_year_agg, how='left', left_on='FIPS', right_on='FIPS_oz')

final_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm_x,PM25_max_pred,PM25_med_pred,PM25_mean_pred,FIPS_oz_x,O3_max_pred,O3_med_pred,O3_mean_pred,...,O3_med_pred_2005,O3_med_pred_2006,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014
0,1001,stable,1001,12.239594,11.870589,11.7879,1001,41.001571,39.948742,39.921898,...,40.529218,43.681686,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065
1,1003,stable,1003,11.024418,9.82287,9.794272,1003,45.070022,41.241604,41.341833,...,43.565735,45.922312,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659
2,1005,stable,1005,11.68817,11.099414,11.084238,1005,41.139416,40.185889,40.162437,...,40.585246,43.720973,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995
3,1007,stable,1007,12.280141,11.700512,11.721958,1007,40.505549,39.399455,39.389288,...,41.133554,43.764713,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933
4,1009,stable,1009,13.498799,12.690525,12.741736,1009,42.714107,41.096201,41.085945,...,41.301827,43.677824,45.038788,41.869574,37.462589,42.542686,41.728563,41.646953,37.739042,38.077132


In [45]:
# merge year data

pred_df = pd.merge(pred_df, pm_pred_year_agg, how='left', left_on='FIPS_pm', right_on='FIPS_pm')

pred_df = pd.merge(pred_df, oz_pred_year_agg, how='left', left_on='FIPS_oz', right_on='FIPS_oz')

pred_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred,FIPS_oz,O3_max_pred,O3_med_pred,O3_mean_pred,...,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014,O3_med_pred_2015,O3_med_pred_2016
0,1001,stable,1001,11.923392,11.545831,11.46093,1001,40.514807,39.489763,39.457052,...,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065,35.994939,39.567964
1,1003,stable,1003,10.724676,9.57737,9.549603,1003,44.482241,40.738241,40.819741,...,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659,36.873269,38.88646
2,1005,stable,1005,11.368095,10.766793,10.752574,1005,40.595697,39.667357,39.633515,...,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995,35.4112,38.890084
3,1007,stable,1007,11.830653,11.281247,11.299146,1007,40.101601,39.013332,39.004118,...,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933,35.970548,39.368079
4,1009,stable,1009,12.847922,12.097639,12.144016,1009,42.326738,40.748248,40.74638,...,45.038788,41.869574,37.462589,42.542686,41.728563,41.646953,37.739042,38.077132,37.006927,41.614854


In [46]:
# drop dup FIPS columns and return list of all cols

final_df.drop(columns=['FIPS_oz_x', 'FIPS_oz_y', 'FIPS_pm_x', 'FIPS_pm_y'], inplace=True)

final_df.columns

Index(['FIPS', 'recent_trend', 'PM25_max_pred', 'PM25_med_pred',
       'PM25_mean_pred', 'O3_max_pred', 'O3_med_pred', 'O3_mean_pred',
       'PM25_max_pred_2001', 'PM25_max_pred_2002', 'PM25_max_pred_2003',
       'PM25_max_pred_2004', 'PM25_max_pred_2005', 'PM25_max_pred_2006',
       'PM25_max_pred_2007', 'PM25_max_pred_2008', 'PM25_max_pred_2009',
       'PM25_max_pred_2010', 'PM25_max_pred_2011', 'PM25_max_pred_2012',
       'PM25_max_pred_2013', 'PM25_max_pred_2014', 'PM25_mean_pred_2001',
       'PM25_mean_pred_2002', 'PM25_mean_pred_2003', 'PM25_mean_pred_2004',
       'PM25_mean_pred_2005', 'PM25_mean_pred_2006', 'PM25_mean_pred_2007',
       'PM25_mean_pred_2008', 'PM25_mean_pred_2009', 'PM25_mean_pred_2010',
       'PM25_mean_pred_2011', 'PM25_mean_pred_2012', 'PM25_mean_pred_2013',
       'PM25_mean_pred_2014', 'PM25_med_pred_2001', 'PM25_med_pred_2002',
       'PM25_med_pred_2003', 'PM25_med_pred_2004', 'PM25_med_pred_2005',
       'PM25_med_pred_2006', 'PM25_med_pred_200

In [47]:
pred_df['FIPS'] = pred_df['FIPS_pm']

pred_df.drop(columns=['FIPS_oz', 'FIPS_pm'], inplace=True)

pred_df.columns

Index(['FIPS', 'recent_trend', 'PM25_max_pred', 'PM25_med_pred',
       'PM25_mean_pred', 'O3_max_pred', 'O3_med_pred', 'O3_mean_pred',
       'PM25_max_pred_2003', 'PM25_max_pred_2004', 'PM25_max_pred_2005',
       'PM25_max_pred_2006', 'PM25_max_pred_2007', 'PM25_max_pred_2008',
       'PM25_max_pred_2009', 'PM25_max_pred_2010', 'PM25_max_pred_2011',
       'PM25_max_pred_2012', 'PM25_max_pred_2013', 'PM25_max_pred_2014',
       'PM25_max_pred_2015', 'PM25_max_pred_2016', 'PM25_mean_pred_2003',
       'PM25_mean_pred_2004', 'PM25_mean_pred_2005', 'PM25_mean_pred_2006',
       'PM25_mean_pred_2007', 'PM25_mean_pred_2008', 'PM25_mean_pred_2009',
       'PM25_mean_pred_2010', 'PM25_mean_pred_2011', 'PM25_mean_pred_2012',
       'PM25_mean_pred_2013', 'PM25_mean_pred_2014', 'PM25_mean_pred_2015',
       'PM25_mean_pred_2016', 'PM25_med_pred_2003', 'PM25_med_pred_2004',
       'PM25_med_pred_2005', 'PM25_med_pred_2006', 'PM25_med_pred_2007',
       'PM25_med_pred_2008', 'PM25_med_pred_200

In [48]:
# get final row count to check for dups

len(final_df)

2651

In [49]:
len(pred_df)

3109

In [50]:
final_df.head()

Unnamed: 0,FIPS,recent_trend,PM25_max_pred,PM25_med_pred,PM25_mean_pred,O3_max_pred,O3_med_pred,O3_mean_pred,PM25_max_pred_2001,PM25_max_pred_2002,...,O3_med_pred_2005,O3_med_pred_2006,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014
0,1001,stable,12.239594,11.870589,11.7879,41.001571,39.948742,39.921898,13.186807,12.490668,...,40.529218,43.681686,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065
1,1003,stable,11.024418,9.82287,9.794272,45.070022,41.241604,41.341833,12.52019,11.26968,...,43.565735,45.922312,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659
2,1005,stable,11.68817,11.099414,11.084238,41.139416,40.185889,40.162437,12.364383,11.443971,...,40.585246,43.720973,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995
3,1007,stable,12.280141,11.700512,11.721958,40.505549,39.399455,39.389288,13.944184,12.781192,...,41.133554,43.764713,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933
4,1009,stable,13.498799,12.690525,12.741736,42.714107,41.096201,41.085945,16.032861,14.493547,...,41.301827,43.677824,45.038788,41.869574,37.462589,42.542686,41.728563,41.646953,37.739042,38.077132


In [51]:
pred_df.head()

Unnamed: 0,FIPS,recent_trend,PM25_max_pred,PM25_med_pred,PM25_mean_pred,O3_max_pred,O3_med_pred,O3_mean_pred,PM25_max_pred_2003,PM25_max_pred_2004,...,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014,O3_med_pred_2015,O3_med_pred_2016
0,1001,stable,11.923392,11.545831,11.46093,40.514807,39.489763,39.457052,12.869481,13.111945,...,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065,35.994939,39.567964
1,1003,stable,10.724676,9.57737,9.549603,44.482241,40.738241,40.819741,12.162325,12.081706,...,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659,36.873269,38.88646
2,1005,stable,11.368095,10.766793,10.752574,40.595697,39.667357,39.633515,11.531937,12.543327,...,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995,35.4112,38.890084
3,1007,stable,11.830653,11.281247,11.299146,40.101601,39.013332,39.004118,13.16291,13.082518,...,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933,35.970548,39.368079
4,1009,stable,12.847922,12.097639,12.144016,42.326738,40.748248,40.74638,14.722058,14.622386,...,45.038788,41.869574,37.462589,42.542686,41.728563,41.646953,37.739042,38.077132,37.006927,41.614854


In [52]:
# create df that includes null rows for pollution and ozone

null_df = pd.merge(cancer_df, pm_agg, how='left', left_on='FIPS', right_on='FIPS_pm')

null_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred
0,0,falling,,,,
1,1001,stable,1001.0,12.239594,11.870589,11.7879
2,1003,stable,1003.0,11.024418,9.82287,9.794272
3,1005,stable,1005.0,11.68817,11.099414,11.084238
4,1007,stable,1007.0,12.280141,11.700512,11.721958


In [53]:
null_df = pd.merge(null_df, oz_agg, how='left', left_on='FIPS', right_on='FIPS_oz')

null_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm,PM25_max_pred,PM25_med_pred,PM25_mean_pred,FIPS_oz,O3_max_pred,O3_med_pred,O3_mean_pred
0,0,falling,,,,,,,,
1,1001,stable,1001.0,12.239594,11.870589,11.7879,1001.0,41.001571,39.948742,39.921898
2,1003,stable,1003.0,11.024418,9.82287,9.794272,1003.0,45.070022,41.241604,41.341833
3,1005,stable,1005.0,11.68817,11.099414,11.084238,1005.0,41.139416,40.185889,40.162437
4,1007,stable,1007.0,12.280141,11.700512,11.721958,1007.0,40.505549,39.399455,39.389288


In [54]:
# check that row count matches cancer_df

len(null_df)

2672

In [55]:
# merge in by year data

null_df = pd.merge(null_df, pm_year_agg, how='left', left_on='FIPS', right_on='FIPS_pm')

null_df = pd.merge(null_df, oz_year_agg, how='left', left_on='FIPS', right_on='FIPS_oz')

null_df.head()

Unnamed: 0,FIPS,recent_trend,FIPS_pm_x,PM25_max_pred,PM25_med_pred,PM25_mean_pred,FIPS_oz_x,O3_max_pred,O3_med_pred,O3_mean_pred,...,O3_med_pred_2005,O3_med_pred_2006,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014
0,0,falling,,,,,,,,,...,,,,,,,,,,
1,1001,stable,1001.0,12.239594,11.870589,11.7879,1001.0,41.001571,39.948742,39.921898,...,40.529218,43.681686,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065
2,1003,stable,1003.0,11.024418,9.82287,9.794272,1003.0,45.070022,41.241604,41.341833,...,43.565735,45.922312,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659
3,1005,stable,1005.0,11.68817,11.099414,11.084238,1005.0,41.139416,40.185889,40.162437,...,40.585246,43.720973,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995
4,1007,stable,1007.0,12.280141,11.700512,11.721958,1007.0,40.505549,39.399455,39.389288,...,41.133554,43.764713,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933


In [56]:
# drop dup FIPS columns and print list of cols

null_df.drop(columns=['FIPS_oz_x', 'FIPS_oz_y', 'FIPS_pm_x', 'FIPS_pm_y'], inplace=True)

null_df.columns

Index(['FIPS', 'recent_trend', 'PM25_max_pred', 'PM25_med_pred',
       'PM25_mean_pred', 'O3_max_pred', 'O3_med_pred', 'O3_mean_pred',
       'PM25_max_pred_2001', 'PM25_max_pred_2002', 'PM25_max_pred_2003',
       'PM25_max_pred_2004', 'PM25_max_pred_2005', 'PM25_max_pred_2006',
       'PM25_max_pred_2007', 'PM25_max_pred_2008', 'PM25_max_pred_2009',
       'PM25_max_pred_2010', 'PM25_max_pred_2011', 'PM25_max_pred_2012',
       'PM25_max_pred_2013', 'PM25_max_pred_2014', 'PM25_mean_pred_2001',
       'PM25_mean_pred_2002', 'PM25_mean_pred_2003', 'PM25_mean_pred_2004',
       'PM25_mean_pred_2005', 'PM25_mean_pred_2006', 'PM25_mean_pred_2007',
       'PM25_mean_pred_2008', 'PM25_mean_pred_2009', 'PM25_mean_pred_2010',
       'PM25_mean_pred_2011', 'PM25_mean_pred_2012', 'PM25_mean_pred_2013',
       'PM25_mean_pred_2014', 'PM25_med_pred_2001', 'PM25_med_pred_2002',
       'PM25_med_pred_2003', 'PM25_med_pred_2004', 'PM25_med_pred_2005',
       'PM25_med_pred_2006', 'PM25_med_pred_200

In [57]:
# check that row count of df matches cancer_data

len(null_df)

2672

In [58]:
null_df.head()

Unnamed: 0,FIPS,recent_trend,PM25_max_pred,PM25_med_pred,PM25_mean_pred,O3_max_pred,O3_med_pred,O3_mean_pred,PM25_max_pred_2001,PM25_max_pred_2002,...,O3_med_pred_2005,O3_med_pred_2006,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014
0,0,falling,,,,,,,,,...,,,,,,,,,,
1,1001,stable,12.239594,11.870589,11.7879,41.001571,39.948742,39.921898,13.186807,12.490668,...,40.529218,43.681686,43.838816,40.486148,36.221797,41.289135,40.862455,39.302231,36.364677,37.080065
2,1003,stable,11.024418,9.82287,9.794272,45.070022,41.241604,41.341833,12.52019,11.26968,...,43.565735,45.922312,43.525804,40.236525,38.505225,42.373774,42.022552,39.985824,37.438556,38.104659
3,1005,stable,11.68817,11.099414,11.084238,41.139416,40.185889,40.162437,12.364383,11.443971,...,40.585246,43.720973,43.837824,41.187681,37.31768,41.459194,41.776717,39.300123,36.652613,37.449995
4,1007,stable,12.280141,11.700512,11.721958,40.505549,39.399455,39.389288,13.944184,12.781192,...,41.133554,43.764713,43.850361,39.875838,35.059151,39.827738,39.680734,38.873873,35.961584,36.127933


In [59]:
#obscure connection info from github by importing info from csv

conn_path = "/content/drive/MyDrive/Data/db_conn_info.csv"

conn_info = pd.read_csv(conn_path)

db_name = conn_info['db_name'][0]

db_pw = conn_info['password'][0]

db_host = conn_info['host'][0]

db_port = conn_info['port'][0]

db_user = conn_info['user'][0]

In [60]:
from sqlalchemy import create_engine

engine = create_engine(f'postgresql://{db_name}:{db_pw}@{db_host}:{db_port}/{db_user}')

In [61]:
# upload data to elephant db

final_df.to_sql('model_dataset', engine, if_exists='replace')

In [62]:
null_df.to_sql('null_model_dataset', engine, if_exists='replace')

In [63]:
pred_df.to_sql('prediction_model_dataset', engine, if_exists='replace')