In [1]:
# import dependencies
%matplotlib notebook
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
# import source data files

deaths_2013_2014 = pd.read_csv("raw_data/OD_2013_2014.csv", thousands=',')
deaths_2015 = pd.read_csv("raw_data/OD_2015.csv", thousands=',')
deaths_2016 = pd.read_csv("raw_data/OD_2016.csv", thousands=',')
deaths_2017 = pd.read_csv("raw_data/OD_2017.csv", thousands=',')

regions = pd.read_csv("raw_data/regions.csv", thousands=',')

# reduce dataframes to include only state and death statistic
#############THESE ARE THE OVERDOSE BY STATE DATA FRAMES *********************
deaths_2013_df = deaths_2013_2014[['State', '2013Number']]
deaths_2013_df = deaths_2013_df.rename(index=str, columns={"2013Number": "2013Deaths"})
# deaths_2013_df["2013Deaths"] = pd.to_numeric(deaths_2013_df["2013Deaths"])
deaths_2014_df = deaths_2013_2014[['State', '2014Number']]
deaths_2014_df = deaths_2014_df.rename(index=str, columns={"2014Number": "2014Deaths"})
deaths_2015_df = deaths_2015[['State', 'Number']]
deaths_2015_df = deaths_2015_df.rename(index=str, columns={"Number": "2015Deaths"})
deaths_2016_df = deaths_2016[['State', 'number']]
deaths_2016_df = deaths_2016_df.rename(index=str, columns={"number": "2016Deaths"})
deaths_2017_df = deaths_2017[['State', 'number']]
deaths_2017_df = deaths_2017_df.rename(index=str, columns={"number": "2017Deaths"})

deaths_2013_df.head()

Unnamed: 0,State,2013Deaths
0,ND,20
1,NE,117
2,SD,55
3,IA,275
4,TX,2446


In [3]:
# import perscription data
rx_2013 = pd.read_csv("raw_data/RX_2013.csv")
rx_2014 = pd.read_csv("raw_data/RX_2014.csv")
rx_2015 = pd.read_csv("raw_data/RX_2015.csv")
rx_2016 = pd.read_csv("raw_data/RX_2016.csv")
rx_2017 = pd.read_csv("raw_data/RX_2017.csv")

# reduce dataframes to include only state and death statistic 
rx_2013_df = rx_2013[['State ABBR', '2013 Prescribing Rate']]
rx_2013_df = rx_2013_df.rename(index=str, columns={"State ABBR": "State", "2013 Prescribing Rate": "2013_RX_rate"})
rx_2014_df = rx_2014[['State ABBR', '2014 Prescribing Rate']]
rx_2014_df = rx_2014_df.rename(index=str, columns={"State ABBR": "State", "2014 Prescribing Rate": "2014_RX_rate"})
rx_2015_df = rx_2015[['State ABBR', '2015 Prescribing Rate']]
rx_2015_df = rx_2015_df.rename(index=str, columns={"State ABBR": "State", "2015 Prescribing Rate": "2015_RX_rate"})
rx_2016_df = rx_2016[['State ABBR', '2016 Prescribing Rate']]
rx_2016_df = rx_2016_df.rename(index=str, columns={"State ABBR": "State", "2016 Prescribing Rate": "2016_RX_rate"})
rx_2017_df = rx_2017[['Abbreviation', 'Year 2017']].drop([51], axis=0).reset_index(drop=True)
rx_2017_df = rx_2017_df.rename(index=str, columns={"Abbreviation": "State", "Year 2017": "2017_RX_rate"})

# drop last row in rx_2017 becuase it is a duplicate entry (WY)
# rx_2017_df = rx_2017_df.drop([51], axis=0).reset_index()

#######THESE ARE THE PERSCRIPTION BY STATE DATA FRAMES*******************
# rx_2013_df
# rx_2014_df
# rx_2015_df
# rx_2016_df
# rx_2017_df


rx_2017_df.head()

Unnamed: 0,State,2017_RX_rate
0,AL,107.2
1,AK,52.0
2,AZ,61.2
3,AR,105.4
4,CA,39.5


In [9]:
# import population data
population2010_2017 = pd.read_csv("raw_data/Pop_2013-2017.csv", thousands=',')
# remove first two rows because they are insignificant to the dataframe
population2010_2017_a = population2010_2017.drop([0,1,2,3,4,56,57,58,59,60,61,62], axis=0).reset_index(drop=True)
# rename column headers
population2010_2017_a = population2010_2017_a.rename(index=str, columns={"Unnamed: 0": "State", "2013": "2013_Pop", "2014": "2014_Pop", "2015": "2015_Pop", "2016": "2016_Pop", "2017": "2017_Pop"})
# create dataframe with new column names
pop_2010_2017_df = pd.DataFrame(population2010_2017_a)
pop_2010_2017_df = pop_2010_2017_df.replace({'.Alabama': 'AL',
                                             '.Alaska': 'AK', 
                                             '.Arizona': 'AZ',
                                             '.Arkansas': 'AR',
                                             '.California': 'CA',
                                             '.Colorado': 'CO',
                                             '.Connecticut': 'CT',
                                             '.Delaware': 'DE',
                                             '.District of Columbia': 'DC',
                                             '.Florida': 'FL',
                                             '.Georgia': 'GA',
                                             '.Hawaii': 'HI',
                                             '.Idaho': 'ID',
                                             '.Illinois': 'IL',
                                             '.Indiana': 'IN',
                                             '.Iowa': 'IA',
                                             '.Kansas': 'KS',
                                             '.Kentucky': 'KY',
                                             '.Louisiana':'LA',
                                             '.Maine': 'ME',
                                             '.Maryland':'MD',
                                             '.Massachusetts':'MA',
                                             '.Michigan':'MI',
                                             '.Minnesota':'MN',
                                             '.Mississippi':'MS',
                                             '.Missouri':'MO',
                                             '.Montana':'MT',
                                             '.Nebraska':'NE',
                                             '.Nevada':'NV',
                                             '.New Hampshire':'NH',
                                             '.New Jersey':'NJ',
                                             '.New Mexico':'NM',
                                             '.New York':'NY',
                                             '.North Carolina':'NC',
                                             '.North Dakota':'ND',
                                             '.Ohio':'OH',
                                             '.Oklahoma':'OK',
                                             '.Oregon':'OR',
                                             '.Pennsylvania':'PA',
                                             '.Rhode Island':'RI',
                                             '.South Carolina':'SC',
                                             '.South Dakota':'SD',
                                             '.Tennessee':'TN',
                                             '.Texas':'TX',
                                             '.Utah':'UT',
                                             '.Vermont':'VT',
                                             '.Virginia':'VA',
                                             '.Washington':'WA',
                                             '.West Virginia':'WV',
                                             '.Wisconsin':'WI',
                                             '.Wyoming':'WY'})
######THIS IS THE POPULATION DATA FRAME******************************
# pop_2010_2017_df

# merge regions
pop_2010_2017_df = pop_2010_2017_df.merge(regions)

pop_2010_2017_df.head()

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [10]:
# merge population_df, rx_dfs, and deaths_dfs (prd = population rx deaths)
# prd_df = pop_2010_2017_df.merge(rx_2013_df).merge(deaths_2013_df).merge(rx_2014_df).merge(deaths_2014_df).merge(rx_2015_df).merge(deaths_2015_df).merge(rx_2016_df).merge(deaths_2016_df).merge(rx_2017_df).merge(deaths_2017_df)

# merge 2013 tables
prd_13_df = pop_2010_2017_df.merge(rx_2013_df).merge(deaths_2013_df)
# merge 2013-2014 tables
prd_14_df = prd_13_df.merge(rx_2014_df).merge(deaths_2014_df)
# merge 2013-2015 tables
prd_15_df = prd_14_df.merge(rx_2015_df).merge(deaths_2015_df)
# merge 2013-2016 tables
prd_16_df = prd_15_df.merge(rx_2016_df).merge(deaths_2016_df)
# merge 2013-2017 tables
prd_17_df = prd_16_df.merge(rx_2017_df).merge(deaths_2017_df)


prd_17_df.head()

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [None]:
# change column locations and create the final consolidated dataframe called "master_df"
master_df = pd.DataFrame(prd_17_df[['State','Region',
                                    '2013_Pop','2013_RX_rate','2013Deaths',
                                    '2014_Pop','2014_RX_rate','2014Deaths',
                                    '2015_Pop','2015_RX_rate','2015Deaths',
                                    '2016_Pop','2016_RX_rate','2016Deaths',
                                    '2017_Pop','2017_RX_rate','2017Deaths']])

# convert population column values to integers for downstream statistical analysis
master_df['2013_Pop'] = master_df['2013_Pop'].apply(int)
master_df['2014_Pop'] = master_df['2014_Pop'].apply(int)
master_df['2015_Pop'] = master_df['2015_Pop'].apply(int)
master_df['2016_Pop'] = master_df['2016_Pop'].apply(int)
master_df['2017_Pop'] = master_df['2017_Pop'].apply(int)
master_df.head()

In [None]:
# normalize deaths by dividing by 100,000
master_df['2013Deaths'] = master_df['2013Deaths'] / (master_df['2013_Pop'] / 100000)
master_df['2014Deaths'] = master_df['2014Deaths'] / (master_df['2014_Pop'] / 100000)
master_df['2015Deaths'] = master_df['2015Deaths'] / (master_df['2015_Pop'] / 100000)
master_df['2016Deaths'] = master_df['2016Deaths'] / (master_df['2016_Pop'] / 100000)
master_df['2017Deaths'] = master_df['2017Deaths'] / (master_df['2017_Pop'] / 100000)

In [None]:
# subset master dataframe into regional data frames by economic region
southeast = master_df[master_df.Region == 'Southeast'].reset_index(drop=True)
southwest = master_df[master_df.Region == 'Southwest'].reset_index(drop=True)
farwest = master_df[master_df.Region == 'Far West'].reset_index(drop=True)
rockmount = master_df[master_df.Region == 'Rocky Mountain'].reset_index(drop=True)
plains = master_df[master_df.Region == 'Plains'].reset_index(drop=True)
greatlakes = master_df[master_df.Region == 'Great Lakes'].reset_index(drop=True)
mideast = master_df[master_df.Region == 'Mideast'].reset_index(drop=True)
newengland = master_df[master_df.Region == 'New England'].reset_index(drop=True)



newengland

In [None]:
# create descriptive stats summary table

master_range_max = master_df.max()
master_range_min = master_df.min()
master_mean = master_df.mean().apply(int)
master_median = master_df.median().apply(int)
master_stdev = master_df.std()
master_var = master_df.var().apply(int)
master_count = master_df.count()
master_sum = master_df.sum()
master_df.describe()
# master_sum_stats_df = pd.DataFrame(master_df.describe())
# master_sum_stats_df
master_stdev

In [None]:
# create table of correlations
master_df.corr()

In [None]:
# create odds ratios by region (odds of death from overdose from target region compared to all other regions)
# calculate deaths by region, from 2013 through 2017
deaths_southeast = southeast[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_southeast = deaths_southeast.sum()

deaths_southwest = southwest[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_southwest = deaths_southwest.sum()

deaths_farwest = farwest[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_farwest = deaths_farwest.sum()

deaths_rockmount = rockmount[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_rockmount = deaths_rockmount.sum()

deaths_plains = plains[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_plains = deaths_plains.sum()

deaths_greatlakes = greatlakes[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_greatlakes = deaths_greatlakes.sum()

deaths_mideast = mideast[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_mideast = deaths_mideast.sum()

deaths_newengland = newengland[['2013Deaths', '2014Deaths', '2015Deaths', '2016Deaths', '2017Deaths']].apply(sum)
total_deaths_newengland = deaths_newengland.sum()

# calculate total population by region, from 2013 through 2017 
pop_southeast = southeast[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_southeast = pop_southeast.sum()

pop_southwest = southwest[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_southwest = pop_southwest.sum()

pop_farwest = farwest[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_farwest = pop_farwest.sum()

pop_rockmount = rockmount[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_rockmount = pop_rockmount.sum()

pop_plains = plains[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_plains = pop_plains.sum()

pop_greatlakes = greatlakes[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_greatlakes = pop_greatlakes.sum()

pop_mideast = mideast[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_mideast = pop_mideast.sum()

pop_newengland = newengland[['2013_Pop', '2014_Pop','2015_Pop','2016_Pop','2017_Pop',]].apply(sum)
total_pop_newengland = pop_newengland.sum()

# calculate combined population for all regions, for entire period of analysis
combined_pop = total_pop_southwest + total_pop_southeast + total_pop_farwest + total_pop_rockmount + total_pop_plains + total_pop_greatlakes + total_pop_mideast + total_pop_newengland
combined_pop

# calculate nots for each region in the odds table
death_not_southeast = total_pop_southeast - total_deaths_southeast
death_not_southwest = total_pop_southwest - total_deaths_southwest
death_not_farwest = total_pop_farwest - total_deaths_farwest
death_not_rockmount = total_pop_rockmount - total_deaths_rockmount
death_not_plains = total_pop_plains - total_deaths_plains
death_not_greatlakes = total_pop_greatlakes - total_deaths_greatlakes
death_not_mideast = total_pop_mideast - total_deaths_mideast
death_not_newengland = total_pop_newengland - total_deaths_newengland

# calculate not nots for all other regions in odds table

combined_not_deaths = death_not_southeast + death_not_southwest + death_not_farwest + death_not_rockmount + death_not_plains + death_not_greatlakes + death_not_mideast + death_not_newengland

# calculate combined deaths

combined_deaths = total_deaths_southeast + total_deaths_southwest + total_deaths_farwest + total_deaths_rockmount + total_deaths_plains + total_deaths_greatlakes + total_deaths_mideast + total_deaths_newengland
combined_deaths

In [None]:
# create odds ratio table by target region, compared to all other regions for deaths during 2013 - 2017 time period; algorithm is: 
# odds_x = (total_deaths_x * (combined_not_deaths - death_not_x) / (death_not_x * combined_deaths - total_deaths_x) )
odds_southeast = (total_deaths_southeast * (combined_not_deaths - death_not_southeast) / (death_not_southeast * combined_deaths - total_deaths_southeast) )
odds_southwest = (total_deaths_southwest * (combined_not_deaths - death_not_southwest) / (death_not_southwest * combined_deaths - total_deaths_southwest) )
odds_farwest = (total_deaths_farwest * (combined_not_deaths - death_not_farwest) / (death_not_farwest * combined_deaths - total_deaths_farwest) )
odds_rockmount = (total_deaths_rockmount * (combined_not_deaths - death_not_rockmount) / (death_not_rockmount * combined_deaths - total_deaths_rockmount) )
odds_plains = (total_deaths_plains * (combined_not_deaths - death_not_plains) / (death_not_plains * combined_deaths - total_deaths_plains) )
odds_greatlakes = (total_deaths_greatlakes * (combined_not_deaths - death_not_greatlakes) / (death_not_greatlakes * combined_deaths - total_deaths_greatlakes) )
odds_mideast = (total_deaths_mideast * (combined_not_deaths - death_not_mideast) / (death_not_mideast * combined_deaths - total_deaths_mideast) )
odds_newengland = (total_deaths_newengland * (combined_not_deaths - death_not_newengland) / (death_not_newengland * combined_deaths - total_deaths_newengland) )

odds_ratios = [odds_southeast, odds_southwest, odds_farwest, odds_rockmount, odds_plains, odds_greatlakes, odds_mideast, odds_newengland]
odds_ratios

In [None]:
odds_ratio_df = pd.DataFrame({
    'Region': ['Southeast', 'Southwest', 'Far West', 'Rockies', 'Plains', 'Great Lakes', 'Mideast', 'New England'],
    'Odds for Death': [odds_southeast, odds_southwest, odds_farwest, odds_rockmount, odds_plains, odds_greatlakes, odds_mideast, odds_newengland]
})
odds_ratio_df = odds_ratio_df.sort_values(['Odds for Death'], ascending=False).reset_index(drop=True)

odds_ratio_df

In [None]:
#Create a (multiple) bar graph of deaths and rx totals