# Data Merging Notebook

#### This notebook merges data from the 20_intermediate_files directory and saves a merged dataset for final analysis. We use a notebook for this instead of a script for interpretability across teammates and easier error checking.

#### Imports

In [178]:
import pandas as pd
import numpy as np


#### Import datasets

In [179]:
# set directory
working_dir = "../20_intermediate_files/"

# set file paths
census = working_dir + "census_df.csv"
fips = working_dir + "fips_df.csv"
vital = working_dir + "vital_clean.csv"
wapo = working_dir + "wapo_clean.csv"


In [180]:
data_sets = [census, fips, vital, wapo]

# read in data
census_df = pd.read_csv(census)
fips_df = pd.read_csv(fips)
vital_df = pd.read_csv(vital)
wapo_df = pd.read_csv(wapo)


#### Merge Prep

In [181]:
# take a look at the data starting with census
# census_df.sample(5)


In [182]:
# FIPS codes
# fips_df.sample(5)


In [183]:
# vital df year as datetime
vital_df["YEAR"] = pd.to_datetime(vital_df["YEAR"], format="%Y-%m-%d")

# Add a column for the merge key: County_name+year
vital_df["merge_key"] = vital_df["COUNTY_NAME"] + vital_df["YEAR"].astype(str)

# Vital Stats
vital_df.sample(5)


Unnamed: 0,COUNTY_NAME,FIPS,YEAR,DRUG/ALCOHOL INDUCED CAUSE,DRUG/ALCOHOL INDUCED CAUSE CODE,DEATHS,STATE,merge_key
4974,"LAKE COUNTY, FL",12069,2011-01-01,All other non-drug and non-alcohol causes,O9,3312.0,FL,"LAKE COUNTY, FL2011-01-01"
4590,"JASPER COUNTY, TX",48241,2005-01-01,All other non-drug and non-alcohol causes,O9,369.0,TX,"JASPER COUNTY, TX2005-01-01"
6529,"BRADFORD COUNTY, FL",12007,2004-01-01,All other non-drug and non-alcohol causes,O9,252.0,FL,"BRADFORD COUNTY, FL2004-01-01"
308,"NOWATA COUNTY, OK",40105,2009-01-01,All other non-drug and non-alcohol causes,O9,144.0,OK,"NOWATA COUNTY, OK2009-01-01"
10385,"KING COUNTY, WA",53033,2013-01-01,All other non-drug and non-alcohol causes,O9,11941.0,WA,"KING COUNTY, WA2013-01-01"


In [184]:
# Add the same merge key
wapo_df["merge_key"] = wapo_df["COUNTY_NAME"] + wapo_df["YEAR"].astype(str)

# WAPO Data
# wapo_df.sample(5)

#### Merging

***We identify that the Vital Stats Dataframe has the base foundation well need to merge data, so we make a copy and call that our base dataframe.***

In [166]:
base_df = vital_df.copy()


***Merge the WAPO DF first***

In [167]:
# merge the vital stats df
base_df = base_df.merge(wapo_df, on="merge_key", how="left")


In [168]:
# Drop the COUNTY_NAME_wapo and STATE_wapo columns
base_df.drop(columns=["COUNTY_NAME_y", "STATE_y"], inplace=True)

# Rename the columns
base_df.rename(
    columns={
        "COUNTY_NAME_x": "COUNTY_NAME",
        "STATE_x": "STATE",
        "YEAR_x": "YEAR",
        "YEAR_y": "YEAR_wapo",
        "QUANTITY": "SHIP_QUANTITY",
    },
    inplace=True,
)

base_df.head()


Unnamed: 0,COUNTY_NAME,FIPS,YEAR,DRUG/ALCOHOL INDUCED CAUSE,DRUG/ALCOHOL INDUCED CAUSE CODE,DEATHS,STATE,merge_key,YEAR_wapo,MONTH,SHIP_QUANTITY
0,"AUTAUGA COUNTY, AL",1001,2009-01-01,All other non-drug and non-alcohol causes,O9,408.0,AL,"AUTAUGA COUNTY, AL2009-01-01",2009-01-01,1.0,90.0
1,"AUTAUGA COUNTY, AL",1001,2009-01-01,All other non-drug and non-alcohol causes,O9,408.0,AL,"AUTAUGA COUNTY, AL2009-01-01",2009-01-01,2.0,2429.0
2,"AUTAUGA COUNTY, AL",1001,2009-01-01,All other non-drug and non-alcohol causes,O9,408.0,AL,"AUTAUGA COUNTY, AL2009-01-01",2009-01-01,3.0,2755.0
3,"AUTAUGA COUNTY, AL",1001,2009-01-01,All other non-drug and non-alcohol causes,O9,408.0,AL,"AUTAUGA COUNTY, AL2009-01-01",2009-01-01,4.0,1142.0
4,"AUTAUGA COUNTY, AL",1001,2009-01-01,All other non-drug and non-alcohol causes,O9,408.0,AL,"AUTAUGA COUNTY, AL2009-01-01",2009-01-01,5.0,2972.0


In [185]:
# Reorder the columns
cols = [
    "COUNTY_NAME",
    "STATE",
    "FIPS",
    "MONTH",
    "YEAR",
    "YEAR_wapo",
    "DRUG/ALCOHOL INDUCED CAUSE",
    "DRUG/ALCOHOL INDUCED CAUSE CODE",
    "DEATHS",
    "SHIP_QUANTITY",
]

# make a new df with the better order
new_df = base_df[cols]


# Houston, we have a problem


In [188]:
# new_df.sample(10)

new_df.YEAR.value_counts(dropna=False, sort=True, ascending=True)

new_df.YEAR_wapo.value_counts(dropna=False, sort=True, ascending=True)

NaN           5042
2007-01-01    8927
2006-01-01    9039
2008-01-01    9140
2009-01-01    9187
2010-01-01    9252
2011-01-01    9347
2012-01-01    9409
Name: YEAR_wapo, dtype: int64

***Merge the Census Data Next***

In [None]:
# merge census data
new_df = new_df.merge(census_df, on="COUNTY_NAME", how="left")


In [None]:
# drop STATE_y
new_df.drop(columns=["STATE_y"], inplace=True)

# rename STATE_x to STATE
new_df.rename(columns={"STATE_x": "STATE"}, inplace=True)

new_df.sample(10)


#### Collapse to yearly data

In [None]:
# group the data by county and year and sum the DEATHS and SHIP QUANTITY columns
grouped_df = new_df.groupby(
    [
        "COUNTY_NAME",
        "STATE",
        "YEAR",
        "YEAR_wapo",
        "DRUG/ALCOHOL INDUCED CAUSE",
        "DRUG/ALCOHOL INDUCED CAUSE CODE",
    ]
).agg(
    {
        "DEATHS": "sum",
        "SHIP_QUANTITY": "sum",
        "POPULATION": "mean",
    }
)


In [None]:
grouped_df.sample(10)


#### Add columns for per capita rates

In [None]:
# add a column for death rate per 100,000 people
grouped_df["DEATH_RATE"] = grouped_df["DEATHS"] / grouped_df["POPULATION"] * 100000

In [None]:
# add a column for ship rate per 100,000 people
grouped_df["SHIP_RATE"] = grouped_df["SHIP_QUANTITY"] / grouped_df["POPULATION"] * 100000

In [None]:
grouped_df.sample(10)

#### Export the final grouped df

In [None]:
# export to the 20_intermediate_files directory
# grouped_df.to_csv("../20_intermediate_files/analysis_df.csv", index=False)