# Data Merging Notebook

#### This notebook merges data from the 20_intermediate_files directory and saves a merged dataset for final analysis. We use a notebook for this instead of a script for interpretability across teammates and easier error checking.

#### Imports

In [46]:
import pandas as pd
import numpy as np


#### Import datasets

In [47]:
# set directory
working_dir = "../20_intermediate_files/"

# set file paths
census = working_dir + "census_df.csv"
fips = working_dir + "fips_df.csv"
vital = working_dir + "vital_clean.csv"
wapo = working_dir + "wapo_clean.csv"


In [48]:
# read in data
census_df = pd.read_csv(census)
fips_df = pd.read_csv(fips, dtype={"FIPS": str, "STATE_FIPS": str})
vital_df = pd.read_csv(vital, dtype={"FIPS": str})
wapo_df = pd.read_csv(wapo, dtype={"YEAR":str, "MONTH":str})


#### Merge Prep

In [52]:
# take a look at the data starting with census
# census_df.sample(5)


In [51]:
# fips_df.sample(5)

In [55]:
# FIPS codes
# fips_df.dtypes
# fips_df.sample(5)


In [56]:
# vital df year as datetime
# vital_df["YEAR"] = pd.to_datetime(vital_df["YEAR"], format="%Y-%m-%d")

# Add a column for the merge key: County_name+year
vital_df["merge_key"] = vital_df["COUNTY_NAME"] + vital_df["YEAR"].astype(str)

# Vital Stats
vital_df.sample(5)


Unnamed: 0,COUNTY_NAME,FIPS,YEAR,STATE,DRUG,DEATHS,merge_key
75,"BLOUNT COUNTY, AL",1009,2008,AL,1,15,"BLOUNT COUNTY, AL2008"
592,"JACKSON COUNTY, OR",41029,2008,OR,1,14,"JACKSON COUNTY, OR2008"
868,"MIAMI-DADE COUNTY, FL",12086,2012,FL,1,145,"MIAMI-DADE COUNTY, FL2012"
284,"COMAL COUNTY, TX",48091,2003,TX,1,10,"COMAL COUNTY, TX2003"
980,"OKALOOSA COUNTY, FL",12091,2003,FL,1,20,"OKALOOSA COUNTY, FL2003"


In [61]:
vital_df.dtypes

COUNTY_NAME    object
FIPS           object
YEAR            int64
STATE          object
DRUG            int64
DEATHS          int64
merge_key      object
dtype: object

In [62]:
# Add the same merge key
wapo_df["merge_key"] = wapo_df["COUNTY_NAME"] + wapo_df["YEAR"].astype(str)

# WAPO Data
wapo_df.sample(5)

Unnamed: 0,COUNTY_NAME,STATE,YEAR,MONTH,QUANTITY,merge_key
3733,"BRYAN COUNTY, OK",OK,2009,6,974.0,"BRYAN COUNTY, OK2009"
5441,"CHARLOTTE COUNTY, FL",FL,2012,2,4976.0,"CHARLOTTE COUNTY, FL2012"
42018,"YAKIMA COUNTY, WA",WA,2009,2,2955.0,"YAKIMA COUNTY, WA2009"
8059,"COMAL COUNTY, TX",TX,2012,9,1602.0,"COMAL COUNTY, TX2012"
19451,"JACKSON COUNTY, FL",FL,2010,7,924.0,"JACKSON COUNTY, FL2010"


#### Merging

***We identify that the Vital Stats Dataframe has the base foundation well need to merge data, so we make a copy and call that our base dataframe.***

In [68]:
base_df = vital_df.copy()


***Merge the WAPO DF first***

In [70]:
# merge the vital stats df
merge_df = base_df.merge(wapo_df, on="merge_key", how="outer")


In [71]:
# Drop the COUNTY_NAME_wapo and STATE_wapo columns
merge_df.drop(columns=["COUNTY_NAME_y", "STATE_y"], inplace=True)

# Rename the columns
merge_df.rename(
    columns={
        "COUNTY_NAME_x": "COUNTY_NAME",
        "STATE_x": "STATE",
        "YEAR_x": "YEAR",
        "YEAR_y": "YEAR_wapo",
        "QUANTITY": "SHIP_QUANTITY",
    },
    inplace=True,
)

# Change Year to string
merge_df["YEAR"] = base_df["YEAR"].astype(str)

merge_df.head()


Unnamed: 0,COUNTY_NAME,FIPS,YEAR,STATE,DRUG,DEATHS,merge_key,YEAR_wapo,MONTH,SHIP_QUANTITY
0,"ALACHUA COUNTY, FL",12001,2003,FL,1.0,11.0,"ALACHUA COUNTY, FL2003",,,
1,"ALACHUA COUNTY, FL",12001,2007,FL,1.0,17.0,"ALACHUA COUNTY, FL2007",2007.0,1.0,876.0
2,"ALACHUA COUNTY, FL",12001,2009,FL,1.0,17.0,"ALACHUA COUNTY, FL2007",2007.0,2.0,4093.0
3,"ALACHUA COUNTY, FL",12001,2010,FL,1.0,17.0,"ALACHUA COUNTY, FL2007",2007.0,3.0,4414.0
4,"ALACHUA COUNTY, FL",12001,2011,FL,1.0,17.0,"ALACHUA COUNTY, FL2007",2007.0,4.0,4159.0


In [77]:
merge_df.sample(5)

Unnamed: 0,COUNTY_NAME,FIPS,YEAR,STATE,DRUG,DEATHS,merge_key,YEAR_wapo,MONTH,SHIP_QUANTITY
41852,,,,,,,"WHITMAN COUNTY, WA2007",2007,3,416.0
27010,,,,,,,"JOSEPHINE COUNTY, OR2006",2006,7,2032.0
34767,,,,,,,"PECOS COUNTY, TX2007",2007,5,177.0
29713,,,,,,,"LIMESTONE COUNTY, TX2011",2011,9,452.0
29956,,,,,,,"LINCOLN COUNTY, WA2010",2010,12,234.0


In [72]:
# Reorder the columns
cols = [
    "COUNTY_NAME",
    "STATE",
    "FIPS",
    "MONTH",
    "YEAR",
    "YEAR_wapo",
    "DEATHS",
    "SHIP_QUANTITY",
]

# make a new df with the better order
new_df = merge_df[cols]


# Houston, we have a problem


In [74]:
# new_df.sample(10)

new_df.YEAR.value_counts(dropna=False, sort=True, ascending=True)

# new_df.YEAR_wapo.value_counts(dropna=False, sort=True, ascending=True)

2003       81
2004       94
2005       96
2006      103
2007      108
2008      117
2009      125
2012      125
2010      126
2013      126
2011      128
2014      131
2015      137
NaN     41672
Name: YEAR, dtype: int64

***Merge the Census Data Next***

In [None]:
# merge census data
new_df = new_df.merge(census_df, on="COUNTY_NAME", how="left")


In [None]:
# drop STATE_y
new_df.drop(columns=["STATE_y"], inplace=True)

# rename STATE_x to STATE
new_df.rename(columns={"STATE_x": "STATE"}, inplace=True)

new_df.sample(10)


#### Collapse to yearly data

In [None]:
# group the data by county and year and sum the DEATHS and SHIP QUANTITY columns
grouped_df = new_df.groupby(
    [
        "COUNTY_NAME",
        "STATE",
        "YEAR",
        "YEAR_wapo",
        "DRUG/ALCOHOL INDUCED CAUSE",
        "DRUG/ALCOHOL INDUCED CAUSE CODE",
    ]
).agg(
    {
        "DEATHS": "sum",
        "SHIP_QUANTITY": "sum",
        "POPULATION": "mean",
    }
)


In [None]:
grouped_df.sample(10)


#### Add columns for per capita rates

In [None]:
# add a column for death rate per 100,000 people
grouped_df["DEATH_RATE"] = grouped_df["DEATHS"] / grouped_df["POPULATION"] * 100000

In [None]:
# add a column for ship rate per 100,000 people
grouped_df["SHIP_RATE"] = grouped_df["SHIP_QUANTITY"] / grouped_df["POPULATION"] * 100000

In [None]:
grouped_df.sample(10)

#### Export the final grouped df

In [None]:
# export to the 20_intermediate_files directory
# grouped_df.to_csv("../20_intermediate_files/analysis_df.csv", index=False)