# Data Merging Notebook

#### This notebook merges data from the 20_intermediate_files directory and saves a merged dataset for final analysis. We use a notebook for this instead of a script for interpretability across teammates and easier error checking.

#### Imports

In [93]:
import pandas as pd
import numpy as np


#### Import datasets

In [94]:
# set directory
working_dir = "../20_intermediate_files/"

# set file paths
census = working_dir + "census_df.csv"
fips = working_dir + "fips_df.csv"
vital = working_dir + "vital_clean.csv"
wapo = working_dir + "wapo_clean.csv"


In [95]:
# read in data
census_df = pd.read_csv(census)
fips_df = pd.read_csv(fips, dtype={"FIPS": str, "STATE_FIPS": str})
vital_df = pd.read_csv(vital, dtype={"FIPS": str})
wapo_df = pd.read_csv(wapo, dtype={"YEAR": str, "MONTH": str})


#### Merge Prep

***Census Data is low drama***

In [96]:
# take a look at the data starting with census

# rename county_name to match other data
census_df = census_df.rename(columns={"COUNTY_NAME": "county_name"})

# Rename Miami-Dade County to Dade County
census_df.loc[
    census_df["county_name"] == "MIAMI-DADE COUNTY, FL", "county_name"
] = "DADE COUNTY, FL"


In [97]:
# show counties that contain the word DADE
# census_df[census_df["county_name"].str.contains("DADE")]


In [98]:
print(f"There are {census_df.shape[0]} counties in the census data.")


There are 540 counties in the census data.


In [99]:
# census_df.sample(5)


***Fix the FIPS data next to remove the state-only lines***

In [100]:
# add a new column of the int of the FIPS code
fips_df["FIPS_INT"] = fips_df["FIPS"].astype(int)

# drop the row if the FIPS code is divisible by 100
fips_df = fips_df[fips_df["FIPS_INT"] % 100 != 0]


In [101]:
# show fips for counties that contain the word DADE
# fips_df[fips_df["COUNTY_NAME"].str.contains("DADE")]


In [102]:
# FIPS codes
# fips_df.dtypes
# fips_df.sample(5)


In [103]:
print(f"There are {fips_df.shape[0]} counties in the FIPS data set.")


There are 540 counties in the FIPS data set.


***Clean the Vital DF to add the new merge key with FIPS code***

In [104]:
# show miami dade county
# vital_df[vital_df["FIPS"] == "12086"]
# vital_df[vital_df["FIPS"] == "12025"]

# because it changed

# change the FIPS code for Miami-Dade County (12086) to 12025 to match Dade County
vital_df.loc[vital_df["FIPS"] == "12086", "FIPS"] = "12025"

vital_df_dade = vital_df.copy()


In [105]:
vital_df_dade[vital_df_dade["FIPS"] == "12086"]


Unnamed: 0,COUNTY_NAME,FIPS,YEAR,STATE,DRUG,DEATHS


In [106]:
# if the FIPS code is six characters, remove the leading zero
vital_df_dade["FIPS"] = vital_df_dade["FIPS"].apply(
    lambda x: x[1:] if len(x) == 6 else x
)

# Add a column for the merge key: County_name+year
vital_df_dade["merge_key"] = vital_df_dade["FIPS"] + vital_df_dade["YEAR"].astype(str)

# Convert merge key to an int
# vital_df_dade["merge_key"] = vital_df_dade["merge_key"].astype(int)

# Vital Stats
# vital_df_dade.sample(5)


In [107]:
# find vital_df_dade merge key is 120862003
vital_df_dade[vital_df_dade["merge_key"] == "120862003"]


Unnamed: 0,COUNTY_NAME,FIPS,YEAR,STATE,DRUG,DEATHS,merge_key
859,"MIAMI-DADE COUNTY, FL",12086,2003,FL,1,111,120862003


In [108]:
# vital_df_dade.sample(10)


In [109]:
# check for duplicate merge keys
vital_df_dade["merge_key"].duplicated().sum()


0

In [110]:
print(
    f"There are {vital_df_dade.shape[0]} rows in the vital_df after dade corrections."
)


There are 1497 rows in the vital_df after dade corrections.


***Prep the WAPO dataset by adding FIPS to county names and generating merge key***

In [111]:
# groupby the COUNTY_NAME, STATE, and YEAR and sum the DRUG_QUANTITY
wapo_df = (
    wapo_df.groupby(["COUNTY_NAME", "STATE", "YEAR"])
    .agg({"QUANTITY": "sum"})
    .reset_index()
)


In [112]:
# wapo_df.sample(10)


In [113]:
# Add FIPS codes to the wapo_df data by merging on county_name
wapo_df_fips = pd.merge(wapo_df, fips_df, on="COUNTY_NAME", how="left").copy()

# Drop STATE_FIPS and STATE_y
wapo_df_fips = wapo_df_fips.drop(["STATE_FIPS", "STATE_y", "FIPS_INT"], axis=1)

# strip the last character from the FIPS column
wapo_df_fips["FIPS"] = wapo_df_fips["FIPS"].str[:-1]

# Add the same merge key
wapo_df_fips["merge_key"] = wapo_df_fips["FIPS"] + wapo_df_fips["YEAR"].astype(str)

# rename the QUANTITY column to DRUG_QUANTITY
wapo_df_fips = wapo_df_fips.rename(
    columns={"QUANTITY": "DRUG_QUANTITY", "STATE_x": "STATE"}
)

# add a flag column to indicate the source of the data
wapo_df_fips["source"] = 1


In [114]:
# WAPO Data
# wapo_df_fips.sample(5)


In [115]:
# make a list of the unique counties in the wapo_df_fips where FIPS is NaN
missing_counties = wapo_df_fips[wapo_df_fips["FIPS"].isna()]["COUNTY_NAME"].unique()


***A Lot of manual digging took place from this moment forward. Its removed. Couple things identified:***

* De Kalb County, AL is DeKalb County, AL
* De Soto County, FL is DeSoto County, FL
* De Witt County, TX is Dewitt County, TX
* Dade County, FL is Miami-Dade County, FL
* Saint Clair County, AL is St. Clair County, AL
* Saint Johns County, FL is St. Johns County, FL
* Saint Lucie County, FL is St. Lucie County, FL

In [116]:
missing_counties


array(['DE KALB COUNTY, AL', 'DE SOTO COUNTY, FL', 'DE WITT COUNTY, TX',
       'MIAMI-DADE COUNTY, FL', 'SAINT CLAIR COUNTY, AL',
       'SAINT JOHNS COUNTY, FL', 'SAINT LUCIE COUNTY, FL'], dtype=object)

In [117]:
# show counties that are in the missing counties list
# wapo_df[wapo_df["COUNTY_NAME"].isin(missing_counties)]


In [118]:
# fix the names
wapo_df["COUNTY_NAME"] = wapo_df["COUNTY_NAME"].replace(
    {
        "DE KALB COUNTY, AL": "DEKALB COUNTY, AL",
        "DE SOTO COUNTY, FL": "DESOTO COUNTY, FL",
        "DE WITT COUNTY, TX": "DEWITT COUNTY, TX",
        "MIAMI-DADE COUNTY, FL": "DADE COUNTY, FL",
        "SAINT CLAIR COUNTY, AL": "ST. CLAIR COUNTY, AL",
        "SAINT JOHNS COUNTY, FL": "ST. JOHNS COUNTY, FL",
        "SAINT LUCIE COUNTY, FL": "ST. LUCIE COUNTY, FL",
    }
)


In [119]:
# show counties that are in the missing counties list
wapo_df[wapo_df["COUNTY_NAME"].isin(missing_counties)]


Unnamed: 0,COUNTY_NAME,STATE,YEAR,QUANTITY


***Success, let's do the merge over***

In [120]:
# Add FIPS codes to the wapo_df data by merging on county_name
wapo_df_fips = pd.merge(wapo_df, fips_df, on="COUNTY_NAME", how="left").copy()


In [121]:
# Drop STATE_FIPS and STATE_y
wapo_df_fips = wapo_df_fips.drop(["STATE_FIPS", "STATE_y"], axis=1)

# strip the last character from the FIPS column
wapo_df_fips["FIPS"] = wapo_df_fips["FIPS"].str[:-1]

# Add the same merge key
wapo_df_fips["merge_key"] = wapo_df_fips["FIPS"] + wapo_df_fips["YEAR"].astype(str)

# rename the QUANTITY column to DRUG_QUANTITY
wapo_df_fips = wapo_df_fips.rename(
    columns={"QUANTITY": "DRUG_QUANTITY", "STATE_x": "STATE"}
)

# add a flag column to indicate the source of the data
wapo_df_fips["source"] = 1


In [122]:
# show rows with NaN
wapo_df_fips[wapo_df_fips["FIPS"].isna()]


Unnamed: 0,COUNTY_NAME,STATE,YEAR,DRUG_QUANTITY,FIPS,FIPS_INT,merge_key,source


In [123]:
# wapo_df_fips.sample(10)


In [124]:
print(f"There are {wapo_df_fips.shape[0]} rows in the wapo_df_fips")


There are 3573 rows in the wapo_df_fips


#### Merging

***Make the base DF with records for all counties for all years***

In [125]:
# Make the year list from 2003 to 2015
year_list = [i for i in range(2003, 2016)]

# make a base empty dataframe
base_df = pd.DataFrame()

# Make a loop that populates the dataframe with a row for each county and year
for i in fips_df["FIPS"]:
    for j in year_list:
        # concat the new record to the base_df
        base_df = pd.concat(
            [
                base_df,
                pd.DataFrame(
                    {
                        "county_name": fips_df[fips_df["FIPS"] == i][
                            "COUNTY_NAME"
                        ].values[0],
                        "state": fips_df[fips_df["FIPS"] == i]["STATE"].values[0],
                        "year": j,
                        "fips": i,
                        "merge_key": str(i[:-1]) + str(j),
                    },
                    index=[0],
                ),
            ]
        )

# convert merge key to int
# base_df["merge_key"] = base_df["merge_key"].astype(int)

# reset the index
base_df = base_df.reset_index(drop=True)


In [126]:
# base_df.sample(10)

# show DADE COUNTY, FL
# base_df[base_df["county_name"] == "DADE COUNTY, FL"]


In [127]:
assert len(year_list) * fips_df.shape[0] == base_df.shape[0]


***Left merge to add vital stats***

In [128]:
# Left merge the base_df with the vital_df
base_df_vital = pd.merge(base_df, vital_df_dade, on="merge_key", how="outer")


In [129]:
# base_df_vital.head(20)


In [130]:
# count rows where drug is not na
base_df_vital["DRUG"].notna().sum()


1497

In [131]:
base_df_vital.tail(15)


Unnamed: 0,county_name,state,year,fips,merge_key,COUNTY_NAME,FIPS,YEAR,STATE,DRUG,DEATHS
7018,"YAKIMA COUNTY, WA",WA,2014.0,53077.0,530772014,"YAKIMA COUNTY, WA",53077,2014.0,WA,1.0,23.0
7019,"YAKIMA COUNTY, WA",WA,2015.0,53077.0,530772015,"YAKIMA COUNTY, WA",53077,2015.0,WA,1.0,26.0
7020,,,,,120862003,"MIAMI-DADE COUNTY, FL",12086,2003.0,FL,1.0,111.0
7021,,,,,120862004,"MIAMI-DADE COUNTY, FL",12086,2004.0,FL,1.0,145.0
7022,,,,,120862005,"MIAMI-DADE COUNTY, FL",12086,2005.0,FL,1.0,154.0
7023,,,,,120862006,"MIAMI-DADE COUNTY, FL",12086,2006.0,FL,1.0,152.0
7024,,,,,120862007,"MIAMI-DADE COUNTY, FL",12086,2007.0,FL,1.0,149.0
7025,,,,,120862008,"MIAMI-DADE COUNTY, FL",12086,2008.0,FL,1.0,149.0
7026,,,,,120862009,"MIAMI-DADE COUNTY, FL",12086,2009.0,FL,1.0,155.0
7027,,,,,120862010,"MIAMI-DADE COUNTY, FL",12086,2010.0,FL,1.0,112.0


***We are losing 13 counties from the vital stats merge. We will need to investigate this.***

In [132]:
# show the na county_name
# base_df_vital.tail(15)

# It was miami dade county, fl


In [133]:
assert base_df_vital["DRUG"].sum() == vital_df["DRUG"].sum()


In [134]:
# base_df_vital.tail(15)

# success, drop the extra rows we acquired


In [135]:
# Drop COUNTY_NAME, STATE, FIPS, YEAR, DRUG
base_df_vital = base_df_vital.drop(
    ["COUNTY_NAME", "STATE", "FIPS", "YEAR", "DRUG"], axis=1
)

# rename DEATHS to drug_deaths
base_df_vital = base_df_vital.rename(columns={"DEATHS": "drug_deaths"})


In [136]:
# base_df_vital.sample(10)


***Now merge the wapo df in***

In [137]:
# Left merge the base_df_vital with the wapo_df_fips
base_df_vital_wapo = pd.merge(base_df_vital, wapo_df_fips, on="merge_key", how="outer")


In [138]:
# Looks like we caught all the issues...

# base_df_vital_wapo.tail(10)


In [139]:
# Drop the COUNTY_NAME, STATE, YEAR, FIPS, and FIPS_INT columns
base_df_vital_wapo = base_df_vital_wapo.drop(
    ["COUNTY_NAME", "STATE", "YEAR", "FIPS", "FIPS_INT"], axis=1
)


In [140]:
# all the rows found a home
assert base_df_vital_wapo["source"].sum() == wapo_df_fips["source"].sum()


In [141]:
# drop the source column
base_df_vital_wapo = base_df_vital_wapo.drop("source", axis=1)

# rename the DRUG_QUANTITY column to drug_quantity
base_df_vital_wapo = base_df_vital_wapo.rename(
    columns={"DRUG_QUANTITY": "drug_quantity"}
)


In [142]:
# base_df_vital_wapo.sample(10)


#### Add population

In [164]:
# merge in the population data
base_df_vital_wapo_pop = pd.merge(
    base_df_vital_wapo, census_df, on="county_name", how="left"
)


In [165]:
# show the DADE COUNTY, FL values to verify pop is correct
base_df_vital_wapo_pop[base_df_vital_wapo_pop["county_name"] == "DADE COUNTY, FL"]


Unnamed: 0,county_name,state,year,fips,merge_key,drug_deaths,drug_quantity,POPULATION,STATE
1027,"DADE COUNTY, FL",FL,2003.0,12025,120252003,,,2496435.0,FL
1028,"DADE COUNTY, FL",FL,2004.0,12025,120252004,,,2496435.0,FL
1029,"DADE COUNTY, FL",FL,2005.0,12025,120252005,,,2496435.0,FL
1030,"DADE COUNTY, FL",FL,2006.0,12025,120252006,,159171.0,2496435.0,FL
1031,"DADE COUNTY, FL",FL,2007.0,12025,120252007,,173867.0,2496435.0,FL
1032,"DADE COUNTY, FL",FL,2008.0,12025,120252008,,205617.0,2496435.0,FL
1033,"DADE COUNTY, FL",FL,2009.0,12025,120252009,,291639.0,2496435.0,FL
1034,"DADE COUNTY, FL",FL,2010.0,12025,120252010,,399644.0,2496435.0,FL
1035,"DADE COUNTY, FL",FL,2011.0,12025,120252011,,397969.0,2496435.0,FL
1036,"DADE COUNTY, FL",FL,2012.0,12025,120252012,,276012.0,2496435.0,FL


In [166]:
base_df_vital_wapo_pop.tail(15)


Unnamed: 0,county_name,state,year,fips,merge_key,drug_deaths,drug_quantity,POPULATION,STATE
7018,"YAKIMA COUNTY, WA",WA,2014.0,53077.0,530772014,23.0,,243231.0,WA
7019,"YAKIMA COUNTY, WA",WA,2015.0,53077.0,530772015,26.0,,243231.0,WA
7020,,,,,120862003,111.0,,,
7021,,,,,120862004,145.0,,,
7022,,,,,120862005,154.0,,,
7023,,,,,120862006,152.0,,,
7024,,,,,120862007,149.0,,,
7025,,,,,120862008,149.0,,,
7026,,,,,120862009,155.0,,,
7027,,,,,120862010,112.0,,,


In [167]:
# count the rows where pop is na
# base_df_vital_wapo_pop["POPULATION"].isna().sum()

# drop em
base_df_vital_wapo_pop = base_df_vital_wapo_pop.dropna(subset=["POPULATION"])


***Something occurs that is puzzling. Even after overriding the value of Miami-Dade county information further up in the script, we still see it matriculate to this level. It's a hold over from SOMETHING, but can't be sure what.***

In [168]:
assert base_df_vital_wapo_pop.POPULATION.isna().sum() == 0


In [169]:
# show na population rows
base_df_vital_wapo_pop[base_df_vital_wapo_pop["POPULATION"].isna()]


Unnamed: 0,county_name,state,year,fips,merge_key,drug_deaths,drug_quantity,POPULATION,STATE


In [170]:
# drop the STATE column
base_df_vital_wapo_pop = base_df_vital_wapo_pop.drop("STATE", axis=1)

# rename the POPULATION column to population
base_df_vital_wapo_pop = base_df_vital_wapo_pop.rename(
    columns={"POPULATION": "population"}
)


In [171]:
# base_df_vital_wapo_pop.sample(10)


#### Add columns for per capita rates

In [172]:
grouped_df = base_df_vital_wapo_pop.copy()


In [173]:
# add a column for death rate per 100,000 people
grouped_df["death_rate"] = grouped_df["drug_deaths"] / grouped_df["population"] * 100000

# add a column for ship rate per 100,000 people
grouped_df["ship_rate"] = (
    grouped_df["drug_quantity"] / grouped_df["population"] * 100000
)


In [174]:
# grouped_df.sample(10)


#### Export the final grouped df

In [175]:
# export to the 20_intermediate_files directory
grouped_df.to_csv("../20_intermediate_files/analysis_df.csv", index=False)
