# Data Preparation Notebook

#### This notebook will be the test bed for data read functions to ingest data from a data folder on the local machine

#### The end outputs of this notebook are that all data structures will have a 'COUNTY NAME, ST' column, and where applicable a FIPS code as well. Merges will be performed elsewhere.

### Import and parameters

In [5]:
# import packages

import pandas as pd
import numpy as np
import os

# set the states concerned for the analysis
states = ["FL", "TX", "WA", "OR", "AL", "OK"]


In [6]:
# ingest the state abbreviations as its own DF
state_df = pd.read_table("../00_source_data/03_state_names.rtf", sep=",")
state_df.columns = ["STATE", "ABBREV"]

# make state upper
state_df["STATE"] = state_df["STATE"].str.upper()

# drop the trailing slash from the abbrev
state_df["ABBREV"] = state_df["ABBREV"].str[0:2]
# state_df.head()


Unnamed: 0,STATE,ABBREV
0,ALABAMA,AL
1,ALASKA,AK
2,ARIZONA,AZ
3,ARKANSAS,AR
4,CALIFORNIA,CA


### WAPO Dataset

This section takes an argument for the path to the WAPO dataset and will ultimately return an annualized dataframe of the states with respective values for each year

***Ingest actions***

In [10]:
# set path to WAPO file
wapo = "/Users/andrewkroening/Desktop/720_Data/arcos_all_washpost.tsv.gz"


In [11]:
# Ingest the WAPO file as chunks

chunks = 500000  # Leave this, there will be 358 chunks

chunk_counter = 0

wapo_df = pd.DataFrame()

for chunk in pd.read_csv(
    wapo,
    sep="\t",
    compression="gzip",
    chunksize=chunks,
    usecols=[
        "BUYER_COUNTY",
        "BUYER_STATE",
        "DRUG_NAME",
        "TRANSACTION_DATE",
        "QUANTITY",
        "UNIT",
    ],
):
    chunk_counter += 1
    percent_chunk = round(chunk_counter / 358 * 100, 2)
    print("Reading chunk: ", chunk_counter, "of 358 (", percent_chunk, "%)")

    # filter the chunk to only include the stations in the list
    chunk = chunk[chunk["BUYER_STATE"].isin(states)].copy()

    chunk["TRANSACTION_DATE"] = pd.to_datetime(
        chunk["TRANSACTION_DATE"], format="%m%d%Y"
    )

    chunk["YEAR"] = chunk["TRANSACTION_DATE"].dt.year
    chunk["MONTH"] = chunk["TRANSACTION_DATE"].dt.month

    chunk = chunk.groupby(["BUYER_COUNTY", "BUYER_STATE", "YEAR", "MONTH"]).agg(
        {"QUANTITY": "sum"}
    )

    # concat with the base df
    wapo_df = pd.concat([wapo_df, chunk])

print("Ingest complete")


Reading chunk:  1 of 358 ( 0.28 %)
Reading chunk:  2 of 358 ( 0.56 %)
Reading chunk:  3 of 358 ( 0.84 %)
Reading chunk:  4 of 358 ( 1.12 %)
Reading chunk:  5 of 358 ( 1.4 %)
Reading chunk:  6 of 358 ( 1.68 %)
Reading chunk:  7 of 358 ( 1.96 %)
Reading chunk:  8 of 358 ( 2.23 %)


  for chunk in pd.read_csv(


Reading chunk:  9 of 358 ( 2.51 %)
Reading chunk:  10 of 358 ( 2.79 %)
Reading chunk:  11 of 358 ( 3.07 %)
Reading chunk:  12 of 358 ( 3.35 %)
Reading chunk:  13 of 358 ( 3.63 %)
Reading chunk:  14 of 358 ( 3.91 %)
Reading chunk:  15 of 358 ( 4.19 %)
Reading chunk:  16 of 358 ( 4.47 %)
Reading chunk:  17 of 358 ( 4.75 %)
Reading chunk:  18 of 358 ( 5.03 %)
Reading chunk:  19 of 358 ( 5.31 %)
Reading chunk:  20 of 358 ( 5.59 %)
Reading chunk:  21 of 358 ( 5.87 %)
Reading chunk:  22 of 358 ( 6.15 %)
Reading chunk:  23 of 358 ( 6.42 %)
Reading chunk:  24 of 358 ( 6.7 %)
Reading chunk:  25 of 358 ( 6.98 %)


  for chunk in pd.read_csv(


Reading chunk:  26 of 358 ( 7.26 %)
Reading chunk:  27 of 358 ( 7.54 %)
Reading chunk:  28 of 358 ( 7.82 %)
Reading chunk:  29 of 358 ( 8.1 %)
Reading chunk:  30 of 358 ( 8.38 %)


  for chunk in pd.read_csv(


Reading chunk:  31 of 358 ( 8.66 %)
Reading chunk:  32 of 358 ( 8.94 %)
Reading chunk:  33 of 358 ( 9.22 %)


  for chunk in pd.read_csv(


Reading chunk:  34 of 358 ( 9.5 %)
Reading chunk:  35 of 358 ( 9.78 %)
Reading chunk:  36 of 358 ( 10.06 %)
Reading chunk:  37 of 358 ( 10.34 %)
Reading chunk:  38 of 358 ( 10.61 %)
Reading chunk:  39 of 358 ( 10.89 %)
Reading chunk:  40 of 358 ( 11.17 %)
Reading chunk:  41 of 358 ( 11.45 %)
Reading chunk:  42 of 358 ( 11.73 %)
Reading chunk:  43 of 358 ( 12.01 %)
Reading chunk:  44 of 358 ( 12.29 %)
Reading chunk:  45 of 358 ( 12.57 %)
Reading chunk:  46 of 358 ( 12.85 %)
Reading chunk:  47 of 358 ( 13.13 %)


  for chunk in pd.read_csv(


Reading chunk:  48 of 358 ( 13.41 %)
Reading chunk:  49 of 358 ( 13.69 %)
Reading chunk:  50 of 358 ( 13.97 %)
Reading chunk:  51 of 358 ( 14.25 %)
Reading chunk:  52 of 358 ( 14.53 %)
Reading chunk:  53 of 358 ( 14.8 %)
Reading chunk:  54 of 358 ( 15.08 %)
Reading chunk:  55 of 358 ( 15.36 %)
Reading chunk:  56 of 358 ( 15.64 %)
Reading chunk:  57 of 358 ( 15.92 %)
Reading chunk:  58 of 358 ( 16.2 %)
Reading chunk:  59 of 358 ( 16.48 %)
Reading chunk:  60 of 358 ( 16.76 %)
Reading chunk:  61 of 358 ( 17.04 %)
Reading chunk:  62 of 358 ( 17.32 %)
Reading chunk:  63 of 358 ( 17.6 %)
Reading chunk:  64 of 358 ( 17.88 %)
Reading chunk:  65 of 358 ( 18.16 %)
Reading chunk:  66 of 358 ( 18.44 %)
Reading chunk:  67 of 358 ( 18.72 %)
Reading chunk:  68 of 358 ( 18.99 %)
Reading chunk:  69 of 358 ( 19.27 %)
Reading chunk:  70 of 358 ( 19.55 %)
Reading chunk:  71 of 358 ( 19.83 %)
Reading chunk:  72 of 358 ( 20.11 %)
Reading chunk:  73 of 358 ( 20.39 %)
Reading chunk:  74 of 358 ( 20.67 %)
Read

  for chunk in pd.read_csv(


Reading chunk:  79 of 358 ( 22.07 %)
Reading chunk:  80 of 358 ( 22.35 %)
Reading chunk:  81 of 358 ( 22.63 %)
Reading chunk:  82 of 358 ( 22.91 %)


  for chunk in pd.read_csv(


Reading chunk:  83 of 358 ( 23.18 %)
Reading chunk:  84 of 358 ( 23.46 %)
Reading chunk:  85 of 358 ( 23.74 %)
Reading chunk:  86 of 358 ( 24.02 %)
Reading chunk:  87 of 358 ( 24.3 %)


  for chunk in pd.read_csv(


Reading chunk:  88 of 358 ( 24.58 %)
Reading chunk:  89 of 358 ( 24.86 %)
Reading chunk:  90 of 358 ( 25.14 %)
Reading chunk:  91 of 358 ( 25.42 %)
Reading chunk:  92 of 358 ( 25.7 %)
Reading chunk:  93 of 358 ( 25.98 %)
Reading chunk:  94 of 358 ( 26.26 %)
Reading chunk:  95 of 358 ( 26.54 %)
Reading chunk:  96 of 358 ( 26.82 %)
Reading chunk:  97 of 358 ( 27.09 %)
Reading chunk:  98 of 358 ( 27.37 %)


  for chunk in pd.read_csv(


Reading chunk:  99 of 358 ( 27.65 %)
Reading chunk:  100 of 358 ( 27.93 %)
Reading chunk:  101 of 358 ( 28.21 %)
Reading chunk:  102 of 358 ( 28.49 %)
Reading chunk:  103 of 358 ( 28.77 %)
Reading chunk:  104 of 358 ( 29.05 %)
Reading chunk:  105 of 358 ( 29.33 %)
Reading chunk:  106 of 358 ( 29.61 %)
Reading chunk:  107 of 358 ( 29.89 %)
Reading chunk:  108 of 358 ( 30.17 %)
Reading chunk:  109 of 358 ( 30.45 %)
Reading chunk:  110 of 358 ( 30.73 %)
Reading chunk:  111 of 358 ( 31.01 %)
Reading chunk:  112 of 358 ( 31.28 %)
Reading chunk:  113 of 358 ( 31.56 %)
Reading chunk:  114 of 358 ( 31.84 %)


  for chunk in pd.read_csv(


Reading chunk:  115 of 358 ( 32.12 %)
Reading chunk:  116 of 358 ( 32.4 %)
Reading chunk:  117 of 358 ( 32.68 %)
Reading chunk:  118 of 358 ( 32.96 %)
Reading chunk:  119 of 358 ( 33.24 %)


  for chunk in pd.read_csv(


Reading chunk:  120 of 358 ( 33.52 %)
Reading chunk:  121 of 358 ( 33.8 %)
Reading chunk:  122 of 358 ( 34.08 %)
Reading chunk:  123 of 358 ( 34.36 %)
Reading chunk:  124 of 358 ( 34.64 %)
Reading chunk:  125 of 358 ( 34.92 %)
Reading chunk:  126 of 358 ( 35.2 %)
Reading chunk:  127 of 358 ( 35.47 %)
Reading chunk:  128 of 358 ( 35.75 %)
Reading chunk:  129 of 358 ( 36.03 %)
Reading chunk:  130 of 358 ( 36.31 %)
Reading chunk:  131 of 358 ( 36.59 %)
Reading chunk:  132 of 358 ( 36.87 %)
Reading chunk:  133 of 358 ( 37.15 %)


  for chunk in pd.read_csv(


Reading chunk:  134 of 358 ( 37.43 %)
Reading chunk:  135 of 358 ( 37.71 %)
Reading chunk:  136 of 358 ( 37.99 %)
Reading chunk:  137 of 358 ( 38.27 %)
Reading chunk:  138 of 358 ( 38.55 %)
Reading chunk:  139 of 358 ( 38.83 %)
Reading chunk:  140 of 358 ( 39.11 %)
Reading chunk:  141 of 358 ( 39.39 %)
Reading chunk:  142 of 358 ( 39.66 %)
Reading chunk:  143 of 358 ( 39.94 %)
Reading chunk:  144 of 358 ( 40.22 %)
Reading chunk:  145 of 358 ( 40.5 %)


  for chunk in pd.read_csv(


Reading chunk:  146 of 358 ( 40.78 %)
Reading chunk:  147 of 358 ( 41.06 %)
Reading chunk:  148 of 358 ( 41.34 %)
Reading chunk:  149 of 358 ( 41.62 %)
Reading chunk:  150 of 358 ( 41.9 %)
Reading chunk:  151 of 358 ( 42.18 %)
Reading chunk:  152 of 358 ( 42.46 %)
Reading chunk:  153 of 358 ( 42.74 %)
Reading chunk:  154 of 358 ( 43.02 %)
Reading chunk:  155 of 358 ( 43.3 %)


  for chunk in pd.read_csv(


Reading chunk:  156 of 358 ( 43.58 %)
Reading chunk:  157 of 358 ( 43.85 %)
Reading chunk:  158 of 358 ( 44.13 %)
Reading chunk:  159 of 358 ( 44.41 %)
Reading chunk:  160 of 358 ( 44.69 %)


  for chunk in pd.read_csv(


Reading chunk:  161 of 358 ( 44.97 %)
Reading chunk:  162 of 358 ( 45.25 %)
Reading chunk:  163 of 358 ( 45.53 %)
Reading chunk:  164 of 358 ( 45.81 %)
Reading chunk:  165 of 358 ( 46.09 %)
Reading chunk:  166 of 358 ( 46.37 %)
Reading chunk:  167 of 358 ( 46.65 %)
Reading chunk:  168 of 358 ( 46.93 %)
Reading chunk:  169 of 358 ( 47.21 %)
Reading chunk:  170 of 358 ( 47.49 %)
Reading chunk:  171 of 358 ( 47.77 %)


  for chunk in pd.read_csv(


Reading chunk:  172 of 358 ( 48.04 %)
Reading chunk:  173 of 358 ( 48.32 %)
Reading chunk:  174 of 358 ( 48.6 %)
Reading chunk:  175 of 358 ( 48.88 %)
Reading chunk:  176 of 358 ( 49.16 %)
Reading chunk:  177 of 358 ( 49.44 %)
Reading chunk:  178 of 358 ( 49.72 %)
Reading chunk:  179 of 358 ( 50.0 %)
Reading chunk:  180 of 358 ( 50.28 %)


  for chunk in pd.read_csv(


Reading chunk:  181 of 358 ( 50.56 %)
Reading chunk:  182 of 358 ( 50.84 %)
Reading chunk:  183 of 358 ( 51.12 %)
Reading chunk:  184 of 358 ( 51.4 %)
Reading chunk:  185 of 358 ( 51.68 %)


  for chunk in pd.read_csv(


Reading chunk:  186 of 358 ( 51.96 %)
Reading chunk:  187 of 358 ( 52.23 %)
Reading chunk:  188 of 358 ( 52.51 %)
Reading chunk:  189 of 358 ( 52.79 %)
Reading chunk:  190 of 358 ( 53.07 %)
Reading chunk:  191 of 358 ( 53.35 %)
Reading chunk:  192 of 358 ( 53.63 %)
Reading chunk:  193 of 358 ( 53.91 %)
Reading chunk:  194 of 358 ( 54.19 %)
Reading chunk:  195 of 358 ( 54.47 %)
Reading chunk:  196 of 358 ( 54.75 %)
Reading chunk:  197 of 358 ( 55.03 %)
Reading chunk:  198 of 358 ( 55.31 %)
Reading chunk:  199 of 358 ( 55.59 %)
Reading chunk:  200 of 358 ( 55.87 %)


  for chunk in pd.read_csv(


Reading chunk:  201 of 358 ( 56.15 %)
Reading chunk:  202 of 358 ( 56.42 %)
Reading chunk:  203 of 358 ( 56.7 %)
Reading chunk:  204 of 358 ( 56.98 %)
Reading chunk:  205 of 358 ( 57.26 %)


  for chunk in pd.read_csv(


Reading chunk:  206 of 358 ( 57.54 %)
Reading chunk:  207 of 358 ( 57.82 %)
Reading chunk:  208 of 358 ( 58.1 %)
Reading chunk:  209 of 358 ( 58.38 %)


  for chunk in pd.read_csv(


Reading chunk:  210 of 358 ( 58.66 %)
Reading chunk:  211 of 358 ( 58.94 %)
Reading chunk:  212 of 358 ( 59.22 %)
Reading chunk:  213 of 358 ( 59.5 %)
Reading chunk:  214 of 358 ( 59.78 %)
Reading chunk:  215 of 358 ( 60.06 %)
Reading chunk:  216 of 358 ( 60.34 %)
Reading chunk:  217 of 358 ( 60.61 %)
Reading chunk:  218 of 358 ( 60.89 %)
Reading chunk:  219 of 358 ( 61.17 %)
Reading chunk:  220 of 358 ( 61.45 %)
Reading chunk:  221 of 358 ( 61.73 %)
Reading chunk:  222 of 358 ( 62.01 %)


  for chunk in pd.read_csv(


Reading chunk:  223 of 358 ( 62.29 %)
Reading chunk:  224 of 358 ( 62.57 %)
Reading chunk:  225 of 358 ( 62.85 %)
Reading chunk:  226 of 358 ( 63.13 %)


  for chunk in pd.read_csv(


Reading chunk:  227 of 358 ( 63.41 %)
Reading chunk:  228 of 358 ( 63.69 %)
Reading chunk:  229 of 358 ( 63.97 %)
Reading chunk:  230 of 358 ( 64.25 %)


  for chunk in pd.read_csv(


Reading chunk:  231 of 358 ( 64.53 %)
Reading chunk:  232 of 358 ( 64.8 %)
Reading chunk:  233 of 358 ( 65.08 %)
Reading chunk:  234 of 358 ( 65.36 %)
Reading chunk:  235 of 358 ( 65.64 %)
Reading chunk:  236 of 358 ( 65.92 %)
Reading chunk:  237 of 358 ( 66.2 %)


  for chunk in pd.read_csv(


Reading chunk:  238 of 358 ( 66.48 %)
Reading chunk:  239 of 358 ( 66.76 %)
Reading chunk:  240 of 358 ( 67.04 %)
Reading chunk:  241 of 358 ( 67.32 %)
Reading chunk:  242 of 358 ( 67.6 %)
Reading chunk:  243 of 358 ( 67.88 %)
Reading chunk:  244 of 358 ( 68.16 %)
Reading chunk:  245 of 358 ( 68.44 %)
Reading chunk:  246 of 358 ( 68.72 %)
Reading chunk:  247 of 358 ( 68.99 %)


  for chunk in pd.read_csv(


Reading chunk:  248 of 358 ( 69.27 %)
Reading chunk:  249 of 358 ( 69.55 %)
Reading chunk:  250 of 358 ( 69.83 %)
Reading chunk:  251 of 358 ( 70.11 %)
Reading chunk:  252 of 358 ( 70.39 %)


  for chunk in pd.read_csv(


Reading chunk:  253 of 358 ( 70.67 %)
Reading chunk:  254 of 358 ( 70.95 %)
Reading chunk:  255 of 358 ( 71.23 %)
Reading chunk:  256 of 358 ( 71.51 %)
Reading chunk:  257 of 358 ( 71.79 %)
Reading chunk:  258 of 358 ( 72.07 %)
Reading chunk:  259 of 358 ( 72.35 %)
Reading chunk:  260 of 358 ( 72.63 %)
Reading chunk:  261 of 358 ( 72.91 %)
Reading chunk:  262 of 358 ( 73.18 %)
Reading chunk:  263 of 358 ( 73.46 %)
Reading chunk:  264 of 358 ( 73.74 %)
Reading chunk:  265 of 358 ( 74.02 %)
Reading chunk:  266 of 358 ( 74.3 %)


  for chunk in pd.read_csv(


Reading chunk:  267 of 358 ( 74.58 %)


  for chunk in pd.read_csv(


Reading chunk:  268 of 358 ( 74.86 %)
Reading chunk:  269 of 358 ( 75.14 %)
Reading chunk:  270 of 358 ( 75.42 %)
Reading chunk:  271 of 358 ( 75.7 %)


  for chunk in pd.read_csv(


Reading chunk:  272 of 358 ( 75.98 %)
Reading chunk:  273 of 358 ( 76.26 %)
Reading chunk:  274 of 358 ( 76.54 %)
Reading chunk:  275 of 358 ( 76.82 %)
Reading chunk:  276 of 358 ( 77.09 %)
Reading chunk:  277 of 358 ( 77.37 %)
Reading chunk:  278 of 358 ( 77.65 %)
Reading chunk:  279 of 358 ( 77.93 %)
Reading chunk:  280 of 358 ( 78.21 %)
Reading chunk:  281 of 358 ( 78.49 %)
Reading chunk:  282 of 358 ( 78.77 %)
Reading chunk:  283 of 358 ( 79.05 %)
Reading chunk:  284 of 358 ( 79.33 %)
Reading chunk:  285 of 358 ( 79.61 %)


  for chunk in pd.read_csv(


Reading chunk:  286 of 358 ( 79.89 %)
Reading chunk:  287 of 358 ( 80.17 %)
Reading chunk:  288 of 358 ( 80.45 %)
Reading chunk:  289 of 358 ( 80.73 %)
Reading chunk:  290 of 358 ( 81.01 %)
Reading chunk:  291 of 358 ( 81.28 %)
Reading chunk:  292 of 358 ( 81.56 %)
Reading chunk:  293 of 358 ( 81.84 %)
Reading chunk:  294 of 358 ( 82.12 %)
Reading chunk:  295 of 358 ( 82.4 %)
Reading chunk:  296 of 358 ( 82.68 %)
Reading chunk:  297 of 358 ( 82.96 %)
Reading chunk:  298 of 358 ( 83.24 %)
Reading chunk:  299 of 358 ( 83.52 %)
Reading chunk:  300 of 358 ( 83.8 %)


  for chunk in pd.read_csv(


Reading chunk:  301 of 358 ( 84.08 %)
Reading chunk:  302 of 358 ( 84.36 %)
Reading chunk:  303 of 358 ( 84.64 %)
Reading chunk:  304 of 358 ( 84.92 %)
Reading chunk:  305 of 358 ( 85.2 %)


  for chunk in pd.read_csv(


Reading chunk:  306 of 358 ( 85.47 %)
Reading chunk:  307 of 358 ( 85.75 %)
Reading chunk:  308 of 358 ( 86.03 %)
Reading chunk:  309 of 358 ( 86.31 %)
Reading chunk:  310 of 358 ( 86.59 %)
Reading chunk:  311 of 358 ( 86.87 %)
Reading chunk:  312 of 358 ( 87.15 %)
Reading chunk:  313 of 358 ( 87.43 %)
Reading chunk:  314 of 358 ( 87.71 %)
Reading chunk:  315 of 358 ( 87.99 %)
Reading chunk:  316 of 358 ( 88.27 %)
Reading chunk:  317 of 358 ( 88.55 %)
Reading chunk:  318 of 358 ( 88.83 %)
Reading chunk:  319 of 358 ( 89.11 %)
Reading chunk:  320 of 358 ( 89.39 %)
Reading chunk:  321 of 358 ( 89.66 %)
Reading chunk:  322 of 358 ( 89.94 %)


  for chunk in pd.read_csv(


Reading chunk:  323 of 358 ( 90.22 %)
Reading chunk:  324 of 358 ( 90.5 %)
Reading chunk:  325 of 358 ( 90.78 %)
Reading chunk:  326 of 358 ( 91.06 %)
Reading chunk:  327 of 358 ( 91.34 %)
Reading chunk:  328 of 358 ( 91.62 %)
Reading chunk:  329 of 358 ( 91.9 %)
Reading chunk:  330 of 358 ( 92.18 %)
Reading chunk:  331 of 358 ( 92.46 %)
Reading chunk:  332 of 358 ( 92.74 %)
Reading chunk:  333 of 358 ( 93.02 %)
Reading chunk:  334 of 358 ( 93.3 %)
Reading chunk:  335 of 358 ( 93.58 %)
Reading chunk:  336 of 358 ( 93.85 %)
Reading chunk:  337 of 358 ( 94.13 %)
Reading chunk:  338 of 358 ( 94.41 %)
Reading chunk:  339 of 358 ( 94.69 %)
Reading chunk:  340 of 358 ( 94.97 %)
Reading chunk:  341 of 358 ( 95.25 %)
Reading chunk:  342 of 358 ( 95.53 %)


  for chunk in pd.read_csv(


Reading chunk:  343 of 358 ( 95.81 %)
Reading chunk:  344 of 358 ( 96.09 %)
Reading chunk:  345 of 358 ( 96.37 %)
Reading chunk:  346 of 358 ( 96.65 %)
Reading chunk:  347 of 358 ( 96.93 %)


  for chunk in pd.read_csv(


Reading chunk:  348 of 358 ( 97.21 %)
Reading chunk:  349 of 358 ( 97.49 %)
Reading chunk:  350 of 358 ( 97.77 %)
Reading chunk:  351 of 358 ( 98.04 %)
Reading chunk:  352 of 358 ( 98.32 %)
Reading chunk:  353 of 358 ( 98.6 %)
Reading chunk:  354 of 358 ( 98.88 %)
Reading chunk:  355 of 358 ( 99.16 %)
Reading chunk:  356 of 358 ( 99.44 %)


  for chunk in pd.read_csv(


Reading chunk:  357 of 358 ( 99.72 %)
Reading chunk:  358 of 358 ( 100.0 %)
Ingest complete


In [12]:
# transform the wapo df
print("Adding county name...")

# Add an index to the wapo_df
wapo_df = wapo_df.reset_index()

# Do some transformations on the WAPO dataset
wapo_df["COUNTY_NAME"] = wapo_df["BUYER_COUNTY"] + " COUNTY, " + wapo_df["BUYER_STATE"]

print("Grouping WAPO data...")

wapo_df = (
    wapo_df.groupby(["COUNTY_NAME", "BUYER_STATE", "YEAR", "MONTH"])
    .agg({"QUANTITY": "sum"})
    .reset_index()
)

# rename buyer state to state
wapo_df = wapo_df.rename(columns={"BUYER_STATE": "STATE"})

print("Operation complete")


Adding county name...
Grouping WAPO data...
Operation complete


***Print a sample to make sure you did it right***

In [13]:
# print(wapo_df.sample(10))


***Assert tests to verify that we have the right states, and some checks on the data***

In [14]:
# Assert that no counties are missing
assert wapo_df["COUNTY_NAME"].isnull().sum() == 0
# Assert that states are in the list
assert set(wapo_df["STATE"].unique()) == set(states)


***Save the output file to a csv in the intermediate files directory***

In [15]:
# save this file as a csv called wapo_clean.csv in the current directory
wapo_df.to_csv("../20_intermediate_files/wapo_clean.csv", index=False)


### Vitality Data

This section takes the txt files passed and will return a dataframe with the respective values for each year

***Ingest actions***

In [16]:
# Set a directory path to find the txt files
nick_path = "../00_source_data/US_VitalStatistics/"


In [17]:
# generate a df from the txt files in a folder path

# initialize the empty df
nick_df = pd.DataFrame()

# set a loop to iterate through the files in the folder
print("Ingesting text files...")

# set a loop to iterate through the files in the folder
for file in os.listdir(nick_path):
    if file.endswith(".txt"):
        txt_table = pd.read_table(
            os.path.join(nick_path, file), sep="\t", skipfooter=15, engine="python"
        )
        # remove the bottom rows that are not needed
        txt_table = txt_table.iloc[0:-16, :]
        nick_df = pd.concat([nick_df, txt_table], axis=0)

# subset to the columns we want
vital_df = nick_df[
    [
        "County",
        "County Code",
        "Year",
        "Drug/Alcohol Induced Cause",
        "Drug/Alcohol Induced Cause Code",
        "Deaths",
    ]
].copy()

print("Transforming vitality data...")

# change the year column to a date time object with a year only
vital_df["Year"] = pd.to_datetime(vital_df["Year"], format="%Y")

# change the County Code column to 6-digit string FIPS code
vital_df["County Code"] = vital_df["County Code"].astype(str).str.zfill(6)

# add a state column with the last two characters from county
vital_df["State"] = vital_df["County"].str[-2:]

# convert NaN deaths to 0
vital_df["Deaths"] = vital_df["Deaths"].fillna(0)

# change the county name to all caps
vital_df["County"] = vital_df["County"].str.upper()

# change all the column names to all caps
vital_df.columns = vital_df.columns.str.upper()

# rename county code to FIPS code
vital_df = vital_df.rename(columns={"COUNTY CODE": "FIPS"})

# filter the vital df to the states of interest
vital_df = vital_df[vital_df["STATE"].isin(states)].copy()

print("Operation complete")


Ingesting text files...
Transforming vitality data...
Operation complete


***Check a sample to inspect data***

In [18]:
# vital_df.sample(10)


***Assert tests to verify that we have the right states, and some checks on the data***

In [19]:
assert vital_df.all().isnull().sum() == 0
# Assert that states are in the list
assert set(vital_df["STATE"].unique()) == set(states)
# Still need to find a way to check if Vital is correctly chunked


***Save the output dataframe to the intermediate files directory***

In [20]:
vital_df.to_csv("../20_intermediate_files/vital_clean.csv", index=False)


### Add FIPS Codes to the available data

This chunk takes the path to FIPS codes and will ingest them into a dataframe

***Ingest data***

In [21]:
fips_path = "../00_source_data/02_fcc_fips_codes.txt"


***start with counties***

In [22]:
# Ingest the fips codes
fips_county_df = pd.read_table(fips_path, sep="\t", skiprows=71, header="infer")

# name the column header
fips_county_df.columns = ["FIPS"]

# split the FIPS column into two columns after the first 5 characters
fips_county_df["COUNTY_NAME"] = fips_county_df["FIPS"].str[10:]

# Turn the FIPS column into just the numbers
fips_county_df["FIPS"] = fips_county_df["FIPS"].str[4:10]

# remove the spaces from the county name
fips_county_df["COUNTY_NAME"] = fips_county_df["COUNTY_NAME"].str.strip()

# make county name upper
fips_county_df["COUNTY_NAME"] = fips_county_df["COUNTY_NAME"].str.upper()

print("Operation complete")


Operation complete


In [23]:
# Ingest the fips codes
fips_state_df = pd.read_table(fips_path, sep="\t", skiprows=15, header="infer")

# keep the first 50 rows
fips_state_df = fips_state_df.iloc[0:50, :]

# name the column header
fips_state_df.columns = ["FIPS"]

# # split the FIPS column into two columns after the first 5 characters
fips_state_df["STATE"] = fips_state_df["FIPS"].str[10:]
fips_state_df["STATE"] = fips_state_df["STATE"].str.strip()

# #Turn the FIPS column into just the numbers
fips_state_df["FIPS"] = fips_state_df["FIPS"].str[4:10]

# merge to add the state abbreviations from the state_df
fips_state_df = fips_state_df.merge(state_df, on="STATE", how="left")

print("Operation complete")


Operation complete


***merge the two dataframes***

In [24]:
# this is going to be a little different
fips_df = fips_county_df.copy()

# add a state column
fips_df["STATE"] = fips_df["COUNTY_NAME"]

# merge to get the state abbreviations
fips_df = fips_df.merge(fips_state_df, on="STATE", how="left")

# drop the state column
fips_df = fips_df.drop(columns=["STATE"])

# rename the columns
fips_df = fips_df.rename(
    columns={"FIPS_x": "FIPS", "ABBREV": "STATE", "FIPS_y": "STATE_FIPS"}
)

# forward fill the state fips and state columns
fips_df["STATE_FIPS"] = fips_df["STATE_FIPS"].fillna(method="ffill")
fips_df["STATE"] = fips_df["STATE"].fillna(method="ffill")

# add state to the county name
fips_df["COUNTY_NAME"] = fips_df["COUNTY_NAME"] + ", " + fips_df["STATE"]

# filter the df to the states of interest
fips_df = fips_df[fips_df["STATE"].isin(states)]


***Sample the FIPS codes to see if we did it right***

In [25]:
# fips_df.sample(10)


***Add some assert tests***

In [26]:
assert vital_df.all().isnull().sum() == 0
# check the first two digits of the FIPS code to make sure they match with STATE_FIPS
assert list(fips_df["FIPS"].str[:2]) == list(
    fips_df["STATE_FIPS"].astype(str).str.strip()
)


***Save the FIPS file cleaned to the 20_intermediate_files directory***

In [27]:
# save the fips_df to a csv
fips_df.to_csv("../20_intermediate_files/fips_df.csv", index=False)


### Census Data

This chunk will ingest filtered county population data and return a dataframe

***Ingest data***

In [28]:
# set the directory path to the raw data
census_path = "../00_source_data/01_census_data.xlsx"


In [29]:
# ingest the raw data and filter for states of interest
census_df = pd.read_excel(census_path, header=0, skiprows=4, usecols="A:B")

# Change the column names to county name and population
census_df.columns = ["COUNTY_NAME", "POPULATION"]

# remove the leading period from the county name
census_df["COUNTY_NAME"] = census_df["COUNTY_NAME"].str[1:]

# move the state name to its own column
census_df["STATE"] = census_df["COUNTY_NAME"].str.split(",").str[1]
census_df["STATE"] = census_df["STATE"].str.strip()
census_df["STATE"] = census_df["STATE"].str.upper()

# remove the state name from the county name
census_df["COUNTY_NAME"] = census_df["COUNTY_NAME"].str.split(",").str[0]

# change population to an integer
census_df["POPULATION"] = census_df["POPULATION"].astype("Int64")

# add an abbreviation column for the state from the state_df
census_df_merge = census_df.merge(state_df, on="STATE", how="outer")

# reformat county name to include abbrev
census_df_merge["COUNTY_NAME"] = (
    census_df_merge["COUNTY_NAME"] + ", " + census_df_merge["ABBREV"]
)
census_df_merge["COUNTY_NAME"] = census_df_merge["COUNTY_NAME"].str.upper()

# remove the state column
census_df_merge = census_df_merge.drop(columns="STATE")

# rename the abbrev column to state
census_df_merge = census_df_merge.rename(columns={"ABBREV": "STATE"})

# filter for the states of interest
census_df_merge = census_df_merge[census_df_merge["STATE"].isin(states)].copy()


***Sample to see if we did it right***

In [30]:
# census_df_merge.sample(10)


***Add assert tests***

In [31]:
assert census_df_merge.all().isnull().sum() == 0
# check last two characters of county name to make sure they match with state
assert list(census_df_merge["COUNTY_NAME"].str[-2:]) == list(census_df_merge["STATE"])
# we need to check county numbers to make sure they match (when we know which control states we want)


***Save the output file to the intermediate files directory***

In [32]:
# save the file to the intermediate folder
census_df_merge.to_csv("../20_intermediate_files/census_df.csv", index=False)
