# Data Preparation Notebook

#### This notebook will be the test bed for data read functions to ingest data from a data folder on the local machine

#### The end outputs of this notebook are that all data structures will have a 'COUNTY NAME, ST' column, and where applicable a FIPS code as well. Merges will be performed elsewhere.

### Import and parameters

In [1]:
# import packages

import pandas as pd
import numpy as np
import os

# set the states concerned for the analysis
states = [
    "FL",
    "AL",
    "GA",
    "MS",
    "SC",
    "TX",
    "OK",
    "AZ",
    "NM",
    "WA",
    "OR",
    "ID",
    "CA",
    "NY",
]

# Drop LA and NV because of issues. Carson city is an independent city in Nevada...??


In [2]:
# ingest the state abbreviations as its own DF
state_df = pd.read_table("../00_source_data/03_state_names.rtf", sep=",")
state_df.columns = ["STATE", "ABBREV"]

# make state upper
state_df["STATE"] = state_df["STATE"].str.upper()

# drop the trailing slash from the abbrev
state_df["ABBREV"] = state_df["ABBREV"].str[0:2]
# state_df.head()


### WAPO Dataset

This section takes an argument for the path to the WAPO dataset and will ultimately return an annualized dataframe of the states with respective values for each year

***Ingest actions***

In [3]:
# set path to WAPO file
wapo = "../00_source_data/prescription_data.zip"


In [4]:
# Ingest the WAPO file as chunks - takes roughly 15 minutes

chunks = 500000  # Leave this, there will be 465 chunks

chunk_counter = 0

wapo_df = pd.DataFrame()

for chunk in pd.read_csv(
    wapo,
    compression="zip",
    chunksize=chunks,
    usecols=[
        "BUYER_COUNTY",
        "BUYER_STATE",
        "DRUG_NAME",
        "TRANSACTION_DATE",
        "QUANTITY",
        "UNIT",
    ],
):
    chunk_counter += 1
    percent_chunk = round(chunk_counter / 465 * 100, 2)
    print("Reading chunk: ", chunk_counter, "of 465 (", percent_chunk, "%)")

    # filter the chunk to only include the stations in the list
    chunk = chunk[chunk["BUYER_STATE"].isin(states)].copy()

    chunk["TRANSACTION_DATE"] = pd.to_datetime(
        chunk["TRANSACTION_DATE"], format="%m%d%Y"
    )

    chunk["YEAR"] = chunk["TRANSACTION_DATE"].dt.year
    chunk["MONTH"] = chunk["TRANSACTION_DATE"].dt.month

    # make the quantity numeric
    chunk["QUANTITY"] = pd.to_numeric(chunk["QUANTITY"], errors="coerce")

    chunk = chunk.groupby(["BUYER_COUNTY", "BUYER_STATE", "YEAR", "MONTH"]).agg(
        {"QUANTITY": "sum"}
    )

    # concat with the base df
    wapo_df = pd.concat([wapo_df, chunk])

print("Ingest complete")


  for chunk in pd.read_csv(


Reading chunk:  1 of 465 ( 0.22 %)
Reading chunk:  2 of 465 ( 0.43 %)
Reading chunk:  3 of 465 ( 0.65 %)
Reading chunk:  4 of 465 ( 0.86 %)


  for chunk in pd.read_csv(


Reading chunk:  5 of 465 ( 1.08 %)
Reading chunk:  6 of 465 ( 1.29 %)
Reading chunk:  7 of 465 ( 1.51 %)


  for chunk in pd.read_csv(


Reading chunk:  8 of 465 ( 1.72 %)


  for chunk in pd.read_csv(


Reading chunk:  9 of 465 ( 1.94 %)
Reading chunk:  10 of 465 ( 2.15 %)


  for chunk in pd.read_csv(


Reading chunk:  11 of 465 ( 2.37 %)


  for chunk in pd.read_csv(


Reading chunk:  12 of 465 ( 2.58 %)
Reading chunk:  13 of 465 ( 2.8 %)
Reading chunk:  14 of 465 ( 3.01 %)
Reading chunk:  15 of 465 ( 3.23 %)
Reading chunk:  16 of 465 ( 3.44 %)
Reading chunk:  17 of 465 ( 3.66 %)
Reading chunk:  18 of 465 ( 3.87 %)


  for chunk in pd.read_csv(


Reading chunk:  19 of 465 ( 4.09 %)
Reading chunk:  20 of 465 ( 4.3 %)
Reading chunk:  21 of 465 ( 4.52 %)
Reading chunk:  22 of 465 ( 4.73 %)
Reading chunk:  23 of 465 ( 4.95 %)
Reading chunk:  24 of 465 ( 5.16 %)
Reading chunk:  25 of 465 ( 5.38 %)
Reading chunk:  26 of 465 ( 5.59 %)
Reading chunk:  27 of 465 ( 5.81 %)
Reading chunk:  28 of 465 ( 6.02 %)


  for chunk in pd.read_csv(


Reading chunk:  29 of 465 ( 6.24 %)
Reading chunk:  30 of 465 ( 6.45 %)
Reading chunk:  31 of 465 ( 6.67 %)
Reading chunk:  32 of 465 ( 6.88 %)
Reading chunk:  33 of 465 ( 7.1 %)
Reading chunk:  34 of 465 ( 7.31 %)
Reading chunk:  35 of 465 ( 7.53 %)
Reading chunk:  36 of 465 ( 7.74 %)
Reading chunk:  37 of 465 ( 7.96 %)
Reading chunk:  38 of 465 ( 8.17 %)
Reading chunk:  39 of 465 ( 8.39 %)
Reading chunk:  40 of 465 ( 8.6 %)
Reading chunk:  41 of 465 ( 8.82 %)
Reading chunk:  42 of 465 ( 9.03 %)
Reading chunk:  43 of 465 ( 9.25 %)
Reading chunk:  44 of 465 ( 9.46 %)
Reading chunk:  45 of 465 ( 9.68 %)
Reading chunk:  46 of 465 ( 9.89 %)
Reading chunk:  47 of 465 ( 10.11 %)
Reading chunk:  48 of 465 ( 10.32 %)
Reading chunk:  49 of 465 ( 10.54 %)
Reading chunk:  50 of 465 ( 10.75 %)
Reading chunk:  51 of 465 ( 10.97 %)
Reading chunk:  52 of 465 ( 11.18 %)
Reading chunk:  53 of 465 ( 11.4 %)
Reading chunk:  54 of 465 ( 11.61 %)
Reading chunk:  55 of 465 ( 11.83 %)
Reading chunk:  56 of 

  for chunk in pd.read_csv(


Reading chunk:  65 of 465 ( 13.98 %)
Reading chunk:  66 of 465 ( 14.19 %)
Reading chunk:  67 of 465 ( 14.41 %)
Reading chunk:  68 of 465 ( 14.62 %)
Reading chunk:  69 of 465 ( 14.84 %)
Reading chunk:  70 of 465 ( 15.05 %)
Reading chunk:  71 of 465 ( 15.27 %)
Reading chunk:  72 of 465 ( 15.48 %)


  for chunk in pd.read_csv(


Reading chunk:  73 of 465 ( 15.7 %)
Reading chunk:  74 of 465 ( 15.91 %)
Reading chunk:  75 of 465 ( 16.13 %)
Reading chunk:  76 of 465 ( 16.34 %)


  for chunk in pd.read_csv(


Reading chunk:  77 of 465 ( 16.56 %)


  for chunk in pd.read_csv(


Reading chunk:  78 of 465 ( 16.77 %)


  for chunk in pd.read_csv(


Reading chunk:  79 of 465 ( 16.99 %)
Reading chunk:  80 of 465 ( 17.2 %)
Reading chunk:  81 of 465 ( 17.42 %)
Reading chunk:  82 of 465 ( 17.63 %)
Reading chunk:  83 of 465 ( 17.85 %)
Reading chunk:  84 of 465 ( 18.06 %)
Reading chunk:  85 of 465 ( 18.28 %)
Reading chunk:  86 of 465 ( 18.49 %)
Reading chunk:  87 of 465 ( 18.71 %)
Reading chunk:  88 of 465 ( 18.92 %)
Reading chunk:  89 of 465 ( 19.14 %)
Reading chunk:  90 of 465 ( 19.35 %)
Reading chunk:  91 of 465 ( 19.57 %)
Reading chunk:  92 of 465 ( 19.78 %)
Reading chunk:  93 of 465 ( 20.0 %)
Reading chunk:  94 of 465 ( 20.22 %)
Reading chunk:  95 of 465 ( 20.43 %)
Reading chunk:  96 of 465 ( 20.65 %)
Reading chunk:  97 of 465 ( 20.86 %)
Reading chunk:  98 of 465 ( 21.08 %)
Reading chunk:  99 of 465 ( 21.29 %)
Reading chunk:  100 of 465 ( 21.51 %)
Reading chunk:  101 of 465 ( 21.72 %)
Reading chunk:  102 of 465 ( 21.94 %)
Reading chunk:  103 of 465 ( 22.15 %)
Reading chunk:  104 of 465 ( 22.37 %)
Reading chunk:  105 of 465 ( 22.58 

  for chunk in pd.read_csv(


Reading chunk:  109 of 465 ( 23.44 %)
Reading chunk:  110 of 465 ( 23.66 %)
Reading chunk:  111 of 465 ( 23.87 %)
Reading chunk:  112 of 465 ( 24.09 %)
Reading chunk:  113 of 465 ( 24.3 %)
Reading chunk:  114 of 465 ( 24.52 %)
Reading chunk:  115 of 465 ( 24.73 %)
Reading chunk:  116 of 465 ( 24.95 %)
Reading chunk:  117 of 465 ( 25.16 %)
Reading chunk:  118 of 465 ( 25.38 %)
Reading chunk:  119 of 465 ( 25.59 %)
Reading chunk:  120 of 465 ( 25.81 %)
Reading chunk:  121 of 465 ( 26.02 %)
Reading chunk:  122 of 465 ( 26.24 %)
Reading chunk:  123 of 465 ( 26.45 %)
Reading chunk:  124 of 465 ( 26.67 %)
Reading chunk:  125 of 465 ( 26.88 %)


  for chunk in pd.read_csv(


Reading chunk:  126 of 465 ( 27.1 %)


  for chunk in pd.read_csv(


Reading chunk:  127 of 465 ( 27.31 %)
Reading chunk:  128 of 465 ( 27.53 %)
Reading chunk:  129 of 465 ( 27.74 %)
Reading chunk:  130 of 465 ( 27.96 %)


  for chunk in pd.read_csv(


Reading chunk:  131 of 465 ( 28.17 %)
Reading chunk:  132 of 465 ( 28.39 %)
Reading chunk:  133 of 465 ( 28.6 %)


  for chunk in pd.read_csv(


Reading chunk:  134 of 465 ( 28.82 %)
Reading chunk:  135 of 465 ( 29.03 %)
Reading chunk:  136 of 465 ( 29.25 %)
Reading chunk:  137 of 465 ( 29.46 %)
Reading chunk:  138 of 465 ( 29.68 %)
Reading chunk:  139 of 465 ( 29.89 %)
Reading chunk:  140 of 465 ( 30.11 %)
Reading chunk:  141 of 465 ( 30.32 %)
Reading chunk:  142 of 465 ( 30.54 %)
Reading chunk:  143 of 465 ( 30.75 %)
Reading chunk:  144 of 465 ( 30.97 %)
Reading chunk:  145 of 465 ( 31.18 %)
Reading chunk:  146 of 465 ( 31.4 %)
Reading chunk:  147 of 465 ( 31.61 %)


  for chunk in pd.read_csv(


Reading chunk:  148 of 465 ( 31.83 %)
Reading chunk:  149 of 465 ( 32.04 %)


  for chunk in pd.read_csv(


Reading chunk:  150 of 465 ( 32.26 %)


  for chunk in pd.read_csv(


Reading chunk:  151 of 465 ( 32.47 %)
Reading chunk:  152 of 465 ( 32.69 %)


  for chunk in pd.read_csv(


Reading chunk:  153 of 465 ( 32.9 %)
Reading chunk:  154 of 465 ( 33.12 %)
Reading chunk:  155 of 465 ( 33.33 %)


  for chunk in pd.read_csv(


Reading chunk:  156 of 465 ( 33.55 %)
Reading chunk:  157 of 465 ( 33.76 %)


  for chunk in pd.read_csv(


Reading chunk:  158 of 465 ( 33.98 %)
Reading chunk:  159 of 465 ( 34.19 %)


  for chunk in pd.read_csv(


Reading chunk:  160 of 465 ( 34.41 %)
Reading chunk:  161 of 465 ( 34.62 %)
Reading chunk:  162 of 465 ( 34.84 %)
Reading chunk:  163 of 465 ( 35.05 %)
Reading chunk:  164 of 465 ( 35.27 %)


  for chunk in pd.read_csv(


Reading chunk:  165 of 465 ( 35.48 %)
Reading chunk:  166 of 465 ( 35.7 %)


  for chunk in pd.read_csv(


Reading chunk:  167 of 465 ( 35.91 %)
Reading chunk:  168 of 465 ( 36.13 %)
Reading chunk:  169 of 465 ( 36.34 %)
Reading chunk:  170 of 465 ( 36.56 %)
Reading chunk:  171 of 465 ( 36.77 %)


  for chunk in pd.read_csv(


Reading chunk:  172 of 465 ( 36.99 %)


  for chunk in pd.read_csv(


Reading chunk:  173 of 465 ( 37.2 %)


  for chunk in pd.read_csv(


Reading chunk:  174 of 465 ( 37.42 %)
Reading chunk:  175 of 465 ( 37.63 %)


  for chunk in pd.read_csv(


Reading chunk:  176 of 465 ( 37.85 %)
Reading chunk:  177 of 465 ( 38.06 %)
Reading chunk:  178 of 465 ( 38.28 %)
Reading chunk:  179 of 465 ( 38.49 %)
Reading chunk:  180 of 465 ( 38.71 %)
Reading chunk:  181 of 465 ( 38.92 %)
Reading chunk:  182 of 465 ( 39.14 %)
Reading chunk:  183 of 465 ( 39.35 %)


  for chunk in pd.read_csv(


Reading chunk:  184 of 465 ( 39.57 %)
Reading chunk:  185 of 465 ( 39.78 %)
Reading chunk:  186 of 465 ( 40.0 %)
Reading chunk:  187 of 465 ( 40.22 %)
Reading chunk:  188 of 465 ( 40.43 %)
Reading chunk:  189 of 465 ( 40.65 %)
Reading chunk:  190 of 465 ( 40.86 %)


  for chunk in pd.read_csv(


Reading chunk:  191 of 465 ( 41.08 %)


  for chunk in pd.read_csv(


Reading chunk:  192 of 465 ( 41.29 %)


  for chunk in pd.read_csv(


Reading chunk:  193 of 465 ( 41.51 %)
Reading chunk:  194 of 465 ( 41.72 %)
Reading chunk:  195 of 465 ( 41.94 %)


  for chunk in pd.read_csv(


Reading chunk:  196 of 465 ( 42.15 %)
Reading chunk:  197 of 465 ( 42.37 %)
Reading chunk:  198 of 465 ( 42.58 %)


  for chunk in pd.read_csv(


Reading chunk:  199 of 465 ( 42.8 %)
Reading chunk:  200 of 465 ( 43.01 %)


  for chunk in pd.read_csv(


Reading chunk:  201 of 465 ( 43.23 %)
Reading chunk:  202 of 465 ( 43.44 %)
Reading chunk:  203 of 465 ( 43.66 %)
Reading chunk:  204 of 465 ( 43.87 %)
Reading chunk:  205 of 465 ( 44.09 %)
Reading chunk:  206 of 465 ( 44.3 %)
Reading chunk:  207 of 465 ( 44.52 %)
Reading chunk:  208 of 465 ( 44.73 %)
Reading chunk:  209 of 465 ( 44.95 %)
Reading chunk:  210 of 465 ( 45.16 %)
Reading chunk:  211 of 465 ( 45.38 %)
Reading chunk:  212 of 465 ( 45.59 %)
Reading chunk:  213 of 465 ( 45.81 %)
Reading chunk:  214 of 465 ( 46.02 %)
Reading chunk:  215 of 465 ( 46.24 %)
Reading chunk:  216 of 465 ( 46.45 %)
Reading chunk:  217 of 465 ( 46.67 %)
Reading chunk:  218 of 465 ( 46.88 %)


  for chunk in pd.read_csv(


Reading chunk:  219 of 465 ( 47.1 %)


  for chunk in pd.read_csv(


Reading chunk:  220 of 465 ( 47.31 %)


  for chunk in pd.read_csv(


Reading chunk:  221 of 465 ( 47.53 %)


  for chunk in pd.read_csv(


Reading chunk:  222 of 465 ( 47.74 %)


  for chunk in pd.read_csv(


Reading chunk:  223 of 465 ( 47.96 %)


  for chunk in pd.read_csv(


Reading chunk:  224 of 465 ( 48.17 %)


  for chunk in pd.read_csv(


Reading chunk:  225 of 465 ( 48.39 %)
Reading chunk:  226 of 465 ( 48.6 %)
Reading chunk:  227 of 465 ( 48.82 %)
Reading chunk:  228 of 465 ( 49.03 %)


  for chunk in pd.read_csv(


Reading chunk:  229 of 465 ( 49.25 %)
Reading chunk:  230 of 465 ( 49.46 %)


  for chunk in pd.read_csv(


Reading chunk:  231 of 465 ( 49.68 %)


  for chunk in pd.read_csv(


Reading chunk:  232 of 465 ( 49.89 %)
Reading chunk:  233 of 465 ( 50.11 %)
Reading chunk:  234 of 465 ( 50.32 %)


  for chunk in pd.read_csv(


Reading chunk:  235 of 465 ( 50.54 %)


  for chunk in pd.read_csv(


Reading chunk:  236 of 465 ( 50.75 %)
Reading chunk:  237 of 465 ( 50.97 %)
Reading chunk:  238 of 465 ( 51.18 %)
Reading chunk:  239 of 465 ( 51.4 %)
Reading chunk:  240 of 465 ( 51.61 %)
Reading chunk:  241 of 465 ( 51.83 %)


  for chunk in pd.read_csv(


Reading chunk:  242 of 465 ( 52.04 %)
Reading chunk:  243 of 465 ( 52.26 %)


  for chunk in pd.read_csv(


Reading chunk:  244 of 465 ( 52.47 %)
Reading chunk:  245 of 465 ( 52.69 %)
Reading chunk:  246 of 465 ( 52.9 %)
Reading chunk:  247 of 465 ( 53.12 %)
Reading chunk:  248 of 465 ( 53.33 %)
Reading chunk:  249 of 465 ( 53.55 %)
Reading chunk:  250 of 465 ( 53.76 %)
Reading chunk:  251 of 465 ( 53.98 %)
Reading chunk:  252 of 465 ( 54.19 %)
Reading chunk:  253 of 465 ( 54.41 %)
Reading chunk:  254 of 465 ( 54.62 %)
Reading chunk:  255 of 465 ( 54.84 %)
Reading chunk:  256 of 465 ( 55.05 %)
Reading chunk:  257 of 465 ( 55.27 %)
Reading chunk:  258 of 465 ( 55.48 %)
Reading chunk:  259 of 465 ( 55.7 %)
Reading chunk:  260 of 465 ( 55.91 %)
Reading chunk:  261 of 465 ( 56.13 %)


  for chunk in pd.read_csv(


Reading chunk:  262 of 465 ( 56.34 %)
Reading chunk:  263 of 465 ( 56.56 %)
Reading chunk:  264 of 465 ( 56.77 %)


  for chunk in pd.read_csv(


Reading chunk:  265 of 465 ( 56.99 %)
Reading chunk:  266 of 465 ( 57.2 %)


  for chunk in pd.read_csv(


Reading chunk:  267 of 465 ( 57.42 %)
Reading chunk:  268 of 465 ( 57.63 %)
Reading chunk:  269 of 465 ( 57.85 %)
Reading chunk:  270 of 465 ( 58.06 %)
Reading chunk:  271 of 465 ( 58.28 %)
Reading chunk:  272 of 465 ( 58.49 %)
Reading chunk:  273 of 465 ( 58.71 %)
Reading chunk:  274 of 465 ( 58.92 %)
Reading chunk:  275 of 465 ( 59.14 %)
Reading chunk:  276 of 465 ( 59.35 %)
Reading chunk:  277 of 465 ( 59.57 %)


  for chunk in pd.read_csv(


Reading chunk:  278 of 465 ( 59.78 %)
Reading chunk:  279 of 465 ( 60.0 %)
Reading chunk:  280 of 465 ( 60.22 %)
Reading chunk:  281 of 465 ( 60.43 %)


  for chunk in pd.read_csv(


Reading chunk:  282 of 465 ( 60.65 %)
Reading chunk:  283 of 465 ( 60.86 %)
Reading chunk:  284 of 465 ( 61.08 %)
Reading chunk:  285 of 465 ( 61.29 %)


  for chunk in pd.read_csv(


Reading chunk:  286 of 465 ( 61.51 %)
Reading chunk:  287 of 465 ( 61.72 %)
Reading chunk:  288 of 465 ( 61.94 %)
Reading chunk:  289 of 465 ( 62.15 %)
Reading chunk:  290 of 465 ( 62.37 %)
Reading chunk:  291 of 465 ( 62.58 %)
Reading chunk:  292 of 465 ( 62.8 %)
Reading chunk:  293 of 465 ( 63.01 %)
Reading chunk:  294 of 465 ( 63.23 %)
Reading chunk:  295 of 465 ( 63.44 %)
Reading chunk:  296 of 465 ( 63.66 %)
Reading chunk:  297 of 465 ( 63.87 %)
Reading chunk:  298 of 465 ( 64.09 %)
Reading chunk:  299 of 465 ( 64.3 %)
Reading chunk:  300 of 465 ( 64.52 %)
Reading chunk:  301 of 465 ( 64.73 %)
Reading chunk:  302 of 465 ( 64.95 %)
Reading chunk:  303 of 465 ( 65.16 %)
Reading chunk:  304 of 465 ( 65.38 %)
Reading chunk:  305 of 465 ( 65.59 %)
Reading chunk:  306 of 465 ( 65.81 %)
Reading chunk:  307 of 465 ( 66.02 %)
Reading chunk:  308 of 465 ( 66.24 %)


  for chunk in pd.read_csv(


Reading chunk:  309 of 465 ( 66.45 %)
Reading chunk:  310 of 465 ( 66.67 %)


  for chunk in pd.read_csv(


Reading chunk:  311 of 465 ( 66.88 %)


  for chunk in pd.read_csv(


Reading chunk:  312 of 465 ( 67.1 %)


  for chunk in pd.read_csv(


Reading chunk:  313 of 465 ( 67.31 %)


  for chunk in pd.read_csv(


Reading chunk:  314 of 465 ( 67.53 %)


  for chunk in pd.read_csv(


Reading chunk:  315 of 465 ( 67.74 %)
Reading chunk:  316 of 465 ( 67.96 %)


  for chunk in pd.read_csv(


Reading chunk:  317 of 465 ( 68.17 %)
Reading chunk:  318 of 465 ( 68.39 %)


  for chunk in pd.read_csv(


Reading chunk:  319 of 465 ( 68.6 %)
Reading chunk:  320 of 465 ( 68.82 %)


  for chunk in pd.read_csv(


Reading chunk:  321 of 465 ( 69.03 %)
Reading chunk:  322 of 465 ( 69.25 %)


  for chunk in pd.read_csv(


Reading chunk:  323 of 465 ( 69.46 %)


  for chunk in pd.read_csv(


Reading chunk:  324 of 465 ( 69.68 %)
Reading chunk:  325 of 465 ( 69.89 %)
Reading chunk:  326 of 465 ( 70.11 %)


  for chunk in pd.read_csv(


Reading chunk:  327 of 465 ( 70.32 %)


  for chunk in pd.read_csv(


Reading chunk:  328 of 465 ( 70.54 %)
Reading chunk:  329 of 465 ( 70.75 %)
Reading chunk:  330 of 465 ( 70.97 %)
Reading chunk:  331 of 465 ( 71.18 %)
Reading chunk:  332 of 465 ( 71.4 %)
Reading chunk:  333 of 465 ( 71.61 %)
Reading chunk:  334 of 465 ( 71.83 %)
Reading chunk:  335 of 465 ( 72.04 %)
Reading chunk:  336 of 465 ( 72.26 %)


  for chunk in pd.read_csv(


Reading chunk:  337 of 465 ( 72.47 %)
Reading chunk:  338 of 465 ( 72.69 %)
Reading chunk:  339 of 465 ( 72.9 %)
Reading chunk:  340 of 465 ( 73.12 %)
Reading chunk:  341 of 465 ( 73.33 %)
Reading chunk:  342 of 465 ( 73.55 %)
Reading chunk:  343 of 465 ( 73.76 %)


  for chunk in pd.read_csv(


Reading chunk:  344 of 465 ( 73.98 %)
Reading chunk:  345 of 465 ( 74.19 %)
Reading chunk:  346 of 465 ( 74.41 %)
Reading chunk:  347 of 465 ( 74.62 %)
Reading chunk:  348 of 465 ( 74.84 %)
Reading chunk:  349 of 465 ( 75.05 %)
Reading chunk:  350 of 465 ( 75.27 %)
Reading chunk:  351 of 465 ( 75.48 %)
Reading chunk:  352 of 465 ( 75.7 %)
Reading chunk:  353 of 465 ( 75.91 %)
Reading chunk:  354 of 465 ( 76.13 %)
Reading chunk:  355 of 465 ( 76.34 %)
Reading chunk:  356 of 465 ( 76.56 %)
Reading chunk:  357 of 465 ( 76.77 %)
Reading chunk:  358 of 465 ( 76.99 %)
Reading chunk:  359 of 465 ( 77.2 %)
Reading chunk:  360 of 465 ( 77.42 %)
Reading chunk:  361 of 465 ( 77.63 %)
Reading chunk:  362 of 465 ( 77.85 %)
Reading chunk:  363 of 465 ( 78.06 %)
Reading chunk:  364 of 465 ( 78.28 %)


  for chunk in pd.read_csv(


Reading chunk:  365 of 465 ( 78.49 %)


  for chunk in pd.read_csv(


Reading chunk:  366 of 465 ( 78.71 %)


  for chunk in pd.read_csv(


Reading chunk:  367 of 465 ( 78.92 %)
Reading chunk:  368 of 465 ( 79.14 %)
Reading chunk:  369 of 465 ( 79.35 %)
Reading chunk:  370 of 465 ( 79.57 %)
Reading chunk:  371 of 465 ( 79.78 %)
Reading chunk:  372 of 465 ( 80.0 %)
Reading chunk:  373 of 465 ( 80.22 %)
Reading chunk:  374 of 465 ( 80.43 %)


  for chunk in pd.read_csv(


Reading chunk:  375 of 465 ( 80.65 %)


  for chunk in pd.read_csv(


Reading chunk:  376 of 465 ( 80.86 %)
Reading chunk:  377 of 465 ( 81.08 %)
Reading chunk:  378 of 465 ( 81.29 %)
Reading chunk:  379 of 465 ( 81.51 %)
Reading chunk:  380 of 465 ( 81.72 %)
Reading chunk:  381 of 465 ( 81.94 %)
Reading chunk:  382 of 465 ( 82.15 %)
Reading chunk:  383 of 465 ( 82.37 %)
Reading chunk:  384 of 465 ( 82.58 %)
Reading chunk:  385 of 465 ( 82.8 %)
Reading chunk:  386 of 465 ( 83.01 %)
Reading chunk:  387 of 465 ( 83.23 %)
Reading chunk:  388 of 465 ( 83.44 %)
Reading chunk:  389 of 465 ( 83.66 %)
Reading chunk:  390 of 465 ( 83.87 %)


  for chunk in pd.read_csv(


Reading chunk:  391 of 465 ( 84.09 %)
Reading chunk:  392 of 465 ( 84.3 %)
Reading chunk:  393 of 465 ( 84.52 %)
Reading chunk:  394 of 465 ( 84.73 %)
Reading chunk:  395 of 465 ( 84.95 %)
Reading chunk:  396 of 465 ( 85.16 %)
Reading chunk:  397 of 465 ( 85.38 %)
Reading chunk:  398 of 465 ( 85.59 %)
Reading chunk:  399 of 465 ( 85.81 %)
Reading chunk:  400 of 465 ( 86.02 %)
Reading chunk:  401 of 465 ( 86.24 %)
Reading chunk:  402 of 465 ( 86.45 %)
Reading chunk:  403 of 465 ( 86.67 %)
Reading chunk:  404 of 465 ( 86.88 %)
Reading chunk:  405 of 465 ( 87.1 %)
Reading chunk:  406 of 465 ( 87.31 %)
Reading chunk:  407 of 465 ( 87.53 %)
Reading chunk:  408 of 465 ( 87.74 %)
Reading chunk:  409 of 465 ( 87.96 %)
Reading chunk:  410 of 465 ( 88.17 %)
Reading chunk:  411 of 465 ( 88.39 %)
Reading chunk:  412 of 465 ( 88.6 %)
Reading chunk:  413 of 465 ( 88.82 %)
Reading chunk:  414 of 465 ( 89.03 %)
Reading chunk:  415 of 465 ( 89.25 %)
Reading chunk:  416 of 465 ( 89.46 %)
Reading chunk: 

  for chunk in pd.read_csv(


Reading chunk:  422 of 465 ( 90.75 %)
Reading chunk:  423 of 465 ( 90.97 %)
Reading chunk:  424 of 465 ( 91.18 %)
Reading chunk:  425 of 465 ( 91.4 %)
Reading chunk:  426 of 465 ( 91.61 %)


  for chunk in pd.read_csv(


Reading chunk:  427 of 465 ( 91.83 %)
Reading chunk:  428 of 465 ( 92.04 %)
Reading chunk:  429 of 465 ( 92.26 %)
Reading chunk:  430 of 465 ( 92.47 %)


  for chunk in pd.read_csv(


Reading chunk:  431 of 465 ( 92.69 %)
Reading chunk:  432 of 465 ( 92.9 %)
Reading chunk:  433 of 465 ( 93.12 %)
Reading chunk:  434 of 465 ( 93.33 %)
Reading chunk:  435 of 465 ( 93.55 %)
Reading chunk:  436 of 465 ( 93.76 %)
Reading chunk:  437 of 465 ( 93.98 %)


  for chunk in pd.read_csv(


Reading chunk:  438 of 465 ( 94.19 %)


  for chunk in pd.read_csv(


Reading chunk:  439 of 465 ( 94.41 %)
Reading chunk:  440 of 465 ( 94.62 %)
Reading chunk:  441 of 465 ( 94.84 %)
Reading chunk:  442 of 465 ( 95.05 %)
Reading chunk:  443 of 465 ( 95.27 %)
Reading chunk:  444 of 465 ( 95.48 %)
Reading chunk:  445 of 465 ( 95.7 %)
Reading chunk:  446 of 465 ( 95.91 %)
Reading chunk:  447 of 465 ( 96.13 %)
Reading chunk:  448 of 465 ( 96.34 %)
Reading chunk:  449 of 465 ( 96.56 %)


  for chunk in pd.read_csv(


Reading chunk:  450 of 465 ( 96.77 %)
Reading chunk:  451 of 465 ( 96.99 %)
Reading chunk:  452 of 465 ( 97.2 %)
Reading chunk:  453 of 465 ( 97.42 %)
Reading chunk:  454 of 465 ( 97.63 %)


  for chunk in pd.read_csv(


Reading chunk:  455 of 465 ( 97.85 %)
Reading chunk:  456 of 465 ( 98.06 %)
Reading chunk:  457 of 465 ( 98.28 %)
Reading chunk:  458 of 465 ( 98.49 %)


  for chunk in pd.read_csv(


Reading chunk:  459 of 465 ( 98.71 %)
Reading chunk:  460 of 465 ( 98.92 %)


  for chunk in pd.read_csv(


Reading chunk:  461 of 465 ( 99.14 %)


  for chunk in pd.read_csv(


Reading chunk:  462 of 465 ( 99.35 %)


  for chunk in pd.read_csv(


Reading chunk:  463 of 465 ( 99.57 %)


  for chunk in pd.read_csv(


Reading chunk:  464 of 465 ( 99.78 %)
Reading chunk:  465 of 465 ( 100.0 %)
Ingest complete


In [5]:
# transform the wapo df
print("Adding county name...")

# Add an index to the wapo_df
wapo_df = wapo_df.reset_index()

# Do some transformations on the WAPO dataset
wapo_df["COUNTY_NAME"] = wapo_df["BUYER_COUNTY"] + " COUNTY, " + wapo_df["BUYER_STATE"]

print("Grouping WAPO data...")

wapo_df = (
    wapo_df.groupby(["COUNTY_NAME", "BUYER_STATE", "YEAR", "MONTH"])
    .agg({"QUANTITY": "sum"})
    .reset_index()
)

# # rename buyer state to state
wapo_df = wapo_df.rename(columns={"BUYER_STATE": "STATE"})

# # Change the year column to a string
wapo_df["YEAR"] = wapo_df["YEAR"].astype(str)

# # Change the month column to a string
wapo_df["MONTH"] = wapo_df["MONTH"].astype(str)

print("Operation complete")


Adding county name...
Grouping WAPO data...
Operation complete


***Print a sample to make sure you did it right***

In [6]:
# wapo_df.sample(10)


In [7]:
# wapo_df["YEAR"].unique()
# wapo_df["STATE"].unique()

assert len(wapo_df["STATE"].unique()) == len(states)


***Assert tests to verify that we have the right states, and some checks on the data***

In [8]:
# Assert that no counties are missing
assert wapo_df["COUNTY_NAME"].isnull().sum() == 0
# Assert that states are in the list
assert set(wapo_df["STATE"].unique()) == set(states)


***Save the output file to a csv in the intermediate files directory***

In [9]:
# save this file as a csv called wapo_clean.csv in the current directory
wapo_df.to_csv("../20_intermediate_files/wapo_clean.csv", index=False)


### Vitality Data

This section takes the txt files passed and will return a dataframe with the respective values for each year

***Ingest actions***

In [10]:
# Set a directory path to find the txt files
nick_path = "../00_source_data/US_VitalStatistics/"


In [11]:
# generate a df from the txt files in a folder path

# initialize the empty df
nick_df = pd.DataFrame()

# set a loop to iterate through the files in the folder
print("Ingesting text files...")

# set a loop to iterate through the files in the folder
for file in os.listdir(nick_path):
    if file.endswith(".txt"):
        txt_table = pd.read_table(
            os.path.join(nick_path, file), sep="\t", skipfooter=15, engine="python"
        )
        # remove the bottom rows that are not needed
        # txt_table = txt_table.iloc[0:-16, :]
        nick_df = pd.concat([nick_df, txt_table], axis=0)

# subset to the columns we want
vital_df = nick_df[
    [
        "County",
        "County Code",
        "Year",
        "Drug/Alcohol Induced Cause",
        "Drug/Alcohol Induced Cause Code",
        "Deaths",
    ]
].copy()

print("Transforming vitality data...")

# change the year column to a string with a year only
vital_df["Year"] = vital_df["Year"].astype(str).str[0:4]

# change the County Code column to 6-digit string FIPS code
vital_df["County Code"] = vital_df["County Code"].astype(str).str.zfill(6)

# add a state column with the last two characters from county
vital_df["State"] = vital_df["County"].str[-2:]

# convert NaN deaths to 0
vital_df["Deaths"] = vital_df["Deaths"].fillna(0)

# change the county name to all caps
vital_df["County"] = vital_df["County"].str.upper()

# change all the column names to all caps
vital_df.columns = vital_df.columns.str.upper()

# rename county code to FIPS code
vital_df = vital_df.rename(columns={"COUNTY CODE": "FIPS"})
vital_df = vital_df.rename(columns={"COUNTY": "COUNTY_NAME"})

# filter the vital df to the states of interest
vital_df = vital_df[vital_df["STATE"].isin(states)].copy()

print("Operation complete")


Ingesting text files...
Transforming vitality data...
Operation complete


***Subset to rows with drug deaths, group and add***

In [12]:
vital_df["DRUG/ALCOHOL INDUCED CAUSE"].value_counts()

# convert the DRUG Cause column to all lower case
vital_df["DRUG/ALCOHOL INDUCED CAUSE"] = vital_df[
    "DRUG/ALCOHOL INDUCED CAUSE"
].str.lower()

# Make a new column called drug with indicators for cause
vital_df["DRUG"] = np.where(
    vital_df["DRUG/ALCOHOL INDUCED CAUSE"].str.contains("drug"), 1, 0
)

# subset to remove all other non-drug deaths
vital_df = vital_df[
    vital_df["DRUG/ALCOHOL INDUCED CAUSE"]
    != "all other non-drug and non-alcohol causes"
].copy()

# subset to remove all other non-drug deaths
vital_df = vital_df[vital_df["DRUG"] == 1].copy()


In [13]:
# vital_df["DRUG/ALCOHOL INDUCED CAUSE"].unique()
vital_df["DRUG/ALCOHOL INDUCED CAUSE"].value_counts()


drug poisonings (overdose) unintentional (x40-x44)    3017
drug poisonings (overdose) suicide (x60-x64)           783
all other drug-induced causes                          326
drug poisonings (overdose) undetermined (y10-y14)      195
Name: DRUG/ALCOHOL INDUCED CAUSE, dtype: int64

In [14]:
# convert the death column to an integer
vital_df["DEATHS"] = vital_df["DEATHS"].astype(int)


In [15]:
# group the vital df and sum the deaths
new_vital_df = (
    vital_df.groupby(["COUNTY_NAME", "FIPS", "YEAR", "STATE", "DRUG"])
    .sum()
    .reset_index()
)


  .sum()


***Check a sample to inspect data***

In [16]:
# new_vital_df.sample(10)


***Assert tests to verify that we have the right states, and some checks on the data***

In [17]:
assert vital_df.all().isnull().sum() == 0
# Assert that states are in the list
assert set(vital_df["STATE"].unique()) == set(states)
# Still need to find a way to check if Vital is correctly chunked

assert vital_df["DEATHS"].sum() == new_vital_df["DEATHS"].sum()


***Save the output dataframe to the intermediate files directory***

In [18]:
new_vital_df.to_csv("../20_intermediate_files/vital_clean.csv", index=False)


### Add FIPS Codes to the available data

This chunk takes the path to FIPS codes and will ingest them into a dataframe

***Ingest data***

In [19]:
fips_path = "../00_source_data/02_fcc_fips_codes.txt"


***start with counties***

In [20]:
# Ingest the fips codes
fips_county_df = pd.read_table(fips_path, sep="\t", skiprows=71, header="infer")

# name the column header
fips_county_df.columns = ["FIPS"]

# split the FIPS column into two columns after the first 5 characters
fips_county_df["COUNTY_NAME"] = fips_county_df["FIPS"].str[10:]

# Turn the FIPS column into just the numbers
fips_county_df["FIPS"] = fips_county_df["FIPS"].str[4:10]

# remove the spaces from the county name
fips_county_df["COUNTY_NAME"] = fips_county_df["COUNTY_NAME"].str.strip()

# make county name upper
fips_county_df["COUNTY_NAME"] = fips_county_df["COUNTY_NAME"].str.upper()

print("Operation complete")


Operation complete


In [21]:
# Ingest the fips codes
fips_state_df = pd.read_table(fips_path, sep="\t", skiprows=15, header="infer")

# keep the first 50 rows
fips_state_df = fips_state_df.iloc[0:50, :]

# name the column header
fips_state_df.columns = ["FIPS"]

# # split the FIPS column into two columns after the first 5 characters
fips_state_df["STATE"] = fips_state_df["FIPS"].str[10:]
fips_state_df["STATE"] = fips_state_df["STATE"].str.strip()

# #Turn the FIPS column into just the numbers
fips_state_df["FIPS"] = fips_state_df["FIPS"].str[4:10]

# merge to add the state abbreviations from the state_df
fips_state_df = fips_state_df.merge(state_df, on="STATE", how="left")

print("Operation complete")


Operation complete


***merge the two dataframes***

In [22]:
# this is going to be a little different
fips_df = fips_county_df.copy()

# add a state column
fips_df["STATE"] = fips_df["COUNTY_NAME"]

# merge to get the state abbreviations
fips_df = fips_df.merge(fips_state_df, on="STATE", how="left")

# drop the state column
fips_df = fips_df.drop(columns=["STATE"])

# rename the columns
fips_df = fips_df.rename(
    columns={"FIPS_x": "FIPS", "ABBREV": "STATE", "FIPS_y": "STATE_FIPS"}
)

# forward fill the state fips and state columns
fips_df["STATE_FIPS"] = fips_df["STATE_FIPS"].fillna(method="ffill")
fips_df["STATE"] = fips_df["STATE"].fillna(method="ffill")

# add state to the county name
fips_df["COUNTY_NAME"] = fips_df["COUNTY_NAME"] + ", " + fips_df["STATE"]

# filter the df to the states of interest
fips_df = fips_df[fips_df["STATE"].isin(states)]


***Sample the FIPS codes to see if we did it right***

In [23]:
# fips_df.sample(10)


***Add some assert tests***

In [24]:
assert vital_df.all().isnull().sum() == 0
# check the first two digits of the FIPS code to make sure they match with STATE_FIPS
assert list(fips_df["FIPS"].str[:2]) == list(
    fips_df["STATE_FIPS"].astype(str).str.strip()
)
# fips_df["FIPS"].dtype


***Save the FIPS file cleaned to the 20_intermediate_files directory***

In [25]:
# save the fips_df to a csv
fips_df.to_csv("../20_intermediate_files/fips_df.csv", index=False)


### Census Data

This chunk will ingest filtered county population data and return a dataframe

***Ingest data***

In [26]:
# set the directory path to the raw data
census_path = "../00_source_data/01_census_data.xlsx"


In [27]:
# ingest the raw data and filter for states of interest
census_df = pd.read_excel(census_path, header=0, skiprows=4, usecols="A:B")

# Change the column names to county name and population
census_df.columns = ["COUNTY_NAME", "POPULATION"]

# remove the leading period from the county name
census_df["COUNTY_NAME"] = census_df["COUNTY_NAME"].str[1:]

# move the state name to its own column
census_df["STATE"] = census_df["COUNTY_NAME"].str.split(",").str[1]
census_df["STATE"] = census_df["STATE"].str.strip()
census_df["STATE"] = census_df["STATE"].str.upper()

# remove the state name from the county name
census_df["COUNTY_NAME"] = census_df["COUNTY_NAME"].str.split(",").str[0]

# change population to an integer
census_df["POPULATION"] = census_df["POPULATION"].astype("Int64")

# add an abbreviation column for the state from the state_df
census_df_merge = census_df.merge(state_df, on="STATE", how="outer")

# reformat county name to include abbrev
census_df_merge["COUNTY_NAME"] = (
    census_df_merge["COUNTY_NAME"] + ", " + census_df_merge["ABBREV"]
)
census_df_merge["COUNTY_NAME"] = census_df_merge["COUNTY_NAME"].str.upper()

# remove the state column
census_df_merge = census_df_merge.drop(columns="STATE")

# rename the abbrev column to state
census_df_merge = census_df_merge.rename(columns={"ABBREV": "STATE"})

# filter for the states of interest
census_df_merge = census_df_merge[census_df_merge["STATE"].isin(states)].copy()


***Sample to see if we did it right***

In [28]:
# census_df_merge["STATE"].unique()

In [29]:
# census_df_merge.sample(10)


***Add assert tests***

In [30]:
# number of counties in each state we have picked, according to google and wikipedia
counties = {"FL":67,"AL":67, "GA":159, "MS":82, "SC":46, "TX":254 , "OK":77, "AZ":15, "NM":33, "WA":39, "OR":36, "ID":44, "CA":58, "NY":62}
# check we have all the counties in our dictionary
assert set(census_df_merge["STATE"].unique()) == set(counties.keys())

In [31]:
assert census_df_merge.all().isnull().sum() == 0
# check last two characters of county name to make sure they match with state for consistency
assert list(census_df_merge["COUNTY_NAME"].str[-2:]) == list(census_df_merge["STATE"])
# we need to check county numbers to make sure they match (when we know which control states we want)
assert census_df_merge["COUNTY_NAME"].nunique() == sum(counties.values())

In [32]:
# change county name of "DOÑA ANA COUNTY, NM" to "DONA ANA COUNTY, NM" to match other datasets for consistency
census_df_merge["COUNTY_NAME"] = census_df_merge["COUNTY_NAME"].str.replace("DOÑA", "DONA")
# check county name contains "DONA"
census_df_merge[census_df_merge["COUNTY_NAME"].str.contains("DONA")]


Unnamed: 0,COUNTY_NAME,POPULATION,STATE
1802,"DONA ANA COUNTY, NM",209233,NM


In [33]:
# replace country name of "DE BACA COUNTY, NM" to "DEBACA COUNTY, NM" to match other datasets
census_df_merge["COUNTY_NAME"] = census_df_merge["COUNTY_NAME"].str.replace("DE BACA", "DEBACA")
census_df_merge[census_df_merge["COUNTY_NAME"].str.contains("BACA")]


Unnamed: 0,COUNTY_NAME,POPULATION,STATE
1801,"DEBACA COUNTY, NM",2022,NM


***Save the output file to the intermediate files directory***

In [34]:
# save the file to the intermediate folder
census_df_merge.to_csv("../20_intermediate_files/census_df.csv", index=False)
