In [7]:
import pandas as pd
import re
from datetime import datetime

In [8]:
dat = pd.read_excel("../data/external/0053 Rxs no PHI March - May 2021.xls", sheet_name = 'Sheet1')

In [9]:
def extract_amount_from_cdl_string(cdl_string):
     matches = re.search(string = cdl_string, pattern = "\#\s*(\d*\.?\d*)")
     first_match = matches.group(1)
     return float(first_match)

def extract_unit_from_cdl_string(cdl_string):
     matches = re.search(string = cdl_string, pattern = "\#\s+\d*\.?\d*\s+([a-zA-Z]+)")
     first_match = matches.group(1)
     return first_match

def get_date_from_timestamp_string(dt_string):
    return datetime.strptime(dt_string, '%b/%d/%y %H:%M:%S').date()

def enforce_types_on_keys(dat):
    # ensure type consistency
    dat["FILL_DATE"] = dat["FILL_DATE"].astype('datetime64[ns]')
    dat["NDC_DESC"] = dat["NDC_DESC"].astype('object')
    dat["UNIT"] = dat["UNIT"].astype('object')
    
    return dat

def extract_amount_unit_and_fill_date(dat):
    dat["AMOUNT"] = dat["CDL"].apply(extract_amount_from_cdl_string)
    dat["UNIT"] = dat["CDL"].apply(extract_unit_from_cdl_string)
    dat['FILL_DATE'] = dat["DISPENSE_DT_TM"].apply(get_date_from_timestamp_string)
    
    dat = enforce_types_on_keys(dat)
    
    return dat

In [10]:
def build_drug_date_spine(dat):
    """Build a dataframe and ensure that a every drug is represented for every date, even if nothing was dispensed"""

    dates = pd.date_range(dat['FILL_DATE'].min(), dat['FILL_DATE'].max(), freq="D").to_frame().rename(columns={0:"FILL_DATE"})
    dates["key"] = 1

    unique_drugs = dat[["NDC_DESC", "UNIT"]].drop_duplicates()
    unique_drugs["key"] = 1
    
    spine = pd.merge(dates, unique_drugs, on='key')[["FILL_DATE", "NDC_DESC", "UNIT"]]
    
    spine = enforce_types_on_keys(spine)
    
    return spine

In [42]:
def get_daily_totals(raw_dat):
    
    dat = extract_amount_unit_and_fill_date(raw_dat)
    
    daily_totals = dat.groupby(by=["FILL_DATE", "NDC_DESC", "UNIT"]).sum().reset_index()[["FILL_DATE","NDC_DESC","UNIT", "AMOUNT"]]

    spine = build_drug_date_spine(dat)
    
    output =  pd.merge(spine, daily_totals, how='left', on=["FILL_DATE", "NDC_DESC", "UNIT"])
    

    return output

In [43]:
daily_totals = get_daily_totals(dat)

In [44]:
daily_totals.dtypes

FILL_DATE    datetime64[ns]
NDC_DESC             object
UNIT                 object
AMOUNT              float64
dtype: object

In [70]:
example_drug = daily_totals[daily_totals["NDC_DESC"] == "atorvastatin 80 mg tablet"].set_index(["FILL_DATE", "NDC_DESC", "UNIT"]).fillna(0)
example_drug

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AMOUNT
FILL_DATE,NDC_DESC,UNIT,Unnamed: 3_level_1
2021-03-01,atorvastatin 80 mg tablet,EA,180.0
2021-03-02,atorvastatin 80 mg tablet,EA,0.0
2021-03-03,atorvastatin 80 mg tablet,EA,90.0
2021-03-04,atorvastatin 80 mg tablet,EA,0.0
2021-03-05,atorvastatin 80 mg tablet,EA,0.0
...,...,...,...
2021-05-27,atorvastatin 80 mg tablet,EA,90.0
2021-05-28,atorvastatin 80 mg tablet,EA,90.0
2021-05-29,atorvastatin 80 mg tablet,EA,0.0
2021-05-30,atorvastatin 80 mg tablet,EA,0.0


In [71]:
example_drug.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AMOUNT
FILL_DATE,NDC_DESC,UNIT,Unnamed: 3_level_1
2021-03-01,atorvastatin 80 mg tablet,EA,180.0
2021-03-02,atorvastatin 80 mg tablet,EA,0.0
2021-03-03,atorvastatin 80 mg tablet,EA,90.0
2021-03-04,atorvastatin 80 mg tablet,EA,0.0
2021-03-05,atorvastatin 80 mg tablet,EA,0.0
2021-03-06,atorvastatin 80 mg tablet,EA,90.0
2021-03-07,atorvastatin 80 mg tablet,EA,180.0
2021-03-08,atorvastatin 80 mg tablet,EA,90.0
2021-03-09,atorvastatin 80 mg tablet,EA,270.0
2021-03-10,atorvastatin 80 mg tablet,EA,0.0


In [73]:
example_drug["N_DAY_ROLLING_SUM"] = example_drug.rolling(5).sum(skipna=True)

example_drug

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AMOUNT,N_DAY_ROLLING_SUM
FILL_DATE,NDC_DESC,UNIT,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-01,atorvastatin 80 mg tablet,EA,180.0,
2021-03-02,atorvastatin 80 mg tablet,EA,0.0,
2021-03-03,atorvastatin 80 mg tablet,EA,90.0,
2021-03-04,atorvastatin 80 mg tablet,EA,0.0,
2021-03-05,atorvastatin 80 mg tablet,EA,0.0,270.0
...,...,...,...,...
2021-05-27,atorvastatin 80 mg tablet,EA,90.0,315.0
2021-05-28,atorvastatin 80 mg tablet,EA,90.0,405.0
2021-05-29,atorvastatin 80 mg tablet,EA,0.0,315.0
2021-05-30,atorvastatin 80 mg tablet,EA,0.0,270.0


In [88]:
# exclude windows with no observations from the average sum calculation
example_drug["AVERAGE_ROLLING_SUM"] = example_drug[example_drug["N_DAY_ROLLING_SUM"] != 0]["N_DAY_ROLLING_SUM"].mean()

# exclude windows with no observations from the std calculation
example_drug["STD_ROLLING_SUM"] = example_drug[example_drug["N_DAY_ROLLING_SUM"] != 0]["N_DAY_ROLLING_SUM"].std()
example_drug["RISK_FACTOR"] = 1


example_drug["MIN_VALUE"] = example_drug["AVERAGE_ROLLING_SUM"]
example_drug["MAX_VALUE"] = example_drug["MIN_VALUE"] + example_drug["STD_ROLLING_SUM"] * example_drug["RISK_FACTOR"]

In [89]:
example_drug

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AMOUNT,N_DAY_ROLLING_SUM,AVERAGE_ROLLING_SUM,STD_ROLLING_SUM,RISK_FACTOR,MIN_VALUE,MAX_VALUE
FILL_DATE,NDC_DESC,UNIT,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-03-01,atorvastatin 80 mg tablet,EA,180.0,,432.108434,199.760711,1,432.108434,631.869145
2021-03-02,atorvastatin 80 mg tablet,EA,0.0,,432.108434,199.760711,1,432.108434,631.869145
2021-03-03,atorvastatin 80 mg tablet,EA,90.0,,432.108434,199.760711,1,432.108434,631.869145
2021-03-04,atorvastatin 80 mg tablet,EA,0.0,,432.108434,199.760711,1,432.108434,631.869145
2021-03-05,atorvastatin 80 mg tablet,EA,0.0,270.0,432.108434,199.760711,1,432.108434,631.869145
...,...,...,...,...,...,...,...,...,...
2021-05-27,atorvastatin 80 mg tablet,EA,90.0,315.0,432.108434,199.760711,1,432.108434,631.869145
2021-05-28,atorvastatin 80 mg tablet,EA,90.0,405.0,432.108434,199.760711,1,432.108434,631.869145
2021-05-29,atorvastatin 80 mg tablet,EA,0.0,315.0,432.108434,199.760711,1,432.108434,631.869145
2021-05-30,atorvastatin 80 mg tablet,EA,0.0,270.0,432.108434,199.760711,1,432.108434,631.869145


In [90]:
example_drug.to_csv("../data/processed/example.csv")