# Data Preparation

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import os

## Specify WD
wd = "/home/abf/BINF667013_Final_Project/"
#wd = (
#    "/Users/adam/Documents/BINF667013_BigDataAnalyticsHealthcare/" +
#    "Final_Project/TEDS_Study"
#)
os.chdir(wd)

In [None]:
## Load TEDS-D 2017 Data into a Pandas Data Frame
teds=pd.read_csv("datasets/tedsd_puf_2017.csv")
teds

## Convert '-9' to NA
teds=teds.replace({-9:np.nan})

primary = [
    'GENDER','STFIPS','PSOURCE','LOS','SERVICES','SERVICES_D',
    'REASON','SUB1', 'SUB2', 'SUB3', 'SUB1_D', 'SUB2_D', 'SUB3_D'
]

### Restrict data to records that are at observed in our primary variables

In [None]:
## Define Mask for non-na primary measures
primary_mask = teds.loc[:,primary].dropna().index

## Exclude unobserved data
teds = teds.loc[primary_mask,:]
print(teds.shape)

### Count the number of complete cases in this dataset

In [None]:
teds.dropna().shape

### Count the number of cases with valid observations for at least 50% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 39, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 75% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 90% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 14, axis=1)
teds[mask].shape

### Explore Correlation between missing values

In [None]:
import missingno as msno
msno.heatmap(teds.loc[primary_mask,:], fontsize=12)

In [None]:
msno.dendrogram(teds)

### Convert Observations to Categorical Variables

In [None]:
for col in teds.columns:
    if not (col =="CASEID" or col == "DISYR"):
        teds[col] = teds[col].astype('category')

### Use MissForest implemented in missingpy to impute missing values

In [None]:
## Stupid hack due to broken module name #WastingMyTime #SloppyCode
import sys
import sklearn.neighbors
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from os.path import exists

def random_chunks(df, n=100000, state=123):
    chunks = []
    i = 0
    while df.shape[0] > n:
        chunks.append(df.sample(n=n, replace=False, random_state=state))
        df = df.drop(chunks[i].index)
        i += 1

    chunks.append(df)
    return(chunks)
        
        
def impute_missforest(df, seed=123):
    print(df.index)
    imputer = MissForest(
        random_state=seed,
        criterion=('squared_error', 'gini'), # Package mispecifies criterion
        n_jobs=-1
    )
    X=imputer.fit_transform(
        df.drop(columns=['CASEID', 'DISYR']),
        cat_vars=np.array([x for x in range(0,74)])
    )

    dg=pd.DataFrame(X)
    dg.columns=df.columns[2:76]
    return(dg)

### Verfiy that All Chunks have at least 1000 observations for all variables

In [None]:
chunks = random_chunks(df=teds, n=100000)
for c in chunks:
    x=c.shape[0] - c.apply(lambda x: sum(pd.isnull(x)), axis=0)
    print(c.index)
    print(x.loc[ x < 1000,])


In [None]:
print(pd.concat(chunks).shape)

In [None]:
if exists("analysis_objects/teds_imputed.csv"):
    teds_imputed=pd.read_csv(
        "analysis_objects/teds_imputed.csv",
        index_col=0
    )
else:
    chunks = random_chunks(df=teds.loc[primary_mask,:], n=100000)
    dflist = [impute_missforest(c) for c in chunks]
    teds_imputed = pd.concat(dflist)
    teds_imputed.to_csv(
        path_or_buf="analysis_objects/teds_imputed.csv"
    )

### Prepare State Level Data

In [None]:
## Import Involuntary Commitment Laws dataset
ic_laws=pd.read_csv("datasets/IC_Laws.csv")

## Convert "." character to NA in appropriate columns
ic_laws=ic_laws.replace({".":np.nan})
## Add state identifier that matches TEDS data (as join column). 
fips_table=pd.read_csv('datasets/state.txt', sep="|")
ic_laws = pd.merge(
    left=fips_table.loc[:,["STATE", "STATE_NAME"]],
    right=ic_laws,
    left_on="STATE_NAME",
    right_on="Jurisdiction",
    how='inner'
).drop(columns="STATE_NAME")
ic_laws['pre_2016'] = ic_laws['Effective Date'] < "2016-01-01"
ic_laws.STATE.value_counts()


### Add Drug Specific Case and Relapse Features to TEDS and TEDS Imputed

In [None]:
def isrelapse(x, **kwargs):
    if(
          x['SUB1_D'] == 1 and\
          x['SUB2_D'] == 1 and\
          x['SUB3_D'] == 1
    ):
        return 0
    else:
        return 1


## Add Relapse Rates to Raw Data Set
teds = teds.assign(
    alc_cases = lambda x: x.loc[:,['SUB1']].apply(lambda x: x['SUB1'] == 2, axis=1),
    hrn_cases = lambda x: x.loc[:,['SUB1']].apply(lambda x: x['SUB1'] == 5, axis=1),
    met_cases = lambda x: x.loc[:,['SUB1']].apply(lambda x: x['SUB1'] == 10, axis=1),
    relapse = lambda x: x.apply(isrelapse, axis=1)
)

teds = teds.assign(
    alc_relapse = lambda y: y.apply(lambda x: 1 if x['relapse'] == 1 and x['SUB1'] == 2 else 0, axis =1),
    hrn_relapse = lambda y: y.apply(lambda x: 1 if x['relapse'] == 1 and x['SUB1'] == 5 else 0, axis=1),
    met_relapse = lambda y: y.apply(lambda x: 1 if x['relapse'] == 1 and x['SUB1'] == 10 else 0, axis=1)
)

## Add Relapse Rates to Imputed Data Set
teds_imputed = teds_imputed.assign(
    alc_cases = lambda x: x.loc[:,['SUB1']].apply(lambda x: x['SUB1'] == 2, axis=1),
    hrn_cases = lambda x: x.loc[:,['SUB1']].apply(lambda x: x['SUB1'] == 5, axis=1),
    met_cases = lambda x: x.loc[:,['SUB1']].apply(lambda x: x['SUB1'] == 10, axis=1),
    relapse = lambda x: x.apply(isrelapse, axis=1)
)

teds_imputed = teds_imputed.assign(
    alc_relapse = lambda y: y.apply(lambda x: 1 if x['relapse'] == 1 and x['SUB1'] == 2 else 0, axis =1),
    hrn_relapse = lambda y: y.apply(lambda x: 1 if x['relapse'] == 1 and x['SUB1'] == 5 else 0, axis=1),
    met_relapse = lambda y: y.apply(lambda x: 1 if x['relapse'] == 1 and x['SUB1'] == 10 else 0, axis=1)
)

### Join State Laws to TEDS Data

In [None]:
if exists("analysis_objects/teds_laws.csv"):
    teds_laws=pd.read_csv("analysis_objects/teds_laws.csv")
else:
    teds_laws = pd.merge(
        left=teds,
        right=ic_laws,
        left_on="STFIPS",
        right_on="STATE",
    )
    teds_laws.to_csv(
        "analysis_objects/teds_laws.csv", index=False
    )
    
if exists("analysis_objects/teds_imp_laws.csv"):
    teds_imp_laws=pd.read_csv("analysis_objects/teds_imp_laws.csv")
else:
    teds_imp_laws=pd.merge(
        left=teds_imputed,
        right=ic_laws,
        left_on="STFIPS",
        right_on="STATE",
    )
    teds_imp_laws.to_csv(
        "analysis_objects/teds_imp_laws.csv", index=False
    )