In [67]:
## Load TEDS-D 2017 Data into a Pandas Data Frame
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import os

## Specify WD
#wd = "/home/abf/BINF667013_Final_Project/"
wd = (
    "/Users/adam/Documents/BINF667013_BigDataAnalyticsHealthcare/" +
    "Final_Project/TEDS_Study"
)
os.chdir(wd)

teds=pd.read_csv("datasets/tedsd_puf_2017.csv")
teds

## Convert '-9' to NA
teds=teds.replace({-9:np.nan})

### Count the number of complete cases in this dataset

In [None]:
teds.dropna().shape

### Count the number of cases with valid observations for at least 50% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 39, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 75% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 90% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 14, axis=1)
teds[mask].shape

### Explore Correlation between missing values

In [None]:
import missingno as msno
msno.heatmap(teds, fontsize=12)

In [None]:
msno.dendrogram(teds)

### Restrict data to records that are at least 75% complete

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
print(teds[mask].shape)
teds = teds[mask]

## make a subset for development
#teds=teds.iloc[0:16300, ]

### Convert Observations to Categorical Variables

In [None]:
for col in teds.columns:
    if not (col =="CASEID" or col == "DISYR"):
        teds[col] = teds[col].astype('category')

### Use MissForest implemented in missingpy to impute missing values

In [68]:
## Stupid hack due to broken module name #WastingMyTime #SloppyCode
import sys
import sklearn.neighbors
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from os.path import exists

def random_chunks(df, n=100000, state=123):
    chunks = []
    i = 0
    while df.shape[0] > n:
        chunks.append(df.sample(n=n, replace=False, random_state=state))
        df = df.drop(chunks[i].index)
        i += 1

    chunks.append(df)
    return(chunks)
        
        
def impute_missforest(df, seed=123):
    print(df.index)
    imputer = MissForest(
        random_state=seed,
        criterion=('squared_error', 'gini'), # Package mispecifies criterion
        n_jobs=-1
    )
    X=imputer.fit_transform(
        df.drop(columns=['CASEID', 'DISYR']),
        cat_vars=np.array([x for x in range(0,74)])
    )

    dg=pd.DataFrame(X)
    dg.columns=df.columns[2:76]
    return(dg)

### Verfiy that All Chunks have at least 1000 observations for all variables

In [None]:
chunks = random_chunks(df=teds, n=100000)
for c in chunks:
    x=c.shape[0] - c.apply(lambda x: sum(pd.isnull(x)), axis=0)
    print(c.index)
    print(x.loc[ x < 1000,])


In [None]:
print(pd.concat(chunks).shape)

In [69]:
if exists("analysis_objects/teds_imputed.csv"):
    teds_imputed=pd.read_csv("analysis_objects/teds_imputed.csv")
else:
    chunks = random_chunks(df=teds, n=100000)
    dflist = [impute_missforest(c) for c in chunks]
    teds_imputed = pd.concat(dflist)
    teds_imputed.to_csv(
        path_or_buf="analysis_objects/teds_imputed.csv"
    )

### Prepare State Level Data

In [70]:
## Import Involuntary Commitment Laws dataset
ic_laws=pd.read_csv("datasets/IC_Laws.csv")

## Convert "." character to NA in appropriate columns
ic_laws=ic_laws.replace({".":np.nan})
## Add state identifier that matches TEDS data (as join column). 
fips_table=pd.read_csv('datasets/state.txt', sep="|")
ic_laws = pd.merge(
    left=fips_table.loc[:,["STATE", "STATE_NAME"]],
    right=ic_laws,
    left_on="STATE_NAME",
    right_on="Jurisdiction",
    how='inner'
).drop(columns="STATE_NAME")
ic_laws['pre_2016'] = ic_laws['Effective Date'] < "2016-01-01"
ic_laws


Unnamed: 0,STATE,Jurisdiction,Effective Date,Valid Through Date,IC_law,IC_Law_Specific,IC_law_broad,IC_Law_Def,IC_Circumstances_Danger to self,IC_Circumstances_Danger to others,...,IC_rights_Other rights excluded from this dataset,IC_treatment without consent_Receive medication,IC_treatment without consent_Surgery,IC_treatment without consent_Electric shock,IC_treatment without consent_Restrained,IC_treatment without consent_Secluded,IC_treatment without consent_Not Specified in the law,IC_recommitment,IC_recommitment review,pre_2016
0,1,Alabama,2018-03-01,2018-03-01,0,,,,,,...,,,,,,,,,,False
1,2,Alaska,2016-07-07,2018-03-01,1,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,False
2,4,Arizona,2018-03-01,2018-03-01,0,,,,,,...,,,,,,,,,,False
3,5,Arkansas,2017-08-01,2018-03-01,1,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,False
4,6,California,2018-01-01,2018-03-01,1,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,False
5,8,Colorado,2017-05-25,2018-03-01,1,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,False
6,9,Connecticut,2017-10-01,2018-03-01,1,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,False
7,10,Delaware,2015-04-23,2018-03-01,1,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,True
8,11,District of Columbia,1970-07-29,2018-03-01,1,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,True
9,12,Florida,2017-07-01,2018-03-01,1,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,False


### Join State Laws to TEDS Data

In [None]:
teds_laws = pd.merge(
    left=teds,
    right=ic_laws,
    left_on="STFIPS",
    right_on="STATE",
)
teds_laws.to_csv("analysis_objects/teds_laws.csv")

teds_imp_laws=pd.merge(
    left=teds_imputed,
    right=ic_laws,
    left_on="STFIPS",
    right_on="STATE",
)
teds_laws.to_csv("analysis_objects/teds_imp_laws.csv")
