In [1]:
## Load TEDS-D 2017 Data into a Pandas Data Frame
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import os

## Specify WD
wd = "/home/abf/BINF667013_Final_Project/"
os.chdir(wd)

teds=pd.read_csv("datasets/tedsd_puf_2017.csv")
teds

## Convert '-9' to NA
teds=teds.replace({-9:np.nan})

### Count the number of complete cases in this dataset

In [None]:
teds.dropna().shape

### Count the number of cases with valid observations for at least 50% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 39, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 75% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 90% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 14, axis=1)
teds[mask].shape

### Explore Correlation between missing values

In [None]:
import missingno as msno
msno.heatmap(teds, fontsize=12)

In [None]:
msno.dendrogram(teds)

### Restrict data to records that are at least 75% complete

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
print(teds[mask].shape)
teds = teds[mask]

## make a subset for development
#teds=teds.iloc[0:16300, ]

### Convert Observations to Categorical Variables

In [None]:
for col in teds.columns:
    if not (col =="CASEID" or col == "DISYR"):
        teds[col] = teds[col].astype('category')

### Use MissForest implemented in missingpy to impute missing values

In [None]:
## Stupid hack due to broken module name #WastingMyTime #SloppyCode
import sys
import sklearn.neighbors
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from os.path import exists

def random_chunks(df, n=100000, state=123):
    chunks = []
    i = 0
    while df.shape[0] > n:
        chunks.append(df.sample(n=n, replace=False, random_state=state))
        df = df.drop(chunks[i].index)
        i += 1

    chunks.append(df)
    return(chunks)
        
        
def impute_missforest(df, seed=123):
    print(df.index)
    imputer = MissForest(
        random_state=seed,
        criterion=('squared_error', 'gini'), # Package mispecifies criterion
        n_jobs=-1
    )
    X=imputer.fit_transform(
        df.drop(columns=['CASEID', 'DISYR']),
        cat_vars=np.array([x for x in range(0,74)])
    )

    dg=pd.DataFrame(X)
    dg.columns=df.columns[2:76]
    return(dg)

### Verfiy that All Chunks have at least 1000 observations for all variables

In [None]:
chunks = random_chunks(df=teds, n=100000)
for c in chunks:
    x=c.shape[0] - c.apply(lambda x: sum(pd.isnull(x)), axis=0)
    print(c.index)
    print(x.loc[ x < 1000,])


In [None]:
print(pd.concat(chunks).shape)

In [None]:
if exists("analysis_objects/teds_imputed.csv"):
    teds_imputed=pd.read_csv("analysis_objects/teds_imputed.csv")
else:
    chunks = random_chunks(df=teds, n=100000)
    dflist = [impute_missforest(c) for c in chunks]
    teds_imputed = pd.concat(dflist)
    teds_imputed.to_csv(
        path_or_buf="analysis_objects/teds_imputed.csv"
    )

### Join State Level Data