In [1]:
## Load TEDS-D 2017 Data into a Pandas Data Frame
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import os

## Specify WD
wd = "/home/abf/BINF667013_Final_Project/"
os.chdir(wd)

teds=pd.read_csv("datasets/tedsd_puf_2017.csv")
teds

## Convert '-9' to NA
teds=teds.replace({-9:np.nan})

### Count the number of complete cases in this dataset

In [None]:
teds.dropna().shape

### Count the number of cases with valid observations for at least 50% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 39, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 75% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
teds[mask].shape

### Count the number of cases with valid observations for at least 90% of available measures

In [None]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 8, axis=1)
teds[mask].shape

### Explore Correlation between missing values

In [None]:
import missingno as msno
msno.heatmap(teds, fontsize=12)

In [None]:
msno.dendrogram(teds)

### Restrict data to records that are at least 75% complete

In [2]:
mask=teds.apply(lambda x: np.isnan(x)).apply(lambda x: sum(x) < 19, axis=1)
print(teds[mask].shape)
teds = teds[mask]

## make a subset for development
teds=teds.iloc[0:10000, ]

(1308896, 76)


### Convert Observations to Categorical Variables

In [3]:
for col in teds.columns:
    if not (col =="CASEID" or col == "DISYR"):
        teds[col] = teds[col].astype('category')

### Use MissForest implemented in missingpy to impute missing values

In [11]:
## Stupid hack because due to broken module name #WastingMyTime #SloppyCode
import sys
import sklearn.neighbors
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from os.path import exists

def data_frame_chunks(df, n):
    for i in range(0, df.shape[0], n):
        
        # Get range of records in df
        low=i
        high=min(i + n  - 1, df.shape[0])
        
        # Return Iterator for data frame chunks
        yield df.loc[low:high,]

def impute_missforest(df, seed=123):
    imputer = MissForest(
        random_state=seed,
        criterion=('squared_error', 'gini'), # Package mispecifies criterion
        n_jobs=-1
    )
    X=imputer.fit_transform(
        df.drop(columns=['CASEID', 'DISYR']),
        cat_vars=np.array([x for x in range(0,74)])
    )

    dg=pd.DataFrame(X)
    dg.columns=df.columns[2:76]
    return(dg)

chunks = data_frame_chunks(df=teds, n=100000)
dflist = [impute_missforest(c) for c in chunks]
teds_imputed = pd.concat(dflist)
teds_imputed.to_csv(
    path_or_buf="analysis_objects/teds_imputed.csv"
)




In [None]:
#chunks=[chunk for chunk in data_frame_chunks(teds, 50)]
#chunks
#imputed = pool.map(impute_missforest, chunks)

In [8]:
import timeit as tm
imputer = MissForest(
    random_state=123,
    criterion=('squared_error', 'gini'), # Package mispecifies criterion
    n_jobs=-1
)
#X=imputer.fit_transform(
#    teds.drop(columns=['CASEID', 'DISYR']),
#    cat_vars=np.array([x for x in range(0,74)])
#)

result=tm.timeit(
    "imputer.fit_transform(teds.drop(columns=['CASEID', 'DISYR']),cat_vars=np.array([x for x in range(0,74)]))",
    "from __main__ import imputer, teds, np", number=3
)
print(result)


imputer = MissForest(
    random_state=123,
    criterion=('squared_error', 'gini'), # Package mispecifies criterion
    n_jobs=1
)
#X=imputer.fit_transform(
#    teds.drop(columns=['CASEID', 'DISYR']),
#    cat_vars=np.array([x for x in range(0,74)])
#)

result=tm.timeit(
    "imputer.fit_transform(teds.drop(columns=['CASEID', 'DISYR']),cat_vars=np.array([x for x in range(0,74)]))",
    "from __main__ import imputer, teds, np", number=3
)
print(result)

#dg=pd.DataFrame(X)
#dg.columns=df.columns[2:76]
#return(dg)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
814.7710349503905
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
272.98170974943787


In [10]:
imputer = MissForest(
    random_state=123,
    criterion=('squared_error', 'gini'), # Package mispecifies criterion
    n_jobs=1
)

result=tm.timeit(
    "imputer.fit_transform(teds.drop(columns=['CASEID', 'DISYR']),cat_vars=np.array([x for x in range(0,74)]))",
    "from __main__ import imputer, teds, np", number=3
)
print(result)

#dg=pd.DataFrame(X)
#dg.columns=df.columns[2:76]
#return(dg)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
1031.4057905506343


In [None]:

if exists("analysis_objects/teds_imputed.csv"):
    teds_imputed=pd.read_csv("analysis_objects/teds_imputed.csv")
else:
    ## Run imputation
    imputer = MissForest(
        random_state=123,
        criterion=('squared_error', 'gini') # Package mispecifies criterion
    )  
    X=imputer.fit_transform(
        teds.drop(columns=['CASEID', 'DISYR']),
        cat_vars=np.array([x for x in range(0,74)])
    )

    teds_imputed=pd.DataFrame(X)
    teds_imputed.columns=teds.columns[2:76]
    teds_imputed.to_csv(
        path_or_buf="analysis_objects/teds_imputed.csv"
    )


In [None]:
for i in range(0, 100,7):
    low = i
    high = min(i + 7, 100)
    