In [1]:
import numpy as np
from operator import itemgetter
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cross_validation import cross_val_score, train_test_split
from sqlalchemy import create_engine
import tqdm

###1. preliminary load-n-scrub

In [2]:
def get_cnx():
    #*********************************************REPLACE WITH YOUR USERNAME:PASSWORD
    cnx = create_engine('postgresql://shermanash:***@54.236.113.118:5432/jaysips')
    return cnx

In [3]:
def get_datasets(cnx):
    '''Retrieves datasets from psql.
    '''
    cleveland = pd.read_sql_query('''SELECT * FROM cleveland''', cnx)
    hungary = pd.read_sql_query('''SELECT * FROM hungary''', cnx)
    longbeach = pd.read_sql_query('''SELECT * FROM longbeach''', cnx)
    swiss = pd.read_sql_query('''SELECT * FROM swiss''', cnx)
    datasets = {'Cleveland Clinic': cleveland, 'Hungarian Institute of Cardiology': hungary, 
            'Swiss University Hospitals': swiss, 'Long Beach V.A. Medical Center': longbeach}
    return datasets

In [46]:
def clean_datasets(datasets, binary=True, drop_above=0.9, avg_below=0.1):
    cleaned = {}    
    for name, hosp in datasets.iteritems():
        
        # Getting rid of irrelevant rows
        hosp = hosp.iloc[:, :58]
        if binary == True:
            # Making heart disease feature binary 0/1
            hosp['num'] = hosp['num'].replace(2, 1).replace(3, 1).replace(4, 1)
        # Dropping rows that are unfilled (-9s) in every dataset
        hosp = hosp.drop(hosp.iloc[:, 44:50], axis=1).drop(hosp.iloc[:, 51:54], axis=1)
        hosp = hosp.drop('pncaden', axis=1).drop('dm', axis=1)
        # Replacing -9s with NaNs
        hosp = hosp.replace(-9, np.nan)    
        
        for col in hosp:
            nans = hosp[col].isnull().sum()
            # Dropping cols in a given dataset that are (default) > 90% NaNs
            if nans > len(hosp)*drop_above:
                hosp = hosp.drop(col, axis=1)
            # Replacing NaNs with column means for cols that are (default) < 10% NaNs 
            elif nans > 0 and nans < len(hosp)*avg_below:            
                hosp[col] = hosp[col].replace(np.nan, np.mean(hosp[col]))
            else:
                continue  
                
        cleaned[name] = hosp   
    return cleaned

In [45]:
cleaned = clean_datasets(get_datasets(get_cnx()))

### 2. assessing the NaN situ

In [6]:
def view_nans(cleaned):
    '''Get dict of cols missing vals for each dataset.'''
    NaNs = {}
    for name, hosp in cleaned.iteritems():
        nulls = hosp.isnull().sum()
        for col, n in nulls.iteritems():
            if n == 0:
                nulls = nulls.drop(col)
        NaNs[name] = nulls
    return NaNs

In [22]:
# for name, hosp in cleaned.iteritems():
#     print name, len(hosp), len(hosp.columns)
# NaNs = view_nans(cleaned)
# NaNs

In [8]:
# #What % of cols are missing values in each dataset? 
# for name, hosp in cleaned.iteritems():
#     print name, float(len(NaNs[name]))/float(len(hosp.columns))

In [9]:
NaNs = view_nans(cleaned)

###3. using KNN to estimate missing values
(going in order from cols with least NaNs to those with most, by dataset)

Helper functions for `estimate_missing_values`:
- `df_nans_to_mean`
- `knn_best_k`
- `train_test_split_nonnan_nan`
- `knn_fillna`
- `to_dict`

In [10]:
def df_nans_to_mean(df):
    new_df = pd.DataFrame()
    for col in df:
        new_df[col] = df[col].replace(np.nan, np.mean(df[col]))
    return new_df

In [32]:
def knn_best_k(X, y):
    recall_scores = {}
    for k in range(1,21):
        if len(y.unique()) <= 12:
            #print "in classifier"
            knn_model = KNeighborsClassifier(n_neighbors=k, weights='distance')
            y = y.apply(str)
            cv_score = cross_val_score(knn_model, X, y, cv=5, scoring='recall_micro')
        else:
            #print "in regressor"
            knn_model = KNeighborsRegressor(n_neighbors=k, weights='distance') 
            cv_score = cross_val_score(knn_model, X, y, cv=5, scoring='r2')
        gen_recall = np.mean(cv_score)   
        recall_scores[k] = gen_recall  
    best_k = max(recall_scores.iteritems(), key=itemgetter(1))[0]
    return best_k

In [12]:
def train_test_split_nonnan_nan(df, y):
    '''Split up a df into train/test sets according to absence/presence of missing values
    in the dependent variable column, y, where y = column name.'''
    estimations = df_nans_to_mean(df)
    train = estimations[df[y].notnull()]  #df_nans_to_mean(df[df[y].notnull()])
    test = estimations[df[y].isnull()]    #df_nans_to_mean(df[df[y].isnull()])
    #print len(train)
    #print len(test)
    X_train, X_test, y_train, y_test = train.drop(y, axis=1), test.drop(y, axis=1), train[y], test[y]
    return X_train, X_test, y_train, y_test

In [13]:
def knn_fillna(k, X_train, X_test, y_train):
    '''Takes in training data, spits out y values...'''
    if len(y_train.unique()) <= 12:
        knn_model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    else:
        knn_model = KNeighborsRegressor(n_neighbors=k, weights='distance') 
    knn_model.fit(X_train, y_train) 
    y_pred = knn_model.predict(X_test)
    return y_pred

In [14]:
def to_dict(y_est, y_nans):
    '''Turns y_estimates into a dict with y_nans indices as keys.'''
    y_dict = {}
    for index, _ in y_nans.iteritems():
        for i in range(len(y_nans)):
            y_dict[index] = y_est[i]
    return y_dict

In [21]:
# full thang
def estimate_missing_values(cleaned, NaNs):
    '''
    Input: cleaned, dict of cleaned df; NaNs, dict of missing values by dataset
    Output: polished dict of datasets with no missing values!
    '''
    polished = {}
    for name, hosp in cleaned.iteritems():
        for col, _ in tqdm.tqdm(NaNs[name].sort_values().iteritems()):
            # Find best k:
            X = df_nans_to_mean(hosp.drop(col, axis=1)) 
            y = hosp[col].fillna(value=np.mean(hosp[col]))
            k_neighbors = knn_best_k(X, y)
            # Get "training" (non-nan) and "testing" (nan) sets
            X_train, X_test, y_train, y_test = train_test_split_nonnan_nan(hosp, col)
            # Estimate missing values w/ knn:
            y_estimates = knn_fillna(k_neighbors, X_train, X_test, y_train)
            # Fill missing values with knn estimates
            y_estimates = to_dict(y_estimates, y_test)
            hosp[col] = hosp[col].fillna(value=y_estimates)
        polished[name] = hosp
    return polished

In [34]:
preprocessed_datasets = estimate_missing_values(cleaned, NaNs)



`met` and `proto` may throw you warnings...ignore them

also unfortunately not yet able to round categoricals

also Hungarian `proto` numbers are fuuuckd

In [36]:
#preprocessed_datasets['Hungarian Institute of Cardiology']['met']

In [43]:
#preprocessed_datasets['Hungarian Institute of Cardiology']['proto']