In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("10k_diabetes/diab_train.csv",
                       na_values = ["?", "Not Available", "Not Mapped"])
df_test = pd.read_csv("10k_diabetes/diab_test.csv")
df_validate = pd.read_csv("10k_diabetes/diab_validation.csv")

In [3]:
print(df_train.shape)
print(df_train.dtypes)

(6000, 52)
Unnamed: 0                   int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexa

In [4]:
type_txt = ["diag_1_desc",
           "diag_2_desc",
           "diag_3_desc"]

type_drop = ["discharge_disposition_id",
           "medical_specialty",]

type_cat = ["race",
          "gender",
          "age",
          "weight",
          "admission_type_id",
        "admission_source_id",
        "payer_code",
          "max_glu_serum",
           "A1Cresult",
           "metformin",
           "repaglinide",
           "nateglinide",
           "chlorpropamide",
           "glimepiride",
           "acetohexamide",
           "glipizide",
           "glyburide",
           "tolbutamide",
           "pioglitazone",
           "rosiglitazone",
           "acarbose",
           "miglitol",
           "troglitazone",
           "tolazamide",
           "examide",
           "citoglipton",
           "insulin",
           "glyburide.metformin",
           "glipizide.metformin",
           "glimepiride.pioglitazone",
           "metformin.rosiglitazone",
           "metformin.pioglitazone",
           "change",
           "diabetesMed"]

type_le = ["age", "weight", "A1Cresult"]

type_int = ["time_in_hospital",
           "num_lab_procedures",
           "num_procedures",
           "num_medications",
           "number_outpatient",
           "number_emergency",
           "number_inpatient",
           "number_diagnoses"]

type_float = ["diag_1",
             "diag_2",
             "diag_3"]

In [5]:
def prep_df(df):
    y = df["readmitted"]
    df = df.drop(columns=['readmitted', 'Unnamed: 0'])
    df = df.drop(columns=type_drop)
    
    #Convert data types
    for i in type_int:
        #df_train[i] = df_train[i].astype('int32')
        df[i] = pd.to_numeric(df[i], errors='coerce', downcast='integer')

    for i in type_float:
        df[i] = pd.to_numeric(df[i], errors='coerce', downcast='float')
        
    for i in type_txt:
        df[i] = df[i].astype('str')
        df[i] = df[i].str.lower()
        
    for i in type_cat:
        df[i] = df[i].astype('str')
        df[i] = df[i].str.lower()
        
    #Combine descriptions
    tmp = df[type_txt[0]] + " " + df[type_txt[1]] + " " + df[type_txt[2]]
    tmp = pd.DataFrame({'description':tmp})
    df = pd.concat([tmp, df], axis = 1)
    df = df.drop(columns = type_txt)
    
    return df, y

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def get_transforms(df, impute=True, imp="mean"):
    ll = type_txt.copy()
    ll.append('description')
    
    #Get features that are categorical and create oh encoding
    ohe_mask = df.dtypes==object
    txt_mask = [i not in ll for i in df.columns]
    mask = [i == True and j == True for i,j in zip(txt_mask, ohe_mask.tolist())]
    col_mask = df.columns[mask]
    
    #Generate OneHotEncoder
    ohe = [OneHotEncoder().fit(df[i].values.reshape(-1,1)) for i in col_mask]
    enc = [ohe[i].transform(df[name].values.reshape(-1,1)).toarray() for i,name in enumerate(col_mask)]
    
    #Concat transformed features
    tmp = np.concatenate(enc, axis = 1)
    tmp = pd.DataFrame(tmp, columns = ['ohe' + str(i) for i in range(tmp.shape[1])])
    
    #Append to dataframe and drop categorical features that have been transformed
    df = pd.concat([df.reset_index(drop=True), tmp.reset_index(drop=True)], axis = 1)
    df = df.drop(columns=col_mask)
    
    #Impute missing values
    if impute:
        idx = pd.isnull(df).any().tolist()
        print("Impute values for the following attributes")
        print(df.columns[idx])

        df_imp = SimpleImputer(strategy=imp).fit_transform(df.loc[:,idx])
        df.loc[:,idx] = df_imp
        
    return df

In [7]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer

def tokenize(x,stem = False):
    rm_word = stopwords.words('english')
    rm_word.extend(',')
    
    if stem:
        stemmer = PorterStemmer()
        return [stemmer.stem(i) for i in list(filter(lambda x: x not in rm_word, word_tokenize(x)))]
    else:
        return list(filter(lambda x: x not in rm_word, word_tokenize(x)))

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def get_nlp(df, n_gram_range=(1,1), stem=False, tfidf=False):
    #NLP 
    tfidfTransformer = TfidfTransformer()
    count_vect = CountVectorizer(ngram_range=n_gram_range)
    
    if not n_gram_range == None:
        #Filter description and possibly stem
        df['description'] = df['description'].apply(lambda txt: ' '.join(tokenize(txt, stem=stem)))
        
        X_transformed = count_vect.fit_transform(df['description'].values)
        
        if tfidf:
            X_transformed = tfidfTransformer.fit_transform(X_transformed)
            
        tmp = pd.DataFrame(X_transformed.A, columns = ['nlp' + str(i) for i in range(X_transformed.shape[1])])    
        df = pd.concat([df.reset_index(drop=True), tmp.reset_index(drop=True)], axis = 1)
        df = df.drop(columns=['description'])
    
    return df

In [31]:
def split_df(DF,df,y):
    idx = np.cumsum([i.shape[0] for i in DF])
    X_train = df.iloc[0:idx[0],]
    y_train = y.iloc[0:idx[0],]
    
    X_val = df.iloc[idx[0]:idx[1],]
    y_val = y.iloc[idx[0]:idx[1],]
    
    X_test = df.iloc[idx[1]:idx[2],]
    y_test = y.iloc[idx[1]:idx[2],]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [10]:
import nltk
from nltk import ngrams

def prep_data(df, col_mask, ohe, count_vect=None, tfidf_trans=None, impute=True, imp = "mean", nlp=False, stem=False, tfidf = False, n_gram_range = None):
        
    enc = [ohe[i].transform(df[name].values.reshape(-1,1)).toarray() for i,name in enumerate(col_mask)]
    
    #Concat transformed features
    tmp = np.concatenate(enc, axis = 1)
    tmp = pd.DataFrame(tmp, columns = ['ohe' + str(i) for i in range(tmp.shape[1])])
    
    #Append to dataframe and drop categorical features that have been transformed
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop(columns=col_mask)
    
    #NLP processing
    tfidfTransformer = TfidfTransformer()
    count_vect = CountVectorizer(ngram_range=n_gram_range)
    if not nlp:
        print("Mode: No NLP")
        df = df.drop(columns = type_txt)
    else:
        print("Mode: NLP, Stem: {}".format(stem))
                
        #Filter description and possibly stem
        df['description'] = df['description'].apply(lambda txt: ' '.join(tokenize(txt, stem=stem)))
        
        if train:
            X_transformed = count_vect.fit_transform(df['description'].values)
        else:
            X_transformed = count_trans.transform(df['description'].values)
            #count_trans._validate_vocabulary()      

        #Wheter to use tfidf instead of counts
        if tfidf:
            if train:
                X_transformed = tfidfTransformer.fit_transform(X_transformed)
            else:
                X_transformed = tfidf_trans.transform(X_transformed)
  
        tmp = pd.DataFrame(X_transformed.A, columns = ['nlp' + str(i) for i in range(X_transformed.shape[1])])    
        #tmp = pd.DataFrame(X_transformed.A, columns = ['nlp' + i for i in count_vect.get_feature_names()])    
        df = pd.concat([df, tmp], axis = 1)
        df = df.drop(columns=['description'])
    
    if train:
        return df, y, ohe, count_vect, tfidfTransformer
    else:
        return df, y

In [38]:
DF = [df_train, df_validate, df_test]
df = pd.concat(DF, axis = 0)
nlp = True

df,y = prep_df(df)
df = get_transforms(df, impute=True, imp="mean")
if nlp:
    df = get_nlp(df, n_gram_range=(1,1), stem=False, tfidf=False)
else:
    df = df.drop(columns=['description'])
X_train, y_train, X_val, y_val, X_test, y_test = split_df(DF,df,y)

Impute values for the following attributes
Index(['diag_1', 'diag_2', 'diag_3'], dtype='object')


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5)
clf.fit(X_train.values, y_train)

pred_val = clf.predict(X_val.values)
pred_test = clf.predict(X_test.values)

acc_val = accuracy_score(y_val, pred_val)
acc_test = accuracy_score(y_test, pred_test)
print(acc_val, acc_test)

0.639 0.6385


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid

param_dict = {'max_depth': [None, 10, 50],
             'min_samples_split': [2, 10],
             'nlp': [True, False],
             'stem': [True, False],
             'n_gram_range': [None, (1,1), (2,2)],}

param_grid = list(ParameterGrid(param_dict))

In [None]:
for param in param_grid:

    if not param['nlp'] and param['stem']:
        #Skip this parameter setting since unreasonable
        continue
        
    if param['nlp'] and param['n_gram_range'] == None:
        #Skip this parameter setting since unreasonable
        continue
    
    if not param['nlp'] and not param['n_gram_range'] == None:
        #Skip this parameter setting since unreasonable
        continue
    
    print(param)

In [None]:
from sklearn.model_selection import KFold

#Run grid search
res = list()
n_fold = 2
for param in param_grid:

    if not param['nlp'] and param['stem']:
        #Skip this parameter setting since unreasonable
        continue
        
    if param['nlp'] and param['n_gram_range'] == None:
        #Skip this parameter setting since unreasonable
        continue
    
    if not param['nlp'] and not param['n_gram_range'] == None:
        #Skip this parameter setting since unreasonable
        continue
        
    X_train, y_train, ohe, count_vect, tfidfTransformer = prep_data(df_train, train=True,
                                                                    nlp=param['nlp'], 
                                                                    stem=param['stem'],
                                                                    n_gram_range=param['n_gram_range']) 
                                                                    
    X_val, y_val = prep_data(df_validate, train=False, nlp=True, n_gram_range=(1,1),
                             ohe_trans=ohe, count_trans=count_vect, tfidf_trans=tfidfTransformer,
                             nlp=param['nlp'], stem=param['stem'], n_gram_range=param['n_gram_range'])                                                       n_gram_range=param['n_gram_range'])
    
    #CV over grid search
    acc_cv = []
    for train, val in zip(KFold(n_splits=n_fold).split(X_train), KFold(n_splits=n_fold).split(X_val)):
        idx_train,_ = train
        idx_val,_ = val
        
        clf = RandomForestClassifier(max_depth=param['max_depth'], min_samples_split=param['min_samples_split'])
        clf.fit(X_train.values[idx_train],y_train[idx_train])
        pred_test = clf.predict(X_val.values[idx_val])
        acc_cv.append(accuracy_score(y_val[idx_val], pred_test))
    
    acc = np.mean(acc_cv)
    acc_sd = np.std(acc_cv)
    print('Mean: {}, Std: {}'.format(acc, acc_sd))
    res.append((acc, acc_sd, param))
    

In [None]:
X_train, y_train, count_trans, tfidf_trans = prep_data(df_train, train=True)

X_val, y_val = prep_data(df_validate, train = False,
                             count_trans=count_trans,
                             tfidf_trans=tfidf_trans)

X_test, y_test = prep_data(df_validate, train = False,
                             count_trans=count_trans,
                             tfidf_trans=tfidf_trans)

In [None]:
X_test.shape

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train.values, y_train)

pred_val = clf.predict(X_val.values)
pred_test = clf.predict(X_test.values)

acc_val = accuracy_score(y_val, pred_val)
acc_test = accuracy_score(y_test, pred_test)
print(acc_val, acc_test)

In [None]:
for i in res:
    print('Mean: {}, Std: {}'.format(i[0], i[1]))

In [None]:
X_test, y_test = prep_data(df_train, nlp=params['nlp'], stem=param['stem'], n_gram_range=param['n_gram_range'])
conf_mat = confusion_matrix(y_test, pred_test)