In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

import time
import math
import random
from tqdm import tqdm
import regex as re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

## Load in Always Pattern, Manual Review, and Sample DF

In [None]:
always_patterns = pd.read_csv("Storage/Data/always_patterns.csv") 
always_patterns = always_patterns[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
always_patterns.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
always_patterns.head()

In [None]:
manual_review = pd.read_csv("Storage/Data/manual_review.csv")
manual_review = manual_review[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
manual_review.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
manual_review.head()

In [None]:
sample = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Sampling\sample_8_12.csv") 
        # pd.read_csv("Storage/Data/20K_sample.csv")
sample = sample[["PatientID", "regex_sent"]]
sample.columns = ['patient_id', 'sequence']
sample.head()

In [None]:
# real_world_sample = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\real_world_sample.csv")
# real_world_sample.head()

## Preprocessing for Sample DF

In [None]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
    
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [None]:
for i in tqdm(range(len(sample))):
    sample.loc[i, "sequence"] = clean_sequence(sample.loc[i]["sequence"][3:len(sample.loc[i]["sequence"]) - 3])

## Splitting into Train, Validation, Split

This code only needs to be run once as train_test_split does a different split every time. After first run, use saved CSVs.

In [None]:
X = always_patterns[["patient_id", "sequence"]]
y = always_patterns["annotator_label"]

y_label = y.to_numpy()
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X,y,random_state=0,test_size=0.10, stratify=y_label)

y_test_valid_label = y_test_valid.to_numpy()
X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state=0, test_size=(0.25), stratify=y_test_valid_label)

In [None]:
X_2 = manual_review[["patient_id", "sequence"]]
y_2 = manual_review["annotator_label"]

y_label_2 = y_2.to_numpy()
X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2,y_2,random_state=0,test_size=0.3, stratify=y_label_2)

y_test_valid_label_2 = y_test_valid_2.to_numpy()
X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state=0, test_size=(0.15/0.3), stratify=y_test_valid_label_2)

In [None]:
X_train = X_train.append(X_train_2)
y_train = y_train.append(y_train_2)

X_test = X_test.append(X_test_2)
y_test = y_test.append(y_test_2)

X_valid = X_valid.append(X_valid_2)
y_valid = y_valid.append(y_valid_2)

In [None]:
len(X_train), len(X_valid), len(X_test) 

### Using Saved CSVs to load in data

In [None]:
X_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\train_8_11.csv")
y_train = X_train["annotator_label"]
X_train = X_train["sequence"]

In [None]:
X_valid = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\valid_8_11.csv")
y_valid = X_valid["annotator_label"]
X_valid = X_valid["sequence"]

In [None]:
X_test = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\test_8_11.csv")
y_test = X_test["annotator_label"]
X_test = X_test["sequence"]

In [None]:
X_sample = sample['sequence']

In [None]:
X_real_world = real_world_sample["sequence"]

In [None]:
len(X_train), len(X_valid), len(X_test)# , len(X_sample), len(X_real_world)

## TF-IDF

In [None]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english",analyzer='word', token_pattern=r'\b[A-Za-z0-9]+\b')
tfidf_train= tfidf_vectorizer.fit_transform(X_train)
tfidf_valid = tfidf_vectorizer.transform(X_valid)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_sample = tfidf_vectorizer.transform(X_sample)
#tfidf_real_world = tfidf_vectorizer.transform(X_real_world)

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_train = pd.DataFrame(tfidf_train.A, columns = tfidf_vectorizer.get_feature_names())
tfidf_valid = pd.DataFrame(tfidf_valid.A, columns = tfidf_vectorizer.get_feature_names())
tfidf_test = pd.DataFrame(tfidf_test.A, columns = tfidf_vectorizer.get_feature_names())
tfidf_sample = pd.DataFrame(tfidf_sample.A, columns = tfidf_vectorizer.get_feature_names())
#tfidf_real_world = pd.DataFrame(tfidf_real_world.A, columns = tfidf_vectorizer.get_feature_names())

tfidf_sample.head()

In [None]:
tfidf_train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_train_8_12.csv", index = False)
tfidf_valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_valid_8_12.csv", index = False)
tfidf_test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_test_8_12.csv", index = False)

In [None]:
tfidf_real_world.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_real_sample.csv", index = False)

In [None]:
# tfidf_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\tfidf_sample_8_12.csv", index = False)

In [None]:
tfidf_train.shape, tfidf_valid.shape, tfidf_test.shape, tfidf_sample.shape

In [None]:
y_train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\y_train_8_12.csv", index = False)
y_valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\y_valid_8_12.csv", index = False)
y_test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\y_test_8_12.csv", index = False)

## Feature Selection

### Identifying features with high correaltion using Pearson Correlation Coefficient

In [None]:
def convert_to_binary(y):
    for i in range(len(y)):
        if (y[i] == 2):
            # convert yes from 2 to 1
            y[i] = 1
        elif (y[i] == 1 or y[i] == 0):
            # convert no/ntr from 0/1 to 0
            y[i] = 0
    return y
y_train = convert_to_binary(y_train)
y_valid = convert_to_binary(y_valid)
y_test = convert_to_binary(y_test)

In [None]:
tfidf_train_features_df = pd.concat([tfidf_train, y_train.reset_index(drop=True)], axis = 1)
tfidf_test_features_df = pd.concat([tfidf_test, y_test.reset_index(drop = True)], axis = 1)
tfidf_valid_features_df = pd.concat([tfidf_valid, y_valid.reset_index(drop = True)], axis = 1)

In [None]:
def filter_features_by_cor(df):
    m = len(df.columns)
    output = df.iloc[:,m-1] 
    output_list = output.tolist()
    corrcoef_array = []

    for i in range(0,m-2):
        input_list = df.iloc[:,i].tolist()
        cols = [input_list, output_list]
        corrcoef = abs(np.corrcoef(cols)) 
        corrcoef_array = np.append(corrcoef_array,corrcoef[0,1])

    feature_names = list(df)
    feature_names = feature_names[0:m-2]
    
    output_df = pd.DataFrame(feature_names, columns=['Features'])
    output_df['CorrCoef'] = corrcoef_array
    output_df = output_df.sort_values('CorrCoef')
    output_df = output_df.reset_index()
    output_df = output_df.drop(columns = "index")
    
    return output_df

In [None]:
tfidf_output_df = filter_features_by_cor(tfidf_train_features_df)
tfidf_output_df = tfidf_output_df.sort_values(by=['CorrCoef'],ascending = False)

In [None]:
tfidf_output_df.head(10)

In [None]:
tfidf_output_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Performance\feature_correlation_binary_labels_8_12.csv", index = False)

In [None]:
tfidf_train_features_df.drop(columns = 'annotator_label')
tfidf_test_features_df.drop(columns = 'annotator_label')
tfidf_valid_features_df.drop(columns = 'annotator_label')

## Regularized Logistic Regression

In [None]:
def filter_by_corr(corr, tfidf_output_df):
    # Setting Correlation threshold
    top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
    filtered_tfidf_train = tfidf_train_features_df.filter(items=top_tfidf_features_df['Features'])
    filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])
    filtered_tfidf_valid = tfidf_valid_features_df.filter(items=top_tfidf_features_df['Features'])
    
    return top_tfidf_features_df, filtered_tfidf_train, filtered_tfidf_test, filtered_tfidf_valid

In [None]:
def logisitic_regression(X_train, y_train, X_test, y_test, c, want_report, want_conf_mat, save_model, name):
    # fitting model
    lr = LogisticRegression(penalty = 'l1', solver = 'liblinear', C = c, random_state = 0, class_weight = 'balanced')
    lr.fit(X_train, y_train)
    
    # predictions
    y_pred = lr.predict(X_test)
    y_prob = lr.predict_proba(X_test)

    # collecting results
    acc = metrics.accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob, multi_class = 'ovr', average = 'weighted')
    # print("ACC: ", acc, "AUC: ", auc)
    
    if (save_model == True):
        pickle.dump(lr, open("Storage/Model/" + name, 'wb'))
    
    if (want_report == True):
        target_names = ['Negative', 'Neither', 'Positive']
        results_lgr = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
        results_lgr = pd.DataFrame(results_lgr).transpose()
        print("Micro F1: ", metrics.f1_score(y_test, y_pred, average = "micro"))
        print("Macro F1: ", metrics.f1_score(y_test, y_pred, average = "macro"))
        print("Weighted F1: ", metrics.f1_score(y_test, y_pred, average = "weighted"))
        
        if (want_conf_mat == True):
            return lr, acc, auc, c, results_lgr, confusion_matrix(y_test, y_pred)
    
        return lr, acc, auc, c, results_lgr
    
    if (want_conf_mat == True):
        return lr, acc, auc, c, confusion_matrix(y_test, y_pred)
        
    return lr, acc, auc, c

In [None]:
def cross_validation_split(dataset, n_folds):
    # ensuring straftification across label
    yes = cross_validation[cross_validation["annotator_label"] == 1].reset_index(drop = True)
    no = cross_validation[cross_validation["annotator_label"] == 0].reset_index(drop = True)
    #ntr = cross_validation[cross_validation["annotator_label"] == 1].reset_index(drop = True)
    #print(len(yes), len(no), len(ntr))
    
    yes_count = len(yes) // n_folds
    no_count = len(no) // n_folds
    #ntr_count = len(ntr) // n_folds
    #print(yes_count, no_count, ntr_count)
    split = list()
    fold_size = len(cross_validation) // n_folds

    # shuffling data to avoid having to generate random nums through while loop
    yes = yes.sample(frac=1).reset_index(drop=True)
    no = no.sample(frac=1).reset_index(drop=True)
    #ntr = ntr.sample(frac=1).reset_index(drop=True)
    
    # creating folds
    for i in tqdm(range(n_folds)):
        fold = pd.DataFrame(columns = cross_validation.columns)

        fold = fold.append(yes[yes_count * i : (yes_count * i) + yes_count])
        #print(len(fold), "YES", )
        fold = fold.append(no[no_count * i : (no_count * i) + no_count])
        #print(len(fold), "NO", )
        #fold = fold.append(ntr[ntr_count * i : (ntr_count * i) + ntr_count])
        #print(len(fold), "NTR", ((ntr_count * i) + ntr_count) - (ntr_count * i))
        split.append(fold)
        
    return split

In [None]:
def evaluate_algorithm(dataset, n_folds):
    splits = cross_validation_split(dataset, n_folds)
    
    counter = 0
    tfidf_all_df = pd.DataFrame()
    df_list  = []
    
    for fold in splits:
        train = splits.copy()
        del train[counter]
        train = pd.concat(train)
        
        y_train = train["annotator_label"].reset_index(drop = True)
        y_train = y_train.astype(int)
        
        y_test = fold["annotator_label"].reset_index(drop = True)
        y_test = y_test.astype(int)

        train = train.drop(columns = ["annotator_label"])
        fold = fold.drop(columns = ["annotator_label"])
        
        test = list()
        corr_list = list(np.arange(1,30) * 0.01)
       
        for corr in corr_list:
            acc_list = []
            auc_list = []
            c_list = []
             
            # filtering by correlation coefficient
            top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
            filtered_tfidf_train = train.filter(items=top_tfidf_features_df['Features'])
            filtered_tfidf_fold = fold.filter(items=top_tfidf_features_df['Features'])
            filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])
            
            #print(filtered_tfidf_train.shape)
            #print(filtered_tfidf_fold.shape)
            
            # tuning for optimal lambda value
            for c in [0.01, 0.1, 1, 10, 100]:
                #name = "Fold-" + str((counter + 1)) + "-Corr-" + str(corr) + "-C-" + str(c) + ".sav"
                lr, acc, auc, c = logisitic_regression(filtered_tfidf_train, y_train, filtered_tfidf_fold, y_test, c, False, False, False, "")
                acc_list.append(acc)
                auc_list.append(auc)
                c_list.append(c)
            
            # gathering model stats
            acc_df = pd.DataFrame(acc_list, columns=['acc'])
            auc_df = pd.DataFrame(auc_list, columns=['auc'])
            c_df = pd.DataFrame(c_list, columns=['c_value'])
            
            assert len(acc_df) == len(auc_df) == len(c_df)
            
            #acc_df["fold_number"] = auc_df["fold_number"] = c_df["fold_number"] = [counter] * len(auc_df)
            
            iter_df = pd.concat([c_df, acc_df, auc_df], axis=1)
            iter_df['corr_thres'] = [corr] * len(iter_df)
            iter_df['fold_number'] = [(counter + 1)] * len(iter_df)
            df_list.append(iter_df)
            
        print("Completed Fold #: ", counter + 1)
        counter += 1
        
        print("Stats DF has", len(df_list), "records")
        
        #df_list.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Optimizing-data\sample_stat_df.csv", index = False)
    return df_list

In [None]:
cross_validation = pd.concat([tfidf_train_features_df, tfidf_valid_features_df])
tfidf_all_df = evaluate_algorithm(cross_validation, 10)

In [None]:
tfidf_all_df = pd.concat(tfidf_all_df)

In [None]:
corr_list = list(np.arange(1,30) * 0.01)
average_results_df = []

for corr in corr_list:
    for c in [0.01, 0.1, 1, 10, 100]:
        filtered = tfidf_all_df[(tfidf_all_df["corr_thres"] == corr) & (tfidf_all_df["c_value"] == c)]
        avg_auc = filtered["auc"].mean()
        avg_acc = filtered["acc"].mean()

        filler = np.arange(5, 9)**2
        df = pd.DataFrame(filler.reshape(1, 4), columns = ["c_value", "acc", "auc", "corr_thres"])
        df.loc[df.index] = [c, avg_acc, avg_auc, corr]
        #print(df)
        
        average_results_df.append(df)

In [None]:
average_results_df = pd.concat(average_results_df)

In [None]:
average_results_df[average_results_df['auc'] == max(average_results_df['auc'])]

In [None]:
# Specific parameter setting performance
corr = 0.01
c = 10

y_train_cross_valid = cross_validation["annotator_label"]
y_test_cross_valid = tfidf_test_features_df["annotator_label"]

cross_validation.drop(columns = ["annotator_label"])
tfidf_test_features_df.drop(columns = ["annotator_label"])

# Setting Correlation threshold
top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
filtered_tfidf_train = cross_validation.filter(items=top_tfidf_features_df['Features'])
filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])

# Running model
lr, acc_optimized, auc_optimized, c_list, report, conf_mat = logisitic_regression(filtered_tfidf_train, y_train_cross_valid, filtered_tfidf_test, y_test_cross_valid, c, True, True, False, "")

print("\nC: ", c, "\n", report)
print("\nAUC: ", auc_optimized)
print("ACC: ", acc_optimized)
print("\nConfusion Matrix: \n", conf_mat)

In [None]:
filtered_tfidf_test["label"] = y_test_cross_valid

In [None]:
y_test_neither = filtered_tfidf_test[filtered_tfidf_test["label"] == 1]
y_test_no = filtered_tfidf_test[filtered_tfidf_test["label"] == 0]
y_test_yes = filtered_tfidf_test[filtered_tfidf_test["label"] == 2]

In [None]:
filtered_tfidf_test = filtered_tfidf_test.drop(columns = ["label"])

In [None]:
neither_yes = pd.concat([y_test_neither, y_test_yes], axis = 0)
neither_yes = neither_yes.drop(columns = ["label"])

In [None]:
neither_yes

In [None]:
preds = lr.predict(neither_yes)
#precision_score()

In [None]:
"""   
      NO   NTR YES
NO  [[439  9  21]
NTR [  5 457   7]
YES [ 26  19 424]]

NO - 439/469 TP, 9/469 NTR when should be NO, and 21/469 YES when should be NO
    Precision: 439/(439+5+26)
    Recall: 439/(439+21+9)
    fl: (439*2)/(470+469)

NTR - 457/469 TP, 5/469 NO when should be NTR, and 7/469 YES when should be NTR
YES - 424/469 TP, 19/469 NTR when should be YES, and 26/469 are NO when should be YES
"""

In [None]:
"""   
      NO   NTR YES
NO  [[83  7  14]
NTR  [10 124  7]
YES  [6  3   39]]

no
tpr: 0.8

"""

In [None]:
FP = conf_mat.sum(axis = 0) - np.diag(conf_mat) 
FN = conf_mat.sum(axis = 1) - np.diag(conf_mat)
TP = np.diag(conf_mat)
TN = conf_mat.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)
print("FPR: ", FPR)

## Saving 

In [None]:
X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
train = pd.concat([X_train, y_train], axis = 1)

In [None]:
X_valid = X_valid.reset_index(drop = True)
y_valid = y_valid.reset_index(drop = True)
valid = pd.concat([X_valid, y_valid], axis = 1)

In [None]:
X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
test = pd.concat([X_test, y_test], axis = 1)

In [None]:
train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\train_8_11.csv", index = False)
valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\valid_8_11.csv", index = False)
test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\test_8_11.csv", index = False)

In [None]:
pickle.dump(lr, open("Storage/" + "model_8_30.sav", 'wb'))

## Model Predictions

In [None]:
tfidf_output_df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Performance\feature_correlation_multiclass_labels_8_12.csv")

In [None]:
top_tfidf_features_df_sample = tfidf_output_df[tfidf_output_df['CorrCoef'] > 0.01]

In [None]:
model = pickle.load(open(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\model_8_30.sav", 'rb'))

### Getting Train and Test Set Predictions and  Probabilties

In [None]:
#train = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\train_8_11.csv")
#valid = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\valid_8_11.csv")
#cross_validation_df = pd.concat([train, valid])

test = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\test_8_11.csv")

In [None]:
test_predictions = model.predict(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))
#cross_valid_predictions = model.predict(cross_validation.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
#cross_validation_df["predictions"] = cross_valid_predictions
test["predictions"] = test_predictions

In [None]:
test_proba = model.predict_proba(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))
#cross_valid_proba = model.predict_proba(cross_validation.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
test_proba = test_proba[:, 1]

In [None]:
test["probability"] = test_proba

In [None]:
# test_proba = np.array([[x] for x in np.max(test_proba, axis=1)])
# test_proba = [item for sublist in test_proba for item in sublist]
# test["probability"] = test_proba

# cross_valid_proba = np.array([[x] for x in np.max(cross_valid_proba, axis=1)])
# cross_valid_proba = [item for sublist in cross_valid_proba for item in sublist]
# cross_validation_df["probability"] = cross_valid_proba

In [None]:
def convert_to_binary(y):
    for i in range(len(y)):
        if (y[i] == 2):
            # convert yes from 2 to 1
            y[i] = 1
        elif (y[i] == 1 or y[i] == 0):
            # convert no/ntr from 0/1 to 0
            y[i] = 0
    return y

test["annotator_label"] = convert_to_binary(test["annotator_label"])

In [None]:
#cross_validation_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\train_full.csv", index = False)
test.to_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\test_binary_classification.csv", index = False)

In [None]:
len(cross_validation_df["patient_id"].unique()), len(cross_validation_df), len(test)

### Running on Samples

In [None]:
sample_predictions = model.predict(tfidf_sample.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
sample["predictions"] = sample_predictions

In [None]:
sample_proba = model.predict_proba(tfidf_sample.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
sample_proba = np.array([[x] for x in np.max(sample_proba, axis=1)])
sample_proba = [item for sublist in sample_proba for item in sublist]

In [None]:
sample["probability"] = sample_proba

In [None]:
sample.head()

In [None]:
sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\sample_predictions_9_1.csv", index = False)

### Finding Optimal Patient Aggregation Scheme

In [None]:
sample["predictions"].value_counts()

In [None]:
sample

In [None]:
apoe = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [None]:
apoe_e2 = apoe[(apoe["APOE"] == "e2/e2") | (apoe["APOE"] == "e2/e3")]
len((apoe_e2[apoe_e2["patient_CI"] == 1]))/len(apoe_e2)

In [None]:
binary_model = pickle.load(open(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\model_8_12_binary_classification.sav", 'rb'))

In [None]:
top_tfidf_features_df_sample_binary = tfidf_output_df[tfidf_output_df['CorrCoef'] > 0.07]

In [None]:
binary_sample_predictions = binary_model.predict(tfidf_sample.filter(items = top_tfidf_features_df_sample_binary['Features']))

In [None]:
binary_sample_preds = sample.copy()
binary_sample_preds["predictions"] = binary_sample_predictions

In [None]:
binary_sample_preds["predictions"].value_counts()

In [None]:
binary_sample_preds = pd.merge(binary_sample_preds, apoe, how = "right", on = "patient_id")

In [None]:
sample_preds_cp = sample.copy()
sample_preds_cp = pd.merge(sample_preds_cp, apoe, how = "right", on = "patient_id")

In [None]:
binary_sample_preds

In [None]:
apoe["APOE"].value_counts()

In [None]:
e_2 = sample_preds_cp[(sample_preds_cp["APOE"] == "e2/e2") | (sample_preds_cp["APOE"] == "e2/e3")]
e_3 = sample_preds_cp[(sample_preds_cp["APOE"] == "e3/e3")]
e_4 = sample_preds_cp[(sample_preds_cp["APOE"] == "e4/e4") | (sample_preds_cp["APOE"] == "e2/e4 or e1/e3") | (sample_preds_cp["APOE"] == "e3/e4")]

In [None]:
len(e_2), len(e_3), len(e_4)

In [None]:
def percent_ci(df, sequence_threshold):
    patient_level_preds = []
    p_id_ls = []
    ci_count = 0
    for p_id in df["patient_id"].unique():
        sequences = df[df["patient_id"] == p_id]
        if len(sequences[sequences["predictions"] == 2]) >= sequence_threshold:
            patient_level_preds.append(1)
            ci_count += 1
        else:
            patient_level_preds.append(0)
        p_id_ls.append(p_id)
    return patient_level_preds, ci_count/len(df["patient_id"].unique()), p_id_ls

In [None]:
x_axis = []
y_axis_e2 = []
y_axis_e3 = []
y_axis_e4 = []

for i in tqdm(range(10)):
    e_2_preds, e_2_percent, e_2_p_id = percent_ci(e_2, i+1)
    e_3_preds, e_3_percent, e_3_p_id = percent_ci(e_3, i+1)
    e_4_preds, e_4_percent, e_4_p_id = percent_ci(e_4, i+1)
    
    x_axis.append(i+1)
    y_axis_e2.append(e_2_percent)
    y_axis_e3.append(e_3_percent)
    y_axis_e4.append(e_4_percent)

In [None]:
plt.plot(x_axis, y_axis_e2, label='APOE ε2')
plt.plot(x_axis, y_axis_e3, label='APOE ε3')
plt.plot(x_axis, y_axis_e4, label='APOE ε4')

plt.xlabel("Sequence Threshold")
plt.ylabel("Percent of Patients Predicted as having CI")
plt.legend(loc='best')
plt.show()

In [None]:
revised_preds = pd.DataFrame()
revised_preds["patient_id"] = e_2_p_id + (e_3_p_id) + (e_4_p_id)

In [None]:
revised_preds["predictions"] = e_2_preds + (e_3_preds) + (e_4_preds)

In [None]:
revised_preds["predictions"].value_counts()

In [None]:
revised_patients = pd.merge(apoe, revised_preds, how = "right", on = "patient_id")

In [None]:
revised_patients.drop(columns = ["patient_CI"])

In [None]:
TP = len(revised_patients[(revised_patients["predictions"] == 1) & (revised_patients["AD_Med_or_ICD_Code"] == 1)])

In [None]:
print(note.loc[note['id'] == 8481]['PatientID'])

In [None]:
FP = len(revised_patients[(revised_patients["predictions"] == 1) & (revised_patients["AD_Med_or_ICD_Code"] == 0)])

In [None]:
TN = len(revised_patients[(revised_patients["predictions"] == 0) & (revised_patients["AD_Med_or_ICD_Code"] == 0)])

In [None]:
FN = len(revised_patients[(revised_patients["predictions"] == 0) & (revised_patients["AD_Med_or_ICD_Code"] == 1)])

## ROC-Curve

In [None]:
pred_prob[:10]

In [None]:
pred_prob[:,0]

In [None]:
n_class = 3

pred_prob = lr.predict_proba(filtered_tfidf_test)

fpr = {}
tpr = {}
thresh = {}
roc_auc = {}

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = metrics.roc_curve(y_test, pred_prob[:,i], pos_label = i)
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])
    
lw = 2

# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--', color='orange', label='TF-IDF (area = %0.2f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='TF-IDF (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='TF-IDF (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

#plt.title('AUROC Curve for TF-IDF')

#plt.savefig('AUROC Curve for TF-IDF', dpi = 300)