In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

import time
import math
import random
from tqdm import tqdm
import regex as re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

## Load in Always Pattern, Manual Review, and Sample DF

In [2]:
always_patterns = pd.read_csv("Storage/Data/always_patterns.csv") 
always_patterns = always_patterns[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
always_patterns.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
always_patterns.head()

Unnamed: 0.1,Unnamed: 0,patient_id,sequence,original,annotator_label
0,0,Z15564314,s other free text-see phs viewer social histor...,s other free text-see phs viewer Social Histor...,1
1,1,Z10171706,------- fusion: no sleep disturbance: no socia...,------- fusion: No Sleep disturbance: No Socia...,1
2,2,Z8935348,ain spasm). 30 tablet 0 unknown (outside pharm...,ain/spasm). 30 tablet 0 Unknown (outside pharm...,1
3,3,Z12212893,------- on 112mcg dose. maria will check dose ...,------- on 112mcg dose. Maria will check dos...,2
4,4,Z9598376,------- (vibramycin) 100 mg capsule take 1 cap...,------- (VIBRAMYCIN) 100 MG capsule Take 1 ca...,1


In [3]:
manual_review = pd.read_csv("Storage/Data/manual_review.csv")
manual_review = manual_review[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
manual_review.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
manual_review.head()

Unnamed: 0.1,Unnamed: 0,patient_id,sequence,original,annotator_label
0,75,Z10504958,ndings: general: no apparent distress. surgica...,ndings: General: No apparent distress. Surgica...,0
1,77,Z6411816,"edness, no syncope, no dyspnea endocrine no po...","edness, No Syncope, No Dyspnea Endocrine No Po...",2
2,87,Z6813204,5 reps calibrated hand gripper with silver spr...,5 reps Calibrated hand gripper with silver sp...,0
3,95,Z10492528,nancial management services: none healthcare p...,nancial management Services: none Healthcare p...,0
4,106,Z7656517,2 | 1 time: 1 | 0 time: 0) 3. “are there any r...,2 | 1 time: 1 | 0 time: 0) 3. “Are there any r...,0


In [21]:
sample = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Sampling\sample_8_12.csv") 
        # pd.read_csv("Storage/Data/20K_sample.csv")
sample = sample[["PatientID", "regex_sent"]]
sample.columns = ['patient_id', 'sequence']
sample.head()

Unnamed: 0,patient_id,sequence
0,Z6352398,"""\""ession alone in the meta-analysis. We discu..."
1,Z6353136,"""\"" ------- l obstruction ASSOCIATED DIAGNOSES..."
2,Z6353461,"""\"" ------- (156 lb) 09/19/17 71.4 kg (157 lb..."
3,Z6353764,"""\""ut answer. Optho notes from recent outpatie..."
4,Z6354111,"""\""hat would be more consistent. Has been aler..."


In [38]:
real_world_sample = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\real_world_sample.csv")
real_world_sample.head()

Unnamed: 0,sequence_number,sequence,note_number,always_pattern_match
0,2,"------ sis over her r bra strap, which was int...",2,['memory loss 03/28/2018 likely start of demen...
1,52,"------ 331-7977), is her health care agent on ...",52,"[""family home with her husband and son, vincen..."
2,86,------ any change in cognitive deficits 2 = re...,86,['cognitive deficits']
3,94,dol (ultram) 50 mg tablet take 1 tablet (50 mg...,94,['father: coronary artery disease with mi in l...
4,114,------ lth prevention and screening \u2022 hyp...,114,['memory not intact.? cn ii-xii: perrl (4mm ->...


## Preprocessing for Sample DF

In [5]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
    
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [18]:
for i in tqdm(range(len(sample))):
    sample.loc[i, "sequence"] = clean_sequence(sample.loc[i]["sequence"][3:len(sample.loc[i]["sequence"]) - 3])

100%|████████████████████████████████████████████████████████████████████████| 186730/186730 [01:07<00:00, 2771.64it/s]


## Splitting into Train, Validation, Split

This code only needs to be run once as train_test_split does a different split every time. After first run, use saved CSVs.

In [None]:
X = always_patterns[["patient_id", "sequence"]]
y = always_patterns["annotator_label"]

y_label = y.to_numpy()
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X,y,random_state=0,test_size=0.10, stratify=y_label)

y_test_valid_label = y_test_valid.to_numpy()
X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state=0, test_size=(0.25), stratify=y_test_valid_label)

In [None]:
X_2 = manual_review[["patient_id", "sequence"]]
y_2 = manual_review["annotator_label"]

y_label_2 = y_2.to_numpy()
X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2,y_2,random_state=0,test_size=0.3, stratify=y_label_2)

y_test_valid_label_2 = y_test_valid_2.to_numpy()
X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state=0, test_size=(0.15/0.3), stratify=y_test_valid_label_2)

In [None]:
X_train = X_train.append(X_train_2)
y_train = y_train.append(y_train_2)

X_test = X_test.append(X_test_2)
y_test = y_test.append(y_test_2)

X_valid = X_valid.append(X_valid_2)
y_valid = y_valid.append(y_valid_2)

In [None]:
len(X_train), len(X_valid), len(X_test) 

### Using Saved CSVs to load in data

In [24]:
X_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\train_8_11.csv")
y_train = X_train["annotator_label"]
X_train = X_train["sequence"]

In [25]:
X_valid = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\valid_8_11.csv")
y_valid = X_valid["annotator_label"]
X_valid = X_valid["sequence"]

In [26]:
X_test = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\test_8_11.csv")
y_test = X_test["annotator_label"]
X_test = X_test["sequence"]

In [27]:
X_sample = sample['sequence']

In [39]:
X_real_world = real_world_sample["sequence"]

In [40]:
len(X_train), len(X_valid), len(X_test), len(X_sample), len(X_real_world)

(7669, 694, 293, 186730, 100)

## TF-IDF

In [41]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english",analyzer='word', token_pattern=r'\b[A-Za-z0-9]+\b')
tfidf_train= tfidf_vectorizer.fit_transform(X_train)
tfidf_valid = tfidf_vectorizer.transform(X_valid)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_sample = tfidf_vectorizer.transform(X_sample)
tfidf_real_world = tfidf_vectorizer.transform(X_real_world)

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_train = pd.DataFrame(tfidf_train.A, columns = tfidf_vbectorizer.get_feature_names())
tfidf_valid = pd.DataFrame(tfidf_valid.A, columns = tfidf_vectorizer.get_feature_names())
tfidf_test = pd.DataFrame(tfidf_test.A, columns = tfidf_vectorizer.get_feature_names())
tfidf_sample = pd.DataFrame(tfidf_sample.A, columns = tfidf_vectorizer.get_feature_names())
tfidf_real_world = pd.DataFrame(tfidf_real_world.A, columns = tfidf_vectorizer.get_feature_names())

tfidf_real_world.head()

Unnamed: 0,0,00,000,0000,0001,0002,0005,0009,000u,0015,...,zurek,zydis,zyflo,zygomatic,zyl,zyloprim,zyprexa,zyrtec,zytiga,zziness
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.176425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tfidf_train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_train_8_12.csv", index = False)
tfidf_valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_valid_8_12.csv", index = False)
tfidf_test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_test_8_12.csv", index = False)

In [42]:
tfidf_real_world.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_real_sample.csv", index = False)

In [31]:
tfidf_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\tfidf_sample_8_12.csv", index = False)

KeyboardInterrupt: 

In [16]:
tfidf_train.shape, tfidf_valid.shape, tfidf_test.shape

((7669, 24098), (694, 24098), (293, 24098))

In [18]:
y_train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\y_train_8_12.csv", index = False)
y_valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\y_valid_8_12.csv", index = False)
y_test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\y_test_8_12.csv", index = False)

## Feature Selection

### Converting to binary problem (where yes = 1 and no/ntr = 0)

In [51]:
def convert_to_binary(y):
    for i in range(len(y)):
        if (y[i] == 2):
            # convert yes from 2 to 1
            y[i] = 1
        elif (y[i] == 1 or y[i] == 0):
            # convert no/ntr from 0/1 to 0
            y[i] = 0
    return y

In [52]:
y_train = convert_to_binary(y_train)

In [54]:
y_valid = convert_to_binary(y_valid)

In [55]:
y_test = convert_to_binary(y_test)

### Identifying features with high correaltion using Pearson Correlation Coefficient

In [113]:
tfidf_train_features_df = pd.concat([tfidf_train, y_train.reset_index(drop=True)], axis = 1)
tfidf_test_features_df = pd.concat([tfidf_test, y_test.reset_index(drop = True)], axis = 1)
tfidf_valid_features_df = pd.concat([tfidf_valid, y_valid.reset_index(drop = True)], axis = 1)

In [114]:
def filter_features_by_cor(df):
    m = len(df.columns)
    output = df.iloc[:,m-1] 
    output_list = output.tolist()
    corrcoef_array = []

    for i in range(0,m-2):
        input_list = df.iloc[:,i].tolist()
        cols = [input_list, output_list]
        corrcoef = abs(np.corrcoef(cols)) 
        corrcoef_array = np.append(corrcoef_array,corrcoef[0,1])

    feature_names = list(df)
    feature_names = feature_names[0:m-2]
    
    output_df = pd.DataFrame(feature_names, columns=['Features'])
    output_df['CorrCoef'] = corrcoef_array
    output_df = output_df.sort_values('CorrCoef')
    output_df = output_df.reset_index()
    output_df = output_df.drop(columns = "index")
    
    return output_df

In [115]:
tfidf_output_df = filter_features_by_cor(tfidf_train_features_df)
tfidf_output_df = tfidf_output_df.sort_values(by=['CorrCoef'],ascending = False)

In [116]:
tfidf_output_df.head(10)

Unnamed: 0,Features,CorrCoef
24096,intact,0.557335
24095,oriented,0.423366
24094,concentration,0.415752
24093,orientation,0.40293
24092,perceptions,0.395939
24091,sensorium,0.395415
24090,judgment,0.385164
24089,fund,0.373389
24088,experiences,0.36934
24087,ideation,0.361289


In [117]:
tfidf_output_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Performance\feature_correlation_multiclass_labels_8_12.csv", index = False)

In [118]:
tfidf_train_features_df.drop(columns = 'annotator_label')
tfidf_test_features_df.drop(columns = 'annotator_label')
tfidf_valid_features_df.drop(columns = 'annotator_label')

Unnamed: 0,0,00,000,0000,0001,0002,0005,0009,000u,0015,...,zurek,zydis,zyflo,zygomatic,zyl,zyloprim,zyprexa,zyrtec,zytiga,zziness
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,0.119656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
690,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
691,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
692,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Regularized Logistic Regression

In [83]:
def filter_by_corr(corr, tfidf_output_df):
    # Setting Correlation threshold
    top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
    filtered_tfidf_train = tfidf_train_features_df.filter(items=top_tfidf_features_df['Features'])
    filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])
    filtered_tfidf_valid = tfidf_valid_features_df.filter(items=top_tfidf_features_df['Features'])
    
    return top_tfidf_features_df, filtered_tfidf_train, filtered_tfidf_test, filtered_tfidf_valid

In [97]:
def logisitic_regression(X_train, y_train, X_test, y_test, c, want_report, want_conf_mat, save_model, name):
    # fitting model
    lr = LogisticRegression(penalty = 'l1', solver = 'liblinear', C = c, random_state = 0, class_weight = 'balanced')
    lr.fit(X_train, y_train)
    
    # predictions
    y_pred = lr.predict(X_test)
    y_prob = lr.predict_proba(X_test)[:, 1]

    # collecting results
    acc = metrics.accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob, average = 'weighted')
    print("ACC: ", acc, "AUC: ", auc)
    
    if (save_model == True):
        pickle.dump(lr, open("Storage/Model/" + name, 'wb'))
    
    if (want_report == True):
        target_names = ['NO', 'YES']
        results_lgr = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
        results_lgr = pd.DataFrame(results_lgr).transpose()
        
        if (want_conf_mat == True):
            return lr, acc, auc, c, results_lgr, confusion_matrix(y_test, y_pred)
    
        return lr, acc, auc, c, results_lgr
    
    if (want_conf_mat == True):
        return lr, acc, auc, c, confusion_matrix(y_test, y_pred)
        
    return lr, acc, auc, c

In [87]:
def cross_validation_split(dataset, n_folds):
    # ensuring straftification across label
    yes = cross_validation[cross_validation["annotator_label"] == 1].reset_index(drop = True)
    no = cross_validation[cross_validation["annotator_label"] == 0].reset_index(drop = True)
    #ntr = cross_validation[cross_validation["annotator_label"] == 1].reset_index(drop = True)
    #print(len(yes), len(no), len(ntr))
    
    yes_count = len(yes) // n_folds
    no_count = len(no) // n_folds
    #ntr_count = len(ntr) // n_folds
    #print(yes_count, no_count, ntr_count)
    split = list()
    fold_size = len(cross_validation) // n_folds

    # shuffling data to avoid having to generate random nums through while loop
    yes = yes.sample(frac=1).reset_index(drop=True)
    no = no.sample(frac=1).reset_index(drop=True)
    #ntr = ntr.sample(frac=1).reset_index(drop=True)
    
    # creating folds
    for i in tqdm(range(n_folds)):
        fold = pd.DataFrame(columns = cross_validation.columns)

        fold = fold.append(yes[yes_count * i : (yes_count * i) + yes_count])
        #print(len(fold), "YES", )
        fold = fold.append(no[no_count * i : (no_count * i) + no_count])
        #print(len(fold), "NO", )
        #fold = fold.append(ntr[ntr_count * i : (ntr_count * i) + ntr_count])
        #print(len(fold), "NTR", ((ntr_count * i) + ntr_count) - (ntr_count * i))
        split.append(fold)
        
    return split

In [90]:
def evaluate_algorithm(dataset, n_folds):
    splits = cross_validation_split(dataset, n_folds)
    
    counter = 0
    tfidf_all_df = pd.DataFrame()
    df_list  = []
    
    for fold in splits:
        train = splits.copy()
        del train[counter]
        train = pd.concat(train)
        
        y_train = train["annotator_label"].reset_index(drop = True)
        y_train = y_train.astype(int)
        
        y_test = fold["annotator_label"].reset_index(drop = True)
        y_test = y_test.astype(int)

        train = train.drop(columns = ["annotator_label"])
        fold = fold.drop(columns = ["annotator_label"])
        
        test = list()
        corr_list = list(np.arange(1,30) * 0.01)
       
        for corr in corr_list:
            acc_list = []
            auc_list = []
            c_list = []
             
            # filtering by correlation coefficient
            top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
            filtered_tfidf_train = train.filter(items=top_tfidf_features_df['Features'])
            filtered_tfidf_fold = fold.filter(items=top_tfidf_features_df['Features'])
            filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])
            
            #print(filtered_tfidf_train.shape)
            #print(filtered_tfidf_fold.shape)
            
            # tuning for optimal lambda value
            for c in [0.01, 0.1, 1, 10, 100]:
                #name = "Fold-" + str((counter + 1)) + "-Corr-" + str(corr) + "-C-" + str(c) + ".sav"
                lr, acc, auc, c = logisitic_regression(filtered_tfidf_train, y_train, filtered_tfidf_fold, y_test, c, False, False, False, "")
                acc_list.append(acc)
                auc_list.append(auc)
                c_list.append(c)
            
            # gathering model stats
            acc_df = pd.DataFrame(acc_list, columns=['acc'])
            auc_df = pd.DataFrame(auc_list, columns=['auc'])
            c_df = pd.DataFrame(c_list, columns=['c_value'])
            
            assert len(acc_df) == len(auc_df) == len(c_df)
            
            #acc_df["fold_number"] = auc_df["fold_number"] = c_df["fold_number"] = [counter] * len(auc_df)
            
            iter_df = pd.concat([c_df, acc_df, auc_df], axis=1)
            iter_df['corr_thres'] = [corr] * len(iter_df)
            iter_df['fold_number'] = [(counter + 1)] * len(iter_df)
            df_list.append(iter_df)
            
        print("Completed Fold #: ", counter + 1)
        counter += 1
        
        print("Stats DF has", len(df_list), "records")
        
        #df_list.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Optimizing-data\sample_stat_df.csv", index = False)
    return df_list

In [91]:
cross_validation = pd.concat([tfidf_train_features_df, tfidf_valid_features_df])
tfidf_all_df = evaluate_algorithm(cross_validation, 10)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.70s/it]


ACC:  0.8217703349282297 AUC:  0.5
ACC:  0.9102870813397129 AUC:  0.9697742348309447
ACC:  0.937799043062201 AUC:  0.9842228148842843
ACC:  0.9629186602870813 AUC:  0.9859031095219952
ACC:  0.9581339712918661 AUC:  0.9843693522073405
ACC:  0.8217703349282297 AUC:  0.5
ACC:  0.9102870813397129 AUC:  0.9697742348309447
ACC:  0.9366028708133971 AUC:  0.9842130457294138
ACC:  0.9557416267942583 AUC:  0.9882965524652462
ACC:  0.9545454545454546 AUC:  0.9847405800924163
ACC:  0.8217703349282297 AUC:  0.5
ACC:  0.9102870813397129 AUC:  0.9697742348309447
ACC:  0.9401913875598086 AUC:  0.9840665084063577
ACC:  0.9521531100478469 AUC:  0.9854537283979563
ACC:  0.9473684210526315 AUC:  0.9828649023572971
ACC:  0.8217703349282297 AUC:  0.5
ACC:  0.9102870813397129 AUC:  0.9697742348309447
ACC:  0.937799043062201 AUC:  0.9836268964371893
ACC:  0.9509569377990431 AUC:  0.9849457323446948
ACC:  0.9413875598086124 AUC:  0.9727538270664204
ACC:  0.8217703349282297 AUC:  0.5
ACC:  0.9102870813397129 AU

In [92]:
tfidf_all_df = pd.concat(tfidf_all_df)

In [93]:
corr_list = list(np.arange(1,30) * 0.01)
average_results_df = []

for corr in corr_list:
    for c in [0.01, 0.1, 1, 10, 100]:
        filtered = tfidf_all_df[(tfidf_all_df["corr_thres"] == corr) & (tfidf_all_df["c_value"] == c)]
        avg_auc = filtered["auc"].mean()
        avg_acc = filtered["acc"].mean()

        filler = np.arange(5, 9)**2
        df = pd.DataFrame(filler.reshape(1, 4), columns = ["c_value", "acc", "auc", "corr_thres"])
        df.loc[df.index] = [c, avg_acc, avg_auc, corr]
        #print(df)
        
        average_results_df.append(df)

In [94]:
average_results_df = pd.concat(average_results_df)

In [95]:
average_results_df[average_results_df['auc'] == max(average_results_df['auc'])]

Unnamed: 0,c_value,acc,auc,corr_thres
0,10.0,0.94055,0.979079,0.07


In [98]:
# Specific parameter setting performance
corr = 0.07
c = 10

y_train_cross_valid = cross_validation["annotator_label"]
y_test_cross_valid = tfidf_test_features_df["annotator_label"]

cross_validation.drop(columns = ["annotator_label"])
tfidf_test_features_df.drop(columns = ["annotator_label"])

# Setting Correlation threshold
top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
filtered_tfidf_train = cross_validation.filter(items=top_tfidf_features_df['Features'])
filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])

# Running model
lr, acc_optimized, auc_optimized, c_list, report, conf_mat = logisitic_regression(filtered_tfidf_train, y_train_cross_valid, filtered_tfidf_test, y_test_cross_valid, c, True, True, True, "model_test_optimal.sav")

print("\nC: ", c, "\n", report)
print("\nAUC: ", auc_optimized)
print("ACC: ", acc_optimized)
print("\nConfusion Matrix: \n", conf_mat)

ACC:  0.8327645051194539 AUC:  0.9343537414965986

C:  10 
               precision    recall  f1-score     support
NO             0.971154  0.824490  0.891832  245.000000
YES            0.494118  0.875000  0.631579   48.000000
accuracy       0.832765  0.832765  0.832765    0.832765
macro avg      0.732636  0.849745  0.761706  293.000000
weighted avg   0.893005  0.832765  0.849197  293.000000

AUC:  0.9343537414965986
ACC:  0.8327645051194539

Confusion Matrix: 
 [[202  43]
 [  6  42]]


In [None]:
"""   
      NO   NTR YES
NO  [[439  9  21]
NTR [  5 457   7]
YES [ 26  19 424]]

NO - 439/469 TP, 9/469 NTR when should be NO, and 21/469 YES when should be NO
    Precision: 439/(439+5+26)
    Recall: 439/(439+21+9)
    fl: (439*2)/(470+469)

NTR - 457/469 TP, 5/469 NO when should be NTR, and 7/469 YES when should be NTR
YES - 424/469 TP, 19/469 NTR when should be YES, and 26/469 are NO when should be YES
"""

In [None]:
"""   
      NO   NTR YES
NO  [[83  7  14]
NTR  [10 124  7]
YES  [6  3   39]]

no
tpr: 0.8

"""

In [99]:
FP = conf_mat.sum(axis = 0) - np.diag(conf_mat) 
FN = conf_mat.sum(axis = 1) - np.diag(conf_mat)
TP = np.diag(conf_mat)
TN = conf_mat.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)
print("FPR: ", FPR)

Sensitivity:  [0.8244898 0.875    ]
Specificity:  [0.875     0.8244898]
NPV:  [0.49411765 0.97115385]
PPV:  [0.97115385 0.49411765]
FPR:  [0.125     0.1755102]


## Saving 

In [None]:
X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
train = pd.concat([X_train, y_train], axis = 1)

In [None]:
X_valid = X_valid.reset_index(drop = True)
y_valid = y_valid.reset_index(drop = True)
valid = pd.concat([X_valid, y_valid], axis = 1)

In [None]:
X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
test = pd.concat([X_test, y_test], axis = 1)

In [None]:
train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\train_8_11.csv", index = False)
valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\valid_8_11.csv", index = False)
test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\test_8_11.csv", index = False)

In [100]:
pickle.dump(lr, open("Storage/" + "model_8_12_binary_classification.sav", 'wb'))

## Getting Train and Test Set Predictions and  Probabilties

In [30]:
tfidf_output_df = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Performance\feature_correlation_binary_labels_8_12.csv")

In [36]:
top_tfidf_features_df_sample = tfidf_output_df[tfidf_output_df['CorrCoef'] > 0.07]

In [38]:
model = pickle.load(open(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\model_8_12_binary_classification.sav", 'rb'))

In [39]:
#train = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\train_8_11.csv")
#valid = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\valid_8_11.csv")
#cross_validation_df = pd.concat([train, valid])

test = pd.read_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\test_8_11.csv")

In [60]:
test_predictions = model.predict(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))
#cross_valid_predictions = model.predict(cross_validation.filter(items = top_tfidf_features_df_sample['Features']))

In [65]:
#cross_validation_df["predictions"] = cross_valid_predictions
test["predictions"] = test_predictions

In [66]:
test_proba = model.predict_proba(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))
#cross_valid_proba = model.predict_proba(cross_validation.filter(items = top_tfidf_features_df_sample['Features']))

In [68]:
test_proba = test_proba[:, 1]

In [69]:
test["probability"] = test_proba

In [72]:
test[test["probability"] > 0.5]

Unnamed: 0,patient_id,sequence,annotator_label,predictions,probability
4,Z7580275,rts pedal edema is worsened recently. current ...,1,1,0.994096
10,Z11168263,"------- bms during the day, either. he reports...",1,1,0.998034
13,Z14070766,------- tion of his ?primary hyperparathyroidi...,1,1,0.999973
17,Z6539715,---------8 2 2017 4:30 pm john pacheco visit d...,0,1,0.821006
19,Z8668636,------- ipal problem: closed fracture of left ...,1,1,0.984569
...,...,...,...,...,...
287,Z7153000,------- ase involving native coronary artery o...,0,1,0.968426
288,Z7055023,rmin 2000 mg daily - loose stools every 4-5 da...,0,1,0.982332
289,Z9328738,rmittently with tenderness to palpation in shi...,0,1,0.624641
290,Z9191149,"r in jamaica plain, or his aunt in roxbury. he...",0,1,0.961395


In [43]:
# test_proba = np.array([[x] for x in np.max(test_proba, axis=1)])
# test_proba = [item for sublist in test_proba for item in sublist]
# test["probability"] = test_proba

# cross_valid_proba = np.array([[x] for x in np.max(cross_valid_proba, axis=1)])
# cross_valid_proba = [item for sublist in cross_valid_proba for item in sublist]
# cross_validation_df["probability"] = cross_valid_proba

In [46]:
def convert_to_binary(y):
    for i in range(len(y)):
        if (y[i] == 2):
            # convert yes from 2 to 1
            y[i] = 1
        elif (y[i] == 1 or y[i] == 0):
            # convert no/ntr from 0/1 to 0
            y[i] = 0
    return y

test["annotator_label"] = convert_to_binary(test["annotator_label"])

In [73]:
#cross_validation_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\train_full.csv", index = False)
test.to_csv(r"C:\Users\tanis\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\test_binary_classification.csv", index = False)

In [168]:
len(cross_validation_df["patient_id"].unique()), len(cross_validation_df), len(test)

(2417, 8363, 293)

## Running on Samples

In [127]:
sample_predictions = model.predict(tfidf_sample.filter(items = top_tfidf_features_df_sample['Features']))

In [128]:
sample["predictions"] = sample_predictions

In [131]:
sample_proba = model.predict_proba(tfidf_sample.filter(items = top_tfidf_features_df_sample['Features']))

In [133]:
sample_proba = np.array([[x] for x in np.max(sample_proba, axis=1)])
sample_proba = [item for sublist in sample_proba for item in sublist]

In [135]:
sample["probability"] = sample_proba

In [164]:
sample.head()

Unnamed: 0,patient_id,sequence,predictions,probability
0,Z6352398,ession alone in the meta-analysis. we discusse...,2,0.583588
1,Z6353136,------- l obstruction associated diagnoses sma...,2,0.986656
2,Z6353461,------- (156 lb) 09 19 17 71.4 kg (157 lb 6.4 ...,0,0.940278
3,Z6353764,ut answer. optho notes from recent outpatient ...,0,0.919579
4,Z6354111,hat would be more consistent. has been alert o...,0,0.573025


In [163]:
sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\sample_8_12.csv", index = False)

## ROC-Curve

In [None]:
n_class = 3

pred_prob = model.predict_proba(filtered_tfidf_test)

fpr = {}
tpr = {}
thresh = {}

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = metrics.roc_curve(y_test, pred_prob[:,i], pos_label = i)

# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='No')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Ntr')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Yes')
plt.title('AUROC Curve for TF-IDF')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('AUROC Curve for TF-IDF', dpi = 300)

In [None]:
test