In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

import time
import math
import random
from tqdm import tqdm
import regex as re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

## Import and Train-Test-Val-Split

In [None]:
train_df = pd.read_csv("Storage/Data/input_optimized.csv") 

train_df = train_df[['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']]
#train_df.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
train_df.head()

In [None]:
X = train_df[["sequence"]]
y = train_df["annotator_label"]
print(len(X))

y_label = y.to_numpy()
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X,y,random_state=0,test_size=0.1, stratify=y_label)

y_test_valid_label = y_test_valid.to_numpy()
X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state=0, test_size=0.25, stratify=y_test_valid_label)

In [None]:
y_test.value_counts()

In [None]:
manual_review = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\test_and_validation.csv")

In [None]:
X_2 = manual_review[["sequence"]]
y_2 = manual_review["annotator_label"]

y_label_2 = y_2.to_numpy()
X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2,y_2,random_state=0,test_size=0.6, stratify=y_label_2)

y_test_valid_label_2 = y_test_valid_2.to_numpy()
X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state=0, test_size=(0.25/0.6), stratify=y_test_valid_label_2)

In [None]:
X_train = X_train.append(X_train_2)
y_train = y_train.append(y_train_2)

X_test = X_test.append(X_test_2)
y_test = y_test.append(y_test_2)

X_valid = X_valid.append(X_valid_2)
y_valid = y_valid.append(y_valid_2)

In [None]:
X_train["label"] = y_train.to_list()

In [None]:
X_valid["label"] = y_valid.to_list()
X_test["label"] = y_test.to_list()

In [None]:
train = pd.concat([X_train, X_valid], axis = 0)
test = pd.concat([X_test])

In [None]:
train.to_csv(r"Storage\Data\train_full.csv", index = False)

In [None]:
test.to_csv(r"Storage\Data\test_full.csv", index = False)

In [None]:
X_train = train_df["sequence"]
y_train = train_df["annotator_label"]

In [None]:
# train_df[train_df["always_pattern_match"].str.len() == 0]

In [None]:
test_val_df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\test_and_validation.csv")

In [None]:
X = test_val_df["sequence"]
y = test_val_df["annotator_label"]
y_label = y.to_numpy()

X_valid, X_test, y_valid, y_test = train_test_split(X,y,random_state=0,test_size=0.4, stratify=y_label)

## Additional Sampling to Obtain Optimal Data Sample

### Investigating why TFIDF vectors are combining words together

In [None]:
pattern = re.compile(r'attentionconcentration', re.IGNORECASE)

In [None]:
investigate = []
idxs = []
counter = 0
for seq in tqdm(train_df["sequence"]):
    m = pattern.search(seq)
    if (m != None):
        investigate.append(seq)
        idxs.append(counter)
        
#         specials = '-"/.~!' #etc
#         trans = seq.translate(str.maketrans(specials, ' '*len(specials)))
#         #print(trans)
#         train_df.loc[i]["sequence"] = trans
    counter += 1    

In [None]:
idxs

## Computing # of sequences that have always pattern matches to multiple classes

In [None]:
always_patterns = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [None]:
always_pattern_regex = always_patterns["Pattern"].to_list()

In [None]:
for i in range(len(always_pattern_regex)):
    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)

In [None]:
df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\Diversity_Sampling\20K_sample_8_9.csv")

In [None]:
a = []
c = []
counter = 0

for seq in tqdm(train_df["original"]):
    curr = []
    classes = []
    conversion = {2: "Yes", 1 : "Neither", 0: "No"}
    
    for p in (always_pattern_regex):
        m = list(set(re.findall(p, seq)))
        m = list(set(map(str.lower, m)))
        if (m != []):
            curr.append("".join(m))
            label = int(train_df.loc[counter]['annotator_label'])
            classes.append(conversion[label])
            #classes = list(set(classes))
            
    a.append(curr)
    c.append(classes)
    counter += 1

train_df["always_pattern_match"] = a
train_df["always_pattern_classes"] = c

In [None]:
train_df.to_csv(r"input_with_always_and_without_8_1.csv", index = False)

In [None]:
train_df_optimal = train_df.drop(no_matches.index)

In [None]:
train_df_optimal["further_review"] = np.where((len(train_df_optimal["always_pattern_classes"]) > 1), 'N', 'Y')

In [None]:
further_review = train_df_optimal[train_df_optimal["further_review"] == 'Y']

In [None]:
further_review

In [None]:
train_df_optimal.to_csv(r"input_optimized.csv", index = False)

In [None]:
no_matches.to_csv(r"test_and_validation.csv", index = False)

### Computing Frequency per Keyword in dataset and comparing to overall 

In [None]:
regex = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Getting_Data\keywords.csv")

In [None]:
k = regex["REGEX"].to_list()
c = regex["CASE"].to_list()
p_list = []

for i in range(len(k)):
    if (c[i] == 0):
        p_list.append(re.compile(k[i][5:], re.IGNORECASE))
    elif (c[i] == 1):
        p_list.append(re.compile(k[i]))
print(p_list)

In [None]:
l = []
for note in tqdm(train_df["original"]):
    curr = []
    for p in (p_list):
        m = list(set(re.findall(p, note)))
        m = list(set(map(str.lower, m)))
        if (m != []):
            curr.append("".join(m))
    #print(curr)
    #print(l)
    l.append(str(curr))
train_df["regex_match"] = l

In [None]:
train_df.head()

In [None]:
train_df.loc[i]['regex_match']

In [None]:
cols =  ['dementia', 'cognition',"\'cognition", 'memory', "memory\'", 'cdr', 'mmse', 'moca', 'alzheimer', 'cognitive impairment','cognitiveimpairment', 'mci', 'cerebellar', 'neurocognitive', 'lewy', "pick's", 'cortical', 'corticobasal', 'cerebral', 'cerebrovascular', 'amnesia', 'ad', 'lbd']
summary_stats  = pd.DataFrame(pd.np.empty((len(train_df), len(cols))) * pd.np.nan) 
summary_stats.columns = cols

In [None]:
for i in tqdm(range(len(train_df))):
    keyword_list = list(eval(train_df.loc[i]['regex_match']))
    #print(keyword_list, len(keyword_list))
    match_tuples = []
    for j in range(len(keyword_list)):
        x = keyword_list[j]

        match_tuples.append((x, 0))

    #print(match_tuples)

    freq = {"dementia":0
        ,"cognition":0
        ,"\'cognition":0
        ,"memory":0
        ,"memory\'":0
        ,"cdr":0
        ,"mmse":0
        ,"moca":0
        ,"alzheimer":0
        ,'cognitive impairment':0
        , "cognitiveimpairment":0
        ,"mci":0
        ,"cerebellar":0
        ,"neurocognitive":0
        ,"lewy":0
        ,"pick's":0
        ,"cortical":0
        ,"corticobasal":0
        ,"cerebral":0
        ,"cerebrovascular":0
        ,"amnesia":0
        ,'ad':0
        ,'lbd':0
    }

    for k, v in match_tuples:   
        if (k != ''):
            freq.update({str(k.lower()):int(freq[k.lower()] + 1)})

    #print(freq, "\n")

    #summary_stats["encounters_with_keywords"][i] = len(train_df[train_df["PatientID"] == unique_ids[i]])

    for k, v in freq.items():
        summary_stats[str(k.lower())][i] = int(v)

In [None]:
summary_stats.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Optimizing-data\keywords_distribution_train_set_8_1.csv", index = False)

In [None]:
X_train

## Count-Vectorizer

In [None]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english", analyzer='word', token_pattern=r'\b[A-Za-z0-9]+\b')

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
count_valid = count_vectorizer.transform(X_valid)
count = count_vectorizer.transform(X)

# Create the CountVectorizer DataFrame: count_train
count_train = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
count_test = pd.DataFrame(count_test.A, columns=count_vectorizer.get_feature_names())
count_valid = pd.DataFrame(count_valid.A, columns=count_vectorizer.get_feature_names())
count = pd.DataFrame(count.A, columns=count_vectorizer.get_feature_names())
count_train.head()

## TFIDF-Vectorizer

In [None]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english",analyzer='word', token_pattern=r'\b[A-Za-z0-9]+\b')
tfidf_train= tfidf_vectorizer.fit_transform(X_train)
tfidf_valid = tfidf_vectorizer.transform(X_valid)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf = tfidf_vectorizer.transform(X)

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_train = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_valid = pd.DataFrame(tfidf_valid.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_test = pd.DataFrame(tfidf_test.A, columns=tfidf_vectorizer.get_feature_names())
tfidf = pd.DataFrame(tfidf.A, columns=tfidf_vectorizer.get_feature_names())
tfidf.head()

## Preparing Data for SMOTE

In [None]:
#ONLY run once
tfidf_train_features_df = pd.concat([tfidf_train, y_train.reset_index(drop=True)], axis=1)
tfidf_test_features_df = pd.concat([tfidf_test, y_test.reset_index(drop = True)], axis = 1)
tfidf_valid_features_df = pd.concat([tfidf_valid, y_valid.reset_index(drop = True)], axis = 1)

In [None]:
tfidf_train_features_df_data = tfidf_train_features_df.values
X_smote, y_smote = tfidf_train_features_df_data[:, :-1], tfidf_train_features_df_data[:, -1]

tfidf_test_features_df_data = tfidf_test_features_df.values
X_smote_test, y_smote_test = tfidf_test_features_df_data[:, :-1], tfidf_test_features_df_data[:, -1]

tfidf_valid_features_df_data = tfidf_valid_features_df.values
X_smote_valid, y_smote_valid = tfidf_valid_features_df_data[:, :-1], tfidf_valid_features_df_data[:, -1]

In [None]:
X_train_new = pd.DataFrame(X_train)
y_train_new = pd.Series(y_train)

X_test_new = pd.DataFrame(X_test)
y_test_new = pd.Series(y_test)

X_valid_new = pd.DataFrame(X_valid)
y_valid_new = pd.Series(y_valid)

In [None]:
x = list(tfidf_train_features_df.columns) 
x.append('annotator_label')
len(x)

In [None]:
X_train_new

In [None]:
tfidf_train_features_df_new = tfidf_train_features_df #pd.concat([tfidf_train_features_df, y_train_new.reset_index(drop = True)], axis = 1)#pd.concat([X_train_new, y_train_new.reset_index(drop=True)], axis=1)
tmp = list(tfidf_train_features_df.columns) 
tfidf_train_features_df_new.columns = tmp

tfidf_test_features_df_new = tfidf_test_features_df #pd.concat([tfidf_test_features_df, y_test_new.reset_index(drop = True)], axis = 1)#pd.concat([X_test_new, y_test_new.reset_index(drop=True)], axis=1)
tmp2 = list(tfidf_test_features_df.columns) 
tfidf_test_features_df_new.columns = tmp2

tfidf_valid_features_df_new = tfidf_valid_features_df #pd.concat([tfidf_valid_features_df, y_valid_new.reset_index(drop = True)], axis = 1)#pd.concat([X_valid_new, y_valid_new.reset_index(drop=True)], axis=1)
tmp3 = list(tfidf_valid_features_df.columns) 
tfidf_valid_features_df_new.columns = tmp3

In [None]:
mappings = {0.0 : 0, 1.0 : 1, 2.0 : 2}

tfidf_train_features_df_new.annotator_label = [mappings[item] for item in tfidf_train_features_df_new.annotator_label]
tfidf_test_features_df_new.annotator_label = [mappings[item] for item in tfidf_test_features_df_new.annotator_label]
tfidf_valid_features_df_new.annotator_label = [mappings[item] for item in tfidf_valid_features_df_new.annotator_label]

tfidf_features_master = pd.concat([tfidf_train_features_df_new,tfidf_test_features_df_new,tfidf_valid_features_df_new])

tfidf_features_master.head()

In [None]:
cross_validation = pd.concat([tfidf_train_features_df_new,tfidf_valid_features_df_new])
len(cross_validation), cross_validation.shape

In [None]:
tfidf_train_features_df_new.drop(columns = 'annotator_label')
tfidf_test_features_df_new.drop(columns = 'annotator_label')
tfidf_valid_features_df_new.drop(columns = 'annotator_label')

In [None]:
tfidf_train_features_df_new

In [None]:
full_tfidf = pd.concat([tfidf_train_features_df_new, tfidf_test_features_df_new, tfidf_valid_features_df_new])

## Filtering by Pearson Correlation Coefficient

In [None]:
def filter_features_by_cor(df):
    m = len(df.columns)
    output = df.iloc[:,m-1] 
    output_list = output.tolist()
    corrcoef_array = []

    for i in range(0,m-2):
        input_list = df.iloc[:,i].tolist()
        cols = [input_list, output_list]
        corrcoef = abs(np.corrcoef(cols)) 
        corrcoef_array = np.append(corrcoef_array,corrcoef[0,1])

    feature_names = list(df)
    feature_names = feature_names[0:m-2]
    
    output_df = pd.DataFrame(feature_names, columns=['Features'])
    output_df['CorrCoef'] = corrcoef_array
    output_df = output_df.sort_values('CorrCoef')
    output_df = output_df.reset_index()
    output_df = output_df.drop(columns = "index")
    
    return output_df

In [None]:
tfidf_output_df = filter_features_by_cor(tfidf_train_features_df)
tfidf_output_df = tfidf_output_df.sort_values(by=['CorrCoef'],ascending = False)

In [None]:
tfidf_output_df.head(10)

In [None]:
tfidf_output_df.to_csv('Data/tfidf_vector_feature_corr_8_1.csv', index = False)

In [None]:
# Setting Correlation threshold
top_tfidf_features_df = tfidf_output_df[:500]
filtered_tfidf_train = tfidf_train_features_df_new.filter(items=top_tfidf_features_df['Features'])
filtered_tfidf_test = tfidf_test_features_df_new.filter(items=top_tfidf_features_df['Features'])
filtered_tfidf_valid   = tfidf_valid_features_df_new.filter(items=top_tfidf_features_df['Features'])
#filtered_tfidf_tanish = tfidf_tanish.filter(items = top_tfidf_features_df['Features'])

In [None]:
filtered_tfidf_train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\tfidf_train_smote.csv", index = False)
filtered_tfidf_test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\tfidf_test_smote.csv", index = False)
filtered_tfidf_valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\tfidf_validation_smote.csv", index = False)

In [None]:
y_train.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\y_train_smote.csv", index = False)
y_test.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\y_test_smote.csv", index = False)
y_valid.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\y_validation_smote.csv", index = False)

## Finding Optimal Correlation Threshold Value by running Logistic Regression model on Validation Set

In [None]:
def filter_by_corr(corr, tfidf_output_df):
    # Setting Correlation threshold
    top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
    filtered_tfidf_train = tfidf_train_features_df.filter(items=top_tfidf_features_df['Features'])
    filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])
    filtered_tfidf_valid = tfidf_valid_features_df.filter(items=top_tfidf_features_df['Features'])
    
    return top_tfidf_features_df, filtered_tfidf_train, filtered_tfidf_test, filtered_tfidf_valid

In [None]:
def logisitic_regression(X_train, y_train, X_test, y_test, c, want_report, want_conf_mat, testing = False):
    # fitting model
    lr = LogisticRegression(penalty='l1', solver='liblinear', C = c, random_state=0)
    lr.fit(X_train, y_train)
    
    if (testing):
        filename = 'Storage/lr_8_9.sav'
        pickle.dump(lr, open(filename, 'wb'))
    
    # predictions
    y_pred = lr.predict(X_test)
    y_prob = lr.predict_proba(X_test)
    
    #print(y_pred, y_prob)
    
    # collecting results
    acc = metrics.accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob, average='weighted', multi_class='ovr')
    
    if (want_report == True):
        target_names = ['NO', 'NTR', 'YES']
        results_lgr = classification_report(y_test, y_pred, target_names = target_names, output_dict=True)
        results_lgr = pd.DataFrame(results_lgr).transpose()
        
        if (want_conf_mat == True):
            return lr, acc, auc, c, results_lgr, confusion_matrix(y_test, y_pred)
    
        return lr, acc, auc, c, results_lgr
    
    if (want_conf_mat == True):
        return lr, acc, auc, c, confusion_matrix(y_test, y_pred)
        
    return lr, acc, auc, c

## Cross Validation

In [None]:
def cross_validation_split(dataset, n_folds):
    # ensuring straftification across label
    yes = cross_validation[cross_validation["annotator_label"] == 2].reset_index(drop = True)
    no = cross_validation[cross_validation["annotator_label"] == 0].reset_index(drop = True)
    ntr = cross_validation[cross_validation["annotator_label"] == 1].reset_index(drop = True)
    #print(len(yes), len(no), len(ntr))
    
    yes_count = len(yes) // 10
    no_count = len(no) // 10
    ntr_count = len(ntr) // 10
    #print(yes_count, no_count, ntr_count)
    split = list()
    fold_size = len(cross_validation) // 10

    # shuffling data to avoid having to generate random nums through while loop
    yes = yes.sample(frac=1).reset_index(drop=True)
    no = no.sample(frac=1).reset_index(drop=True)
    ntr = ntr.sample(frac=1).reset_index(drop=True)
    
    # creating folds
    for i in tqdm(range(n_folds)):
        fold = pd.DataFrame(columns = cross_validation.columns)

        fold = fold.append(yes[yes_count * i : (yes_count * i) + yes_count])
        #print(len(fold), "YES", )
        fold = fold.append(no[no_count * i : (no_count * i) + no_count])
        #print(len(fold), "NO", )
        fold = fold.append(ntr[ntr_count * i : (ntr_count * i) + ntr_count])
        #print(len(fold), "NTR", ((ntr_count * i) + ntr_count) - (ntr_count * i))
        split.append(fold)
        
    return split

In [None]:
cross_validation = cross_validation.reset_index(drop = True)

In [None]:
splits = cross_validation_split(cross_validation, 10)

In [None]:
#cross_validation.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Data\cross_validation_tfidf_8_3.csv", index = False)

In [None]:
def evaluate_algorithm(dataset, n_folds):
    splits = cross_validation_split(dataset, n_folds)
    
    counter = 0
    tfidf_all_df = pd.DataFrame()
    df_list  = []
    
    for fold in splits:
        train = splits.copy()
        del train[counter]
        train = pd.concat(train)
        
        y_train = train["annotator_label"].reset_index(drop = True)
        y_train = y_train.astype(int)
        
        y_test = fold["annotator_label"].reset_index(drop = True)
        y_test = y_test.astype(int)
        
        train = train.drop(columns = ["annotator_label"])
        fold = fold.drop(columns = ["annotator_label"])
        
        test = list()
        corr_list = list(np.arange(1,30) * 0.01)
       
        for corr in corr_list:
            acc_list = []
            auc_list = []
            c_list = []
             
            # filtering by correlation coefficient
            top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
            filtered_tfidf_train = train.filter(items=top_tfidf_features_df['Features'])
            filtered_tfidf_fold = fold.filter(items=top_tfidf_features_df['Features'])
            filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])
            
            #print(filtered_tfidf_train.shape)
            #print(filtered_tfidf_fold.shape)
            
            # tuning for optimal lambda value
            for c in [0.01, 0.1, 1, 10, 100]:
                lr, acc, auc, c = logisitic_regression(filtered_tfidf_train, y_train, filtered_tfidf_fold, y_test, c, False, False, False)
                acc_list.append(acc)
                auc_list.append(auc)
                c_list.append(c)
            
            # gathering model stats
            acc_df = pd.DataFrame(acc_list, columns=['acc'])
            auc_df = pd.DataFrame(auc_list, columns=['auc'])
            c_df = pd.DataFrame(c_list, columns=['c_value'])
            
            assert len(acc_df) == len(auc_df) == len(c_df)
            
            #acc_df["fold_number"] = auc_df["fold_number"] = c_df["fold_number"] = [counter] * len(auc_df)
            
            iter_df = pd.concat([c_df, acc_df, auc_df], axis=1)
            iter_df['corr_thres'] = [corr] * len(iter_df)
            iter_df['fold_number'] = [(counter + 1)] * len(iter_df)
            df_list.append(iter_df)
            
        print("Completed Fold #: ", counter + 1)
        counter += 1
        
        print("Stats DF has", len(df_list), "records")
        
        #df_list.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Optimizing-data\sample_stat_df.csv", index = False)
    return df_list

In [None]:
tfidf_all_df = evaluate_algorithm(cross_validation, 10)

In [None]:
tfidf_all_df = pd.concat(tfidf_all_df)

In [None]:
tfidf_all_df = tfidf_all_df.reset_index(drop = True)

In [None]:
tfidf_all_df.to_csv('tfidf_vect_performance_l1_regularization_10_fold_cross_validation_8_4.csv')

In [None]:
corr_list = list(np.arange(1,30) * 0.01)
average_results_df = []

for corr in corr_list:
    for c in [0.01, 0.1, 1, 10, 100]:
        filtered = tfidf_all_df[(tfidf_all_df["corr_thres"] == corr) & (tfidf_all_df["c_value"] == c)]
        avg_auc = filtered["auc"].mean()
        avg_acc = filtered["acc"].mean()

        filler = np.arange(5, 9)**2
        df = pd.DataFrame(filler.reshape(1, 4), columns = ["c_value", "acc", "auc", "corr_thres"])
        df.loc[df.index] = [c, avg_acc, avg_auc, corr]
        #print(df)
        
        average_results_df.append(df)

In [None]:
average_results_df = pd.concat(average_results_df)
average_results_df

In [None]:
average_results_df[average_results_df['auc'] == max(average_results_df['auc'])]

In [None]:
average_results_df[average_results_df['auc'] == min(average_results_df['auc'])]

In [None]:
len(tfidf_all_df[tfidf_all_df['auc'] > 0.95])

## Running Logistic Regression model with Optimal Parameters (identified from validation set) on Test Set

In [None]:
# Specific parameter setting performance
corr = 0.01
c = 10

y_train = cross_validation["annotator_label"]
y_test = tfidf_test_features_df["annotator_label"]

cross_validation.drop(columns = ["annotator_label"])
tfidf_test_features_df.drop(columns = ["annotator_label"])

# Setting Correlation threshold
top_tfidf_features_df = tfidf_output_df[tfidf_output_df['CorrCoef'] > corr]
filtered_tfidf_train = cross_validation.filter(items=top_tfidf_features_df['Features'])
filtered_tfidf_test = tfidf_test_features_df.filter(items=top_tfidf_features_df['Features'])

# Running model
lr, acc_optimized, auc_optimized, c_list, report, conf_mat = logisitic_regression(filtered_tfidf_train, y_train, filtered_tfidf_test, y_test, c, True, True)

print("\nC: ", c, "\n", report)
print("\nAUC: ", auc_optimized)
print("ACC: ", acc_optimized)
print("\nConfusion Matrix: \n", conf_mat)

In [None]:
FP = conf_mat.sum(axis = 0) - np.diag(conf_mat) 
FN = conf_mat.sum(axis = 1) - np.diag(conf_mat)
TP = np.diag(conf_mat)
TN = conf_mat.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)

# Specificity or true negative rate
TNR = TN/(TN+FP) 

# Precision or positive predictive value
PPV = TP/(TP+FP)

# Negative predictive value
NPV = TN/(TN+FN)

# Fall out or false positive rate
FPR = FP/(FP+TN)

# False negative rate
FNR = FN/(TP+FN)

# False discovery rate
FDR = FP/(TP+FP)

print("Sensitivity: ", TPR)
print("Specificity: ", TNR)
print("NPV: ", NPV)
print("PPV: ", PPV)

## Testing on 20K 

In [None]:
df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\Diversity_Sampling\20K_sample_8_9.csv")

In [None]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
    
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [None]:
for i in range(len(df)):
    df.loc[i, "regex_sent"] = clean_sequence(df.loc[i]["regex_sent"][7:len(df.loc[i]["regex_sent"]) - 5])

In [None]:
df.loc[2]["regex_sent"]

In [None]:
X = df["regex_sent"].to_list()

In [None]:
model = pickle.load(open(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\lr_8_9.sav", 'rb'))

In [None]:
tfidf_valid

In [None]:
model.score(tfidf_valid,y_valid)

In [None]:
filtered = filter(lambda x : x == 2, predictions)
print(len(list(filtered)))

In [None]:
initial_set = pd.concat([])

In [None]:
"""   
      NO   NTR YES
NO  [[439  9  21]
NTR [  5 457   7]
YES [ 26  19 424]]

NO - 439/469 TP, 9/469 NTR when should be NO, and 21/469 YES when should be NO
    Precision: 439/(439+5+26)
    Recall: 439/(439+21+9)
    fl: (439*2)/(470+469)

NTR - 457/469 TP, 5/469 NO when should be NTR, and 7/469 YES when should be NTR
YES - 424/469 TP, 19/469 NTR when should be YES, and 26/469 are NO when should be YES
"""

In [None]:
"""
ROC-AUC:
    TPR = TP / P = TP / (TP+FN) = number of true positives / number of positives
    FPR = FP / N = FP / (FP+TN) = number of false positives / number of negatives
    plots FPR aganist TPR 
    
classifier achieves the good performance on the positive class (high AUC) 
at the cost of a high false negatives rate (or a low number of true negative), resulting in low ACC.
"""

In [None]:
show_number = 20
lr_feature_names = list(filtered_tfidf_train.columns)
lr_feature_coef = list(lr.coef_.tolist()[0])
lr_feature_importance_df = pd.DataFrame(list(zip(lr_feature_names,lr_feature_coef)), columns = ['feature','lr_coef'])
lr_feature_importance_df = lr_feature_importance_df.sort_values('lr_coef',ascending=False)

plt.figure(figsize=(15,15))
plt.title("LR Top Feature Importmant Rank")
plt.gca().invert_yaxis()
plt.barh(lr_feature_importance_df[:show_number]['feature'], lr_feature_importance_df[:show_number]['lr_coef'], color='g')

## Analysis on Predictions

In [None]:
sample = pd.DataFrame(columns = ['patient_id', 'empi', 'label', 'apoe'])

In [None]:
sample['patient_id'] = df['PatientID']
sample['empi'] = df['EMPI']
sample['label'] = predictions

In [None]:
len(sample["patient_id"].unique())

In [None]:
bib1 = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\BioBank\sd587_03212118253982866_6375194824947826181_Bib.csv")
bib2 = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\BioBank\sd587_03212118253982866_6375194824947826182_Bib.csv")

In [None]:
nodup = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\BioBank\Partners_biobank_APOE_nodup.csv")

In [None]:
bib = pd.concat([bib1, bib2])

In [None]:
bib = bib.reset_index(drop = True)
nodup = nodup.reset_index(drop = True)

In [None]:
allele = str((nodup[nodup["subject_id"] == bib.loc[i]["Subject_Id"]]["APOE"].values))
allele = allele[2:len(allele)-2]
allele

In [None]:
bib = bib[bib['Subject_Id'].notna()]

In [None]:
apoe = []
for i in tqdm(range(len(bib))):
    allele = str((nodup[nodup["subject_id"] == bib.loc[i]["Subject_Id"]]["APOE"].values))
    allele = allele[2:len(allele)-2]
    apoe.append(allele)

In [None]:
len(bib)

In [None]:
bib["APOE"] = apoe

In [None]:
bib.head()

In [None]:
apoe = []
for i in range(len(sample)):
    allele = bib[bib["EMPI"] == bib.loc[i]["EMPI"]]["APOE"].values
    apoe.append(allele)

In [None]:
sample["apoe"] = apoe

In [None]:
sample.head()

In [None]:
convert = {2 : "Yes", 1 : "NTR", 0: "NO"}

In [None]:
# sample.label = [convert[i] for i in sample.label]

In [None]:
y = sample[sample["label"] == "Yes"]
ntr = sample[sample["label"] == "NTR"]
n = sample[sample["label"] == "NO"]

In [None]:
y

In [None]:
y["apoe"].value_counts()

In [None]:
n["apoe"].value_counts()

In [None]:
ntr["apoe"].value_counts()

In [None]:
ntr

In [None]:
sample["apoe"].value_counts()

In [None]:
model