In [129]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
import regex as re
import math 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

In [103]:
entire = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\load_data\slat_8_16.csv", header = None)
entire.columns = ["sequence_number", "sequence", "note_number"]

## Testing Model Generability

## Running Multi-Class Model on Test Set to examine whether sequence level predictions are sound

In [14]:
tfidf_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_train_8_12.csv")

In [15]:
y_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\y_train_8_12.csv")

In [16]:
tfidf_train_features_df = pd.concat([tfidf_train, y_train.reset_index(drop=True)], axis = 1)

In [18]:
def filter_features_by_cor(df):
    m = len(df.columns)
    output = df.iloc[:,m-1] 
    output_list = output.tolist()
    corrcoef_array = []

    for i in range(0,m-2):
        input_list = df.iloc[:,i].tolist()
        cols = [input_list, output_list]
        corrcoef = abs(np.corrcoef(cols)) 
        corrcoef_array = np.append(corrcoef_array,corrcoef[0,1])

    feature_names = list(df)
    feature_names = feature_names[0:m-2]
    
    output_df = pd.DataFrame(feature_names, columns=['Features'])
    output_df['CorrCoef'] = corrcoef_array
    output_df = output_df.sort_values('CorrCoef')
    output_df = output_df.reset_index()
    output_df = output_df.drop(columns = "index")
    
    return output_df

In [19]:
tfidf_output_df = filter_features_by_cor(tfidf_train_features_df)
tfidf_output_df = tfidf_output_df.sort_values(by = ['CorrCoef'], ascending = False)

In [21]:
tfidf_output_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Feature Selection\tfidf_output_df_8_12.csv", index = False)

In [22]:
top_tfidf_features_df_sample = tfidf_output_df[tfidf_output_df['CorrCoef'] > 0.01]

In [24]:
tfidf_test = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_test_8_12.csv")

In [26]:
model = pickle.load(open(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\model_8_11.sav", 'rb'))

In [27]:
test_predictions = model.predict(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))

In [29]:
test_sequences = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Train Test Split\test_full.csv")

In [33]:
test_sequences = pd.concat([test_sequences, pd.Series(test_predictions)], axis = 1)

## Seeing if there are discrepancies between ACC of sequences that have matched with and without always patterns

In [85]:
always_patterns = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [86]:
always_pattern_regex = always_patterns["Pattern"].to_list()

In [87]:
for i in range(len(always_pattern_regex)):
    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)

In [88]:
def find_always_pattern_matches(df, col):
    a = []
    counter = 0

    for seq in tqdm(df[col]):
        curr = []
        classes = []

        for p in (always_pattern_regex):
            m = list(set(re.findall(p, seq)))
            m = list(set(map(str.lower, m)))
            
            if (m != []):
                curr.append("".join(m))

        a.append(curr)
        counter += 1
    
    df["always_pattern_match"] = a

In [89]:
find_always_pattern_matches(test_sequences, "sequence")

100%|██████████████████████████████████████████████████████████████████████████████| 293/293 [00:00<00:00, 2989.27it/s]


In [90]:
len(test_sequences[test_sequences["always_pattern_match"].str.len() == 0])

98

In [93]:
test_sequences.columns = ['Unnamed: 0', 'sequence', 'annotator_label', 'predictions', 'always_pattern_match', 'always_pattern_classes']

In [94]:
always_pattern_match = test_sequences[test_sequences["always_pattern_match"].str.len() != 0]
no_always_pattern_match = test_sequences[test_sequences["always_pattern_match"].str.len() == 0]

In [95]:
always_pattern_match = always_pattern_match.sample(frac = 1).reset_index(drop = True) 
no_always_pattern_match = no_always_pattern_match.sample(frac = 1).reset_index(drop = True) 

In [96]:
always_pattern_sample = always_pattern_match[:50]
no_always_pattern_match_sample = no_always_pattern_match[:50]

In [98]:
always_pattern_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\AP_sample.csv", index = False)
no_always_pattern_match_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\NO_AP_sample.csv", index = False)

In [99]:
accuracy_score(test_sequences["annotator_label"], test_sequences["predictions"])

0.8395904436860068

In [100]:
accuracy_score(always_pattern_match["annotator_label"], always_pattern_match["predictions"])

0.9794871794871794

In [101]:
accuracy_score(no_always_pattern_match["annotator_label"], no_always_pattern_match["predictions"])

0.5612244897959183

In [102]:
accuracy_score(no_always_pattern_match_sample["annotator_label"], no_always_pattern_match_sample["predictions"])

0.54

In [123]:
find_always_pattern_matches(entire, "sequence")

100%|████████████████████████████████████████████████████████████████████████| 279224/279224 [01:30<00:00, 3096.00it/s]


In [124]:
len(entire[entire["always_pattern_match"].str.len() != 0]) / len(entire)

0.13389250207718534

## Testing Model Generability

In [126]:
match = entire[entire["always_pattern_match"].str.len() != 0]
no_match = entire[entire["always_pattern_match"].str.len() == 0]

In [147]:
percent = (round((len(match) / len(entire)) * 100) + 1) / 100

In [150]:
seq_count = 100

In [174]:
match_slice = match[:int(percent * seq_count)] 
no_match_slice = no_match[:int((1 - percent) * seq_count)]
len(match_slice), len(no_match_slice)

(14, 86)

In [175]:
real_world_sample = pd.concat([match_slice, no_match_slice])

In [176]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
    
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [177]:
real_world_sample = real_world_sample.reset_index(drop = True)

In [178]:
for i in tqdm(range((seq_count))):
    real_world_sample.loc[i, "sequence"] = clean_sequence(real_world_sample.loc[i]["sequence"][3:len(real_world_sample.loc[i]["sequence"]) - 3])

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 929.98it/s]


In [180]:
real_world_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\real_world_sample.csv", index = False)

In [181]:
tfidf_real_world = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\tfidf_real_sample.csv")

In [182]:
real_world_predictions = model.predict(tfidf_real_world.filter(items = top_tfidf_features_df_sample['Features']))

In [183]:
real_world_sample["predictions"] = real_world_predictions

In [184]:
real_world_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\real_world_sample.csv", index = False)

## Poster Confusion Matrix

Analyzing Results of 13,941 Patients

In [109]:
df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\20K_sample_8_11.csv")
df2 = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [78]:
df2["patient_CI"] = p_CI

In [110]:
e2 = df2[(df2["APOE"] == "e2/e2") | (df2["APOE"] == "e2/e3")]
e3 = df2[df2["APOE"] == "e3/e3"]
e4 = df2[(df2["APOE"] == "e2/e4 or e1/e3") | (df2["APOE"] == "e3/e4") | (df2["APOE"] == "e4/e4")]

In [111]:
(len(e2) + len(e3) + len(e4)) - len(df2)

0

In [112]:
df2["APOE"].value_counts()

e3/e3             8751
e3/e4             2862
e2/e3             1675
e2/e4 or e1/e3     302
e4/e4              272
e2/e2               79
Name: APOE, dtype: int64

In [113]:
len(e2), len(e3), len(e4)

(1754, 8751, 3436)

In [114]:
e2["AD_Med_or_ICD_Code"].value_counts()

0    1562
1     192
Name: AD_Med_or_ICD_Code, dtype: int64

In [115]:
e3["AD_Med_or_ICD_Code"].value_counts()

0    7771
1     980
Name: AD_Med_or_ICD_Code, dtype: int64

In [116]:
e4["AD_Med_or_ICD_Code"].value_counts()

0    2859
1     577
Name: AD_Med_or_ICD_Code, dtype: int64

In [117]:
e2["patient_CI"].value_counts()

1    1080
0     674
Name: patient_CI, dtype: int64

In [118]:
e3["patient_CI"].value_counts()

1    5415
0    3336
Name: patient_CI, dtype: int64

In [119]:
e4["patient_CI"].value_counts()

1    2169
0    1267
Name: patient_CI, dtype: int64

In [126]:
len(df2[(df2["patient_CI"] == 1) & (df2["AD_Med_or_ICD_Code"] == 1)])

1665

In [127]:
len(df2[(df2["patient_CI"] == 1) & (df2["AD_Med_or_ICD_Code"] == 0)])

6999

In [128]:
len(df2[(df2["patient_CI"] == 0) & (df2["AD_Med_or_ICD_Code"] == 1)])

84

In [129]:
len(df2[(df2["patient_CI"] == 0) & (df2["AD_Med_or_ICD_Code"] == 0)])

5193

## Analyzing False Positives
Model Predicts CI, no prescence of Med/ICD Code <br>
What portion of these are where models if correct, and what portion is where model is wrong

In [131]:
predictions = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [134]:
FP = predictions[(predictions["patient_CI"] == 1) & (predictions["AD_Med_or_ICD_Code"] == 0)]

In [136]:
sample_set = FP["patient_id"].to_list()[:50]

In [138]:
dataset = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\load_data\SLAT_production_7_24.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [271]:
pred_proba = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\lr_binary_classification_8_17.csv")

In [264]:
sampling_df = pd.DataFrame()
pred = []
proba = []

for i in tqdm(range(len(sample_set))):
    sampling_df = sampling_df.append(dataset[dataset["PatientID"] == sample_set[i]])
    pred += (pred_proba[pred_proba["patient_id"] == sample_set[i]]["predictions"].to_list())
    proba += (pred_proba[pred_proba["patient_id"] == sample_set[i]]["probability"].to_list())

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 13.17it/s]


In [268]:
sampling_df["predictions"] = pred
sampling_df["probability"] = proba

In [270]:
sampling_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\50_FP_sample.csv", index = False)

## Fixing Error of Sequences not being padded

In [145]:
not_padded = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\FINAL\not_padded_matches_7_14.csv")

In [147]:
padded = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\FINAL\padded_matches_7_14.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [151]:
len(padded[padded["padded_merged_regex_sent"].str.len() < 500])

3339

In [166]:
not_padded.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'EMPI', 'MRN', 'PatientID', 'PatientEncounterID', 'ContactDTS',
       'EncounterTypeDSC', 'NoteID', 'InpatientNoteTypeDSC', 'buckets',
       'NoteCSNID', 'LineNBR', 'NoteTXT', 'regex_location', 'regex_sent',
       'regex_match', 'match_count', 'pruned_regex_location',
       'merged_row_location', 'padded_merged_regex_sent', 'merged_regex_match',
       'sequence_level_regex_location', 'sequence_level_regex_match',
       'char_count', 'note_char_count']

In [168]:
entire_matches = pd.concat([padded, not_padded])

In [172]:
len(entire_matches[entire_matches["padded_merged_regex_sent"].str.len() < 500])

3339

In [204]:
sequences = []
counter = 0

for index, row in dataset.iterrows():
    match = entire_matches[entire_matches["NoteID"] == row["NoteID"]]
    
    if (len(match) > 1):
        match = match[match["NoteTXT"] == str(row["NoteTXT"])]
        if (len(match) > 1):
            print("Issue")
            break
        sequences.append(match["padded_merged_regex_sent"].values[0])
    else:
        sequences.append(match["padded_merged_regex_sent"].values[0])
        
    if (len(sequences) == counter):
        print("Issue")
        break
        
    counter += 1
    if (counter % 100000 == 0):
        print("Finished ", counter, " sequences", len(sequences))

Finished  100000  sequences 100000
Finished  200000  sequences 200000


In [207]:
len(sequences)

279224

In [None]:
# dataset["regex_sent"] = sequences

In [208]:
from sqlite3 import connect
conn = connect(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\db.sqlite3")
gui_sentence = pd.read_sql("SELECT * FROM gui_sentence", conn)

In [223]:
new = pd.DataFrame()
new["id"] = gui_sentence["id"]
new["Contents"] = sequences
new["Note_id"] = gui_sentence["Note_id"]

In [226]:
new["Contents"] = new["Contents"].apply(json.dumps)

In [227]:
import json

def is_json(myjson):
    try:
        json_object = json.loads(myjson)
    except ValueError as e:
        return False
    return True

is_json(new.loc[7658]["Contents"])

True

In [230]:
new.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\slat_8_16.csv", index = False, header = False)

In [232]:
dataset["regex_sent"] = new["Contents"]

In [234]:
dataset.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\SLAT_8_17.csv", index = False)