In [1]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
import regex as re
import math 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

In [3]:
entire = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Sequences\slat_8_16.csv", header = None)
entire.columns = ["sequence_number", "sequence", "note_number"]

## Testing Model Generability

## Running Multi-Class Model on Test Set to examine whether sequence level predictions are sound

In [18]:
tfidf_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_train_8_12.csv")

In [19]:
y_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\y_train_8_12.csv")

In [20]:
tfidf_train_features_df = pd.concat([tfidf_train, y_train.reset_index(drop=True)], axis = 1)

In [21]:
def filter_features_by_cor(df):
    m = len(df.columns)
    output = df.iloc[:,m-1] 
    output_list = output.tolist()
    corrcoef_array = []

    for i in range(0,m-2):
        input_list = df.iloc[:,i].tolist()
        cols = [input_list, output_list]
        corrcoef = abs(np.corrcoef(cols)) 
        corrcoef_array = np.append(corrcoef_array,corrcoef[0,1])

    feature_names = list(df)
    feature_names = feature_names[0:m-2]
    
    output_df = pd.DataFrame(feature_names, columns=['Features'])
    output_df['CorrCoef'] = corrcoef_array
    output_df = output_df.sort_values('CorrCoef')
    output_df = output_df.reset_index()
    output_df = output_df.drop(columns = "index")
    
    return output_df

In [22]:
tfidf_output_df = filter_features_by_cor(tfidf_train_features_df)
tfidf_output_df = tfidf_output_df.sort_values(by = ['CorrCoef'], ascending = False)

In [None]:
tfidf_output_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Feature Selection\tfidf_output_df_8_12.csv", index = False)

In [23]:
top_tfidf_features_df_sample = tfidf_output_df[tfidf_output_df['CorrCoef'] > 0.01]

In [24]:
tfidf_test = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_test_8_12.csv")

In [25]:
model = pickle.load(open(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\model_8_11.sav", 'rb'))

In [26]:
test_predictions = model.predict(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))

In [27]:
test_sequences = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Train Test Split\test_full.csv")

In [28]:
test_sequences = pd.concat([test_sequences, pd.Series(test_predictions)], axis = 1)

In [29]:
test_sequences.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\test_set_preds_8_25.csv", index = False)

## Seeing if there are discrepancies between ACC of sequences that have matched with and without always patterns

In [30]:
test_sequences = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\test_set_preds_8_25.csv")

In [31]:
always_patterns = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [32]:
always_pattern_regex = always_patterns["Pattern"].to_list()

In [33]:
for i in range(len(always_pattern_regex)):
    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)

In [34]:
len(always_pattern_regex)

35

In [341]:
def find_always_pattern_matches(df, col):
    a = []
    counter = 0

    for seq in tqdm(df[col]):
        curr = []
        classes = []

        for p in (always_pattern_regex):
            m = list(set(re.findall(p, seq)))
            m = list(set(map(str.lower, m)))
            
            if (m != []):
                curr.append("".join(m))

        a.append(curr)
        counter += 1
    
    df["always_pattern_match"] = a
    
    return df

In [36]:
find_always_pattern_matches(test_sequences, "sequence")

100%|██████████████████████████████████████████████████████████████████████████████| 293/293 [00:00<00:00, 2339.64it/s]


In [37]:
len(test_sequences[test_sequences["always_pattern_match"].str.len() == 0])

98

In [40]:
test_sequences.columns = ['Unnamed: 0', 'sequence', 'annotator_label', 'predictions', 'always_pattern_match']

In [41]:
always_pattern_match = test_sequences[test_sequences["always_pattern_match"].str.len() != 0]
no_always_pattern_match = test_sequences[test_sequences["always_pattern_match"].str.len() == 0]

In [42]:
always_pattern_match = always_pattern_match.sample(frac = 1).reset_index(drop = True) 
no_always_pattern_match = no_always_pattern_match.sample(frac = 1).reset_index(drop = True) 

In [43]:
always_pattern_sample = always_pattern_match[:50]
no_always_pattern_match_sample = no_always_pattern_match[:50]

In [44]:
always_pattern_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\AP_sample_8_25.csv", index = False)
no_always_pattern_match_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\NO_AP_sample_8_25.csv", index = False)

In [45]:
accuracy_score(test_sequences["annotator_label"], test_sequences["predictions"])

0.8395904436860068

In [46]:
accuracy_score(always_pattern_match["annotator_label"], always_pattern_match["predictions"])

0.9794871794871794

In [47]:
accuracy_score(no_always_pattern_match["annotator_label"], no_always_pattern_match["predictions"])

0.5612244897959183

In [48]:
accuracy_score(no_always_pattern_match_sample["annotator_label"], no_always_pattern_match_sample["predictions"])

0.52

In [49]:
find_always_pattern_matches(entire, "sequence")

100%|████████████████████████████████████████████████████████████████████████| 279224/279224 [01:32<00:00, 3009.94it/s]


In [50]:
len(entire[entire["always_pattern_match"].str.len() != 0]) / len(entire)

0.1456071111365785

## Testing Model Generability

In [51]:
match = entire[entire["always_pattern_match"].str.len() != 0]
no_match = entire[entire["always_pattern_match"].str.len() == 0]

In [52]:
percent = (round((len(match) / len(entire)) * 100) + 1) / 100

In [53]:
seq_count = 100

In [54]:
match_slice = match[:int(percent * seq_count)] 
no_match_slice = no_match[:int((1 - percent) * seq_count)]
len(match_slice), len(no_match_slice)

(16, 84)

In [55]:
real_world_sample = pd.concat([match_slice, no_match_slice])

In [56]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
    
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [57]:
real_world_sample = real_world_sample.reset_index(drop = True)

In [58]:
for i in tqdm(range((seq_count))):
    real_world_sample.loc[i, "sequence"] = clean_sequence(real_world_sample.loc[i]["sequence"][3:len(real_world_sample.loc[i]["sequence"]) - 3])

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1369.05it/s]


In [180]:
real_world_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\real_world_sample.csv", index = False)

In [80]:
tfidf_real_world = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_real_sample.csv")

In [81]:
real_world_predictions = model.predict(tfidf_real_world.filter(items = top_tfidf_features_df_sample['Features']))

In [92]:
real_world_predictions[:8]

array([2, 1, 2, 1, 0, 0, 1, 2], dtype=int64)

In [93]:
real_world_predictions[15:56]

array([2, 0, 0, 0, 0, 1, 0, 2, 1, 1, 2, 2, 0, 2, 1, 0, 2, 1, 0, 0, 2, 0,
       0, 2, 2, 0, 2, 0, 1, 0, 2, 1, 1, 2, 2, 1, 2, 0, 0, 2, 0],
      dtype=int64)

In [94]:
tfidf_real_world.shape

(100, 24098)

In [126]:
fifty_seq = pd.read_excel(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\50_seq_real_world_sample.xlsx")

In [127]:
tfidf_50_seq = tfidf_real_world[:8]

In [128]:
tfidf_50_seq = tfidf_50_seq.append(tfidf_real_world[15:56])

In [129]:
fifty_proba = model.predict_proba(tfidf_50_seq.filter(items = top_tfidf_features_df_sample['Features']))

In [130]:
max_proba = []
for arr in fifty_proba:
    max_proba.append(max(arr))

In [131]:
len(max_proba), len(fifty_seq)

(49, 49)

In [132]:
fifty_seq["proba"] = max_proba

In [137]:
auc = roc_auc_score(fifty_seq["label"], fifty_proba, multi_class = "ovr", average = "weighted")

In [146]:
acc = metrics.accuracy_score(fifty_seq["predictions"], fifty_seq["label"])

In [147]:
auc, acc

(0.7777106030382077, 0.6530612244897959)

In [184]:
real_world_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\real_world_sample.csv", index = False)

## Poster Confusion Matrix

Analyzing Results of 13,941 Patients

In [109]:
df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\20K_sample_8_11.csv")
df2 = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [78]:
df2["patient_CI"] = p_CI

In [110]:
e2 = df2[(df2["APOE"] == "e2/e2") | (df2["APOE"] == "e2/e3")]
e3 = df2[df2["APOE"] == "e3/e3"]
e4 = df2[(df2["APOE"] == "e2/e4 or e1/e3") | (df2["APOE"] == "e3/e4") | (df2["APOE"] == "e4/e4")]

In [111]:
(len(e2) + len(e3) + len(e4)) - len(df2)

0

In [112]:
df2["APOE"].value_counts()

e3/e3             8751
e3/e4             2862
e2/e3             1675
e2/e4 or e1/e3     302
e4/e4              272
e2/e2               79
Name: APOE, dtype: int64

In [113]:
len(e2), len(e3), len(e4)

(1754, 8751, 3436)

In [114]:
e2["AD_Med_or_ICD_Code"].value_counts()

0    1562
1     192
Name: AD_Med_or_ICD_Code, dtype: int64

In [115]:
e3["AD_Med_or_ICD_Code"].value_counts()

0    7771
1     980
Name: AD_Med_or_ICD_Code, dtype: int64

In [116]:
e4["AD_Med_or_ICD_Code"].value_counts()

0    2859
1     577
Name: AD_Med_or_ICD_Code, dtype: int64

In [117]:
e2["patient_CI"].value_counts()

1    1080
0     674
Name: patient_CI, dtype: int64

In [118]:
e3["patient_CI"].value_counts()

1    5415
0    3336
Name: patient_CI, dtype: int64

In [119]:
e4["patient_CI"].value_counts()

1    2169
0    1267
Name: patient_CI, dtype: int64

In [126]:
len(df2[(df2["patient_CI"] == 1) & (df2["AD_Med_or_ICD_Code"] == 1)])

1665

In [127]:
len(df2[(df2["patient_CI"] == 1) & (df2["AD_Med_or_ICD_Code"] == 0)])

6999

In [128]:
len(df2[(df2["patient_CI"] == 0) & (df2["AD_Med_or_ICD_Code"] == 1)])

84

In [129]:
len(df2[(df2["patient_CI"] == 0) & (df2["AD_Med_or_ICD_Code"] == 0)])

5193

## Analyzing False Positives
Model Predicts CI, no prescence of Med/ICD Code <br>
What portion of these are where models if correct, and what portion is where model is wrong

In [131]:
predictions = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [134]:
FP = predictions[(predictions["patient_CI"] == 1) & (predictions["AD_Med_or_ICD_Code"] == 0)]

In [136]:
sample_set = FP["patient_id"].to_list()[:50]

In [138]:
dataset = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\load_data\SLAT_production_7_24.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [271]:
pred_proba = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\lr_binary_classification_8_17.csv")

In [264]:
sampling_df = pd.DataFrame()
pred = []
proba = []

for i in tqdm(range(len(sample_set))):
    sampling_df = sampling_df.append(dataset[dataset["PatientID"] == sample_set[i]])
    pred += (pred_proba[pred_proba["patient_id"] == sample_set[i]]["predictions"].to_list())
    proba += (pred_proba[pred_proba["patient_id"] == sample_set[i]]["probability"].to_list())

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 13.17it/s]


In [268]:
sampling_df["predictions"] = pred
sampling_df["probability"] = proba

In [270]:
sampling_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\50_FP_sample.csv", index = False)

## Fixing Error of Sequences not being padded

In [145]:
not_padded = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\FINAL\not_padded_matches_7_14.csv")

In [147]:
padded = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\FINAL\padded_matches_7_14.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [151]:
len(padded[padded["padded_merged_regex_sent"].str.len() < 500])

3339

In [166]:
not_padded.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'EMPI', 'MRN', 'PatientID', 'PatientEncounterID', 'ContactDTS',
       'EncounterTypeDSC', 'NoteID', 'InpatientNoteTypeDSC', 'buckets',
       'NoteCSNID', 'LineNBR', 'NoteTXT', 'regex_location', 'regex_sent',
       'regex_match', 'match_count', 'pruned_regex_location',
       'merged_row_location', 'padded_merged_regex_sent', 'merged_regex_match',
       'sequence_level_regex_location', 'sequence_level_regex_match',
       'char_count', 'note_char_count']

In [168]:
entire_matches = pd.concat([padded, not_padded])

In [172]:
len(entire_matches[entire_matches["padded_merged_regex_sent"].str.len() < 500])

3339

In [204]:
sequences = []
counter = 0

for index, row in dataset.iterrows():
    match = entire_matches[entire_matches["NoteID"] == row["NoteID"]]
    
    if (len(match) > 1):
        match = match[match["NoteTXT"] == str(row["NoteTXT"])]
        if (len(match) > 1):
            print("Issue")
            break
        sequences.append(match["padded_merged_regex_sent"].values[0])
    else:
        sequences.append(match["padded_merged_regex_sent"].values[0])
        
    if (len(sequences) == counter):
        print("Issue")
        break
        
    counter += 1
    if (counter % 100000 == 0):
        print("Finished ", counter, " sequences", len(sequences))

Finished  100000  sequences 100000
Finished  200000  sequences 200000


In [207]:
len(sequences)

279224

In [None]:
# dataset["regex_sent"] = sequences

In [208]:
from sqlite3 import connect
conn = connect(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\db.sqlite3")
gui_sentence = pd.read_sql("SELECT * FROM gui_sentence", conn)

In [223]:
new = pd.DataFrame()
new["id"] = gui_sentence["id"]
new["Contents"] = sequences
new["Note_id"] = gui_sentence["Note_id"]

In [226]:
new["Contents"] = new["Contents"].apply(json.dumps)

In [227]:
import json

def is_json(myjson):
    try:
        json_object = json.loads(myjson)
    except ValueError as e:
        return False
    return True

is_json(new.loc[7658]["Contents"])

True

In [230]:
new.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\slat_8_16.csv", index = False, header = False)

In [232]:
dataset["regex_sent"] = new["Contents"]

In [234]:
dataset.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\SLAT_8_17.csv", index = False)

## Getting Straftification of Sequences that are matched by always patterns by keyword

In [361]:
slat = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\SLAT_8_17.csv")

In [362]:
regex = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Getting_Data\keywords.csv")

In [363]:
k = regex["REGEX"].to_list()
c = regex["CASE"].to_list()
keywords = []

for i in range(len(k)):
    if (c[i] == 0):
        keywords.append(re.compile(k[i][5:], re.IGNORECASE))
    elif (c[i] == 1):
        keywords.append(re.compile(k[i]))
len(keywords)

18

In [365]:
def matches(df, col):
    l = []
    for note in tqdm(df[col]):
        curr = []
        for p in (keywords):
            m = list(set(re.findall(p, note)))
            m = list(set(map(str.lower, m)))
            if (m != []):
                curr.append("".join(m))
        #print(curr)
        #print(l)
        l.append(str(curr))
    
    df["match"] = l

In [366]:
matches(slat, "regex_sent")

100%|████████████████████████████████████████████████████████████████████████| 279224/279224 [00:49<00:00, 5673.05it/s]


In [368]:
freq = {"dementia":0
,"cognition":0
,"cognition":0
,"memory":0
,"mmse":0
,"moca":0
,"alzheimer":0
,'cognitive impairment':0
,"mci":0
,"cerebellar":0
,"neurocognitive":0
,"lewy":0
,"pick's":0
,"corticobasal":0
,"cerebral":0
,"cerebrovascular":0
,"amnesia":0
,"ad": 0
,"lbd": 0
}

In [369]:
import warnings
warnings.filterwarnings("ignore")

In [370]:
slat_copy = slat.copy()

In [379]:
ap_by_keyword = pd.DataFrame(columns = ["keyword", "percentage"])

In [380]:
counter = 0
for k in keywords:
    subset = slat_copy[slat_copy["match"].str.contains(list(freq.keys())[counter])]
    subset = find_always_pattern_matches(subset, "regex_sent")
    ap_match = len(subset[subset["always_pattern_match"].str.len() != 0])
    ap_by_keyword.loc[counter] = (list(freq.keys())[counter], ap_match/ len(subset))
    print("Percent of ", list(freq.keys())[counter], " with AP match: ", ((ap_match) / len(subset)), " Count: ", ap_match, "Len Df: ", len(ap_by_keyword))
    counter += 1

100%|██████████████████████████████████████████████████████████████████████████| 51091/51091 [00:18<00:00, 2702.99it/s]


Percent of  dementia  with AP match:  0.16809222759390108  Count:  8588 Len Df:  1


100%|██████████████████████████████████████████████████████████████████████████| 87581/87581 [00:30<00:00, 2907.55it/s]


Percent of  cognition  with AP match:  0.11967207499343464  Count:  10481 Len Df:  2


100%|████████████████████████████████████████████████████████████████████████| 109472/109472 [00:37<00:00, 2913.99it/s]
  0%|                                                                                         | 0/2154 [00:00<?, ?it/s]

Percent of  memory  with AP match:  0.27036137094416834  Count:  29597 Len Df:  3


100%|████████████████████████████████████████████████████████████████████████████| 2154/2154 [00:00<00:00, 2269.78it/s]
  0%|                                                                                         | 0/9799 [00:00<?, ?it/s]

Percent of  mmse  with AP match:  0.5297121634168988  Count:  1141 Len Df:  4


100%|████████████████████████████████████████████████████████████████████████████| 9799/9799 [00:03<00:00, 2540.33it/s]
  0%|                                                                                        | 0/20772 [00:00<?, ?it/s]

Percent of  moca  with AP match:  0.47637514032044087  Count:  4668 Len Df:  5


100%|██████████████████████████████████████████████████████████████████████████| 20772/20772 [00:07<00:00, 2843.90it/s]
  0%|                                                                                        | 0/20468 [00:00<?, ?it/s]

Percent of  alzheimer  with AP match:  0.5875698055074138  Count:  12205 Len Df:  6


100%|██████████████████████████████████████████████████████████████████████████| 20468/20468 [00:07<00:00, 2759.01it/s]
  0%|                                                                                         | 0/3933 [00:00<?, ?it/s]

Percent of  cognitive impairment  with AP match:  0.21003517686144224  Count:  4299 Len Df:  7


100%|████████████████████████████████████████████████████████████████████████████| 3933/3933 [00:01<00:00, 2475.97it/s]
  0%|                                                                                        | 0/26890 [00:00<?, ?it/s]

Percent of  mci  with AP match:  0.2929061784897025  Count:  1152 Len Df:  8


100%|██████████████████████████████████████████████████████████████████████████| 26890/26890 [00:08<00:00, 3047.20it/s]
  0%|                                                                                         | 0/7782 [00:00<?, ?it/s]

Percent of  cerebellar  with AP match:  0.21904053551506136  Count:  5890 Len Df:  9


100%|████████████████████████████████████████████████████████████████████████████| 7782/7782 [00:03<00:00, 2449.03it/s]
  0%|                                                                                         | 0/2599 [00:00<?, ?it/s]

Percent of  neurocognitive  with AP match:  0.3652017476227191  Count:  2842 Len Df:  10


100%|████████████████████████████████████████████████████████████████████████████| 2599/2599 [00:00<00:00, 3239.87it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<?, ?it/s]


Percent of  lewy  with AP match:  0.1789149672951135  Count:  465 Len Df:  11
Percent of  pick's  with AP match:  0.25  Count:  11 Len Df:  12


100%|██████████████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 1140.29it/s]


Percent of  corticobasal  with AP match:  0.43333333333333335  Count:  65 Len Df:  13


100%|██████████████████████████████████████████████████████████████████████████| 45937/45937 [00:16<00:00, 2827.18it/s]
  0%|                                                                                        | 0/36437 [00:00<?, ?it/s]

Percent of  cerebral  with AP match:  0.12131832727431047  Count:  5573 Len Df:  14


100%|██████████████████████████████████████████████████████████████████████████| 36437/36437 [00:11<00:00, 3045.32it/s]
  0%|                                                                                         | 0/3725 [00:00<?, ?it/s]

Percent of  cerebrovascular  with AP match:  0.09440952877569503  Count:  3440 Len Df:  15


100%|████████████████████████████████████████████████████████████████████████████| 3725/3725 [00:01<00:00, 2878.00it/s]
  0%|                                                                                         | 0/2707 [00:00<?, ?it/s]

Percent of  amnesia  with AP match:  0.15892617449664428  Count:  592 Len Df:  16


100%|████████████████████████████████████████████████████████████████████████████| 2707/2707 [00:01<00:00, 2368.40it/s]
  0%|                                                                                          | 0/228 [00:00<?, ?it/s]

Percent of  ad  with AP match:  0.41189508681196896  Count:  1115 Len Df:  17


100%|██████████████████████████████████████████████████████████████████████████████| 228/228 [00:00<00:00, 3578.10it/s]

Percent of  lbd  with AP match:  0.4166666666666667  Count:  95 Len Df:  18





In [382]:
ap_by_keyword.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Documentation\always_pattern_rates_by_keyword.csv", index = False)

## Fine Tuning Always Patterns

In [398]:
pattern1 = r"\bincluding\s*but\s*not\s*limited\s*to\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"

pattern2 = r"\bside\s*effects?\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"

# pattern3 = r"\brisk\s*of?\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"
# pattern4 = r"\bconcerns?\s*(for|regarding)\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"

pattern5 = r"\bpatients\s*with\s*[^\.]*(cognitive\s*impairments?|cognitive\s*disabilities|cognitive\s*concerns?|dementia|Alzheimer(s|’s)?)"

pattern6 = r"\bpatients\s*requiring\s*an\s*additional\s*person\s*to\s*understand\s*their\s*clinical\s*diagnosis"

### Getting sequences that have not been annotated yet in SLAT

In [221]:
model_matrix = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\model_matrix.csv")

In [232]:
not_in_slat = slat.copy()
counter = 0

for i in tqdm(range(len(model_matrix))):
    match = not_in_slat[not_in_slat["regex_sent"] == str(model_matrix.loc[i]["original"])]
    not_in_slat = not_in_slat.drop(match.index)
    counter += 1
    if (counter % 1000 == 0):
        print("Finished ", counter, " sequences left", len(not_in_slat)) 

 11%|████████▏                                                                     | 1001/9527 [02:30<20:23,  6.97it/s]

Finished  1000  sequences left 278121


 21%|████████████████▍                                                             | 2001/9527 [05:02<18:52,  6.64it/s]

Finished  2000  sequences left 277030


 31%|████████████████████████▌                                                     | 3001/9527 [07:29<14:00,  7.77it/s]

Finished  3000  sequences left 275957


 42%|████████████████████████████████▊                                             | 4001/9527 [09:55<12:26,  7.40it/s]

Finished  4000  sequences left 274882


 52%|████████████████████████████████████████▉                                     | 5001/9527 [12:23<10:24,  7.25it/s]

Finished  5000  sequences left 273833


 63%|█████████████████████████████████████████████████▏                            | 6001/9527 [14:47<08:46,  6.70it/s]

Finished  6000  sequences left 272801


 73%|█████████████████████████████████████████████████████████▎                    | 7001/9527 [17:18<06:07,  6.88it/s]

Finished  7000  sequences left 271785


 84%|█████████████████████████████████████████████████████████████████▌            | 8001/9527 [19:47<03:54,  6.51it/s]

Finished  8000  sequences left 270777


 94%|█████████████████████████████████████████████████████████████████████████▋    | 9001/9527 [22:11<01:04,  8.12it/s]

Finished  9000  sequences left 269810


100%|██████████████████████████████████████████████████████████████████████████████| 9527/9527 [23:26<00:00,  6.77it/s]


In [234]:
not_in_slat.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\not_annotated_8_27.csv", index = False)

### Getting Matches for Potential Always Patterns

In [474]:
def fine_tune_always_patterns(df, col, regex):
    regex_pattern = re.compile(regex, re.IGNORECASE)
    a = []

    for seq in tqdm(df[col]):
        match = regex_pattern.search(seq)
        if (match is not None):
            a.append(match.group())
        else:
            a.append("")
#         curr = []
#         classes = []

#         m = list(set(re.findall(regex_pattern, seq)))
#         # in case m is a list of tuples
#         if (len(m) >= 1 and type(m[0]) is tuple):
#             converted_m = []
#             for t in m:
#                 for x in t:
#                     if (x != ""):
#                         converted_m.append(x)
#             converted_m = list(set(map(str.lower, converted_m)))
#             if (converted_m != []):
#                 curr.append("".join(converted_m))
#         else:
#             m = list(set(map(str.lower, m)))
#             if (m != []):
#                 curr.append("".join(m))
#         a.append(curr)
    
    df["match"] = a
    return (df[df["match"].str.len() != 0], len(df[df["match"].str.len() != 0]))

#### Colin Patterns

In [511]:
pattern1_df, pattern1_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern1)

100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 332306.48it/s]


In [512]:
(pattern1_count)

17

In [513]:
pattern1_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\Always Patterns\colin_pattern1_matches.csv", index = False)

In [514]:
pattern2_df, pattern2_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern2)

100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 148415.67it/s]


In [515]:
pattern2_count

399

In [516]:
pattern2_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\colin_pattern2_matches.csv", index = False)

In [517]:
pattern5_df, pattern5_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern5)

100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 285033.10it/s]


In [518]:
pattern5_count

259

In [519]:
pattern5_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\colin_pattern5_matches.csv", index = False)

In [520]:
pattern6_df, pattern6_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern6)

100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 328272.88it/s]


In [521]:
pattern6_count

12

In [522]:
pattern6_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\colin_pattern6_matches.csv", index = False)

#### My Patterns

In [500]:
fifty_seq_real_world = pd.read_excel(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\50_seq_real_world_sample.xlsx")

In [501]:
fifty_seq_real_world = fifty_seq_real_world[fifty_seq_real_world["regex"].str.len() > 0]

In [502]:
len(fifty_seq_real_world)

12

In [510]:
counts = []
counter = 0
for regex in fifty_seq_real_world["regex"]:
    match_df, count = fine_tune_always_patterns(not_in_slat, "regex_sent", regex)
    match_df.to_csv(r"pattern_{}_real_world.csv".format(counter), index = False)
    counter += 1
    counts.append(count)

100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 213083.17it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 212660.34it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 383241.40it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 182040.17it/s]
100%|███████████████████████████████████████████████████████████████████████| 269311/269311 [00:03<00:00, 78324.98it/s]
100%|███████████████████████████████████████████████████████████████████████| 269311/269311 [00:03<00:00, 76904.17it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 215059.77it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 165260.40it/s]
100%|███████████████████████████████████

In [507]:
fifty_seq_real_world["match_counts"] = counts

In [509]:
fifty_seq_real_world.to_excel(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\Always Patterns\12_real_world_errors.xlsx", index = False)

In [523]:
family_pattern = r"(husband|wife|son|daughter|aunt|uncle)"

In [528]:
family_matches, family_match_count = fine_tune_always_patterns(not_in_slat, "regex_sent", family_pattern)

100%|███████████████████████████████████████████████████████████████████████| 269311/269311 [00:18<00:00, 14266.35it/s]


In [530]:
family_dementia_pattern = r"(husband|wife|son|daughter|aunt|uncle)\s*[^.]*dementia"

In [533]:
family_dementia_matches, family_dementia_match_count = fine_tune_always_patterns(not_in_slat, "regex_sent", family_dementia_pattern)

100%|███████████████████████████████████████████████████████████████████████| 269311/269311 [00:03<00:00, 73034.37it/s]


In [534]:
family_dementia_match_count

8903

### Current Always Patterns in SLAT

In [536]:
curr_ap = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [538]:
curr_ap = curr_ap["Pattern"].to_list()

In [539]:
counts = []
counter = 0
for regex in curr_ap:
    match_df, count = fine_tune_always_patterns(not_in_slat, "regex_sent", regex)
    match_df.to_csv(r"Storage/Analysis/Always Patterns/Current/curr_pattern_{}_real_world.csv".format(counter), index = False)
    counter += 1
    counts.append(count)

100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 180317.14it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 379379.15it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 247632.02it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 225212.95it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 316044.35it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 393241.69it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:00<00:00, 389679.65it/s]
100%|██████████████████████████████████████████████████████████████████████| 269311/269311 [00:01<00:00, 232096.09it/s]
100%|███████████████████████████████████

In [540]:
curr_ap = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [541]:
curr_ap["match_counts"] = counts

In [543]:
curr_ap["match_counts"].sum()

51874

In [546]:
curr_ap.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\Always Patterns\curr_ap_matches.csv", index = False)