In [None]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
import regex as re
import math 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
entire = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Sequences\slat_8_16.csv", header = None)
entire.columns = ["sequence_number", "sequence", "note_number"]

## Testing Model Generability

## Running Multi-Class Model on Test Set to examine whether sequence level predictions are sound

In [None]:
tfidf_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_train_8_12.csv")

In [None]:
y_train = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\y_train_8_12.csv")

In [None]:
tfidf_train_features_df = pd.concat([tfidf_train, y_train.reset_index(drop=True)], axis = 1)

In [None]:
def filter_features_by_cor(df):
    m = len(df.columns)
    output = df.iloc[:,m-1] 
    output_list = output.tolist()
    corrcoef_array = []

    for i in range(0,m-2):
        input_list = df.iloc[:,i].tolist()
        cols = [input_list, output_list]
        corrcoef = abs(np.corrcoef(cols)) 
        corrcoef_array = np.append(corrcoef_array,corrcoef[0,1])

    feature_names = list(df)
    feature_names = feature_names[0:m-2]
    
    output_df = pd.DataFrame(feature_names, columns=['Features'])
    output_df['CorrCoef'] = corrcoef_array
    output_df = output_df.sort_values('CorrCoef')
    output_df = output_df.reset_index()
    output_df = output_df.drop(columns = "index")
    
    return output_df

In [None]:
tfidf_output_df = filter_features_by_cor(tfidf_train_features_df)
tfidf_output_df = tfidf_output_df.sort_values(by = ['CorrCoef'], ascending = False)

In [None]:
tfidf_output_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Feature Selection\tfidf_output_df_8_12.csv", index = False)

In [None]:
top_tfidf_features_df_sample = tfidf_output_df[tfidf_output_df['CorrCoef'] > 0.01]

In [None]:
tfidf_test = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_test_8_12.csv")

In [None]:
model = pickle.load(open(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\model_8_11.sav", 'rb'))

In [None]:
test_predictions = model.predict(tfidf_test.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
test_sequences = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\Train Test Split\test_full.csv")

In [None]:
test_sequences = pd.concat([test_sequences, pd.Series(test_predictions)], axis = 1)

In [None]:
test_sequences.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\test_set_preds_8_25.csv", index = False)

## Seeing if there are discrepancies between ACC of sequences that have matched with and without always patterns

In [None]:
test_sequences = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\test_set_preds_8_25.csv")

In [None]:
always_patterns = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [None]:
always_pattern_regex = always_patterns["Pattern"].to_list()

In [None]:
for i in range(len(always_pattern_regex)):
    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)

In [None]:
len(always_pattern_regex)

In [None]:
def find_always_pattern_matches(df, col):
    a = []
    counter = 0

    for seq in tqdm(df[col]):
        curr = []
        classes = []

        for p in (always_pattern_regex):
            m = list(set(re.findall(p, seq)))
            m = list(set(map(str.lower, m)))
            
            if (m != []):
                curr.append("".join(m))

        a.append(curr)
        counter += 1
    
    df["always_pattern_match"] = a
    
    return df

In [None]:
find_always_pattern_matches(test_sequences, "sequence")

In [None]:
len(test_sequences[test_sequences["always_pattern_match"].str.len() == 0])

In [None]:
test_sequences.columns = ['Unnamed: 0', 'sequence', 'annotator_label', 'predictions', 'always_pattern_match']

In [None]:
always_pattern_match = test_sequences[test_sequences["always_pattern_match"].str.len() != 0]
no_always_pattern_match = test_sequences[test_sequences["always_pattern_match"].str.len() == 0]

In [None]:
always_pattern_match = always_pattern_match.sample(frac = 1).reset_index(drop = True) 
no_always_pattern_match = no_always_pattern_match.sample(frac = 1).reset_index(drop = True) 

In [None]:
always_pattern_sample = always_pattern_match[:50]
no_always_pattern_match_sample = no_always_pattern_match[:50]

In [None]:
always_pattern_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\AP_sample_8_25.csv", index = False)
no_always_pattern_match_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\NO_AP_sample_8_25.csv", index = False)

In [None]:
accuracy_score(test_sequences["annotator_label"], test_sequences["predictions"])

In [None]:
accuracy_score(always_pattern_match["annotator_label"], always_pattern_match["predictions"])

In [None]:
accuracy_score(no_always_pattern_match["annotator_label"], no_always_pattern_match["predictions"])

In [None]:
accuracy_score(no_always_pattern_match_sample["annotator_label"], no_always_pattern_match_sample["predictions"])

In [None]:
find_always_pattern_matches(entire, "sequence")

In [None]:
len(entire[entire["always_pattern_match"].str.len() != 0]) / len(entire)

## Testing Model Generability

In [None]:
match = entire[entire["always_pattern_match"].str.len() != 0]
no_match = entire[entire["always_pattern_match"].str.len() == 0]

In [None]:
percent = (round((len(match) / len(entire)) * 100) + 1) / 100

In [None]:
seq_count = 100

In [None]:
match_slice = match[:int(percent * seq_count)] 
no_match_slice = no_match[:int((1 - percent) * seq_count)]
len(match_slice), len(no_match_slice)

In [None]:
real_world_sample = pd.concat([match_slice, no_match_slice])

In [None]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
    
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [None]:
real_world_sample = real_world_sample.reset_index(drop = True)

In [None]:
for i in tqdm(range((seq_count))):
    real_world_sample.loc[i, "sequence"] = clean_sequence(real_world_sample.loc[i]["sequence"][3:len(real_world_sample.loc[i]["sequence"]) - 3])

In [None]:
real_world_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\real_world_sample.csv", index = False)

In [None]:
tfidf_real_world = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\TFIDF\tfidf_real_sample.csv")

In [None]:
real_world_predictions = model.predict(tfidf_real_world.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
real_world_predictions[:8]

In [None]:
real_world_predictions[15:56]

In [None]:
tfidf_real_world.shape

In [None]:
fifty_seq = pd.read_excel(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\50_seq_real_world_sample.xlsx")

In [None]:
tfidf_50_seq = tfidf_real_world[:8]

In [None]:
tfidf_50_seq = tfidf_50_seq.append(tfidf_real_world[15:56])

In [None]:
fifty_proba = model.predict_proba(tfidf_50_seq.filter(items = top_tfidf_features_df_sample['Features']))

In [None]:
max_proba = []
for arr in fifty_proba:
    max_proba.append(max(arr))

In [None]:
len(max_proba), len(fifty_seq)

In [None]:
fifty_seq["proba"] = max_proba

In [None]:
auc = roc_auc_score(fifty_seq["label"], fifty_proba, multi_class = "ovr", average = "weighted")

In [None]:
acc = metrics.accuracy_score(fifty_seq["predictions"], fifty_seq["label"])

In [None]:
auc, acc

In [None]:
real_world_sample.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\real_world_sample.csv", index = False)

## Poster Confusion Matrix

Analyzing Results of 13,941 Patients

In [None]:
df = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\data\20K_sample_8_11.csv")
df2 = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [None]:
df2["patient_CI"] = p_CI

In [None]:
e2 = df2[(df2["APOE"] == "e2/e2") | (df2["APOE"] == "e2/e3")]
e3 = df2[df2["APOE"] == "e3/e3"]
e4 = df2[(df2["APOE"] == "e2/e4 or e1/e3") | (df2["APOE"] == "e3/e4") | (df2["APOE"] == "e4/e4")]

In [None]:
(len(e2) + len(e3) + len(e4)) - len(df2)

In [None]:
df2["APOE"].value_counts()

In [None]:
len(e2), len(e3), len(e4)

In [None]:
e2["AD_Med_or_ICD_Code"].value_counts()

In [None]:
e3["AD_Med_or_ICD_Code"].value_counts()

In [None]:
e4["AD_Med_or_ICD_Code"].value_counts()

In [None]:
e2["patient_CI"].value_counts()

In [None]:
e3["patient_CI"].value_counts()

In [None]:
e4["patient_CI"].value_counts()

In [None]:
len(df2[(df2["patient_CI"] == 1) & (df2["AD_Med_or_ICD_Code"] == 1)])

In [None]:
len(df2[(df2["patient_CI"] == 1) & (df2["AD_Med_or_ICD_Code"] == 0)])

In [None]:
len(df2[(df2["patient_CI"] == 0) & (df2["AD_Med_or_ICD_Code"] == 1)])

In [None]:
len(df2[(df2["patient_CI"] == 0) & (df2["AD_Med_or_ICD_Code"] == 0)])

## Analyzing False Positives
Model Predicts CI, no prescence of Med/ICD Code <br>
What portion of these are where models if correct, and what portion is where model is wrong

In [None]:
predictions = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Baseline_Model\data\tanish_predictions_with_structured_features.csv")

In [None]:
FP = predictions[(predictions["patient_CI"] == 1) & (predictions["AD_Med_or_ICD_Code"] == 0)]

In [None]:
sample_set = FP["patient_id"].to_list()[:50]

In [None]:
dataset = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\load_data\SLAT_production_7_24.csv")

In [None]:
pred_proba = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\lr_binary_classification_8_17.csv")

In [None]:
sampling_df = pd.DataFrame()
pred = []
proba = []

for i in tqdm(range(len(sample_set))):
    sampling_df = sampling_df.append(dataset[dataset["PatientID"] == sample_set[i]])
    pred += (pred_proba[pred_proba["patient_id"] == sample_set[i]]["predictions"].to_list())
    proba += (pred_proba[pred_proba["patient_id"] == sample_set[i]]["probability"].to_list())

In [None]:
sampling_df["predictions"] = pred
sampling_df["probability"] = proba

In [None]:
sampling_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\50_FP_sample.csv", index = False)

## Fixing Error of Sequences not being padded

In [None]:
not_padded = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\FINAL\not_padded_matches_7_14.csv")

In [None]:
padded = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\Regex_match\FINAL\padded_matches_7_14.csv")

In [None]:
len(padded[padded["padded_merged_regex_sent"].str.len() < 500])

In [None]:
not_padded.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'EMPI', 'MRN', 'PatientID', 'PatientEncounterID', 'ContactDTS',
       'EncounterTypeDSC', 'NoteID', 'InpatientNoteTypeDSC', 'buckets',
       'NoteCSNID', 'LineNBR', 'NoteTXT', 'regex_location', 'regex_sent',
       'regex_match', 'match_count', 'pruned_regex_location',
       'merged_row_location', 'padded_merged_regex_sent', 'merged_regex_match',
       'sequence_level_regex_location', 'sequence_level_regex_match',
       'char_count', 'note_char_count']

In [None]:
entire_matches = pd.concat([padded, not_padded])

In [None]:
len(entire_matches[entire_matches["padded_merged_regex_sent"].str.len() < 500])

In [None]:
sequences = []
counter = 0

for index, row in dataset.iterrows():
    match = entire_matches[entire_matches["NoteID"] == row["NoteID"]]
    
    if (len(match) > 1):
        match = match[match["NoteTXT"] == str(row["NoteTXT"])]
        if (len(match) > 1):
            print("Issue")
            break
        sequences.append(match["padded_merged_regex_sent"].values[0])
    else:
        sequences.append(match["padded_merged_regex_sent"].values[0])
        
    if (len(sequences) == counter):
        print("Issue")
        break
        
    counter += 1
    if (counter % 100000 == 0):
        print("Finished ", counter, " sequences", len(sequences))

In [None]:
len(sequences)

In [None]:
# dataset["regex_sent"] = sequences

In [None]:
from sqlite3 import connect
conn = connect(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\db.sqlite3")
gui_sentence = pd.read_sql("SELECT * FROM gui_sentence", conn)

In [None]:
new = pd.DataFrame()
new["id"] = gui_sentence["id"]
new["Contents"] = sequences
new["Note_id"] = gui_sentence["Note_id"]

In [None]:
new["Contents"] = new["Contents"].apply(json.dumps)

In [None]:
import json

def is_json(myjson):
    try:
        json_object = json.loads(myjson)
    except ValueError as e:
        return False
    return True

is_json(new.loc[7658]["Contents"])

In [None]:
new.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\slat_8_16.csv", index = False, header = False)

In [None]:
dataset["regex_sent"] = new["Contents"]

In [None]:
dataset.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\SLAT_8_17.csv", index = False)

## Getting Straftification of Sequences that are matched by always patterns by keyword

In [None]:
slat = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\SLAT_8_17.csv")

In [None]:
regex = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Getting_Data\keywords.csv")

In [None]:
k = regex["REGEX"].to_list()
c = regex["CASE"].to_list()
keywords = []

for i in range(len(k)):
    if (c[i] == 0):
        keywords.append(re.compile(k[i][5:], re.IGNORECASE))
    elif (c[i] == 1):
        keywords.append(re.compile(k[i]))
len(keywords)

In [None]:
def matches(df, col):
    l = []
    for note in tqdm(df[col]):
        curr = []
        for p in (keywords):
            m = list(set(re.findall(p, note)))
            m = list(set(map(str.lower, m)))
            if (m != []):
                curr.append("".join(m))
        #print(curr)
        #print(l)
        l.append(str(curr))
    
    df["match"] = l

In [None]:
matches(slat, "regex_sent")

In [None]:
freq = {"dementia":0
,"cognition":0
,"cognition":0
,"memory":0
,"mmse":0
,"moca":0
,"alzheimer":0
,'cognitive impairment':0
,"mci":0
,"cerebellar":0
,"neurocognitive":0
,"lewy":0
,"pick's":0
,"corticobasal":0
,"cerebral":0
,"cerebrovascular":0
,"amnesia":0
,"ad": 0
,"lbd": 0
}

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
slat_copy = slat.copy()

In [None]:
ap_by_keyword = pd.DataFrame(columns = ["keyword", "percentage"])

In [None]:
counter = 0
for k in keywords:
    subset = slat_copy[slat_copy["match"].str.contains(list(freq.keys())[counter])]
    subset = find_always_pattern_matches(subset, "regex_sent")
    ap_match = len(subset[subset["always_pattern_match"].str.len() != 0])
    ap_by_keyword.loc[counter] = (list(freq.keys())[counter], ap_match/ len(subset))
    print("Percent of ", list(freq.keys())[counter], " with AP match: ", ((ap_match) / len(subset)), " Count: ", ap_match, "Len Df: ", len(ap_by_keyword))
    counter += 1

In [None]:
ap_by_keyword.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Documentation\always_pattern_rates_by_keyword.csv", index = False)

## Fine Tuning Always Patterns

In [None]:
pattern1 = r"\bincluding\s*but\s*not\s*limited\s*to\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"

pattern2 = r"\bside\s*effects?\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"

# pattern3 = r"\brisk\s*of?\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"
# pattern4 = r"\bconcerns?\s*(for|regarding)\s*[^.]*(dementia|MCI|mild\s*cognitive\s*impairment|memory\s*loss|memory\s*issues|cognitive\s*impairment|memory\s*problem)"

pattern5 = r"\bpatients\s*with\s*[^\.]*(cognitive\s*impairments?|cognitive\s*disabilities|cognitive\s*concerns?|dementia|Alzheimer(s|’s)?)"

pattern6 = r"\bpatients\s*requiring\s*an\s*additional\s*person\s*to\s*understand\s*their\s*clinical\s*diagnosis"

### Getting sequences that have not been annotated yet in SLAT

In [None]:
model_matrix = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Data\model_matrix.csv")

In [None]:
not_in_slat = slat.copy()
counter = 0

for i in tqdm(range(len(model_matrix))):
    match = not_in_slat[not_in_slat["regex_sent"] == str(model_matrix.loc[i]["original"])]
    not_in_slat = not_in_slat.drop(match.index)
    counter += 1
    if (counter % 1000 == 0):
        print("Finished ", counter, " sequences left", len(not_in_slat)) 

In [None]:
not_in_slat.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\BigDataSets\SLAT\not_annotated_8_27.csv", index = False)

### Getting Matches for Potential Always Patterns

In [None]:
def fine_tune_always_patterns(df, col, regex):
    regex_pattern = re.compile(regex, re.IGNORECASE)
    a = []

    for seq in tqdm(df[col]):
        match = regex_pattern.search(seq)
        if (match is not None):
            a.append(match.group())
        else:
            a.append("")
#         curr = []
#         classes = []

#         m = list(set(re.findall(regex_pattern, seq)))
#         # in case m is a list of tuples
#         if (len(m) >= 1 and type(m[0]) is tuple):
#             converted_m = []
#             for t in m:
#                 for x in t:
#                     if (x != ""):
#                         converted_m.append(x)
#             converted_m = list(set(map(str.lower, converted_m)))
#             if (converted_m != []):
#                 curr.append("".join(converted_m))
#         else:
#             m = list(set(map(str.lower, m)))
#             if (m != []):
#                 curr.append("".join(m))
#         a.append(curr)
    
    df["match"] = a
    return (df[df["match"].str.len() != 0], len(df[df["match"].str.len() != 0]))

#### Colin Patterns

In [None]:
pattern1_df, pattern1_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern1)

In [None]:
(pattern1_count)

In [None]:
pattern1_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\Always Patterns\colin_pattern1_matches.csv", index = False)

In [None]:
pattern2_df, pattern2_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern2)

In [None]:
pattern2_count

In [None]:
pattern2_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\colin_pattern2_matches.csv", index = False)

In [None]:
pattern5_df, pattern5_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern5)

In [None]:
pattern5_count

In [None]:
pattern5_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\colin_pattern5_matches.csv", index = False)

In [None]:
pattern6_df, pattern6_count = fine_tune_always_patterns(not_in_slat, "regex_sent", pattern6)

In [None]:
pattern6_count

In [None]:
pattern6_df.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\colin_pattern6_matches.csv", index = False)

#### My Patterns

In [None]:
fifty_seq_real_world = pd.read_excel(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Predictions\50_seq_real_world_sample.xlsx")

In [None]:
fifty_seq_real_world = fifty_seq_real_world[fifty_seq_real_world["regex"].str.len() > 0]

In [None]:
len(fifty_seq_real_world)

In [None]:
counts = []
counter = 0
for regex in fifty_seq_real_world["regex"]:
    match_df, count = fine_tune_always_patterns(not_in_slat, "regex_sent", regex)
    match_df.to_csv(r"pattern_{}_real_world.csv".format(counter), index = False)
    counter += 1
    counts.append(count)

In [None]:
fifty_seq_real_world["match_counts"] = counts

In [None]:
fifty_seq_real_world.to_excel(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\Always Patterns\12_real_world_errors.xlsx", index = False)

In [None]:
family_pattern = r"(husband|wife|son|daughter|aunt|uncle)"

In [None]:
family_matches, family_match_count = fine_tune_always_patterns(not_in_slat, "regex_sent", family_pattern)

In [None]:
family_dementia_pattern = r"(husband|wife|son|daughter|aunt|uncle)\s*[^.]*dementia"

In [None]:
family_dementia_matches, family_dementia_match_count = fine_tune_always_patterns(not_in_slat, "regex_sent", family_dementia_pattern)

In [None]:
family_dementia_match_count

### Current Always Patterns in SLAT

In [None]:
curr_ap = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [None]:
curr_ap = curr_ap["Pattern"].to_list()

In [None]:
counts = []
counter = 0
for regex in curr_ap:
    match_df, count = fine_tune_always_patterns(not_in_slat, "regex_sent", regex)
    match_df.to_csv(r"Storage/Analysis/Always Patterns/Current/curr_pattern_{}_real_world.csv".format(counter), index = False)
    counter += 1
    counts.append(count)

In [None]:
curr_ap = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv")

In [None]:
curr_ap["match_counts"] = counts

In [None]:
curr_ap["match_counts"].sum()

In [None]:
curr_ap.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\Storage\Analysis\Always Patterns\curr_ap_matches.csv", index = False)