In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re

In [2]:
import os
import shutil
import smart_open
from sys import platform
import gensim

In [3]:
def prepend_line(infile, outfile, line):
    """ 
    Function use to prepend lines using bash utilities in Linux. 
    (source: http://stackoverflow.com/a/10850588/610569)
    """
    with open(infile, 'r', encoding="utf8") as old:
        with open(outfile, 'w', encoding="utf8") as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r', encoding="utf8") as fin:
        with open(outfile, 'w', encoding="utf8") as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)

def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r', encoding="utf8") as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r', encoding="utf8") as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims

# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
#glove_file="glove.6B.300d.txt"
glove_file= r'glove.42B.300d.txt'

num_lines, dims = get_lines(glove_file)

# Output: Gensim Model text format.
gensim_file='glove_model2.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
if platform == "linux" or platform == "linux2":
    prepend_line(glove_file, gensim_file, gensim_first_line)
else:
    prepend_slow(glove_file, gensim_file, gensim_first_line)

# Demo: Loads the newly created glove_model.txt into gensim API.
model=gensim.models.KeyedVectors.load_word2vec_format(gensim_file,binary=False) #GloVe Model

In [4]:
exp = "[a-zA-Z0-9]+"

In [5]:
def vector_avg(vec_list):
        return sum(vec_list)/len(vec_list)

    #return np.mean(vec_list)


def get_vector(med):   #"kirk vit"
    split_med=med.split()   #["kirk","vit"]
    
    vec_list=[]
    for item in med:
        try:
            vec_list.append(model[item])  #converting each word in the medicine name to a 
        except:
            #print("exception!!")
            continue
        
        
        '''if model[item]:
            vec_list.append(model[item])  #converting each word in the medicine name to a 
        else:
            continue'''
        
    #calc vector avg of all words in medicine name    
    vector=vector_avg(vec_list)    
        
    return vector
    
    
def get_similarity(vector):
    sim=model.similar_by_vector(vector, topn=11, restrict_vocab=None)
    return sim
    

In [6]:
def medstring_to_vector(medstring):
    meds=re.findall(exp,medstring)
    #print(meds)    #['kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil']

    #list of vectors of the medicine names in each row
    #[v(kirkland multivitamin), v(kirkland calcium vitamin), v(vitamin d), v(fish oil)]
    vec_list=[]  

    for item in meds:
        #try:             
        v=get_vector(item)
        vec_list.append(v)                            

    #avg of all med vectors in a row (for a user/entry)
    if vec_list:
        meds_avg=vector_avg(vec_list)
        return meds_avg
    else:
        return -1

In [7]:
def add_vector_column(df): 
    df["meds_vector"] = ""
    df['allergies_vector']=""
    df['history_vector']=""

    for i,j in df["other_meds_filtered"].iteritems():
        #j = "['kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil']"

        #if not pd.isna(j):
        if not pd.isnull(j):
            meds_avg = medstring_to_vector(j)
            if type(meds_avg) != int:
                df["meds_vector"].loc[i]= meds_avg
                
    for i,j in df["allergies_filtered"].iteritems():
              #if not pd.isna(j):
        if not pd.isnull(j):
            aller_avg = medstring_to_vector(j)
            if type(aller_avg) != int:
                df["allergies_vector"].loc[i]= aller_avg
                
    for i,j in df["history_filtered"].iteritems():
              #if not pd.isna(j):
        if not pd.isnull(j):
            his_avg = medstring_to_vector(j)
            if type(his_avg) != int:
                df["history_vector"].loc[i]= aller_avg
    
                

In [8]:
data = pd.read_csv(r'dfs.csv')


In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES,other_meds_filtered,allergies_filtered,history_filtered
0,0,916600,01-01-2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,,2,01-01-2021,,Y,,Pcn and bee venom,,"['pcn', 'bee venom']",
1,1,916601,01-01-2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,,2,01-01-2021,,Y,,"""Dairy""",['residing nursing facility . patients chart .'],"[""`` dairy ''""]",['residing nursing facility . patients chart .']
2,2,916602,01-01-2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,,2,01-01-2021,,,Y,Shellfish,,['shellfish'],
3,3,916603,01-01-2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,2,01-01-2021,,,,"Diclofenac, novacaine, lidocaine, pickles, tom...",,"['diclofenac', 'novacaine', 'lidocaine', 'pick...","['diverticulitis', 'mitral valve prolapse', 'o..."
4,4,916604,01-01-2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,,2,01-01-2021,,,,,,,


In [10]:
add_vector_column(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES,other_meds_filtered,allergies_filtered,history_filtered,meds_vector,allergies_vector,history_vector
0,0,916600,01-01-2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,,Y,,Pcn and bee venom,,"['pcn', 'bee venom']",,,"[0.17354767, 0.27377817, 0.100952215, 0.273204...",
1,1,916601,01-01-2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,,Y,,"""Dairy""",['residing nursing facility . patients chart .'],"[""`` dairy ''""]",['residing nursing facility . patients chart .'],"[0.14097442, 0.32214308, -0.032744993, 0.26034...","[0.20041339, 0.331478, -0.1741894, 0.21092239,...","[0.13993715, 0.3670162, 0.026343018, 0.3592622..."
2,2,916602,01-01-2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,,,Y,Shellfish,,['shellfish'],,,"[0.23856923, 0.15417689, 0.16545779, 0.3678997...",
3,3,916603,01-01-2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,,,"Diclofenac, novacaine, lidocaine, pickles, tom...",,"['diclofenac', 'novacaine', 'lidocaine', 'pick...","['diverticulitis', 'mitral valve prolapse', 'o...",,"[0.153226, 0.19185863, 0.06232959, 0.23845184,...","[0.13993715, 0.3670162, 0.026343018, 0.3592622..."
4,4,916604,01-01-2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,,,,,,,,,,


In [12]:
data.to_csv('TOWvectors.csv')

In [17]:
data["DIED"]=data["DIED"].replace(np.nan, 0)
data["L_THREAT"]=data["L_THREAT"].replace(np.nan, 0)
data["ER_VISIT"]=data["ER_VISIT"].replace(np.nan, 0)
data["HOSPITAL"]=data["HOSPITAL"].replace(np.nan, 0)
#num_data["HOSPDAYS"]=num_data["HOSPDAYS"].replace(np.nan, 0)
data["DISABLE"]=data["DISABLE"].replace(np.nan, 0)
data["BIRTH_DEFECT"]=data["BIRTH_DEFECT"].replace(np.nan, 0)

data=data.replace("Y",1)

data

  mask = arr == x


Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,OFC_VISIT,ER_ED_VISIT,ALLERGIES,other_meds_filtered,allergies_filtered,history_filtered,meds_vector,allergies_vector,history_vector,risk
0,0,916600,01-01-2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,1.0,,Pcn and bee venom,,"['pcn', 'bee venom']",,,"[0.17354767, 0.27377817, 0.100952215, 0.273204...",,
1,1,916601,01-01-2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,1.0,,"""Dairy""",['residing nursing facility . patients chart .'],"[""`` dairy ''""]",['residing nursing facility . patients chart .'],"[0.14097442, 0.32214308, -0.032744993, 0.26034...","[0.20041339, 0.331478, -0.1741894, 0.21092239,...","[0.13993715, 0.3670162, 0.026343018, 0.3592622...",
2,2,916602,01-01-2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,,1.0,Shellfish,,['shellfish'],,,"[0.23856923, 0.15417689, 0.16545779, 0.3678997...",,
3,3,916603,01-01-2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,,"Diclofenac, novacaine, lidocaine, pickles, tom...",,"['diclofenac', 'novacaine', 'lidocaine', 'pick...","['diverticulitis', 'mitral valve prolapse', 'o...",,"[0.153226, 0.19185863, 0.06232959, 0.23845184,...","[0.13993715, 0.3670162, 0.026343018, 0.3592622...",
4,4,916604,01-01-2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44426,44426,966061,01-22-2021,WV,65.0,65.0,,F,,"HEAD COGESTION, FEVERISH AND ACHY STARTED 12-3...",...,1.0,,,,,,,,,
44427,44427,966062,01-22-2021,NJ,61.0,61.0,,F,,"SEVERE HEADACHE, BODY ACHES, LOW GRADE TEMP (9...",...,,,,"['bystolic mg-od', 'multi-vit']",,,"[0.22648835, 0.14664122, 0.14537087, 0.2346280...",,,
44428,44428,966063,01-22-2021,CA,40.0,40.0,,F,,Exactly 7 days after I received the vaccine my...,...,,,,"['generic aleve', 'butran patch', 'nortriptyli...",,"['osteoarthritis', 'bulged discs spine', 'anxi...","[0.22605449, 0.28097165, 0.03322323, 0.2979094...",,"[0.13993715, 0.3670162, 0.026343018, 0.3592622...",
44429,44429,966064,01-22-2021,OH,39.0,39.0,,F,,"High Fever >102, dizziness, fatigue, body ache...",...,,,,"['lexapro', 'prevacid', 'multivitamin']",,['e'],"[0.15310867, 0.28272304, 0.00552508, 0.2554333...",,"[0.13993715, 0.3670162, 0.026343018, 0.3592622...",


In [16]:
sympt_df = pd.read_csv(r'C:\Users\nikij\Desktop\college\capstone\vaccine\Data\2021cleaned\2021VAERSSYMPTOMS_clean.csv')

vax_df = pd.read_csv(r'C:\Users\nikij\Desktop\college\capstone\vaccine\Data\2021cleaned\2021VAERSVAX_clean.csv')
janssen_df = vax_df[vax_df['VAX_NAME'] == 'COVID19 (COVID19 (JANSSEN))']
moderna_df = vax_df[vax_df['VAX_NAME'] == 'COVID19 (COVID19 (MODERNA))']
pfeizer_df = vax_df[vax_df['VAX_NAME'] == 'COVID19 (COVID19 (PFIZER-BIONTECH))']

In [39]:
def get_symptoms(vax_df):
    age_vax_df = pd.merge(vax_df, age_grp_df, how = 'inner', on = 'VAERS_ID')
    age_vax_df = age_vax_df[['VAERS_ID','VAX_NAME','meds_vector']]
    
    age_vax_df['similarity_score']=""
    for i,j in age_vax_df["meds_vector"].iteritems():
        if type(j) == np.ndarray:
            #print(type(j))
            sim_score = np.dot(user_med_vector, j)/(np.linalg.norm(user_med_vector)* np.linalg.norm(j))
            age_vax_df["similarity_score"].loc[i]= sim_score
            #print(i,type(sim_score))
        else:
            #print(type(sim_score))
            type(j)
            age_vax_df["similarity_score"].loc[i]=None

    age_vax_df.sort_values(by=['similarity_score'], ascending=False)

    #sympt_match_df = pd.merge(age_vax_df, sympt_df, how = 'left', on = 'VAERS_ID')
    sympt_match_df = pd.merge(age_vax_df, sympt_df, how = 'inner', on = 'VAERS_ID')
    sympt_match_df = sympt_match_df[['VAERS_ID','meds_vector','similarity_score','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5']]


    pred_symptoms = []
    for i in range(len(sympt_match_df)):
        if len(pred_symptoms) >=5:
            break
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM1']) and sympt_match_df['SYMPTOM1'][i] not in pred_symptoms:
            pred_symptoms.append(sympt_match_df['SYMPTOM1'][i])   
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM2']) and sympt_match_df['SYMPTOM2'][i] not in pred_symptoms:
            pred_symptoms.append(sympt_match_df['SYMPTOM2'][i])
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM3']) and sympt_match_df['SYMPTOM3'][i] not in pred_symptoms:
            pred_symptoms.append(sympt_match_df['SYMPTOM3'][i])
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM4']) and sympt_match_df['SYMPTOM4'][i] not in pred_symptoms:
            pred_symptoms.append(sympt_match_df['SYMPTOM4'][i])
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM5']) and sympt_match_df['SYMPTOM5'][i] not in pred_symptoms:
            pred_symptoms.append(sympt_match_df['SYMPTOM5'][i])

    return pred_symptoms

In [None]:
#On the basis of age group and medications taken, 
#similarity between users is found

#Assumptions
#Meds is a compulsory field; not empty
#Meds selected from a searchable drop down menu, 
#so it will be an exact match to what's been used as sample input

In [42]:
#user example
#40 #50 #71 #35
try:
    age = int(input("Enter age: "))

except:
    print("Invalid age input. Please try again.")
    exit()

#'kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil'
#'amlodipine', 'ambien', 'benicar/hctz', 'invokana', 'metformin', 'levothyroxine', 'bydureon', 'metoprolol'
#'levothyroxine', 'estradiol'
#fluoxetine qd, cetirizine qd apple cider vinegar pills
med_entry = input("Enter list of medications:\n")


age_grp_df = data.loc[(data['AGE_YRS'] >= age-5) & (data['AGE_YRS'] <= age+5)]
#CONVERT INPUT MEDS INTO VECTOR
user_med_vector=medstring_to_vector(med_entry)

#COVID19 (COVID19 (JANSSEN)) #COVID19 (COVID19 (MODERNA)) #COVID19 (COVID19 (PFIZER-BIONTECH))
vax_list = ['COVID19 (COVID19 (MODERNA))','COVID19 (COVID19 (PFIZER-BIONTECH))'] 

for vax in vax_list:
    print("\n**********************************************************")
    print("VACCINE NAME:",vax,"\n")
    if vax == 'COVID19 (COVID19 (MODERNA))':
        symptom_set = get_symptoms(moderna_df)
    elif vax == 'COVID19 (COVID19 (PFIZER-BIONTECH))':
        symptom_set = get_symptoms(pfeizer_df)
        
    print("POTENTIAL SYMPTOMS:")
    for i in range(len(symptom_set)):
        print(i+1,".",symptom_set[i],sep="")
    print("**********************************************************\n")


Enter age: 35
Enter list of medications:
fluoxetine qd, cetirizine qd apple cider vinegar pills

**********************************************************
VACCINE NAME: COVID19 (COVID19 (MODERNA)) 

POTENTIAL SYMPTOMS:
1.Dysphagia
2.Epiglottitis
3.Diarrhoea
4.Nasal congestion
5.Blood pressure decreased
6.Chest pain
7.Chills
8.Confusional state
9.Decreased appetite
**********************************************************


**********************************************************
VACCINE NAME: COVID19 (COVID19 (PFIZER-BIONTECH)) 

POTENTIAL SYMPTOMS:
1.Chills
2.Dizziness
3.Injection site pain
4.Myalgia
5.Pyrexia
**********************************************************



In [None]:
#rank the symptoms according to likelihood

#Color code the more likely symptoms -> will be improving the UI look for the Dec review. 
#For now, we are still determining the appropriate thresholds for color coding

In [None]:
Give use case, explain the dataset
Young male, elder female, etc., diff med histories
Specify that it is real-time data
Highlight the pretrained