In [1]:
import pandas as pd
import numpy as np

In [2]:
# -*- coding: utf-8 -*-
"""
word2vec embeddings start with a line with the number of lines (tokens?) and 
the number of dimensions of the file. This allows gensim to allocate memory 
accordingly for querying the model. Larger dimensions mean larger memory is 
held captive. Accordingly, this line has to be inserted into the GloVe 
embeddings file.
"""

import os
import shutil
import smart_open
from sys import platform

import gensim


def prepend_line(infile, outfile, line):
    """ 
    Function use to prepend lines using bash utilities in Linux. 
    (source: http://stackoverflow.com/a/10850588/610569)
    """
    with open(infile, 'r', encoding="utf8") as old:
        with open(outfile, 'w', encoding="utf8") as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r', encoding="utf8") as fin:
        with open(outfile, 'w', encoding="utf8") as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)

def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r', encoding="utf8") as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r', encoding="utf8") as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims

# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
#glove_file="glove.6B.300d.txt"
glove_file=r'glove.42B.300d.txt'

num_lines, dims = get_lines(glove_file)

# Output: Gensim Model text format.
gensim_file='glove_model2.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
if platform == "linux" or platform == "linux2":
    prepend_line(glove_file, gensim_file, gensim_first_line)
else:
    prepend_slow(glove_file, gensim_file, gensim_first_line)

# Demo: Loads the newly created glove_model.txt into gensim API.

# save the gensim model to disk
#model = gensim.models.KeyedVectors.load_word2vec_format(gensim_file, binary=False)






In [3]:
model=gensim.models.KeyedVectors.load_word2vec_format(gensim_file,binary=False) #GloVe Model

In [4]:
import re


def vector_avg(vec_list):
    return sum(vec_list)/len(vec_list)
    #return np.mean(vec_list)


def get_vector(med):   #"kirk vit"
    split_med=med.split()   #["kirk","vit"]
    
    vec_list=[]
    for item in med:
        try:
            vec_list.append(model[item])  #converting each word in the medicine name to a 
        except:
            print("itemmmmmmmmm ",item, len(item))
            print("exception!!")
            continue
        
        
        '''if model[item]:
            vec_list.append(model[item])  #converting each word in the medicine name to a 
        else:
            continue'''
        
    #calc vector avg of all words in medicine name
    vector=vector_avg(vec_list)
    return vector
    
    
def get_similarity(vector):
    sim=model.similar_by_vector(vector, topn=11, restrict_vocab=None)
    return sim
    

def add_vector_column(df): 
    df["meds_vector"] = ""

    for i,j in df["other_meds_filtered"].iteritems():
        #j = "['kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil']"

        #if not pd.isna(j):
        if not pd.isnull(j):
            meds_avg = medstring_to_vector(j)
            if type(meds_avg) != int:
                df["meds_vector"].loc[i]= meds_avg

exp = "[a-zA-Z0-9]+"
def medstring_to_vector(medstring):
    meds=re.findall(exp,medstring)
    #print(meds)    #['kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil']

    #list of vectors of the medicine names in each row
    #[v(kirkland multivitamin), v(kirkland calcium vitamin), v(vitamin d), v(fish oil)]
    vec_list=[]  

    for item in meds:
        #try:             
        v=get_vector(item)
        vec_list.append(v)                            




In [5]:
data = pd.read_csv(r'filtered_cols.csv')
add_vector_column(data)

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [6]:
vax_df = pd.read_csv(r'2021VAERSVAX_clean3.csv')
janssen_df = vax_df[vax_df['VAX_NAME'] == 'COVID19 (COVID19 (JANSSEN))']
moderna_df = vax_df[vax_df['VAX_NAME'] == 'COVID19 (COVID19 (MODERNA))']
pfeizer_df = vax_df[vax_df['VAX_NAME'] == 'COVID19 (COVID19 (PFIZER-BIONTECH))']

In [7]:
def get_symptoms(vax_df):
    #age_vax_df = pd.merge(age_grp_df, vax_df, how = 'left', on = 'VAERS_ID')
    age_vax_df = pd.merge(vax_df, age_grp_df, how = 'inner', on = 'VAERS_ID')
    age_vax_df = age_vax_df[['VAERS_ID','VAX_NAME','meds_vector']]
    #print(age_vax_df.head())
    #return[1,2,3]

    age_vax_df['similarity_score']=""
    for i,j in age_vax_df["meds_vector"].iteritems():
        if type(j) == np.ndarray:
            #print(type(j))
            sim_score = np.dot(user_med_vector, j)/(np.linalg.norm(user_med_vector)* np.linalg.norm(j))
            age_vax_df["similarity_score"].loc[i]= sim_score
            #print(i,type(sim_score))
        else:
            #print(type(sim_score))
            type(j)
            age_vax_df["similarity_score"].loc[i]=None

    age_vax_df.sort_values(by=['similarity_score'], ascending=False)
    sympt_df=pd.read_csv("2021VAERSSYMPTOMS_clean3.csv")

    #sympt_match_df = pd.merge(age_vax_df, sympt_df, how = 'left', on = 'VAERS_ID')
    sympt_match_df = pd.merge(age_vax_df, sympt_df, how = 'inner', on = 'VAERS_ID')
    sympt_match_df = sympt_match_df[['VAERS_ID','meds_vector','similarity_score','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5']]


    pred_symptoms = set()
    for i in range(len(sympt_match_df)):
        if len(pred_symptoms) >=5:
            break
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM1']):
            pred_symptoms.add(sympt_match_df['SYMPTOM1'][i])   
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM2']):
            #pred_symptoms.append(sympt_match_df['SYMPTOM2'][i])
            pred_symptoms.add(sympt_match_df['SYMPTOM2'][i])
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM3']):
            pred_symptoms.add(sympt_match_df['SYMPTOM3'][i])
        if not pd.isnull(sympt_match_df.at[i,'SYMPTOM4']):
            pred_symptoms.add(sympt_match_df['SYMPTOM4'][i])

    return pred_symptoms

In [8]:
#user1 
#40
age = int(input("Enter age: "))

#"'kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil'"
med_entry = input("Enter list of medications: ")


age_grp_df = data.loc[(data['AGE_YRS'] >= age-5) & (data['AGE_YRS'] <= age+5)]
#CONVERT INPUT MEDS INTO VECTOR
user_med_vector=medstring_to_vector(med_entry)

#COVID19 (COVID19 (JANSSEN)) #COVID19 (COVID19 (MODERNA)) #COVID19 (COVID19 (PFIZER-BIONTECH))
vax_list = ['COVID19 (COVID19 (MODERNA))','COVID19 (COVID19 (PFIZER-BIONTECH))','COVID19 (JANSSEN))']

for vax in vax_list:
    print("\nVACCINE NAME:",vax,"\n")
    if vax == 'COVID19 (COVID19 (MODERNA))':
        symptom_set = get_symptoms(moderna_df)
    elif vax == 'COVID19 (COVID19 (PFIZER-BIONTECH))':
        symptom_set = get_symptoms(pfeizer_df)
    elif vax == 'COVID19 COVID19 (JANSSEN))':
        symptom_set = get_symptoms(janssen_df)
    
    print("POTENTIAL SYMPTOMS:")
    for symp in symptom_set:
        print("*",symp)
    print("***********\n")


VACCINE NAME: COVID19 (COVID19 (MODERNA)) 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


POTENTIAL SYMPTOMS:
* Injection site erythema
* Injection site swelling
* Pharyngeal swelling
* Injection site pruritus
* Injection site warmth
***********


VACCINE NAME: COVID19 (COVID19 (PFIZER-BIONTECH)) 

POTENTIAL SYMPTOMS:
* Dizziness
* Pain
* Nausea
* Tremor
* Chills
***********


VACCINE NAME: COVID19 (JANSSEN)) 

POTENTIAL SYMPTOMS:
* Dizziness
* Pain
* Nausea
* Tremor
* Chills
***********



In [9]:
data.to_csv("someeee.csv")

In [10]:
data

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES,other_meds_filtered,allergies_filtered,history_filtered,meds_vector
0,916600,01-01-2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,,...,2,01-01-2021,,Y,,Pcn and bee venom,,"['pcn', 'bee venom']",,
1,916601,01-01-2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,,...,2,01-01-2021,,Y,,"""Dairy""",['residing nursing facility . patients chart .'],"[""`` dairy ''""]",['residing nursing facility . patients chart .'],
2,916602,01-01-2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",,...,2,01-01-2021,,,Y,Shellfish,,['shellfish'],,
3,916603,01-01-2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",,...,2,01-01-2021,,,,"Diclofenac, novacaine, lidocaine, pickles, tom...",,"['diclofenac', 'novacaine', 'lidocaine', 'pick...","['diverticulitis', 'mitral valve prolapse', 'o...",
4,916604,01-01-2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",,...,2,01-01-2021,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444314,1553408,08-13-2021,CT,,78.0,,M,,Very very full/Very stuffed; Not feeling the g...,,...,2,08-10-2021,,,,,['vitamins nos'],,['medical history/concurrent conditions bypass...,
444315,1553414,08-13-2021,IL,,,,M,,Real puffy; Tuesday night shoulder was terribl...,,...,2,08-10-2021,,,,,,,,
444316,1553416,08-13-2021,VA,81.0,81.0,,F,,Right arm at the injection site got pink; Righ...,,...,2,08-10-2021,,,,,['losartan'],,['medical history/concurrent conditions macula...,
444317,1553436,08-13-2021,PR,19.0,19.0,,M,,Patient goes to emergency room with chest pain...,,...,2,08-13-2021,,,Y,Penicillin,,['penicillin'],,
