In [1]:
#__________________________________________________________________________________________________________________

In [2]:
# -*- coding: utf-8 -*-
"""
word2vec embeddings start with a line with the number of lines (tokens?) and 
the number of dimensions of the file. This allows gensim to allocate memory 
accordingly for querying the model. Larger dimensions mean larger memory is 
held captive. Accordingly, this line has to be inserted into the GloVe 
embeddings file.
"""

import os
import shutil
import smart_open
from sys import platform

import gensim


def prepend_line(infile, outfile, line):
    """ 
    Function use to prepend lines using bash utilities in Linux. 
    (source: http://stackoverflow.com/a/10850588/610569)
    """
    with open(infile, 'r', encoding="utf8") as old:
        with open(outfile, 'w', encoding="utf8") as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r', encoding="utf8") as fin:
        with open(outfile, 'w', encoding="utf8") as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)

def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r', encoding="utf8") as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r', encoding="utf8") as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims

# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
#glove_file="glove.6B.300d.txt"
glove_file=r'glove.42B.300d.txt'

num_lines, dims = get_lines(glove_file)

# Output: Gensim Model text format.
gensim_file='glove_model2.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
if platform == "linux" or platform == "linux2":
    prepend_line(glove_file, gensim_file, gensim_first_line)
else:
    prepend_slow(glove_file, gensim_file, gensim_first_line)

# Demo: Loads the newly created glove_model.txt into gensim API.




In [3]:
model=gensim.models.KeyedVectors.load_word2vec_format(gensim_file,binary=False) #GloVe Model

In [4]:
try:
    word="elephant"
    word=word.lower()
    print(model.most_similar(positive=[word], topn=20))
    
except:
    print("not found")
#print(model.similarity('woman', 'man'))

not found


In [5]:
print(model.similarity('paracetamol', 'ibuprofen'))
print(model.similarity('crocin', 'paracetamol'))
print(model.similarity('banana', 'energy'))

0.7885133
0.262924
0.23695178


In [6]:
print(model["paracetamol"])

[ 0.057777  -0.065165   0.46952    0.074488   0.45392   -0.40041
 -0.39756    0.35879    0.32403   -0.51244    0.10301    0.24029
 -0.74578    0.10445   -0.63752    0.47939    0.51449    0.58795
 -0.68408    0.31802    0.076313  -0.29599   -0.38381   -0.062685
  0.48119   -0.41661    0.52723   -0.80899    0.30443   -0.022983
  0.090743  -0.35872   -0.11547   -0.54067    0.32223   -0.31708
 -0.26967   -0.58827    0.030474  -0.42215   -0.6473    -0.24239
 -0.66428   -0.71383   -0.056201   0.13471    0.17543    0.2095
  0.28686   -0.20854   -0.36009    0.5571     0.031856  -0.29795
 -0.022248   0.37339    0.010434   0.017707  -0.06727   -0.69088
 -0.5294    -0.29336   -0.74367    0.16231   -0.32833    0.3725
  0.60718    0.0039399 -0.40087   -0.59991    0.089434   0.35092
  0.64336    0.53996    0.6468     0.16951    0.26801   -0.05911
 -0.74138    0.13343    0.25689   -0.058544   0.22232   -0.022667
  0.37799    0.43947   -0.39901    0.53077   -0.83845    0.49801
  0.55239   -0.11924    

In [7]:
print(model[0])

[ 1.8378e-01 -1.2123e-01 -1.1987e-01  1.5227e-02 -1.9121e-01 -6.6074e-02
 -2.9876e+00  8.0795e-01  6.7338e-02 -1.3184e-01 -5.2740e-01  4.4521e-01
  1.2982e-01 -2.1823e-01 -4.5080e-01 -2.2478e-01 -3.0766e-01 -1.1137e-01
 -1.6200e-01 -2.1294e-01 -4.6022e-01 -8.6593e-02 -2.4902e-01  4.6729e-01
 -6.0230e-01 -4.4972e-01  4.3946e-01  1.4738e-02  2.7498e-01 -7.8421e-02
  3.6009e-01  1.2172e-01  4.2980e-01 -5.5345e-02  4.4950e-01 -7.4444e-01
 -2.6702e-01  1.6431e-01 -1.9335e-01  1.3468e-01  2.8870e-01  2.3924e-01
 -2.3579e-01 -2.8972e-01  2.0149e-01  4.8135e-02 -1.8322e-01 -1.5492e-01
 -1.9255e-01  4.0271e-01  1.6051e-01  1.7721e-01  3.2557e-01  1.1625e-02
 -4.2572e-01  3.4205e-01 -4.5865e-01 -2.4860e-01  3.4128e-02  3.3060e-02
 -5.7065e-02  1.8136e-01 -4.3638e-01  5.7090e-04 -1.1935e-01 -2.1950e-01
  1.6429e-01 -1.8119e-01 -1.9145e-01 -8.1672e-02 -2.9620e-01  2.5803e-01
  7.3848e-02  5.4213e-01 -1.5405e-01 -4.9256e-01  9.1719e-02  1.3329e-01
 -5.2530e-02 -2.0518e-01  3.4576e-01 -1.0449e+00  7

In [8]:
print(v1.shape)
print(v2.shape)

NameError: name 'v1' is not defined

In [None]:
print("hi")

hi


In [9]:
#get avg of para and ibu
vector= (model["paracetamol"] + model["ibuprofen"])/2

#fond the words most similar to both ibu and para (midpoint of ibu and para)
model.similar_by_vector(vector, topn=11, restrict_vocab=None)

[('ibuprofen', 0.950954258441925),
 ('paracetamol', 0.9400843977928162),
 ('acetaminophen', 0.8674831986427307),
 ('tylenol', 0.788176953792572),
 ('aspirin', 0.7668460607528687),
 ('naproxen', 0.76459139585495),
 ('motrin', 0.7616320252418518),
 ('advil', 0.7219496965408325),
 ('nsaids', 0.7087039351463318),
 ('codeine', 0.7057143449783325),
 ('aleve', 0.6704925298690796)]

In [None]:
######################################################################################################################

In [10]:
import pandas as pd
import numpy as np

In [11]:
df=pd.read_csv("filtered_cols.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
slice_len = int(0.1*len(df))
print(slice_len)

df_slice1=df[0 : slice_len ]
len(df_slice1)

44431


44431

In [13]:
df_slice1.head()

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES,other_meds_filtered,allergies_filtered,history_filtered
0,916600,01-01-2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,,...,,2,01-01-2021,,Y,,Pcn and bee venom,,"['pcn', 'bee venom']",
1,916601,01-01-2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,,...,,2,01-01-2021,,Y,,"""Dairy""",['residing nursing facility . patients chart .'],"[""`` dairy ''""]",['residing nursing facility . patients chart .']
2,916602,01-01-2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",,...,,2,01-01-2021,,,Y,Shellfish,,['shellfish'],
3,916603,01-01-2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",,...,,2,01-01-2021,,,,"Diclofenac, novacaine, lidocaine, pickles, tom...",,"['diclofenac', 'novacaine', 'lidocaine', 'pick...","['diverticulitis', 'mitral valve prolapse', 'o..."
4,916604,01-01-2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",,...,,2,01-01-2021,,,,,,,


In [14]:
df_slice1.to_csv(r"vecs.csv")

In [None]:
####################################################################

In [15]:

dfs=df_slice1
len(dfs)

44431

In [23]:
def vector_avg(vec_list):
    print("len of vec list in vector_avg",len(vec_list))
    if len(vec_list)!=0:
        return sum(vec_list)/len(vec_list)
    else:
        print("empty list")
    #return np.mean(vec_list)


def get_vector(med):   #"kirk vit"
    split_med=med.split()   #["kirk","vit"]
    
    vec_list=[]
    for item in split_med:
        try:
            vec_list.append(model[item])  #converting each word in the medicine name to a 
        except:
            print("itemmmmmmmmm ",item, len(item))
            print("exception!!")
            continue
        
        
        '''if model[item]:
            vec_list.append(model[item])  #converting each word in the medicine name to a 
        else:
            continue'''
        
    #calc vector avg of all words in medicine name
    print(len(vec_list),vec_list)
    vector=vector_avg(vec_list)
    return vector
    
    
def get_similarity(vector):
    sim=model.similar_by_vector(vector, topn=11, restrict_vocab=None)
    return sim
    

    
#_--------------------------------------------------------------------------------------------
'''    
#get avg of para and ibu
vector= (model["paracetamol"] + model["ibuprofen"])/2

#fond the words most similar to both ibu and para (midpoint of ibu and para)
model.similar_by_vector(vector, topn=11, restrict_vocab=None)
'''

v=get_vector("kirkland multivitamin")
# print(len(v), v)
# avg=vector_avg(v)
# s=get_similarity(v)

# print(v)
 

2 [array([ 8.1622e-01,  4.5686e-01, -1.5478e-02, -2.9026e-01,  2.6184e-02,
        1.3996e-01,  2.3389e-01, -5.0764e-01, -1.8106e-01, -2.2790e-01,
       -4.5547e-02,  2.3326e-01, -2.2971e-01,  7.7212e-03, -1.7437e-01,
       -3.0994e-01, -1.9378e-01,  2.9733e-01,  2.3628e-01, -4.2627e-01,
       -2.2908e-01, -1.9814e-01, -4.6180e-01, -2.8569e-01,  2.4981e-01,
       -8.8686e-02,  2.0597e-01, -8.2386e-02,  4.5985e-01, -4.5648e-01,
       -2.5694e-01,  2.3182e-01, -9.6735e-02,  3.6997e-01,  2.7062e-02,
        2.4928e-01, -1.8081e-01,  5.5592e-02,  2.0340e-01, -2.8519e-03,
       -4.3224e-01,  2.9709e-02,  3.2677e-01, -5.5606e-01,  5.4551e-01,
        3.1817e-01,  1.6103e-01, -1.4200e-02,  3.1019e-01,  5.6615e-01,
        6.1689e-02, -2.4332e-01, -3.3956e-01, -5.8627e-01,  3.9618e-01,
        2.3468e-01,  3.0587e-01,  4.5858e-01, -6.9215e-01, -1.9803e-01,
       -9.0520e-02, -1.3074e-01, -1.3347e-01, -1.4577e-01, -1.2897e-01,
        2.6453e-01, -1.8585e-02, -8.9045e-02, -7.3477e-01, -5

In [24]:
dfs["meds_vector"] = ""

for i,j in dfs["other_meds_filtered"].iteritems():
    #j = "['kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil']"

    if not pd.isna(j):
        print("j ka maal",j)

        meds= j[1:-1].split(", ")
        meds=[x.strip("'") for x in meds]
        #print(meds)    #['kirkland multivitamin', 'kirkland calcium vitamin', 'vitamin d', 'fish oil']


        #list of vectors of the medicine names in each row
        #[v(kirkland multivitamin), v(kirkland calcium vitamin), v(vitamin d), v(fish oil)]
        vec_list=[]  

        

        for item in meds:
            v=get_vector(item)
            vec_list.append(v)


        #avg of all med vectors in a row (for a user/entry)
        meds_avg=vector_avg(vec_list)
        meds_avg=(list(meds_avg))
        dfs["meds_vector"].loc[i]= meds_avg

        #print(meds_avg)
    else:
        print("maal", i)

            

maal 0
j ka maal ['residing nursing facility . patients chart .']
7 [array([-2.1064e-01,  1.8325e-01, -2.0508e-01, -1.1611e-01, -1.0890e-01,
       -1.4002e-01, -1.7231e+00, -1.3783e-01,  1.9281e-01,  1.0687e+00,
       -5.3759e-02, -1.7653e-01,  4.0949e-01, -3.4732e-01, -1.4624e-01,
       -1.7634e-01,  2.7534e-01,  4.8917e-02, -1.5345e-01,  2.2464e-01,
       -2.3529e-01,  1.4528e-02, -1.1577e-01, -5.1340e-02,  1.0447e-01,
       -2.2284e-01, -9.8540e-02, -1.7416e-01, -6.2839e-02, -5.4830e-01,
       -2.6338e-01,  3.9396e-01, -2.2260e-02,  6.1233e-02, -3.3078e-01,
       -1.3278e-01, -9.2612e-03, -6.6711e-03, -3.6506e-02, -3.0020e-01,
        4.3845e-01, -3.8654e-01,  1.5631e-01, -2.2729e-02, -4.1961e-01,
        4.1579e-03,  5.6667e-01, -1.2049e-01, -1.2376e-01,  1.6814e-01,
        5.5886e-02,  3.7579e-01,  8.7498e-04, -4.1596e-02,  9.4462e-02,
        4.2468e-01,  3.1664e-01, -2.3990e-01, -4.3171e-01,  5.4600e-02,
        2.5284e-01,  4.2322e-01,  2.1754e-01, -6.1097e-01,  1.0940e

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'

In [None]:
dfs["meds_vector"]

1        [0.13214307, 0.29208905, -0.017764108, 0.22982...
5        [0.15177006, 0.19313937, -0.032748237, 0.20284...
7        [0.15571064, 0.26085582, 0.040806342, 0.234450...
9        [0.21023898, 0.21132958, 0.093709856, 0.272576...
11       [0.15030146, 0.3432966, 0.081075, 0.3176437, 0...
                               ...                        
44425                                                     
44427                                                     
44428                                                     
44429                                                     
44430                                                     
Name: meds_vector, Length: 22666, dtype: object

In [None]:
dfs = dfs[dfs['other_meds_filtered'].notna()]

dfs["meds_vector"].iloc[22000]

''

In [None]:
small_df=dfs

small_df["DIED"]=small_df["DIED"].replace(np.nan, 0)
small_df["L_THREAT"]=small_df["L_THREAT"].replace(np.nan, 0)
small_df["ER_VISIT"]=small_df["ER_VISIT"].replace(np.nan, 0)
small_df["HOSPITAL"]=small_df["HOSPITAL"].replace(np.nan, 0)
small_df["X_STAY"]=small_df["X_STAY"].replace(np.nan, 0)
small_df["DISABLE"]=small_df["DISABLE"].replace(np.nan, 0)

small_df=small_df.replace("Y",1)

def risk_score(row):
    if row["DIED"] or row["L_THREAT"] or row["ER_VISIT"] or row["HOSPITAL"] or row["X_STAY"] or row["DISABLE"]:
        return 1
    else:
        return 0

small_df["risk"]=small_df.apply(risk_score, axis=1)

sex_cols=pd.get_dummies(small_df.SEX)

big_df=pd.concat([small_df,sex_cols], axis=1)


In [None]:
big_df=big_df.drop(columns=['RECVDATE','STATE','CAGE_YR', 'CAGE_MO', 'SEX',
       'RPT_DATE', 'SYMPTOM_TEXT', 'DIED', 'DATEDIED', 'L_THREAT', 'ER_VISIT',
       'HOSPITAL', 'HOSPDAYS', 'X_STAY', 'DISABLE', 'RECOVD', 'VAX_DATE',
       'ONSET_DATE', 'NUMDAYS', 'LAB_DATA', 'V_ADMINBY', 'V_FUNDBY',
       'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'SPLTTYPE',
       'FORM_VERS', 'TODAYS_DATE', 'BIRTH_DEFECT', 'OFC_VISIT', 'ER_ED_VISIT',
       'ALLERGIES', 'other_meds_filtered', 'allergies_filtered',
       'history_filtered'],axis=1)
big_df.columns

Index(['VAERS_ID', 'AGE_YRS', 'meds_vector', 'med_vec_list', 'risk', 'F', 'M'], dtype='object')

In [None]:
vax=pd.read_csv("2021VAERSVAX_clean3.csv")
vax=vax.drop(columns=["VAX_TYPE","VAX_MANU","VAX_LOT","VAX_DOSE_SERIES","VAX_SITE"], axis=1)
merdf=pd.merge(big_df, vax, how="inner",on="VAERS_ID")

merdf=merdf[(merdf["VAX_NAME"]=="COVID19 (COVID19 (MODERNA))") | (merdf["VAX_NAME"]=="COVID19 (COVID19 (PFIZER-BIONTECH))") | (merdf["VAX_NAME"]=="COVID19 (COVID19 (JANSSEN))")]
merdf=merdf[0:600]
merdf

Unnamed: 0,VAERS_ID,AGE_YRS,meds_vector,med_vec_list,risk,F,M,VAX_NAME
0,916601,73.0,"[0.13214307, 0.29208905, -0.017764108, 0.22982...","[0.13214307, 0.29208905, -0.017764108, 0.22982...",0,1,0,COVID19 (COVID19 (MODERNA))
2,916607,50.0,"[0.15571064, 0.26085582, 0.040806342, 0.234450...","[0.15571064, 0.26085582, 0.040806342, 0.234450...",0,0,1,COVID19 (COVID19 (MODERNA))
3,916609,71.0,"[0.21023898, 0.21132958, 0.093709856, 0.272576...","[0.21023898, 0.21132958, 0.093709856, 0.272576...",0,1,0,COVID19 (COVID19 (MODERNA))
4,916611,33.0,"[0.15030146, 0.3432966, 0.081075, 0.3176437, 0...","[0.15030146, 0.3432966, 0.081075, 0.3176437, 0...",0,1,0,COVID19 (COVID19 (MODERNA))
5,916612,71.0,"[0.1926234, 0.22672129, 0.14428662, 0.26885587...","[0.1926234, 0.22672129, 0.14428662, 0.26885587...",0,1,0,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...
604,917593,79.0,"[0.13920067, 0.27728057, 0.05038308, 0.299194,...","[0.13920067, 0.27728057, 0.05038308, 0.299194,...",0,1,0,COVID19 (COVID19 (MODERNA))
605,917594,52.0,"[0.1379336, 0.27807236, 0.027472204, 0.2662143...","[0.1379336, 0.27807236, 0.027472204, 0.2662143...",0,1,0,COVID19 (COVID19 (PFIZER-BIONTECH))
606,917595,31.0,"[0.06718363, 0.3217772, -0.12017094, 0.1964165...","[0.06718363, 0.3217772, -0.12017094, 0.1964165...",0,1,0,COVID19 (COVID19 (PFIZER-BIONTECH))
607,917597,39.0,"[0.12360231, 0.345573, 0.0315716, 0.3244145, 0...","[0.12360231, 0.345573, 0.0315716, 0.3244145, 0...",0,1,0,COVID19 (COVID19 (MODERNA))


In [None]:
finaldf=merdf.drop(columns=['VAERS_ID','med_vec_list'], axis=1)
finaldf.to_csv("finall.csv")

In [None]:
big_df.to_csv("afterw2v.csv")

In [None]:
def li(i):
    return list(i)

In [None]:
dfs["med_vec_list"]=dfs['meds_vector'].apply(lambda x:li(x))

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
dfs['med_vec_list']

1        [0.13214307, 0.29208905, -0.017764108, 0.22982...
5        [0.15177006, 0.19313937, -0.032748237, 0.20284...
7        [0.15571064, 0.26085582, 0.040806342, 0.234450...
9        [0.21023898, 0.21132958, 0.093709856, 0.272576...
11       [0.15030146, 0.3432966, 0.081075, 0.3176437, 0...
                               ...                        
44425                                                   []
44427                                                   []
44428                                                   []
44429                                                   []
44430                                                   []
Name: med_vec_list, Length: 22666, dtype: object

In [None]:
for i in dfs['med_vec_list']:
    
   
    if len(i)>0:
        print(type(i[1]), i[0])
    print(type(i))

<class 'numpy.float32'> 0.13214307
<class 'list'>
<class 'numpy.float32'> 0.15177006
<class 'list'>
<class 'numpy.float32'> 0.15571064
<class 'list'>
<class 'numpy.float32'> 0.21023898
<class 'list'>
<class 'numpy.float32'> 0.15030146
<class 'list'>
<class 'numpy.float32'> 0.1926234
<class 'list'>
<class 'numpy.float32'> 0.2056778
<class 'list'>
<class 'numpy.float32'> 0.14350978
<class 'list'>
<class 'numpy.float32'> 0.15260942
<class 'list'>
<class 'numpy.float32'> 0.112639114
<class 'list'>
<class 'numpy.float32'> 0.09308522
<class 'list'>
<class 'numpy.float32'> 0.14290318
<class 'list'>
<class 'numpy.float32'> 0.12914723
<class 'list'>
<class 'numpy.float32'> 0.11108448
<class 'list'>
<class 'numpy.float32'> 0.11294017
<class 'list'>
<class 'numpy.float32'> 0.18161567
<class 'list'>
<class 'numpy.float32'> 0.15982562
<class 'list'>
<class 'numpy.float32'> 0.1488475
<class 'list'>
<class 'numpy.float32'> 0.14750631
<class 'list'>
<class 'numpy.float32'> 0.10876
<class 'list'>
<clas

In [None]:
type(dfs['med_vec_list'])


pandas.core.series.Series

In [None]:
from scipy import sparse

In [None]:
=dfs['med_vec_list'].apply(lambda x:spmat(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
print(dfs["spm_meds"][36])

  (0, 0)	0.12639941
  (0, 1)	0.26741454
  (0, 2)	0.057454064
  (0, 3)	0.21737182
  (0, 4)	0.3728368
  (0, 5)	0.12257677
  (0, 6)	-2.359727
  (0, 7)	0.42046157
  (0, 8)	0.3363525
  (0, 9)	-0.31790915
  (0, 10)	0.34768817
  (0, 11)	0.359756
  (0, 12)	0.025754333
  (0, 13)	-0.1507298
  (0, 14)	-0.23550992
  (0, 15)	-0.20812528
  (0, 16)	-0.011508589
  (0, 17)	-0.15946065
  (0, 18)	0.24542473
  (0, 19)	-0.31271842
  (0, 20)	0.4531549
  (0, 21)	-0.1072404
  (0, 22)	-0.11615706
  (0, 23)	0.33994198
  (0, 24)	-0.2903546
  :	:
  (0, 275)	-0.06270593
  (0, 276)	0.14658089
  (0, 277)	0.17889857
  (0, 278)	0.3713663
  (0, 279)	-0.15917946
  (0, 280)	-0.015718598
  (0, 281)	0.037189234
  (0, 282)	0.02221403
  (0, 283)	0.20061421
  (0, 284)	0.18277614
  (0, 285)	-0.0035479683
  (0, 286)	-0.13708554
  (0, 287)	0.18316329
  (0, 288)	0.054588
  (0, 289)	-0.028049434
  (0, 290)	0.054924995
  (0, 291)	-0.045850758
  (0, 292)	-0.12475854
  (0, 293)	0.28018466
  (0, 294)	0.05704054
  (0, 295)	-0.07794879


In [None]:
dfs.to_csv(r"dfs_list.csv")

In [None]:
moderna=pd.read_csv("moderna.csv", encoding="ISO-8859-1")

In [None]:
moderna.drop(columns=['meds_vector'])

Unnamed: 0.1,Unnamed: 0,VAERS_ID,AGE_YRS,DIED,L_THREAT,ER_VISIT,HOSPITAL,BIRTH_DEFECT,DISABLE,VAX_NAME,SEX_F,SEX_M,risk
0,1,916601,73.0,0,0,0,0,0,0,1,1,0,0.0
1,6,916607,50.0,0,0,0,0,0,0,1,0,1,0.0
2,8,916609,71.0,0,0,0,0,0,0,1,1,0,0.0
3,10,916611,33.0,0,0,0,0,0,0,1,1,0,0.0
4,11,916612,71.0,0,0,0,0,0,0,1,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,926,917619,36.0,0,0,0,0,0,0,1,1,0,0.0
435,927,917620,29.0,0,0,0,0,0,0,1,1,0,0.0
436,928,917621,22.0,0,0,0,0,0,0,1,0,1,0.0
437,929,917622,54.0,0,0,0,0,0,0,1,1,0,0.0


In [None]:
dfinal=pd.merge(dfs,moderna,on='VAERS_ID',how="inner")
dfinal.head()

Unnamed: 0.1,VAERS_ID,RECVDATE,STATE,AGE_YRS_x,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED_x,DATEDIED,L_THREAT_x,ER_VISIT_x,HOSPITAL_x,HOSPDAYS,X_STAY,DISABLE_x,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,LAB_DATA,V_ADMINBY,V_FUNDBY,OTHER_MEDS,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT_x,OFC_VISIT,ER_ED_VISIT,ALLERGIES,other_meds_filtered,allergies_filtered,history_filtered,meds_vector_x,med_vec_list,spm_meds,Unnamed: 0,meds_vector_y,AGE_YRS_y,DIED_y,L_THREAT_y,ER_VISIT_y,HOSPITAL_y,BIRTH_DEFECT_y,DISABLE_y,VAX_NAME,SEX_F,SEX_M,risk
0,916601,01-01-2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,,,,,,,,,Y,12-31-2020,12-31-2020,0.0,,SEN,,Patient residing at nursing facility. See pati...,Patient residing at nursing facility. See pati...,Patient residing at nursing facility. See pati...,,,2,01-01-2021,,Y,,"""Dairy""",['residing nursing facility . patients chart .'],"[""`` dairy ''""]",['residing nursing facility . patients chart .'],"[0.13214307, 0.29208905, -0.017764108, 0.22982...","[0.13214307, 0.29208905, -0.017764108, 0.22982...","(0, 0)\t0.13214307\n (0, 1)\t0.29208905\n ...",1,[ 1.32143065e-01 2.92089045e-01 -1.77641083e-...,73.0,0,0,0,0,0,0,1,1,0,0.0
1,916607,01-01-2021,KS,50.0,50.0,,M,,SEVERE chills approximately 13-14 hours after ...,,,,,,,,,Y,12-28-2020,12-29-2020,1.0,,PUB,,"Amlodipine, Ambien, Benicar/HCTZ, Invokana, Me...",,"High blood pressure, high cholesterol, sleep a...",,,2,01-01-2021,,,,Penicillin,"['amlodipine', 'ambien', 'benicar/hctz', 'invo...",['penicillin'],"['high blood pressure', 'high cholesterol', 's...","[0.15571064, 0.26085582, 0.040806342, 0.234450...","[0.15571064, 0.26085582, 0.040806342, 0.234450...","(0, 0)\t0.15571064\n (0, 1)\t0.26085582\n ...",6,[ 1.55710638e-01 2.60855824e-01 4.08063419e-...,50.0,0,0,0,0,0,0,1,0,1,0.0
2,916609,01-01-2021,TN,71.0,71.0,,F,,On day 9 following the vaccination I noticed a...,,,,,,,,,N,12-23-2020,12-31-2020,8.0,,PUB,,Medication Summary 1/1/21 Name of Medica...,,"Hashimoto's thyroiditis, Hypertension, depression",,,2,01-01-2021,,,,"Sulfa antibiotics, azithromycin, adhesive in ...",['medication summary // medication rx otc dose...,"['sulfa antibiotics', 'azithromycin', 'adhesiv...","[""hashimoto 's thyroiditis"", 'hypertension', '...","[0.21023898, 0.21132958, 0.093709856, 0.272576...","[0.21023898, 0.21132958, 0.093709856, 0.272576...","(0, 0)\t0.21023898\n (0, 1)\t0.21132958\n ...",8,[ 2.10238978e-01 2.11329579e-01 9.37098563e-...,71.0,0,0,0,0,0,0,1,1,0,0.0
3,916611,01-01-2021,NC,33.0,33.0,,F,,12pm Received shot 1 pm Sore arm at injection ...,,,,,,,,,N,12-29-2020,12-29-2020,0.0,12-29-20 (at 12:30pm before I started feeling...,OTH,,Flonase Zyrtec,Covid-19 (symptom onset 12-16-20 negative test...,Obesity Anxiety,,,2,01-01-2021,,Y,,Dust mites Zoloft Wellbutrin Buspar,['flonase zyrtec'],['dust mites zoloft wellbutrin buspar'],['obesity anxiety'],"[0.15030146, 0.3432966, 0.081075, 0.3176437, 0...","[0.15030146, 0.3432966, 0.081075, 0.3176437, 0...","(0, 0)\t0.15030146\n (0, 1)\t0.3432966\n (...",10,[ 1.50301456e-01 3.43296587e-01 8.10749978e-...,33.0,0,0,0,0,0,0,1,1,0,0.0
4,916612,01-01-2021,CA,71.0,71.0,,F,,"Left side of face became numb, including to be...",,,,,,,,,U,12-30-2020,12-30-2020,0.0,None yet,PVT,,"levothyroxine 100mcg/day, estradiol 1mg/day",,Graves Disease,,,2,01-01-2021,,,,"penicillin, toradol, methimazole","['levothyroxine', 'estradiol']","['penicillin', 'toradol', 'methimazole']",['graves disease'],"[0.1926234, 0.22672129, 0.14428662, 0.26885587...","[0.1926234, 0.22672129, 0.14428662, 0.26885587...","(0, 0)\t0.1926234\n (0, 1)\t0.22672129\n (...",11,[ 0.1926234 0.22672129 0.14428662 0.268855...,71.0,0,0,0,0,0,0,1,1,0,0.0


In [None]:
dfinal=dfinal.drop(columns=['RECVDATE','SEX','STATE',"CAGE_YR",'CAGE_MO','RPT_DATE','SYMPTOM_TEXT','DIED_x','DATEDIED','L_THREAT_x','ER_VISIT_x','HOSPITAL_x','HOSPDAYS','X_STAY','DISABLE_x',	'RECOVD',	'VAX_DATE',	'ONSET_DATE',	'NUMDAYS',	'LAB_DATA',	'V_ADMINBY',	'V_FUNDBY',	'OTHER_MEDS',	'CUR_ILL',	'HISTORY',	'PRIOR_VAX',	'SPLTTYPE',	'FORM_VERS',	'TODAYS_DATE',	'BIRTH_DEFECT_x',	'OFC_VISIT',	'ER_ED_VISIT',	'ALLERGIES',	'other_meds_filtered',	'allergies_filtered',	'history_filtered',	'meds_vector_x',	'spm_meds','Unnamed: 0',	'meds_vector_y',	'AGE_YRS_y'	,'DIED_y'	,'L_THREAT_y',	'ER_VISIT_y',	'HOSPITAL_y'	])

In [None]:
dfinal

NameError: name 'dfinal' is not defined

In [None]:
dfinal['risk'].unique()

array([0., 7., 3., 4., 5.])

In [None]:
X=dfinal[['AGE_YRS_x','BIRTH_DEFECT_y','DISABLE_y','SEX_F', 'SEX_M']]
X_sp=sparse.csr_matrix(X)
y=dfinal[['risk']]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train

Unnamed: 0,AGE_YRS_x,BIRTH_DEFECT_y,DISABLE_y,SEX_F,SEX_M
72,56.0,0,0,1,0
282,68.0,0,0,1,0
17,44.0,0,0,1,0
162,44.0,0,0,0,1
200,28.0,0,0,1,0
...,...,...,...,...,...
247,74.0,0,0,1,0
244,66.0,0,0,1,0
177,40.0,0,0,1,0
283,66.0,0,0,1,0


In [None]:
clf = MultinomialNB()
clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
y_predict=clf.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        87
         4.0       0.00      0.00      0.00         1

    accuracy                           0.99        88
   macro avg       0.49      0.50      0.50        88
weighted avg       0.98      0.99      0.98        88



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Santosh---------------
dfinal

NameError: name 'dfinal' is not defined