In [2]:
import numpy as np
import pandas as pd
import math
import base64


In [78]:
# IMPORT DATA: nrows limited to 10,000 to save memory; remove argument to load full datasets
data = pd.read_csv("data/2021VAERSDATA.csv", nrows=10000)
sym = pd.read_csv("data/2021VAERSSYMPTOMS.csv", nrows=10000)
vax = pd.read_csv("data/2021VAERSVAX.csv", nrows=10000)

# PRELIMINARY CLEANING: dropping unnecessary columns and rows with incomplete values.
# PRELIMINARY CLEANING: vaersdata
data_drop = ["CAGE_YR", 
             "CAGE_MO", 
             "RPT_DATE", 
             "SYMPTOM_TEXT", 
             "DATEDIED", 
             "L_THREAT", 
             "ER_VISIT", 
             "HOSPITAL", 
             "HOSPDAYS", 
             "X_STAY", 
             "DISABLE", 
             "NUMDAYS", 
             "OTHER_MEDS", 
             "CUR_ILL", 
             "HISTORY", 
             "PRIOR_VAX", 
             "SPLTTYPE", 
             "TODAYS_DATE", 
             "BIRTH_DEFECT", 
             "ER_ED_VISIT", 
             "ALLERGIES", 
             "LAB_DATA", 
             "V_ADMINBY", 
             "V_FUNDBY",
             "FORM_VERS", 
             "OFC_VISIT", 
             "DIED", 
             "RECOVD"]
data = data.drop(data_drop, axis = 1)
# remove rows with blank input
data = data.replace(r'^\s*$', np.nan, regex = True)
data.dropna()
# remove rows with incomplete SEX values
data = data[(data["SEX"] != "U")]
# PRELIMINARY CLEANING: vaerssymptoms
sym_drop = ["SYMPTOMVERSION1", 
            "SYMPTOMVERSION2", 
            "SYMPTOMVERSION3", 
            "SYMPTOMVERSION4", 
            "SYMPTOMVERSION5"]
sym = sym.drop(sym_drop, axis = 1)
# PRELIMINARY CLEANING: vaersvax
vax_drop = ["VAX_ROUTE", 
            "VAX_SITE", 
            "VAX_LOT"]
vax = vax.drop(vax_drop, axis = 1)
# remove rows reporting non-COVID-19 related symptoms and unknown dosage series
# NOTE:  VAX_DOSE_SERIES reflects patients with a number of administered doses above 3
vax = vax[(vax["VAX_TYPE"] == "COVID19") & 
          (vax["VAX_DOSE_SERIES"] != "UNK") & 
          (vax["VAX_DOSE_SERIES"] != "N/A")]

# MERGING DATAFRAMES: merged dataframes assigned to variable "vaers"
vaers = pd.merge(vax, data)
vaers = pd.merge(vaers, sym)

# STATE FILTERING: New York, California, and Florida
vaers_filtered = vaers[(vaers["STATE"].isin(["CA", "FL", "NY"]))]

In [79]:
vaers_filtered

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_DOSE_SERIES,VAX_NAME,RECVDATE,STATE,AGE_YRS,SEX,VAX_DATE,ONSET_DATE,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
1,916601,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/01/2021,CA,73.0,F,12/31/2020,12/31/2020,Anxiety,Dyspnoea,,,
13,916612,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/01/2021,CA,71.0,F,12/30/2020,12/30/2020,Ear pain,Hypoaesthesia,,,
14,916613,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/01/2021,CA,40.0,F,12/30/2020,12/30/2020,Abdominal pain upper,Dizziness,Dysgeusia,,
15,916614,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/01/2021,NY,29.0,F,12/22/2020,12/22/2020,Blood pressure increased,Chest discomfort,Heart rate increased,,
16,916615,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/01/2021,NY,38.0,F,12/23/2020,12/31/2020,Injection site erythema,Injection site pruritus,Injection site swelling,Lymph node pain,Lymphadenopathy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7613,924532,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/06/2021,FL,59.0,F,01/04/2021,01/05/2021,Injection site erythema,Injection site mass,Injection site pain,Injection site pruritus,
7614,924533,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/06/2021,CA,30.0,F,01/05/2021,01/05/2021,Dizziness,Dysphagia,Headache,Heart rate increased,Hypertension
7615,924533,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/06/2021,CA,30.0,F,01/05/2021,01/05/2021,Nausea,Oropharyngeal pain,,,
7616,924534,COVID19,MODERNA,1,COVID19 (COVID19 (MODERNA)),01/06/2021,CA,29.0,F,12/31/2020,01/05/2021,Headache,Injection site rash,Pain,Pruritus,Tenderness
