Jupyter notebook for doing the interview task for UMC:

**Analyse the drug side-effect reports (def:_narratives_)**

- look at differences between COVID and non-COVID cases
- what can we say about the text?
- suggestions in assignment:
    - first person? why?
    - redundant narrative? why?
- self investigations
    - text lengths and distributions wrt cases -- classifier. Histogram as feature for a Naive Bayes
        - Statistics
    - What features make it first person?
        - Statistics
    - redundancy detection:
        - duplication detection regexp within narrative
        - duplication of information from other columns and data files?
        - fluff content vs. entropy content
    - vocabulary differences: word frequency distributions TF-IDF 
    - content itself: keywords?
    - timing of narrative wrt symptoms?
    - look at all possibilities and their relevant narratives: FN, FP, TP, TN
    - *Sentiment analysis:* TextBlob, Vader, LLM?
    - *Topic models with LDA*


In [5]:
import pickle
import pandas as pd
from IPython.display import display
import numpy as np

data_filename = "2020VAERSDATA"
vax_filename = "2020VAERSVAX"
symptoms_filename = "2020VAERSSYMPTOMS"

Read in data and display relevant columns 

In [8]:
with open(data_filename+".pickle", 'rb') as fh:
    df_data, cols_data = pickle.load(fh)
    display(df_data[cols_data].head(5))
print("\n\n")
with open(vax_filename+".pickle", 'rb') as fh:
    df_vax, cols_vax = pickle.load(fh)
    display(df_vax[cols_vax].head(5))
print("\n\n")
with open(symptoms_filename+".pickle", 'rb') as fh:
    df_symptoms, cols_symptoms = pickle.load(fh)
    display(df_symptoms[cols_symptoms].head(5))


Unnamed: 0,VAERS_ID,SYMPTOM_TEXT
0,855017,Symptoms occurred almost immediately- aching j...
1,855018,"Extreme pain, muscle weakness in right arm so ..."
2,855019,SORENESS IN THE AREA. ITCHING AND RASH
3,855020,"sore arm, fever 101 , chills, and aching musc..."
4,855021,Patient presented to Clinic today 1/1/2020. Sh...







Unnamed: 0,VAERS_ID,VAX_NAME
0,855017,ZOSTER (SHINGRIX)
1,855018,VACCINE NOT SPECIFIED (OTHER)
2,855019,ZOSTER (SHINGRIX)
3,855020,ZOSTER (SHINGRIX)
4,855021,ZOSTER (SHINGRIX)







Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,855017,Arthralgia,Chills,Injection site pain,Pyrexia,
1,855018,Chills,Fatigue,Hypertension,Hypoaesthesia,Injected limb mobility decreased
2,855018,Muscular weakness,Pain in extremity,Pyrexia,Tremor,Vertigo
3,855019,Pain,Pruritus,Rash,,
4,855020,Chills,Influenza like illness,Myalgia,Pain in extremity,Pyrexia


Check the total number of patient entries:

In [23]:
# Data Check
print(df_data[cols_data[0]].nunique())
print(df_vax[cols_data[0]].nunique())
print(df_symptoms[cols_data[0]].nunique())

print(set(df_data["VAERS_ID"]).symmetric_difference(set(df_vax["VAERS_ID"])))
print(set(df_data["VAERS_ID"]).symmetric_difference(set(df_symptoms["VAERS_ID"])))

49688
49688
49688
set()
set()


In [25]:
# temp
ids = [1123, 1345, 2345, 8876]
names = ["Karaboudjan", "covid-19", "co vid", "COVID-19" ]
dict4df = {
    "VAERS_ID": ids,
    "VAERS_VAX":names
}
df_toy = pd.DataFrame(dict4df)
display(df_toy)

Unnamed: 0,VAERS_ID,VAERS_VAX
0,1123,Karaboudjan
1,1345,covid-19
2,2345,co vid
3,8876,COVID-19


In [70]:
# get all rows where column is related to covid
query_keys = ["covid", "cov"]
df_covid = df_toy[df_toy["VAERS_VAX"].str.replace(" ", "").str.contains('|'.join(query_keys), case=False, regex=True)]
display(df_covid)

Unnamed: 0,VAERS_ID,VAERS_VAX
1,1345,covid-19
2,2345,co vid
3,8876,COVID-19


In [78]:
df_covid = df_vax[df_vax[cols_vax[1]].str.replace(" ", "").str.contains('|'.join(query_keys), case=False, regex=True)]
display(df_covid)

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
46482,902418,COVID19,PFIZER\BIONTECH,EH9899,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
46501,902440,COVID19,PFIZER\BIONTECH,EH 9899,1,SYR,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
46508,902446,COVID19,PFIZER\BIONTECH,EH9899,1,IM,RA,COVID19 (COVID19 (PFIZER-BIONTECH))
46527,902464,COVID19,PFIZER\BIONTECH,EH9899,UNK,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
46528,902465,COVID19,PFIZER\BIONTECH,EH9899,1,IM,RA,COVID19 (COVID19 (PFIZER-BIONTECH))
...,...,...,...,...,...,...,...,...
59624,918440,COVID19,UNKNOWN MANUFACTURER,,UNK,,,COVID19 (COVID19 (UNKNOWN))
59625,918441,COVID19,MODERNA,025J20-2A,1,IM,RA,COVID19 (COVID19 (MODERNA))
59626,918442,COVID19,UNKNOWN MANUFACTURER,,UNK,,,COVID19 (COVID19 (UNKNOWN))
59627,918503,COVID19,MODERNA,039K20A,1,IM,RA,COVID19 (COVID19 (MODERNA))


In [89]:
idx_start = 46480
idx_end = 46800
df_d = df_data.iloc[idx_start:idx_end]
df_v = df_vax.iloc[idx_start:idx_end]
display(df_d)
df_covid = df_v[df_v[cols_vax[1]].str.replace(" ", "").str.contains('|'.join(query_keys), case=False, regex=True)]
display(df_covid.shape)

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
46480,913201,12/29/2020,AZ,58.0,58.0,,F,,"Pt experienced Weakness, dizziness, vomiting, ...",,...,,"HTN, Type 2 DM, Vit D Def.",,,2,12/29/2020,,,,NKDA
46481,913202,12/29/2020,AZ,37.0,37.0,,F,,Pt developed a rash about an 30mins to an hour...,,...,,,,,2,12/29/2020,,Y,,NKDA
46482,913206,12/29/2020,TX,49.0,49.0,,M,,Bell?s Palsey about 4.5 hours after injection.,,...,,"Diabetes, Hypertension, High Cholesterol",,,2,12/29/2020,,,Y,
46483,913207,12/29/2020,OR,20.0,20.0,,F,,First about 15 minutes after I received the va...,,...,,,,,2,12/29/2020,,,,
46484,913208,12/29/2020,UT,19.0,19.0,,F,,Chills and muscle aches two hours after the va...,,...,,Asthma and environmental (seasonal) allergies,,,2,12/29/2020,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46795,913493,12/30/2020,TN,39.0,39.0,,F,,Started having throat swelling approximately 5...,,...,,Mass Cell Activation Syndrome - was diagnosed ...,,,2,12/30/2020,,,,"Any foods high in histamine - wine, tomatoes, ..."
46796,913494,12/30/2020,TX,40.0,40.0,,M,,Patient started with sore arm the day of the v...,,...,none,Hypertension and heart disease,,,2,12/30/2020,,,,none
46797,913495,12/30/2020,CT,20.0,20.0,,F,,"Nausea, vomiting, vertigo, syncope, chills, he...",,...,,,,,2,12/30/2020,,,,Penicillin allergy
46798,913496,12/30/2020,NC,57.0,57.0,,F,,"dizziness, pounding headache for most of night...",,...,none,none,,,2,12/30/2020,,,,none


(46, 8)