Setup

In [None]:
# install packages
import Pkg;
Pkg.add("CSV");
Pkg.add("DataFrames");
Pkg.add("Pandas");
Pkg.add("StatsBase");

In [1]:
# load packages
using CSV;
using DataFrames;
using StatsBase;

In [3]:
# setup filepaths
path_source = string(@__DIR__,"\\..\\source");
path_dev = string(@__DIR__,"\\..\\dev");

Process VAERS symptoms file

In [4]:
df_symp_appended = CSV.read(joinpath(path_dev,"19-21VAERSSYMPTOMS.csv"), DataFrame)

In [5]:
# Each row in VAERSSYMPTOMS.csv is limited to 5 MedDRA symptom terms 
# This means VAERS reports with > 5 symptoms will have multiple rows
# Collapse the 5 MedDRA symptom columns so that we have a single column containing an array of symptoms for each row

test = copy(df_symp_appended);

# rows with 1 symptom
test1 = filter(row -> any(ismissing, row[names(test)[4:5]]), test);
test1[!,:SYMPTOMS] = map(collect, zip(test1[!,:SYMPTOM1]));
select!(test1, [:VAERS_ID, :SYMPTOMS])

# rows with 2 symptoms
test2 = filter(row -> !any(ismissing, row[names(test)[2:5]]) && any(ismissing, row[names(test)[6:7]]), test)
test2[!,:SYMPTOMS] = map(collect, zip(test2[!,:SYMPTOM1], 
                                      test2[!,:SYMPTOM2]));
select!(test2, [:VAERS_ID, :SYMPTOMS])

# rows with 3 symptoms
test3 = filter(row -> !any(ismissing, row[names(test)[2:7]]) && any(ismissing, row[names(test)[8:9]]), test)
test3[!,:SYMPTOMS] = map(collect, zip(test3[!,:SYMPTOM1], 
                                      test3[!,:SYMPTOM2], 
                                      test3[!,:SYMPTOM3]));
select!(test3, [:VAERS_ID, :SYMPTOMS])

# rows with 4 symptoms
test4 = filter(row -> !any(ismissing, row[names(test)[2:9]]) && any(ismissing, row[names(test)[10:11]]), test)
test4[!,:SYMPTOMS] = map(collect, zip(test4[!,:SYMPTOM1], 
                                      test4[!,:SYMPTOM2], 
                                      test4[!,:SYMPTOM3], 
                                      test4[!,:SYMPTOM4]));
select!(test4, [:VAERS_ID, :SYMPTOMS])

# rows with 5 symptoms (no missing symptoms)
test5 = filter(row -> !any(ismissing, row[names(test)[2:11]]), test);
test5[!,:SYMPTOMS] = map(collect, zip(test5[!,:SYMPTOM1], 
                                      test5[!,:SYMPTOM2], 
                                      test5[!,:SYMPTOM3], 
                                      test5[!,:SYMPTOM4], 
                                      test5[!,:SYMPTOM5]));
select!(test5, [:VAERS_ID, :SYMPTOMS]);

# append and sort
test = reduce(vcat, [test1, test2, test3, test4, test5]);
sort!(test, :VAERS_ID)

Unnamed: 0_level_0,VAERS_ID,SYMPTOMS
Unnamed: 0_level_1,Int64,Array…
1,794156,"[""Injected limb mobility decreased"", ""Injection site joint pain""]"
2,794157,"[""Apathy"", ""Arthralgia"", ""Asthenia"", ""Injection site erythema"", ""Injection site pain""]"
3,794157,"[""Injection site pruritus"", ""Injection site swelling"", ""Injection site warmth"", ""Listless"", ""Night sweats""]"
4,794158,"[""Chills"", ""Headache"", ""Nausea"", ""Pain"", ""Pyrexia""]"
5,794159,"[""Injection site erythema"", ""Injection site swelling"", ""Injection site warmth"", ""Pain""]"
6,794160,"[""Injection site swelling"", ""Lip blister"", ""Lip swelling"", ""Pain""]"
7,794160,"[""Asthenia"", ""Chills"", ""Fatigue"", ""Influenza like illness"", ""Injection site erythema""]"
8,794161,"[""Pyrexia""]"
9,794162,"[""Injection site erythema"", ""Injection site rash"", ""Injection site swelling"", ""Macule"", ""Rash papular""]"
10,794163,"[""Myalgia"", ""Nausea"", ""Pyrexia""]"


In [33]:
# Collapse the rows so that we get a dict of symptoms for each VAERS_ID
# Step 1: create the dict 
vaers_id_to_symptoms_dict = Dict{Int, Set{String}}()
# Step 2: populate the keys (VAERS_ID) of the dict
for rownumber in 1:size(test, 1)
    vaers_id = test[rownumber, :VAERS_ID]
    if !haskey(vaers_id_to_symptoms_dict, vaers_id)
        # this is the set where we will store all of the symptoms for this VAERS ID
        vaers_id_to_symptoms_dict[vaers_id] = Set{String}()
    end
end
# Step 3: populate the values (SYMPTOMS) of the dict
for rownumber in 1:size(test, 1)
    vaers_id = test[rownumber, :VAERS_ID]
    symptoms = test[rownumber, :SYMPTOMS]
    for symptom in symptoms 
        push!(vaers_id_to_symptoms_dict[vaers_id], symptom)
    end
end
# View dict
vaers_id_to_symptoms_dict

Dict{Int64, Set{String}} with 135054 entries:
  870067  => Set(["Platelet count decreased"])
  1043880 => Set(["Aspiration", "Death", "Insomnia", "Seizure"])
  900301  => Set(["Coronavirus infection", "Cough", "Dyspnoea"])
  905057  => Set(["Influenza"])
  818452  => Set(["Abdominal distension", "Flatulence", "Abdominal pain upper",…
  870391  => Set(["Product storage error"])
  1051588 => Set(["Paraesthesia"])
  1048923 => Set(["Pain", "Pyrexia", "Pain in extremity"])
  802732  => Set(["Pain", "Injection site pain"])
  876141  => Set(["Expired product administered"])
  816542  => Set(["Herpes zoster"])
  864692  => Set(["Malaise", "Pain", "Pyrexia", "Pruritus", "Skin warm", "Eryth…
  888531  => Set(["Tenderness", "Pain in extremity", "Peripheral swelling"])
  925270  => Set(["Muscle fatigue"])
  926039  => Set(["Nausea", "Skin warm", "Arthralgia", "Erythema", "Neck pain",…
  844734  => Set(["Cough", "Diarrhoea", "Headache", "Lethargy"])
  898563  => Set(["Mobility decreased", "Product

In [35]:
# Create dict of most reported symptoms
symptoms_all_dupes = reduce(vcat, test.SYMPTOMS)
symptoms_freq_dict = StatsBase.countmap(symptoms_all_dupes)

Dict{String, Int64} with 6470 entries:
  "Idiopathic urticaria"                              => 1
  "Skin test positive"                                => 11
  "Intercepted medication error"                      => 2
  "Inappropriate release of product for distribution" => 2
  "Sinus rhythm"                                      => 8
  "Tendinous contracture"                             => 1
  "Hyperhidrosis"                                     => 2882
  "Subdural haematoma"                                => 21
  "Tuberculosis"                                      => 2
  "Blood creatine increased"                          => 9
  "Magnetic resonance elastography"                   => 2
  "Antineutrophil cytoplasmic antibody"               => 9
  "Prothrombin level normal"                          => 8
  "Biliary tract infection fungal"                    => 1
  "Cough"                                             => 2508
  "Coronavirus infection"                             => 12
  "Aller

In [38]:
# List of most reported symptoms
list_symptoms = unique(reduce(vcat, test.SYMPTOMS))
freq_per_symptom = [symptoms_freq_dict[val] for val in list_symptoms]
perm = sortperm(freq_per_symptom; rev=true)
#print(freq_per_symptom[perm])
list_symptoms_mostlikely = list_symptoms[perm]

6470-element Vector{String}:
 "Pyrexia"
 "Headache"
 "Pain"
 "Chills"
 "Fatigue"
 "Injection site pain"
 "Pain in extremity"
 "Nausea"
 "Injection site erythema"
 "No adverse event"
 "Dizziness"
 "Injection site swelling"
 "Myalgia"
 ⋮
 "Splenic vein thrombosis"
 "Therapy cessation"
 "Nephrostomy"
 "Gastrointestinal bacterial infection"
 "Nail injury"
 "Vertebral artery stenosis"
 "Bladder mass"
 "Gallbladder mass"
 "Mini-tracheostomy"
 "Scan myocardial perfusion abnormal"
 "Eosinophils urine"
 "Flight of ideas"

In [39]:
# write to dev
CSV.write(joinpath(path_dev,"19-21VAERSSYMPTOMS_dev.csv"), vaers_id_to_symptoms_dict);