Setup

In [None]:
# install packages
import Pkg;
Pkg.add("CSV");
Pkg.add("DataFrames");
Pkg.add("Pandas")

In [1]:
# load packages
using CSV;
using DataFrames;

In [2]:
# setup filepaths
path_source = string(@__DIR__,"\\..\\source");
path_dev = string(@__DIR__,"\\..\\dev");

Process VAERS symptoms file

In [3]:
df_symp_appended = CSV.read(joinpath(path_dev,"19-21VAERSSYMPTOMS.csv"), DataFrame);

In [None]:
# Each symptom has an MedDRA Term and MedDRA dictionary version number
# Concatenate the term and version number into a single column
#=
insertcols!(test, 2, :SYMP1 =>[string(test[!,:SYMPTOM1][i])*"_"*string(test[!,:SYMPTOMVERSION1][i]) for i=1:nrow(test)])
insertcols!(test, 3, :SYMP2 =>[string(test[!,:SYMPTOM2][i])*"_"*string(test[!,:SYMPTOMVERSION2][i]) for i=1:nrow(test)])
insertcols!(test, 4, :SYMP3 =>[string(test[!,:SYMPTOM3][i])*"_"*string(test[!,:SYMPTOMVERSION3][i]) for i=1:nrow(test)])
insertcols!(test, 5, :SYMP4 =>[string(test[!,:SYMPTOM4][i])*"_"*string(test[!,:SYMPTOMVERSION4][i]) for i=1:nrow(test)])
insertcols!(test, 6, :SYMP5 =>[string(test[!,:SYMPTOM5][i])*"_"*string(test[!,:SYMPTOMVERSION5][i]) for i=1:nrow(test)])
=#

In [7]:
# Each row in the VAERSSYMPTOMS.csv is limited to 5 MedDRA terms so there could be multiple rows per VAERS ID
# Collapse the columns so that we have a single column containing an array of symptoms

test = copy(df_symp_appended);

# rows with 1 symptom
test1 = filter(row -> any(ismissing, row[names(test)[4:5]]), test);
test1[!,:SYMPTOMS] = map(collect, zip(test1[!,:SYMPTOM1]));
select!(test1, [:VAERS_ID, :SYMPTOMS])

# rows with 2 symptoms
test2 = filter(row -> !any(ismissing, row[names(test)[2:5]]) && any(ismissing, row[names(test)[6:7]]), test)
test2[!,:SYMPTOMS] = map(collect, zip(test2[!,:SYMPTOM1], 
                                      test2[!,:SYMPTOM2]));
select!(test2, [:VAERS_ID, :SYMPTOMS])

# rows with 3 symptoms
test3 = filter(row -> !any(ismissing, row[names(test)[2:7]]) && any(ismissing, row[names(test)[8:9]]), test)
test3[!,:SYMPTOMS] = map(collect, zip(test3[!,:SYMPTOM1], 
                                      test3[!,:SYMPTOM2], 
                                      test3[!,:SYMPTOM3]));
select!(test3, [:VAERS_ID, :SYMPTOMS])

# rows with 4 symptoms
test4 = filter(row -> !any(ismissing, row[names(test)[2:9]]) && any(ismissing, row[names(test)[10:11]]), test)
test4[!,:SYMPTOMS] = map(collect, zip(test4[!,:SYMPTOM1], 
                                      test4[!,:SYMPTOM2], 
                                      test4[!,:SYMPTOM3], 
                                      test4[!,:SYMPTOM4]));
select!(test4, [:VAERS_ID, :SYMPTOMS])

# rows with 5 symptoms (no missing symptoms)
test5 = filter(row -> !any(ismissing, row[names(test)[2:11]]), test);
test5[!,:SYMPTOMS] = map(collect, zip(test5[!,:SYMPTOM1], 
                                      test5[!,:SYMPTOM2], 
                                      test5[!,:SYMPTOM3], 
                                      test5[!,:SYMPTOM4], 
                                      test5[!,:SYMPTOM5]));
select!(test5, [:VAERS_ID, :SYMPTOMS]);

# append and sort
test = reduce(vcat, [test1, test2, test3, test4, test5]);
sort!(test, :VAERS_ID)

Unnamed: 0_level_0,VAERS_ID,SYMPTOMS
Unnamed: 0_level_1,Int64,Array…
1,794156,"[""Injected limb mobility decreased"", ""Injection site joint pain""]"
2,794157,"[""Apathy"", ""Arthralgia"", ""Asthenia"", ""Injection site erythema"", ""Injection site pain""]"
3,794157,"[""Injection site pruritus"", ""Injection site swelling"", ""Injection site warmth"", ""Listless"", ""Night sweats""]"
4,794158,"[""Chills"", ""Headache"", ""Nausea"", ""Pain"", ""Pyrexia""]"
5,794159,"[""Injection site erythema"", ""Injection site swelling"", ""Injection site warmth"", ""Pain""]"
6,794160,"[""Injection site swelling"", ""Lip blister"", ""Lip swelling"", ""Pain""]"
7,794160,"[""Asthenia"", ""Chills"", ""Fatigue"", ""Influenza like illness"", ""Injection site erythema""]"
8,794161,"[""Pyrexia""]"
9,794162,"[""Injection site erythema"", ""Injection site rash"", ""Injection site swelling"", ""Macule"", ""Rash papular""]"
10,794163,"[""Myalgia"", ""Nausea"", ""Pyrexia""]"


In [None]:
#combine(groupby(test, :VAERS_ID), :SYMPTOMS => join)

In [5]:
# Unique list of symptoms across all VAERS reports
list_symptoms = unique(reduce(vcat, test.SYMPTOMS))

6470-element Vector{String}:
 "Injected limb mobility decreased"
 "Injection site joint pain"
 "Apathy"
 "Arthralgia"
 "Asthenia"
 "Injection site erythema"
 "Injection site pain"
 "Injection site pruritus"
 "Injection site swelling"
 "Injection site warmth"
 "Listless"
 "Night sweats"
 "Chills"
 ⋮
 "Therapy cessation"
 "Cheyne-Stokes respiration"
 "Nephrostomy"
 "Gastrointestinal bacterial infection"
 "Nail injury"
 "Vertebral artery stenosis"
 "Bladder mass"
 "Gallbladder mass"
 "Mini-tracheostomy"
 "Scan myocardial perfusion abnormal"
 "Eosinophils urine"
 "Flight of ideas"

In [6]:
# One hot encode by the array of symptoms in :SYMPTOMS
test_onehot = transform(test, :SYMPTOMS .=> [ByRow(v -> x in v) for x in list_symptoms] .=> Symbol.(:SYMPTOMS_, list_symptoms))

Unnamed: 0_level_0,VAERS_ID,SYMPTOMS
Unnamed: 0_level_1,Int64,Array…
1,794156,"[""Injected limb mobility decreased"", ""Injection site joint pain""]"
2,794157,"[""Apathy"", ""Arthralgia"", ""Asthenia"", ""Injection site erythema"", ""Injection site pain""]"
3,794157,"[""Injection site pruritus"", ""Injection site swelling"", ""Injection site warmth"", ""Listless"", ""Night sweats""]"
4,794158,"[""Chills"", ""Headache"", ""Nausea"", ""Pain"", ""Pyrexia""]"
5,794159,"[""Injection site erythema"", ""Injection site swelling"", ""Injection site warmth"", ""Pain""]"
6,794160,"[""Injection site swelling"", ""Lip blister"", ""Lip swelling"", ""Pain""]"
7,794160,"[""Asthenia"", ""Chills"", ""Fatigue"", ""Influenza like illness"", ""Injection site erythema""]"
8,794161,"[""Pyrexia""]"
9,794162,"[""Injection site erythema"", ""Injection site rash"", ""Injection site swelling"", ""Macule"", ""Rash papular""]"
10,794163,"[""Myalgia"", ""Nausea"", ""Pyrexia""]"


In [8]:
# write to dev
CSV.write(joinpath(path_dev,"19-21VAERSSYMPTOMS_dev.csv"), test);
CSV.write(joinpath(path_dev,"19-21VAERSSYMPTOMS_dev_onehot.csv"), test_onehot);