In [57]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import categorical_embedder as ce

# All data

In [73]:
df = pd.read_csv('../Machine_Learning/data/df_all.csv')

In [74]:
df_all = df.copy()

In [75]:
def scale_multi_dim_var(cols):
    max_sum = df_all[cols].sum(axis=1).max()
    df_other_copy[cols] = df_all[cols] / max_sum

In [76]:
age_cols = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']
scale_multi_dim_var(age_cols)

numdays_cols = ['NUMDAYS_0-1', 'NUMDAYS_2-3', 'NUMDAYS_4-5', 'NUMDAYS_6-7', 'NUMDAYS_8-9', 'NUMDAYS_10-11',
               'NUMDAYS_12-13', 'NUMDAYS_14+']
scale_multi_dim_var(numdays_cols)

reaction_cols = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
scale_multi_dim_var(reaction_cols)

de_cols = ['DE', 'De', 'dE', 'de']
scale_multi_dim_var(de_cols)

serious_cols = ['SERIOUS Y', 'SERIOUS N']
scale_multi_dim_var(serious_cols)

sex_cols = ['SEX_M', 'SEX_F', 'SEX_U']
scale_multi_dim_var(sex_cols)


In [77]:
y = df_all['y_true']
X = df_all.drop('y_true', axis=1)

print(X.shape)
print(y.shape)

(1125, 40)
(1125,)


In [78]:
X

Unnamed: 0,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,Angioedema,23,95,161,304,308,287,191,105,45,...,2878.0,943022.0,2351.0,1530710.0,564.0,2314.0,0,0,0,1
1,Hypoaesthesia,39,732,2295,4248,4426,3566,1960,862,241,...,26529.0,919371.0,27845.0,1505216.0,7405.0,19126.0,0,0,0,1
2,Erythema multiforme,20,12,31,28,41,30,34,30,20,...,733.0,945167.0,2228.0,1530833.0,131.0,602.0,0,0,0,1
3,Insomnia,33,205,732,1345,1345,1253,1012,581,180,...,11215.0,934685.0,14609.0,1518452.0,2674.0,8541.0,0,0,0,1
4,Myalgia,106,935,4266,7133,6992,6599,4653,2335,583,...,52182.0,893718.0,75910.0,1457151.0,12832.0,39351.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,Increased insulin requirement,0,0,1,1,0,0,1,0,0,...,3.0,98389.0,33.0,2380536.0,2.0,1.0,1,0,0,0
1121,Blood lactate dehydrogenase decreased,0,0,0,0,1,0,0,0,0,...,7.0,945893.0,17.0,1533044.0,4.0,3.0,0,0,0,1
1122,Breast cellulitis,0,0,0,0,2,0,0,2,0,...,4.0,537275.0,16.0,1941666.0,1.0,3.0,0,1,0,0
1123,Vascular graft thrombosis,0,0,0,0,0,2,0,1,0,...,3.0,537276.0,10.0,1941672.0,3.0,0.0,0,1,0,0


In [79]:
y.values

array([1, 1, 1, ..., 0, 0, 0])

In [98]:
embedding_info = ce.get_embedding_info(X, categorical_variables=['symptom'])
X_encoded, encoders = ce.get_label_encoded_data(X, categorical_variables=['symptom'])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

In [99]:
X_test

Unnamed: 0,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
4,609,106,935,4266,7133,6992,6599,4653,2335,583,...,52182.0,893718.0,75910.0,1457151.0,12832.0,39351.0,0,0,0,1
350,596,0,0,1,0,0,1,0,0,0,...,4.0,945896.0,0.0,1533061.0,1.0,3.0,0,0,0,1
997,548,0,0,0,2,0,0,1,0,0,...,3.0,537276.0,21.0,1941661.0,1.0,2.0,0,1,0,0
224,422,1,0,0,0,0,0,0,0,0,...,4.0,945896.0,0.0,1533061.0,1.0,3.0,0,0,0,1
564,478,0,0,1,1,1,0,0,2,0,...,6.0,98386.0,4279.0,2376290.0,0.0,6.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,986,0,0,2,0,1,2,2,0,1,...,10.0,537269.0,61.0,1941621.0,8.0,2.0,0,1,0,0
709,359,0,0,0,0,2,0,2,0,0,...,4.0,537275.0,1.0,1941681.0,4.0,0.0,0,1,0,0
169,207,0,0,0,0,0,2,0,0,0,...,4.0,945896.0,1.0,1533060.0,4.0,0.0,0,0,0,1
835,521,0,0,0,0,0,3,2,2,0,...,7.0,98385.0,69.0,2380500.0,7.0,0.0,1,0,0,0


In [100]:
df_all

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,1,Angioedema,23,95,161,304,308,287,191,105,...,2878.0,943022.0,2351.0,1530710.0,564.0,2314.0,0,0,0,1
1,1,Hypoaesthesia,39,732,2295,4248,4426,3566,1960,862,...,26529.0,919371.0,27845.0,1505216.0,7405.0,19126.0,0,0,0,1
2,1,Erythema multiforme,20,12,31,28,41,30,34,30,...,733.0,945167.0,2228.0,1530833.0,131.0,602.0,0,0,0,1
3,1,Insomnia,33,205,732,1345,1345,1253,1012,581,...,11215.0,934685.0,14609.0,1518452.0,2674.0,8541.0,0,0,0,1
4,1,Myalgia,106,935,4266,7133,6992,6599,4653,2335,...,52182.0,893718.0,75910.0,1457151.0,12832.0,39351.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,0,Increased insulin requirement,0,0,1,1,0,0,1,0,...,3.0,98389.0,33.0,2380536.0,2.0,1.0,1,0,0,0
1121,0,Blood lactate dehydrogenase decreased,0,0,0,0,1,0,0,0,...,7.0,945893.0,17.0,1533044.0,4.0,3.0,0,0,0,1
1122,0,Breast cellulitis,0,0,0,0,2,0,0,2,...,4.0,537275.0,16.0,1941666.0,1.0,3.0,0,1,0,0
1123,0,Vascular graft thrombosis,0,0,0,0,0,2,0,1,...,3.0,537276.0,10.0,1941672.0,3.0,0.0,0,1,0,0


In [None]:
# Get embeddings
embeddings = ce.get_embeddings(X_train, y_train, categorical_embedding_info=embedding_info,
                               is_classification=True, epochs=100, batch_size=256)

In [103]:
df_embedded = ce.fit_transform(df_all, embeddings=embeddings, encoders=encoders, drop_categorical_vars=True)
df_embedded.head()

Unnamed: 0,y_true,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
0,1,23,95,161,304,308,287,191,105,45,...,0.02735,0.028329,-0.003347,-0.034938,0.042003,0.045872,-0.006744,-0.031622,0.001456,-0.034739
1,1,39,732,2295,4248,4426,3566,1960,862,241,...,0.153152,-0.116624,-0.107445,0.144514,0.144928,0.172942,-0.094439,0.147882,0.108925,0.109101
2,1,20,12,31,28,41,30,34,30,20,...,0.25197,-0.262678,-0.312541,0.316623,0.294394,0.234246,-0.263375,0.316704,0.293659,0.318284
3,1,33,205,732,1345,1345,1253,1012,581,180,...,0.156545,-0.111521,-0.147374,0.133255,0.172962,0.144399,-0.103453,0.107303,0.0815,0.136353
4,1,106,935,4266,7133,6992,6599,4653,2335,583,...,0.037217,-0.095543,-0.088111,0.111776,0.089034,0.050345,-0.07744,0.018539,0.011462,0.0803


In [104]:
df_embedded.to_pickle('data/df_all_embedded')  

In [109]:
X_train, X_test, y_train, y_test = train_test_split(df_embedded, y)

X_train = X_train.drop('y_true', axis=1)
X_test = X_test.drop('y_true', axis=1)

X_train.to_csv('../Machine_Learning/data/X_train_all.csv', index=False)
X_test.to_csv('../Machine_Learning/data/X_test_all.csv', index=False)
y_train.to_csv('../Machine_Learning/data/y_train_all.csv', index=False)
y_test.to_csv('../Machine_Learning/data/y_test_all.csv', index=False)

Unnamed: 0,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
962,1,1,6,4,10,8,14,3,1,0,...,-0.047681,0.033711,0.028897,-0.034901,-0.016367,-0.026803,0.055199,0.018942,0.006267,-0.064565
1074,6,21,9,17,25,33,14,18,14,10,...,-0.061667,0.009911,0.021424,-0.048089,-0.006167,-0.053823,0.054360,-0.068293,-0.060405,0.012780
242,0,0,0,0,0,0,2,1,0,0,...,0.010892,-0.000011,-0.015189,-0.025711,0.032403,-0.023947,0.023531,0.006700,0.025493,-0.019118
635,0,0,9,17,15,12,5,2,0,0,...,-0.045004,0.022856,-0.009831,-0.049107,0.024326,0.015817,-0.015032,0.037623,-0.025589,-0.031306
449,0,0,5,7,8,13,15,8,0,1,...,0.015930,0.002875,-0.019599,0.021653,0.010241,-0.040859,-0.035966,-0.014652,0.019251,-0.008537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,0,0,0,0,0,0,0,2,1,0,...,-0.006594,-0.017532,0.023981,-0.038084,0.028413,0.006607,0.033995,0.033905,0.039429,0.001958
1050,0,0,0,0,0,0,0,1,0,0,...,0.032927,-0.021964,-0.020495,0.001351,-0.023536,0.047498,0.004269,0.036613,-0.012369,0.034669
638,4,20,22,99,95,86,86,46,8,4,...,-0.041769,0.059161,0.000283,-0.043642,-0.008216,-0.004000,0.003054,-0.004404,0.002884,-0.068487
676,0,0,0,0,0,0,0,0,0,0,...,0.007009,-0.023230,0.005398,-0.005451,0.011303,-0.021753,0.022699,0.007492,0.038061,0.021275


# Generate splits with Moderna = test

In [11]:
df_embedded['y_true'] = y

In [12]:
df_embedded

Unnamed: 0,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,...,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49,y_true
0,0.020193,0.016728,0.014843,0.017429,0.017247,0.016329,0.012564,0.012005,0.013682,0.006903,...,-0.038803,0.031955,-0.013366,0.006447,0.082953,0.059780,-0.082158,-0.033914,0.011456,1
1,0.034241,0.128896,0.211579,0.243550,0.247844,0.202890,0.128930,0.098559,0.073275,0.034517,...,-0.002073,0.025551,-0.050798,0.006093,0.075400,0.073157,-0.099012,-0.017383,0.000349,1
2,0.017559,0.002113,0.002858,0.001605,0.002296,0.001707,0.002237,0.003430,0.006081,0.003945,...,-0.023250,-0.000355,-0.020413,0.035789,-0.032599,-0.004464,0.015447,0.040009,0.041251,1
3,0.028973,0.036098,0.067484,0.077113,0.075316,0.071290,0.066570,0.066430,0.054728,0.023669,...,0.004094,0.027650,-0.049678,-0.039587,0.058385,0.030067,-0.059521,-0.070503,-0.013063,1
4,0.093064,0.164642,0.393288,0.408955,0.391533,0.375455,0.306078,0.266979,0.177258,0.074951,...,-0.031789,0.028889,-0.047447,-0.007253,0.101274,0.050122,-0.026549,-0.051247,-0.007401,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,0.000000,0.000000,0.000092,0.000057,0.000000,0.000000,0.000066,0.000000,0.000000,0.000000,...,0.032908,-0.051052,-0.013448,-0.031592,0.058226,-0.018147,0.039280,0.011030,-0.027882,0
1121,0.000000,0.000000,0.000000,0.000000,0.000056,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.040171,-0.017807,-0.028698,-0.025172,-0.025134,-0.015200,-0.036714,-0.047624,0.026188,0
1122,0.000000,0.000000,0.000000,0.000000,0.000112,0.000000,0.000000,0.000229,0.000000,0.000000,...,0.032199,-0.003319,0.031135,0.060899,-0.070487,-0.024701,-0.056912,0.026682,-0.029919,0
1123,0.000000,0.000000,0.000000,0.000000,0.000000,0.000114,0.000000,0.000114,0.000000,0.000000,...,0.038854,0.024804,0.028361,0.009756,-0.046723,-0.036647,-0.060717,0.015059,0.039474,0


In [13]:
X_test = df_embedded[(df_embedded['manufacturer_MODERNA'] == 1)]
y_test = X_test['y_true']
X_test = X_test.drop(columns=['y_true'])

X_train = df_embedded[(df_embedded['manufacturer_MODERNA'] == 0)]
y_train = X_train['y_true']
X_train = X_train.drop(columns=['y_true'])

In [14]:
X_train.to_csv('../Machine_Learning/X_train_Moderna.csv', index=False)
X_test.to_csv('../Machine_Learning/X_test_Moderna.csv', index=False)
y_train.to_csv('../Machine_Learning/y_train_Moderna.csv', index=False)
y_test.to_csv('../Machine_Learning/y_test_Moderna.csv', index=False)

# With Janssen as Test

In [15]:
X_test = df_embedded[(df_embedded['manufacturer_JANSSEN'] == 1)]
y_test = X_test['y_true']
X_test = X_test.drop(columns=['y_true'])

X_train = df_embedded[(df_embedded['manufacturer_JANSSEN'] == 0)]
y_train = X_train['y_true']
X_train = X_train.drop(columns=['y_true'])

In [16]:
X_train.to_csv('../Machine_Learning/X_train_Janssen.csv', index=False)
X_test.to_csv('../Machine_Learning/X_test_Janssen.csv', index=False)
y_train.to_csv('../Machine_Learning/y_train_Janssen.csv', index=False)
y_test.to_csv('../Machine_Learning/y_test_Janssen.csv', index=False)

# Create Embeddings for Dataset where a symptom only appears in test or train

### fist we want to know if there are any unique adverse reaction across the vainnes to split the dataset accoringly

In [1]:
Pfizer_adverse_reactions = [
    "Vomiting",
    "Lymphadenopathy",
    "Rash", 
    "Pruritus", 
    "Urticaria", 
    "Angioedema",
    "Anaphylaxis",
    "Decreased appetite",
    "Insomnia",
    "Headache",
    "Dizziness",
    "Lethargy",
    "Facial paralysis", 
    "Paraesthesia",
    "Hypoaesthesia",
    "Myocarditis",
    "Pericarditis",
    "Diarrhoea",
    "Nausea",
    "Hyperhidrosis",
    "Night sweats",
    "Erythema multiforme",
    "Arthralgia",
    "Myalgia",
    "Pain in extremity",
    "Heavy menstrual bleeding",
    "Injection site pain",
    "Fatigue",
    "Chills",
    "Pyrexia",
    "Injection site swelling",
    "Injection site redness",
    "Asthenia",
    "Malaise",
    "Injection site pruritus",
    "Extensive swelling of vaccinated limb",
    "Facial swelling"
]
Moderna_adverse_reactions = [
    "Lymphadenopathy",
    "Anaphylaxis",
    "Hypersensitivity",
    "Decreased appetite",
    "Irritability", 
    "Crying",
    "Headache",
    "Sleepiness",
    "Dizziness",
    "Facial paralysis", 
    "Hypoaesthesia",
    "Paraesthesia",
    "Myocarditis",
    "Pericarditis",
    "Nausea", 
    "Vomiting",
    "Diarrhoea",
    "Abdominal pain",
    "Rash",
    "Urticaria",
    "Erythema multiforme",
    "Mechanical urticaria",
    "Chronic urticaria",
    "Myalgia",
    "Arthralgia",
    "Heavy menstrual bleeding",
    "Injection site pain",
    "Fatigue",
    "Chills",
    "Pyrexia",
    "Injection site swelling",
    "Injection site erythema",
    "Injection site urticaria",
    "Injection site rash",
    "Delayed injection site reaction",
    "Injection site pruritus",
    "Facial swelling",
    "Extensive swelling of vaccinated limb"
]

Janssen_adverse_reactions = [
    "Lymphadenopathy",
    "Immune thrombocytopenia",
    "Anaphylaxis",
    "Headache",
    "Dizziness",
    "Tremor",
    "Urticaria",
    "Hypersensitivity",
    "Paraesthesia",
    "Hypoaesthesia",
    "Facial paralysis",
    "Tinnitus",
    "Guillain-Barre syndrome",
    "Myelitis", 
    "Venous thromboembolism",
    "Thrombosis in combination with thrombocytopenia",
    "Myocarditis", 
    "Pericarditis",
    "Capillary leak syndrome",
    "Cutaneous vasculitis", 
    "Nausea",
    "Cough", 
    "Oropharyngeal pain",
    "Sneezing",
    "Diarrhoea",
    "Vomiting",
    "Rash",
    "Hyperhidrosis",
    "Myalgia",
    "Arthralgia",
    "Muscular weakness",
    "Back pain",
    "Pain in extremity",
    "Fatigue",
    "Injection site pain",
    "Injection site swelling",
    "Chills",
    "Pyrexia"
]

Novavax_adverse_reactions = [
    "Headache",
    "Nausea",
    "Vomiting",
    "Myalgia",
    "Arthralgia",
    "Injection site tenderness",
    "Injection site pain",
    "Fatigue",
    "Malaise",
    "Injection site redness",
    "Injection site swelling",
    "Pyrexia",
    "Pain in extremity",
    "Injection site pruritus",
    "Chills",
    "Injection site warmth",
    "Lymphadenopathy",
    "Anaphylaxis",
    "Paraesthesia",
    "Hypoaesthesia",
    "Myocarditis",
    "Pericarditis",
    "Hypertension",
    "Rash",
    "Erythema",
    "Pruritus",
    "Urticaria"
]

In [2]:
from collections import Counter, defaultdict

named_lists = [
    ("Pfizer", Pfizer_adverse_reactions),
    ("Moderna", Moderna_adverse_reactions),
    ("Janssen", Janssen_adverse_reactions),
    ("Novavax", Novavax_adverse_reactions)
]

# Combine all lists into one
combined_list = []
for name, lst in named_lists:
    combined_list.extend(lst)

# Count the frequency of each word
word_counts = Counter(combined_list)

# Track the lists each word appears in
word_occurrences = defaultdict(set)

for name, lst in named_lists:
    for word in lst:
        word_occurrences[word].add(name)

# Print the word counts and the lists they appear in
for word, count in word_counts.items():
    lists_containing_word = ", ".join(word_occurrences[word])
    print(f"{word}: {count} times, appears in {lists_containing_word}")

Vomiting: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Lymphadenopathy: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Rash: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Pruritus: 2 times, appears in Pfizer, Novavax
Urticaria: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Angioedema: 1 times, appears in Pfizer
Anaphylaxis: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Decreased appetite: 2 times, appears in Moderna, Pfizer
Insomnia: 1 times, appears in Pfizer
Headache: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Dizziness: 3 times, appears in Moderna, Pfizer, Janssen
Lethargy: 1 times, appears in Pfizer
Facial paralysis: 3 times, appears in Moderna, Pfizer, Janssen
Paraesthesia: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Hypoaesthesia: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Myocarditis: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Pericarditis: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Dia

In [4]:
len(combined_list)

140

In [5]:
# Combine all lists into one
combined_list = []
for name, lst in named_lists:
    combined_list.extend(lst)

# Count the frequency of each word
word_counts = Counter(combined_list)

# Track the lists each word appears in
word_occurrences = defaultdict(set)

for name, lst in named_lists:
    for word in lst:
        word_occurrences[word].add(name)

# Print the word counts and the lists they appear in
print("Word counts and lists they appear in:")
for word, count in word_counts.items():
    lists_containing_word = ", ".join(word_occurrences[word])
    print(f"{word}: {count} times, appears in {lists_containing_word}")

# Show the total number of symptoms that appear one, two, three, and four times
appear_once = sum(1 for word in word_occurrences if len(word_occurrences[word]) == 1)
appear_twice = sum(1 for word in word_occurrences if len(word_occurrences[word]) == 2)
appear_thrice = sum(1 for word in word_occurrences if len(word_occurrences[word]) == 3)
appear_four_times = sum(1 for word in word_occurrences if len(word_occurrences[word]) == 4)

print("\nTotal number of symptoms appearing in lists:")
print(f"Appear in exactly one list: {appear_once}")
print(f"Appear in exactly two lists: {appear_twice}")
print(f"Appear in exactly three lists: {appear_thrice}")
print(f"Appear in exactly four lists: {appear_four_times}")

Word counts and lists they appear in:
Vomiting: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Lymphadenopathy: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Rash: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Pruritus: 2 times, appears in Pfizer, Novavax
Urticaria: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Angioedema: 1 times, appears in Pfizer
Anaphylaxis: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Decreased appetite: 2 times, appears in Moderna, Pfizer
Insomnia: 1 times, appears in Pfizer
Headache: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Dizziness: 3 times, appears in Moderna, Pfizer, Janssen
Lethargy: 1 times, appears in Pfizer
Facial paralysis: 3 times, appears in Moderna, Pfizer, Janssen
Paraesthesia: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Hypoaesthesia: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Myocarditis: 4 times, appears in Moderna, Pfizer, Novavax, Janssen
Pericarditis: 4 times, appears in

## 5% True labels

In [18]:
df = pd.read_csv('data/df2_unique_5.csv')

In [19]:
df_test = df[df['manufacturer_JANSSEN'] == 1]
df_other = df[df['manufacturer_JANSSEN'] != 1]

df_other_copy = df_other.copy()
df_test_copy = df_test.copy()

In [20]:
def scale_multi_dim_var(cols):
    max_sum = df_other_copy[cols].sum(axis=1).max()
    df_other_copy[cols] = df_other_copy[cols] / max_sum
    
    max_sum_test = df_test_copy[cols].sum(axis=1).max()
    df_test_copy[cols] = df_test_copy[cols] / max_sum_test

In [21]:
age_cols = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']
scale_multi_dim_var(age_cols)

numdays_cols = ['NUMDAYS_0-1', 'NUMDAYS_2-3', 'NUMDAYS_4-5', 'NUMDAYS_6-7', 'NUMDAYS_8-9', 'NUMDAYS_10-11',
               'NUMDAYS_12-13', 'NUMDAYS_14+']
scale_multi_dim_var(numdays_cols)

reaction_cols = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
scale_multi_dim_var(reaction_cols)

de_cols = ['DE', 'De', 'dE', 'de']
scale_multi_dim_var(de_cols)

serious_cols = ['SERIOUS Y', 'SERIOUS N']
scale_multi_dim_var(serious_cols)

sex_cols = ['SEX_M', 'SEX_F', 'SEX_U']
scale_multi_dim_var(sex_cols)

df_other_copy

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
560,1,Angioedema,0.000753,0.003112,0.005274,0.009959,0.010090,0.009402,0.006257,0.003440,...,0.001161,0.380410,9.483812e-04,0.617480,0.014029,0.057559,0,0,0,1
561,1,Erythema multiforme,0.000655,0.000393,0.001016,0.000917,0.001343,0.000983,0.001114,0.000983,...,0.000296,0.381275,8.987636e-04,0.617530,0.003259,0.014974,0,0,0,1
562,1,Insomnia,0.001081,0.006716,0.023980,0.044062,0.044062,0.041048,0.033153,0.019034,...,0.004524,0.377047,5.893195e-03,0.612536,0.066514,0.212452,0,0,0,1
563,1,Vomiting,0.027289,0.086257,0.102310,0.121867,0.110270,0.111941,0.089140,0.055332,...,0.012823,0.368748,2.516982e-02,0.593259,0.226606,0.564076,0,0,0,1
564,1,Malaise,0.005438,0.040753,0.092318,0.133825,0.135495,0.139623,0.117215,0.081016,...,0.016216,0.365355,1.812776e-02,0.600301,0.326327,0.673673,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,0,Calculus urinary,0.000000,0.000066,0.000066,0.000033,0.000066,0.000098,0.000197,0.000033,...,0.000009,0.381562,7.261107e-06,0.618422,0.000373,0.000199,0,0,0,1
1476,0,CSF virus no organisms observed,0.000000,0.000000,0.000033,0.000000,0.000066,0.000000,0.000033,0.000000,...,0.000002,0.381570,3.267498e-05,0.618396,0.000099,0.000000,0,0,0,1
1477,0,Emergency care,0.000000,0.000000,0.000098,0.000098,0.000066,0.000066,0.000000,0.000000,...,0.000013,0.381558,1.129505e-05,0.618418,0.000572,0.000249,0,0,0,1
1478,0,Periventricular leukomalacia,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000033,...,0.000003,0.381568,2.016974e-06,0.618427,0.000124,0.000050,0,0,0,1


In [22]:
X = pd.concat([df_other_copy, df_test_copy], ignore_index=True)
y = X['y_true']
X = X.drop('y_true', axis=1)

In [23]:
print(X.shape)
print(y.shape)

(1480, 40)
(1480,)


In [24]:
from sklearn.preprocessing import LabelEncoder

embedding_info = ce.get_embedding_info(X, categorical_variables=['symptom'])
X_encoded, encoders = ce.get_label_encoded_data(X, categorical_variables=['symptom'])

X_train_encoded = X_encoded.iloc[:len(df_other)]
y_train_encoded = pd.Series(LabelEncoder().fit_transform(y.iloc[:len(df_other)]))

# Get embeddings
embeddings = ce.get_embeddings(X_train_encoded, y_train_encoded, categorical_embedding_info=embedding_info,
                               is_classification=True, epochs=100, batch_size=256)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [25]:
df_other_embedded = ce.fit_transform(df_other_copy, embeddings=embeddings, encoders=encoders, drop_categorical_vars=True)
df_test_embedded = ce.fit_transform(df_test_copy, embeddings=embeddings, encoders=encoders, drop_categorical_vars=True)

In [26]:
print(df_other_embedded.shape)
print(df_test_embedded.shape)

(920, 90)
(560, 90)


In [30]:
df_other_embedded.to_pickle('data/df_other_embedded5')  

In [101]:
df_other_embedded

Unnamed: 0,y_true,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
280,1,0.000753,0.003112,0.005274,0.009959,0.010090,0.009402,0.006257,0.003440,0.001474,...,0.072883,0.051374,-0.011422,0.070715,0.002122,-0.039608,-0.005973,-0.022606,0.006071,-0.026018
281,1,0.000655,0.000393,0.001016,0.000917,0.001343,0.000983,0.001114,0.000983,0.000655,...,0.030801,0.073768,-0.054399,0.070028,-0.056906,-0.068597,-0.009662,0.077479,0.051174,-0.038502
282,1,0.001081,0.006716,0.023980,0.044062,0.044062,0.041048,0.033153,0.019034,0.005897,...,-0.020722,-0.024217,-0.049730,0.069132,0.018435,-0.003915,-0.038062,-0.015338,0.051287,0.008760
283,1,0.027289,0.086257,0.102310,0.121867,0.110270,0.111941,0.089140,0.055332,0.026601,...,0.001843,0.016675,-0.007700,0.047092,0.030687,-0.057993,-0.026730,0.038080,0.022538,-0.051694
284,1,0.005438,0.040753,0.092318,0.133825,0.135495,0.139623,0.117215,0.081016,0.035348,...,0.007967,0.007533,-0.002606,-0.016519,0.051346,-0.057507,0.018874,0.017172,0.037908,-0.055132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,0,0.000000,0.000000,0.000066,0.000000,0.000033,0.000000,0.000066,0.000000,0.000033,...,-0.011311,0.040461,0.017210,-0.001321,0.014348,-0.022530,0.005763,-0.046514,-0.007625,0.022377
736,0,0.000033,0.000033,0.000131,0.000131,0.000328,0.000295,0.000197,0.000131,0.000098,...,0.044157,-0.020538,0.038682,-0.039177,-0.003087,0.046062,-0.007926,-0.026610,0.015146,0.014391
737,0,0.000000,0.000000,0.000000,0.000066,0.000098,0.000262,0.000098,0.000066,0.000000,...,0.010036,-0.040909,0.014446,-0.013675,0.042600,0.007885,0.022552,-0.042215,0.045749,-0.020172
738,0,0.000000,0.000098,0.000295,0.000229,0.000197,0.000262,0.000098,0.000000,0.000000,...,0.047258,0.002857,0.020705,-0.000889,0.025060,-0.007529,-0.000564,0.044165,-0.044915,0.031951


In [27]:
df_other_embedded_copy = df_other_embedded.copy()
df_test_embedded_copy = df_test_embedded.copy()

In [28]:
df_transformed = pd.concat([df_other_embedded_copy, df_test_embedded_copy])

In [29]:
Janssen = df_transformed[df_transformed['manufacturer_JANSSEN'] == 1]
Others  = df_transformed[df_transformed['manufacturer_JANSSEN'] == 0]

In [15]:
target_column = 'y_true'

# Split the OTHERS_df into X_train and y_train
X_train = Others.drop(columns=[target_column])
y_train = Others[target_column]

# Split the Janssen_df into X_test and y_test
X_test = Janssen.drop(columns=[target_column])
y_test = Janssen[target_column]

In [16]:
X_train.to_csv('../Machine_Learning/data/X_train_unique_5.csv', index=False)
X_test.to_csv('../Machine_Learning/data/X_test_unique_5.csv', index=False)
y_train.to_csv('../Machine_Learning/data/y_train_unique_5.csv', index=False)
y_test.to_csv('../Machine_Learning/data/y_test_unique_5.csv', index=False)

In [17]:
X_test

Unnamed: 0,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
0,0.000063,0.003519,0.017782,0.023060,0.024128,0.024065,0.015646,0.005027,0.001068,0.000251,...,-0.042935,0.024282,0.027139,0.018163,-0.031918,-0.037963,-0.017418,0.041997,0.009878,-0.026965
1,0.000440,0.023311,0.114169,0.114797,0.094314,0.096324,0.057870,0.016525,0.004273,0.001131,...,-0.043609,-0.018931,0.038629,-0.014921,-0.000083,-0.033385,-0.019592,0.039406,0.035829,0.042126
2,0.000000,0.001445,0.005969,0.012315,0.012441,0.015394,0.010556,0.004901,0.001382,0.000189,...,0.018669,0.032954,0.013276,0.038167,0.015422,0.038285,-0.012219,-0.029095,-0.003571,0.029091
3,0.000000,0.000189,0.001634,0.003456,0.004210,0.005215,0.002639,0.001885,0.000754,0.000126,...,0.043867,0.008406,-0.020070,-0.016257,0.040577,-0.012176,0.026340,0.018533,-0.046016,0.005898
4,0.000000,0.000000,0.000000,0.000251,0.000000,0.000063,0.000251,0.000126,0.000063,0.000000,...,-0.042985,0.041822,-0.005313,-0.042079,-0.019278,-0.048225,-0.010921,0.003654,-0.047210,0.045176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,0.000000,0.000000,0.000000,0.000000,0.000377,0.000189,0.000000,0.000000,0.000000,0.000000,...,-0.022529,-0.024073,0.001430,-0.049488,0.047921,0.035508,0.034056,-0.049806,0.019126,0.028682
556,0.000000,0.000189,0.000251,0.000314,0.000251,0.000440,0.000126,0.000000,0.000000,0.000000,...,0.020714,0.063511,-0.068713,-0.047055,-0.060927,0.065810,0.009347,-0.028130,0.011958,0.053153
557,0.000000,0.000000,0.000063,0.000126,0.000000,0.000063,0.000000,0.000000,0.000000,0.000000,...,0.020208,-0.020178,0.014814,0.007526,-0.015843,0.024825,0.030616,-0.049860,0.021291,-0.017540
558,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.044443,0.039335,-0.014685,0.048108,0.001086,-0.035684,0.032410,-0.009408,0.036268,-0.036339


## 10% True labels

In [31]:
df = pd.read_csv('data/df2_unique_10.csv')

In [32]:
df_test = df[df['manufacturer_JANSSEN'] == 1]

In [33]:
df_test

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,1,Back pain,1,56,283,367,384,383,249,80,...,2125.0,96267.0,32151.0,2348418.0,375.0,1750.0,1,0,0,0
1,1,Nausea,7,371,1817,1827,1501,1533,921,263,...,9791.0,88601.0,156847.0,2223722.0,1272.0,8519.0,1,0,0,0
2,1,Muscular weakness,0,23,95,196,198,245,168,78,...,1198.0,97194.0,24180.0,2356389.0,345.0,853.0,1,0,0,0
3,1,Facial paralysis,0,3,26,55,67,83,42,30,...,445.0,97947.0,10246.0,2370323.0,223.0,222.0,1,0,0,0
4,1,Cutaneous vasculitis,0,0,0,4,0,1,4,2,...,32.0,98360.0,461.0,2380108.0,13.0,19.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,0,Coagulation factor V level,0,0,0,0,1,0,1,0,...,4.0,98388.0,89.0,2380480.0,3.0,1.0,1,0,0,0
276,0,Abdominal rigidity,0,0,2,2,2,3,1,1,...,17.0,98375.0,229.0,2380340.0,7.0,10.0,1,0,0,0
277,0,Thrombocytosis,0,0,0,1,1,1,1,1,...,8.0,98384.0,241.0,2380328.0,7.0,1.0,1,0,0,0
278,0,Thyroxine free,0,0,2,5,12,4,2,1,...,33.0,98359.0,808.0,2379761.0,12.0,21.0,1,0,0,0


In [34]:
df_other = df[df['manufacturer_JANSSEN'] != 1]

In [35]:
df_other

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
280,1,Angioedema,23,95,161,304,308,287,191,105,...,2878.0,943022.0,2351.0,1530710.0,564.0,2314.0,0,0,0,1
281,1,Erythema multiforme,20,12,31,28,41,30,34,30,...,733.0,945167.0,2228.0,1530833.0,131.0,602.0,0,0,0,1
282,1,Insomnia,33,205,732,1345,1345,1253,1012,581,...,11215.0,934685.0,14609.0,1518452.0,2674.0,8541.0,0,0,0,1
283,1,Vomiting,833,2633,3123,3720,3366,3417,2721,1689,...,31787.0,914113.0,62395.0,1470666.0,9110.0,22677.0,0,0,0,1
284,1,Malaise,166,1244,2818,4085,4136,4262,3578,2473,...,40198.0,905702.0,44938.0,1488123.0,13119.0,27083.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,0,JC polyomavirus test,0,0,2,0,1,0,2,0,...,22.0,945878.0,2.0,1533059.0,19.0,3.0,0,0,0,1
736,0,Cardiovascular function test,1,1,4,4,10,9,6,4,...,122.0,945778.0,20.0,1533041.0,75.0,47.0,0,0,0,1
737,0,Injection site streaking,0,0,0,2,3,8,3,2,...,18.0,945882.0,525.0,1532536.0,1.0,17.0,0,0,0,1
738,0,Vitamin B6,0,3,9,7,6,8,3,0,...,79.0,945821.0,30.0,1533031.0,56.0,23.0,0,0,0,1


In [36]:
df_other_copy = df_other.copy()
df_test_copy = df_test.copy()

In [37]:
def scale_multi_dim_var(cols):
    max_sum = df_other_copy[cols].sum(axis=1).max()
    df_other_copy[cols] = df_other_copy[cols] / max_sum
    
    max_sum_test = df_test_copy[cols].sum(axis=1).max()
    df_test_copy[cols] = df_test_copy[cols] / max_sum_test

In [38]:
age_cols = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']
scale_multi_dim_var(age_cols)

numdays_cols = ['NUMDAYS_0-1', 'NUMDAYS_2-3', 'NUMDAYS_4-5', 'NUMDAYS_6-7', 'NUMDAYS_8-9', 'NUMDAYS_10-11',
               'NUMDAYS_12-13', 'NUMDAYS_14+']
scale_multi_dim_var(numdays_cols)

reaction_cols = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
scale_multi_dim_var(reaction_cols)

de_cols = ['DE', 'De', 'dE', 'de']
scale_multi_dim_var(de_cols)

serious_cols = ['SERIOUS Y', 'SERIOUS N']
scale_multi_dim_var(serious_cols)

sex_cols = ['SEX_M', 'SEX_F', 'SEX_U']
scale_multi_dim_var(sex_cols)

df_other_copy

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
280,1,Angioedema,0.000753,0.003112,0.005274,0.009959,0.010090,0.009402,0.006257,0.003440,...,0.001161,0.380410,9.483812e-04,0.617480,0.014029,0.057559,0,0,0,1
281,1,Erythema multiforme,0.000655,0.000393,0.001016,0.000917,0.001343,0.000983,0.001114,0.000983,...,0.000296,0.381275,8.987636e-04,0.617530,0.003259,0.014974,0,0,0,1
282,1,Insomnia,0.001081,0.006716,0.023980,0.044062,0.044062,0.041048,0.033153,0.019034,...,0.004524,0.377047,5.893195e-03,0.612536,0.066514,0.212452,0,0,0,1
283,1,Vomiting,0.027289,0.086257,0.102310,0.121867,0.110270,0.111941,0.089140,0.055332,...,0.012823,0.368748,2.516982e-02,0.593259,0.226606,0.564076,0,0,0,1
284,1,Malaise,0.005438,0.040753,0.092318,0.133825,0.135495,0.139623,0.117215,0.081016,...,0.016216,0.365355,1.812776e-02,0.600301,0.326327,0.673673,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,0,JC polyomavirus test,0.000000,0.000000,0.000066,0.000000,0.000033,0.000000,0.000066,0.000000,...,0.000009,0.381562,8.067896e-07,0.618428,0.000473,0.000075,0,0,0,1
736,0,Cardiovascular function test,0.000033,0.000033,0.000131,0.000131,0.000328,0.000295,0.000197,0.000131,...,0.000049,0.381522,8.067896e-06,0.618421,0.001866,0.001169,0,0,0,1
737,0,Injection site streaking,0.000000,0.000000,0.000000,0.000066,0.000098,0.000262,0.000098,0.000066,...,0.000007,0.381564,2.117823e-04,0.618217,0.000025,0.000423,0,0,0,1
738,0,Vitamin B6,0.000000,0.000098,0.000295,0.000229,0.000197,0.000262,0.000098,0.000000,...,0.000032,0.381539,1.210184e-05,0.618417,0.001393,0.000572,0,0,0,1


In [39]:
X = pd.concat([df_other_copy, df_test_copy], ignore_index=True)
y = X['y_true']
X = X.drop('y_true', axis=1)

In [40]:
print(X.shape)
print(y.shape)

(740, 40)
(740,)


In [49]:
X

Unnamed: 0,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,Angioedema,0.000753,0.003112,0.005274,0.009959,0.010090,0.009402,0.006257,0.003440,0.001474,...,0.001161,0.380410,0.000948,0.617480,0.014029,0.057559,0,0,0,1
1,Erythema multiforme,0.000655,0.000393,0.001016,0.000917,0.001343,0.000983,0.001114,0.000983,0.000655,...,0.000296,0.381275,0.000899,0.617530,0.003259,0.014974,0,0,0,1
2,Insomnia,0.001081,0.006716,0.023980,0.044062,0.044062,0.041048,0.033153,0.019034,0.005897,...,0.004524,0.377047,0.005893,0.612536,0.066514,0.212452,0,0,0,1
3,Vomiting,0.027289,0.086257,0.102310,0.121867,0.110270,0.111941,0.089140,0.055332,0.026601,...,0.012823,0.368748,0.025170,0.593259,0.226606,0.564076,0,0,0,1
4,Malaise,0.005438,0.040753,0.092318,0.133825,0.135495,0.139623,0.117215,0.081016,0.035348,...,0.016216,0.365355,0.018128,0.600301,0.326327,0.673673,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,Coagulation factor V level,0.000000,0.000000,0.000000,0.000000,0.000063,0.000000,0.000063,0.000000,0.000000,...,0.000002,0.039689,0.000036,0.960273,0.000153,0.000051,1,0,0,0
736,Abdominal rigidity,0.000000,0.000000,0.000126,0.000126,0.000126,0.000189,0.000063,0.000063,0.000000,...,0.000007,0.039684,0.000092,0.960217,0.000357,0.000510,1,0,0,0
737,Thrombocytosis,0.000000,0.000000,0.000000,0.000063,0.000063,0.000063,0.000063,0.000063,0.000000,...,0.000003,0.039688,0.000097,0.960212,0.000357,0.000051,1,0,0,0
738,Thyroxine free,0.000000,0.000000,0.000126,0.000314,0.000754,0.000251,0.000126,0.000063,0.000000,...,0.000013,0.039678,0.000326,0.959983,0.000612,0.001071,1,0,0,0


In [53]:
from sklearn.preprocessing import LabelEncoder

embedding_info = ce.get_embedding_info(X, categorical_variables=['symptom'])
X_encoded, encoders = ce.get_label_encoded_data(X, categorical_variables=['symptom'])

X_train_encoded = X_encoded.iloc[:len(df_other)]
y_train_encoded = pd.Series(LabelEncoder().fit_transform(y.iloc[:len(df_other)]))

# Get embeddings
embeddings = ce.get_embeddings(X_train_encoded, y_train_encoded, categorical_embedding_info=embedding_info,
                               is_classification=True, epochs=100, batch_size=256)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [54]:
df_other_embedded = ce.fit_transform(df_other_copy, embeddings=embeddings, encoders=encoders, drop_categorical_vars=True)
df_test_embedded = ce.fit_transform(df_test_copy, embeddings=embeddings, encoders=encoders, drop_categorical_vars=True)

In [55]:
print(df_other_embedded.shape)
print(df_test_embedded.shape)

(740, 89)


In [44]:

df_other_embedded

Unnamed: 0,y_true,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
280,1,0.000753,0.003112,0.005274,0.009959,0.010090,0.009402,0.006257,0.003440,0.001474,...,0.072883,0.051374,-0.011422,0.070715,0.002122,-0.039608,-0.005973,-0.022606,0.006071,-0.026018
281,1,0.000655,0.000393,0.001016,0.000917,0.001343,0.000983,0.001114,0.000983,0.000655,...,0.030801,0.073768,-0.054399,0.070028,-0.056906,-0.068597,-0.009662,0.077479,0.051174,-0.038502
282,1,0.001081,0.006716,0.023980,0.044062,0.044062,0.041048,0.033153,0.019034,0.005897,...,-0.020722,-0.024217,-0.049730,0.069132,0.018435,-0.003915,-0.038062,-0.015338,0.051287,0.008760
283,1,0.027289,0.086257,0.102310,0.121867,0.110270,0.111941,0.089140,0.055332,0.026601,...,0.001843,0.016675,-0.007700,0.047092,0.030687,-0.057993,-0.026730,0.038080,0.022538,-0.051694
284,1,0.005438,0.040753,0.092318,0.133825,0.135495,0.139623,0.117215,0.081016,0.035348,...,0.007967,0.007533,-0.002606,-0.016519,0.051346,-0.057507,0.018874,0.017172,0.037908,-0.055132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,0,0.000000,0.000000,0.000066,0.000000,0.000033,0.000000,0.000066,0.000000,0.000033,...,-0.011311,0.040461,0.017210,-0.001321,0.014348,-0.022530,0.005763,-0.046514,-0.007625,0.022377
736,0,0.000033,0.000033,0.000131,0.000131,0.000328,0.000295,0.000197,0.000131,0.000098,...,0.044157,-0.020538,0.038682,-0.039177,-0.003087,0.046062,-0.007926,-0.026610,0.015146,0.014391
737,0,0.000000,0.000000,0.000000,0.000066,0.000098,0.000262,0.000098,0.000066,0.000000,...,0.010036,-0.040909,0.014446,-0.013675,0.042600,0.007885,0.022552,-0.042215,0.045749,-0.020172
738,0,0.000000,0.000098,0.000295,0.000229,0.000197,0.000262,0.000098,0.000000,0.000000,...,0.047258,0.002857,0.020705,-0.000889,0.025060,-0.007529,-0.000564,0.044165,-0.044915,0.031951


In [45]:
df_test_embedded

Unnamed: 0,y_true,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
0,1,0.000063,0.003519,0.017782,0.023060,0.024128,0.024065,0.015646,0.005027,0.001068,...,-0.014778,0.003964,-0.013519,-0.023167,0.034921,0.019186,0.040619,0.003764,0.027214,0.037214
1,1,0.000440,0.023311,0.114169,0.114797,0.094314,0.096324,0.057870,0.016525,0.004273,...,-0.010946,-0.012091,-0.017636,-0.039410,0.003663,-0.001802,0.001173,-0.008259,0.038269,-0.031059
2,1,0.000000,0.001445,0.005969,0.012315,0.012441,0.015394,0.010556,0.004901,0.001382,...,0.014271,-0.048803,-0.046415,0.015356,-0.010391,0.033851,-0.015576,0.031135,0.007100,-0.014178
3,1,0.000000,0.000189,0.001634,0.003456,0.004210,0.005215,0.002639,0.001885,0.000754,...,-0.037220,-0.026920,-0.024913,0.016636,-0.034564,-0.013669,0.018321,0.027398,0.037199,-0.022153
4,1,0.000000,0.000000,0.000000,0.000251,0.000000,0.000063,0.000251,0.000126,0.000063,...,-0.004202,-0.043140,-0.037415,0.027186,-0.022032,0.036764,0.043671,0.009447,0.041735,-0.018646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,0,0.000000,0.000000,0.000000,0.000000,0.000063,0.000000,0.000063,0.000000,0.000000,...,0.043899,-0.036809,-0.049759,0.001039,-0.031974,0.004296,0.030820,-0.027318,-0.015533,0.026365
276,0,0.000000,0.000000,0.000126,0.000126,0.000126,0.000189,0.000063,0.000063,0.000000,...,-0.023108,0.024305,-0.033302,0.034569,-0.033942,-0.018293,0.023146,-0.044471,-0.034490,-0.020048
277,0,0.000000,0.000000,0.000000,0.000063,0.000063,0.000063,0.000063,0.000063,0.000000,...,-0.028591,-0.016502,0.013877,0.001378,-0.036516,-0.023037,-0.030250,0.011566,0.011248,-0.020112
278,0,0.000000,0.000000,0.000126,0.000314,0.000754,0.000251,0.000126,0.000063,0.000000,...,0.017034,0.044163,0.024014,0.028514,-0.025663,0.003121,-0.031262,0.041755,0.009540,-0.029858


In [46]:
df_other_embedded.to_pickle('data/df_other_embedded10')  

In [374]:
df_other_embedded_copy = df_other_embedded.copy()
df_test_embedded_copy = df_test_embedded.copy()

In [376]:
df_transformed = pd.concat([df_other_embedded_copy, df_test_embedded_copy])

In [377]:
df_transformed

Unnamed: 0,y_true,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
280,1,0.000753,0.003112,0.005274,0.009959,0.010090,0.009402,0.006257,0.003440,0.001474,...,0.036859,0.012987,-0.061234,-0.004186,-0.055662,-0.014458,-0.062480,0.011197,0.065043,0.077136
281,1,0.000655,0.000393,0.001016,0.000917,0.001343,0.000983,0.001114,0.000983,0.000655,...,0.057951,-0.011189,-0.071340,0.024492,-0.013903,0.029624,-0.052215,0.061242,0.041733,0.016940
282,1,0.001081,0.006716,0.023980,0.044062,0.044062,0.041048,0.033153,0.019034,0.005897,...,-0.015781,-0.060033,-0.039953,0.021752,-0.013534,0.021844,-0.012677,0.073315,0.038553,0.058060
283,1,0.027289,0.086257,0.102310,0.121867,0.110270,0.111941,0.089140,0.055332,0.026601,...,0.009515,-0.011975,-0.050807,0.058137,-0.042506,0.047001,-0.013710,-0.036974,0.007904,0.045888
284,1,0.005438,0.040753,0.092318,0.133825,0.135495,0.139623,0.117215,0.081016,0.035348,...,0.046891,0.016539,-0.015454,0.054719,-0.051151,0.042944,0.014782,-0.048226,-0.034339,-0.002191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,0,0.000000,0.000000,0.000000,0.000000,0.000063,0.000000,0.000063,0.000000,0.000000,...,-0.046338,-0.038258,-0.023938,-0.006976,0.014785,-0.012902,-0.035204,-0.004362,-0.028583,0.002080
276,0,0.000000,0.000000,0.000126,0.000126,0.000126,0.000189,0.000063,0.000063,0.000000,...,-0.015611,0.045626,-0.014261,-0.003219,-0.030572,-0.013642,-0.003378,0.027487,0.004843,-0.042961
277,0,0.000000,0.000000,0.000000,0.000063,0.000063,0.000063,0.000063,0.000063,0.000000,...,-0.048767,-0.028058,0.041895,-0.017405,-0.026763,0.039854,0.044538,-0.048612,-0.027922,0.014829
278,0,0.000000,0.000000,0.000126,0.000314,0.000754,0.000251,0.000126,0.000063,0.000000,...,0.006679,-0.006114,0.007646,0.017000,-0.004471,-0.013921,0.005473,0.049189,-0.047253,0.045166


In [378]:
Janssen = df_transformed[df_transformed['manufacturer_JANSSEN'] == 1]
Others  = df_transformed[df_transformed['manufacturer_JANSSEN'] == 0]

In [379]:
target_column = 'y_true'

# Split the OTHERS_df into X_train and y_train
X_train = Others.drop(columns=[target_column])
y_train = Others[target_column]

# Split the Janssen_df into X_test and y_test
X_test = Janssen.drop(columns=[target_column])
y_test = Janssen[target_column]

In [380]:
y_test

0      1
1      1
2      1
3      1
4      1
      ..
275    0
276    0
277    0
278    0
279    0
Name: y_true, Length: 280, dtype: int64

In [381]:
X_train.to_csv('../Machine_Learning/data/X_train_unique_10.csv', index=False)
X_test.to_csv('../Machine_Learning/data/X_test_unique_10.csv', index=False)
y_train.to_csv('../Machine_Learning/data/y_train_unique_10.csv', index=False)
y_test.to_csv('../Machine_Learning/data/y_test_unique_10.csv', index=False)

In [382]:
X_test

Unnamed: 0,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,...,symptom_embedding_40,symptom_embedding_41,symptom_embedding_42,symptom_embedding_43,symptom_embedding_44,symptom_embedding_45,symptom_embedding_46,symptom_embedding_47,symptom_embedding_48,symptom_embedding_49
0,0.000063,0.003519,0.017782,0.023060,0.024128,0.024065,0.015646,0.005027,0.001068,0.000251,...,-0.031792,-0.021895,-0.002600,-0.007345,0.047701,0.040471,-0.046026,-0.021012,0.027435,0.000237
1,0.000440,0.023311,0.114169,0.114797,0.094314,0.096324,0.057870,0.016525,0.004273,0.001131,...,0.034355,-0.025342,0.004985,-0.004853,-0.001912,-0.036803,0.012387,-0.020887,-0.027174,0.045523
2,0.000000,0.001445,0.005969,0.012315,0.012441,0.015394,0.010556,0.004901,0.001382,0.000189,...,-0.005448,-0.023026,-0.045969,-0.021206,-0.000887,-0.049768,0.043007,-0.036687,0.012983,0.034753
3,0.000000,0.000189,0.001634,0.003456,0.004210,0.005215,0.002639,0.001885,0.000754,0.000126,...,0.031358,-0.034581,0.008295,0.048916,0.000776,-0.018169,-0.023784,0.023504,-0.035434,-0.003846
4,0.000000,0.000000,0.000000,0.000251,0.000000,0.000063,0.000251,0.000126,0.000063,0.000000,...,0.019673,0.013351,0.023705,0.010016,0.000077,-0.019458,-0.039591,0.004215,-0.001771,-0.022175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,0.000000,0.000000,0.000000,0.000000,0.000063,0.000000,0.000063,0.000000,0.000000,0.000000,...,-0.046338,-0.038258,-0.023938,-0.006976,0.014785,-0.012902,-0.035204,-0.004362,-0.028583,0.002080
276,0.000000,0.000000,0.000126,0.000126,0.000126,0.000189,0.000063,0.000063,0.000000,0.000000,...,-0.015611,0.045626,-0.014261,-0.003219,-0.030572,-0.013642,-0.003378,0.027487,0.004843,-0.042961
277,0.000000,0.000000,0.000000,0.000063,0.000063,0.000063,0.000063,0.000063,0.000000,0.000000,...,-0.048767,-0.028058,0.041895,-0.017405,-0.026763,0.039854,0.044538,-0.048612,-0.027922,0.014829
278,0.000000,0.000000,0.000126,0.000314,0.000754,0.000251,0.000126,0.000063,0.000000,0.000000,...,0.006679,-0.006114,0.007646,0.017000,-0.004471,-0.013921,0.005473,0.049189,-0.047253,0.045166
