In [10]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from tqdm import tqdm
from collections import Counter

client = MongoClient('mongodb://localhost:27017/')
db = client['vaers']  
collection = db['combinations cleaned']
collection2 = db['reports cleaned']

In [25]:
y_true_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/y_true_Pfizer.npy', allow_pickle=True)
y_true_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/y_true_Moderna.npy', allow_pickle=True)
y_true_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/y_true_Janssen.npy', allow_pickle=True)
y_true_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/y_true_Novavax.npy', allow_pickle=True)

manufacturer_array_Pfizer = np.full(len(y_true_Pfizer), 'PFIZER\\BIONTECH')
manufacturer_array_Moderna = np.full(len(y_true_Moderna), 'MODERNA')
manufacturer_array_Janssen = np.full(len(y_true_Janssen), 'JANSSEN')
manufacturer_array_Novavax = np.full(len(y_true_Novavax), 'NOVAVAX')


order_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/order_Pfizer.npy', allow_pickle=True)
order_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/order_Moderna.npy', allow_pickle=True)
order_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/order_Janssen.npy', allow_pickle=True)
order_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/order_Novavax.npy', allow_pickle=True)

y_true = np.concatenate([y_true_Pfizer, y_true_Moderna, y_true_Janssen, y_true_Novavax])
manu = np.concatenate([manufacturer_array_Pfizer, manufacturer_array_Moderna, manufacturer_array_Janssen, manufacturer_array_Novavax])
order = np.concatenate([order_Pfizer, order_Moderna, order_Janssen, order_Novavax])

In [26]:
indexes_y_true_1 = np.where(y_true == 1)[0]
indexes_y_true_0 = np.where(y_true == 0)[0]

np.random.seed(42)  # Set seed for reproducibility
sampled_indexes_y_true_0 = np.random.choice(indexes_y_true_0, size=1000, replace=False)

combined_indexes = np.concatenate([indexes_y_true_1, sampled_indexes_y_true_0])

filtered_y_true = y_true[combined_indexes]
filtered_order = order[combined_indexes]
filtered_manu = manu[combined_indexes]

In [27]:
df = pd.DataFrame({
    'y_true': filtered_y_true,
    'symptom': filtered_order, 
    'manufacturer': filtered_manu
})

In [28]:
columns = ['AGE', 'SEX', 'SERIOUS', 'NUMDAYS', 'SEX', 'DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT', 'RECOVD', 'NUMDAYS']
# Define the age bins and labels once
age_bins = list(range(0, 130, 10))  # Creates bins from 0 to 120 in steps of 10
age_labels = [f"{i}-{i+9}" for i in age_bins[:-1]]

# Define the NUMDAYS bins and labels once
numdays_bins = list(range(0, 16, 2)) + [float('inf')]  # Creates bins from 0 to 14 in steps of 2, plus a bin for 14+
numdays_labels = [f"{i}-{i+1}" for i in numdays_bins[:-2]] + ['14+']

# Initialize the age group columns and the serious_count column in the DataFrame
for label in age_labels:
    df[label] = 0

# Initialize the NUMDAYS bins columns in the DataFrame
for label in numdays_labels:
    df[f'NUMDAYS_{label}'] = 0
    
sex_values = ['M', 'F', 'U']
for sex in sex_values:
    df[f'SEX_{sex}'] = 0
    
count_columns = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
for col in count_columns:
    df[f'{col}'] = 0
    

    
for index, row in tqdm(df.iterrows()):
    query1 = {
        'symptom': row['symptom'],
        'manufacturer': row['manufacturer'],
        'vaccine': 'COVID19'
    }
    result1 = list(collection.find(query1))

    df.loc[index, 'DE'] = result1[0].get("DE", None)
    df.loc[index, 'De'] = result1[0].get("De", None)
    df.loc[index, 'dE'] = result1[0].get("dE", None)
    df.loc[index, 'de'] = result1[0].get("de", None)

       # Second query to 'col'
    query2 = {
        'symptoms': row['symptom'],
        'vax_data.VAX_MANU': row['manufacturer'],
        'vax_data.VAX_TYPE': 'COVID19'
    }
        # Second query

    projection = {field: 1 for field in columns}  # Create projection dictionary
    projection['_id'] = 0  # Exclude _id from the result
    
    result2 = list(collection2.find(query2, projection))
    
    # Extract the ages
    ages = [entry['AGE'] for entry in result2]
    age_counts, _ = pd.cut(ages, bins=age_bins, labels=age_labels, right=False, retbins=True)
    for label in age_labels:
        df.loc[index, label] = (age_counts == label).sum()
        
    # Extract Serious
    serious_values = [entry['SERIOUS'] for entry in result2]
    serious_Y_count = serious_values.count('Y')
    df.loc[index, ('SERIOUS Y')] = serious_Y_count
    serious_N_count = serious_values.count('N')
    df.loc[index, ('SERIOUS N')] = serious_N_count

    for col in count_columns:
        col_values = [entry[col] for entry in result2]
        col_count = col_values.count('Y')
        df.loc[index, f'{col}'] = col_count
    
    # Extract time from vaccination to symptom
    numdays_values = [entry['NUMDAYS'] for entry in result2]
    numdays_counts, _ = pd.cut(numdays_values, bins=numdays_bins, labels=numdays_labels, right=False, retbins=True)
    for label in numdays_labels:
        df.loc[index, f'NUMDAYS_{label}'] = (numdays_counts == label).sum()

    #Extract SEX
    sex_values_list = [entry['SEX'] for entry in result2]
    for sex in sex_values:
        sex_count = sex_values_list.count(sex)
        df.loc[index, f'SEX_{sex}'] = sex_count
        
df.to_csv('../Machine_Learning/df.csv', index=False)
df.to_json('../Machine_Learning/df.json', orient='records', lines=True)

1125it [03:00,  6.23it/s]


In [4]:
import pandas as pd
df = pd.read_csv('../Machine_Learning/df.csv')

In [5]:
df2 = df

In [8]:
df2.columns


Index(['y_true', 'symptom', 'manufacturer', '0-9', '10-19', '20-29', '30-39',
       '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109',
       '110-119', 'NUMDAYS_0-1', 'NUMDAYS_2-3', 'NUMDAYS_4-5', 'NUMDAYS_6-7',
       'NUMDAYS_8-9', 'NUMDAYS_10-11', 'NUMDAYS_12-13', 'NUMDAYS_14+', 'SEX_M',
       'SEX_F', 'SEX_U', 'DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE',
       'BIRTH_DEFECT', 'DE', 'De', 'dE', 'de', 'SERIOUS Y', 'SERIOUS N'],
      dtype='object')

In [32]:
# Apply one-hot encoding on the 'manufacturer' column
df2 = pd.get_dummies(df2, columns=['manufacturer'], dtype=int)



# Display the final DataFrame
df2

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,1,Angioedema,23,95,161,304,308,287,191,105,...,2878.0,943022.0,2351.0,1530710.0,564.0,2314.0,0,0,0,1
1,1,Hypoaesthesia,39,732,2295,4248,4426,3566,1960,862,...,26529.0,919371.0,27845.0,1505216.0,7405.0,19126.0,0,0,0,1
2,1,Erythema multiforme,20,12,31,28,41,30,34,30,...,733.0,945167.0,2228.0,1530833.0,131.0,602.0,0,0,0,1
3,1,Insomnia,33,205,732,1345,1345,1253,1012,581,...,11215.0,934685.0,14609.0,1518452.0,2674.0,8541.0,0,0,0,1
4,1,Myalgia,106,935,4266,7133,6992,6599,4653,2335,...,52182.0,893718.0,75910.0,1457151.0,12832.0,39351.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,0,Increased insulin requirement,0,0,1,1,0,0,1,0,...,3.0,98389.0,33.0,2380536.0,2.0,1.0,1,0,0,0
1121,0,Blood lactate dehydrogenase decreased,0,0,0,0,1,0,0,0,...,7.0,945893.0,17.0,1533044.0,4.0,3.0,0,0,0,1
1122,0,Breast cellulitis,0,0,0,0,2,0,0,2,...,4.0,537275.0,16.0,1941666.0,1.0,3.0,0,1,0,0
1123,0,Vascular graft thrombosis,0,0,0,0,0,2,0,1,...,3.0,537276.0,10.0,1941672.0,3.0,0.0,0,1,0,0


In [33]:
df2.to_csv('../Machine_Learning/df2.csv', index=False)

# Dateset where not negative symptom appears twice

In [12]:
y_true_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/y_true_Pfizer.npy', allow_pickle=True)
y_true_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/y_true_Moderna.npy', allow_pickle=True)
y_true_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/y_true_Janssen.npy', allow_pickle=True)
y_true_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/y_true_Novavax.npy', allow_pickle=True)

manufacturer_array_Pfizer = np.full(len(y_true_Pfizer), 'PFIZER\\BIONTECH')
manufacturer_array_Moderna = np.full(len(y_true_Moderna), 'MODERNA')
manufacturer_array_Janssen = np.full(len(y_true_Janssen), 'JANSSEN')
manufacturer_array_Novavax = np.full(len(y_true_Novavax), 'NOVAVAX')


order_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/order_Pfizer.npy', allow_pickle=True)
order_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/order_Moderna.npy', allow_pickle=True)
order_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/order_Janssen.npy', allow_pickle=True)
order_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/order_Novavax.npy', allow_pickle=True)

y_true = np.concatenate([y_true_Pfizer, y_true_Moderna, y_true_Janssen, y_true_Novavax])
manu = np.concatenate([manufacturer_array_Pfizer, manufacturer_array_Moderna, manufacturer_array_Janssen, manufacturer_array_Novavax])
order = np.concatenate([order_Pfizer, order_Moderna, order_Janssen, order_Novavax])

In [16]:
data = {'y_true': y_true, 'order': order, 'manufacturer': manu}
df = pd.DataFrame(data)

In [17]:
df

Unnamed: 0,y_true,order,manufacturer
0,0,Blood lead,PFIZER\BIONTECH
1,0,Magnetic resonance imaging,PFIZER\BIONTECH
2,0,Hypothyroidism,PFIZER\BIONTECH
3,0,Vitamin B6,PFIZER\BIONTECH
4,0,Anxiety disorder,PFIZER\BIONTECH
...,...,...,...
20423,0,Incorrect product formulation administered,NOVAVAX
20424,0,Head discomfort,NOVAVAX
20425,0,Blood pressure increased,NOVAVAX
20426,0,Muscle spasms,NOVAVAX


In [18]:
indexes_y_true_1 = np.where(y_true == 1)[0]
indexes_y_true_0 = np.where(y_true == 0)[0]

unique_order_indexes = []
unique_orders = set()

for idx in indexes_y_true_0:
    if order[idx] not in unique_orders:
        unique_orders.add(order[idx])
        unique_order_indexes.append(idx)

unique_order_indexes = np.array(unique_order_indexes)

np.random.seed(42)  # Set seed for reproducibility
sample_size = min(1000, len(unique_order_indexes))  # Adjust sample size if fewer unique orders than 1000
sampled_indexes_y_true_0 = np.random.choice(unique_order_indexes, size=sample_size, replace=False)

combined_indexes = np.concatenate([indexes_y_true_1, sampled_indexes_y_true_0])

filtered_y_true = y_true[combined_indexes]
filtered_order = order[combined_indexes]
filtered_manu = manu[combined_indexes]


In [20]:
df = pd.DataFrame({
    'y_true': filtered_y_true,
    'symptom': filtered_order, 
    'manufacturer': filtered_manu
})

In [21]:
columns = ['AGE', 'SEX', 'SERIOUS', 'NUMDAYS', 'SEX', 'DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT', 'RECOVD', 'NUMDAYS']
# Define the age bins and labels once
age_bins = list(range(0, 130, 10))  # Creates bins from 0 to 120 in steps of 10
age_labels = [f"{i}-{i+9}" for i in age_bins[:-1]]

# Define the NUMDAYS bins and labels once
numdays_bins = list(range(0, 16, 2)) + [float('inf')]  # Creates bins from 0 to 14 in steps of 2, plus a bin for 14+
numdays_labels = [f"{i}-{i+1}" for i in numdays_bins[:-2]] + ['14+']

# Initialize the age group columns and the serious_count column in the DataFrame
for label in age_labels:
    df[label] = 0

# Initialize the NUMDAYS bins columns in the DataFrame
for label in numdays_labels:
    df[f'NUMDAYS_{label}'] = 0
    
sex_values = ['M', 'F', 'U']
for sex in sex_values:
    df[f'SEX_{sex}'] = 0
    
count_columns = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
for col in count_columns:
    df[f'{col}'] = 0
    

    
for index, row in tqdm(df.iterrows()):
    query1 = {
        'symptom': row['symptom'],
        'manufacturer': row['manufacturer'],
        'vaccine': 'COVID19'
    }
    result1 = list(collection.find(query1))

    df.loc[index, 'DE'] = result1[0].get("DE", None)
    df.loc[index, 'De'] = result1[0].get("De", None)
    df.loc[index, 'dE'] = result1[0].get("dE", None)
    df.loc[index, 'de'] = result1[0].get("de", None)

       # Second query to 'col'
    query2 = {
        'symptoms': row['symptom'],
        'vax_data.VAX_MANU': row['manufacturer'],
        'vax_data.VAX_TYPE': 'COVID19'
    }
        # Second query

    projection = {field: 1 for field in columns}  # Create projection dictionary
    projection['_id'] = 0  # Exclude _id from the result
    
    result2 = list(collection2.find(query2, projection))
    
    # Extract the ages
    ages = [entry['AGE'] for entry in result2]
    age_counts, _ = pd.cut(ages, bins=age_bins, labels=age_labels, right=False, retbins=True)
    for label in age_labels:
        df.loc[index, label] = (age_counts == label).sum()
        
    # Extract Serious
    serious_values = [entry['SERIOUS'] for entry in result2]
    serious_Y_count = serious_values.count('Y')
    df.loc[index, ('SERIOUS Y')] = serious_Y_count
    serious_N_count = serious_values.count('N')
    df.loc[index, ('SERIOUS N')] = serious_N_count

    for col in count_columns:
        col_values = [entry[col] for entry in result2]
        col_count = col_values.count('Y')
        df.loc[index, f'{col}'] = col_count
    
    # Extract time from vaccination to symptom
    numdays_values = [entry['NUMDAYS'] for entry in result2]
    numdays_counts, _ = pd.cut(numdays_values, bins=numdays_bins, labels=numdays_labels, right=False, retbins=True)
    for label in numdays_labels:
        df.loc[index, f'NUMDAYS_{label}'] = (numdays_counts == label).sum()

    #Extract SEX
    sex_values_list = [entry['SEX'] for entry in result2]
    for sex in sex_values:
        sex_count = sex_values_list.count(sex)
        df.loc[index, f'SEX_{sex}'] = sex_count
        
df.to_csv('../Machine_Learning/df_unique.csv', index=False)
df.to_json('../Machine_Learning/df_unique.json', orient='records', lines=True)

1125it [02:59,  6.28it/s]


In [22]:
import pandas as pd
df = pd.read_csv('../Machine_Learning/df_unique.csv')

In [23]:
df2 = df

In [24]:
df2.columns

Index(['y_true', 'symptom', 'manufacturer', '0-9', '10-19', '20-29', '30-39',
       '40-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109',
       '110-119', 'NUMDAYS_0-1', 'NUMDAYS_2-3', 'NUMDAYS_4-5', 'NUMDAYS_6-7',
       'NUMDAYS_8-9', 'NUMDAYS_10-11', 'NUMDAYS_12-13', 'NUMDAYS_14+', 'SEX_M',
       'SEX_F', 'SEX_U', 'DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE',
       'BIRTH_DEFECT', 'DE', 'De', 'dE', 'de', 'SERIOUS Y', 'SERIOUS N'],
      dtype='object')

In [25]:
df2 = pd.get_dummies(df2, columns=['manufacturer'], dtype=int)

In [26]:
df2

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,1,Angioedema,23,95,161,304,308,287,191,105,...,2878.0,943022.0,2351.0,1530710.0,564.0,2314.0,0,0,0,1
1,1,Hypoaesthesia,39,732,2295,4248,4426,3566,1960,862,...,26529.0,919371.0,27845.0,1505216.0,7405.0,19126.0,0,0,0,1
2,1,Erythema multiforme,20,12,31,28,41,30,34,30,...,733.0,945167.0,2228.0,1530833.0,131.0,602.0,0,0,0,1
3,1,Insomnia,33,205,732,1345,1345,1253,1012,581,...,11215.0,934685.0,14609.0,1518452.0,2674.0,8541.0,0,0,0,1
4,1,Myalgia,106,935,4266,7133,6992,6599,4653,2335,...,52182.0,893718.0,75910.0,1457151.0,12832.0,39351.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,0,Hyperalbuminaemia,0,0,0,0,1,0,0,0,...,3.0,945897.0,0.0,1533061.0,1.0,2.0,0,0,0,1
1121,0,Injection site streaking,0,0,0,2,3,8,3,2,...,18.0,945882.0,525.0,1532536.0,1.0,17.0,0,0,0,1
1122,0,Retinopexy,0,0,0,1,1,0,3,1,...,8.0,945892.0,16.0,1533045.0,7.0,1.0,0,0,0,1
1123,0,Wound haemorrhage,0,1,5,4,8,12,3,7,...,79.0,945821.0,52.0,1533009.0,21.0,58.0,0,0,0,1


In [27]:
df2.to_csv('../Machine_Learning/df2_unique.csv', index=False)