In [3]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from tqdm import tqdm
from collections import Counter

client = MongoClient('mongodb://localhost:27017/')
db = client['vaers']  
collection = db['combinations cleaned']
collection2 = db['reports cleaned']

# All data

In [4]:
y_true_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/y_true_Pfizer.npy', allow_pickle=True)
y_true_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/y_true_Moderna.npy', allow_pickle=True)
y_true_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/y_true_Janssen.npy', allow_pickle=True)
y_true_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/y_true_Novavax.npy', allow_pickle=True)

manufacturer_array_Pfizer = np.full(len(y_true_Pfizer), 'PFIZER\\BIONTECH')
manufacturer_array_Moderna = np.full(len(y_true_Moderna), 'MODERNA')
manufacturer_array_Janssen = np.full(len(y_true_Janssen), 'JANSSEN')
manufacturer_array_Novavax = np.full(len(y_true_Novavax), 'NOVAVAX')


order_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/order_Pfizer.npy', allow_pickle=True)
order_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/order_Moderna.npy', allow_pickle=True)
order_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/order_Janssen.npy', allow_pickle=True)
order_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/order_Novavax.npy', allow_pickle=True)

y_true = np.concatenate([y_true_Pfizer, y_true_Moderna, y_true_Janssen, y_true_Novavax])
manu = np.concatenate([manufacturer_array_Pfizer, manufacturer_array_Moderna, manufacturer_array_Janssen, manufacturer_array_Novavax])
order = np.concatenate([order_Pfizer, order_Moderna, order_Janssen, order_Novavax])

In [5]:
indexes_y_true_1 = np.where(y_true == 1)[0]
indexes_y_true_0 = np.where(y_true == 0)[0]

np.random.seed(42)  # Set seed for reproducibility
sampled_indexes_y_true_0 = np.random.choice(indexes_y_true_0, size=1000, replace=False)

combined_indexes = np.concatenate([indexes_y_true_1, sampled_indexes_y_true_0])

filtered_y_true = y_true[combined_indexes]
filtered_order = order[combined_indexes]
filtered_manu = manu[combined_indexes]

In [6]:
df = pd.DataFrame({
    'y_true': filtered_y_true,
    'symptom': filtered_order, 
    'manufacturer': filtered_manu
})

In [7]:
columns = ['AGE', 'SEX', 'SERIOUS', 'NUMDAYS', 'SEX', 'DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT', 'RECOVD', 'NUMDAYS']
# Define the age bins and labels once
age_bins = list(range(0, 130, 10))  # Creates bins from 0 to 120 in steps of 10
age_labels = [f"{i}-{i+9}" for i in age_bins[:-1]]

# Define the NUMDAYS bins and labels once
numdays_bins = list(range(0, 16, 2)) + [float('inf')]  # Creates bins from 0 to 14 in steps of 2, plus a bin for 14+
numdays_labels = [f"{i}-{i+1}" for i in numdays_bins[:-2]] + ['14+']

# Initialize the age group columns and the serious_count column in the DataFrame
for label in age_labels:
    df[label] = 0

# Initialize the NUMDAYS bins columns in the DataFrame
for label in numdays_labels:
    df[f'NUMDAYS_{label}'] = 0
    
sex_values = ['M', 'F', 'U']
for sex in sex_values:
    df[f'SEX_{sex}'] = 0
    
count_columns = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
for col in count_columns:
    df[f'{col}'] = 0
    

    
for index, row in tqdm(df.iterrows()):
    query1 = {
        'symptom': row['symptom'],
        'manufacturer': row['manufacturer'],
        'vaccine': 'COVID19'
    }
    result1 = list(collection.find(query1))

    df.loc[index, 'DE'] = result1[0].get("DE", None)
    df.loc[index, 'De'] = result1[0].get("De", None)
    df.loc[index, 'dE'] = result1[0].get("dE", None)
    df.loc[index, 'de'] = result1[0].get("de", None)

       # Second query to 'col'
    query2 = {
        'symptoms': row['symptom'],
        'vax_data.VAX_MANU': row['manufacturer'],
        'vax_data.VAX_TYPE': 'COVID19'
    }
        # Second query

    projection = {field: 1 for field in columns}  # Create projection dictionary
    projection['_id'] = 0  # Exclude _id from the result

    result2 = list(collection2.find(query2, projection))
    
    # Extract the ages
    ages = [entry['AGE'] for entry in result2]
    age_counts, _ = pd.cut(ages, bins=age_bins, labels=age_labels, right=False, retbins=True)
    for label in age_labels:
        df.loc[index, label] = (age_counts == label).sum()
        
    # Extract Serious
    serious_values = [entry['SERIOUS'] for entry in result2]
    serious_Y_count = serious_values.count('Y')
    df.loc[index, ('SERIOUS Y')] = serious_Y_count
    serious_N_count = serious_values.count('N')
    df.loc[index, ('SERIOUS N')] = serious_N_count

    for col in count_columns:
        col_values = [entry[col] for entry in result2]
        col_count = col_values.count('Y')
        df.loc[index, f'{col}'] = col_count
    
    # Extract time from vaccination to symptom
    numdays_values = [entry['NUMDAYS'] for entry in result2]
    numdays_counts, _ = pd.cut(numdays_values, bins=numdays_bins, labels=numdays_labels, right=False, retbins=True)
    for label in numdays_labels:
        df.loc[index, f'NUMDAYS_{label}'] = (numdays_counts == label).sum()

    #Extract SEX
    sex_values_list = [entry['SEX'] for entry in result2]
    for sex in sex_values:
        sex_count = sex_values_list.count(sex)
        df.loc[index, f'SEX_{sex}'] = sex_count
        
df.to_csv('../Machine_Learning/data/df_all.csv', index=False)
df.to_json('../Machine_Learning/data/df_all.json', orient='records', lines=True)

1125it [02:47,  6.70it/s]


In [12]:
import pandas as pd
df = pd.read_csv('../Machine_Learning/data/df_all.csv')

In [13]:
df_all = df.copy()

In [14]:
# Apply one-hot encoding on the 'manufacturer' column
df_all = pd.get_dummies(df_all, columns=['manufacturer'], dtype=int)

# Display the final DataFrame
df_all

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,1,Angioedema,23,95,161,304,308,287,191,105,...,2878.0,943022.0,2351.0,1530710.0,564.0,2314.0,0,0,0,1
1,1,Hypoaesthesia,39,732,2295,4248,4426,3566,1960,862,...,26529.0,919371.0,27845.0,1505216.0,7405.0,19126.0,0,0,0,1
2,1,Erythema multiforme,20,12,31,28,41,30,34,30,...,733.0,945167.0,2228.0,1530833.0,131.0,602.0,0,0,0,1
3,1,Insomnia,33,205,732,1345,1345,1253,1012,581,...,11215.0,934685.0,14609.0,1518452.0,2674.0,8541.0,0,0,0,1
4,1,Myalgia,106,935,4266,7133,6992,6599,4653,2335,...,52182.0,893718.0,75910.0,1457151.0,12832.0,39351.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,0,Increased insulin requirement,0,0,1,1,0,0,1,0,...,3.0,98389.0,33.0,2380536.0,2.0,1.0,1,0,0,0
1121,0,Blood lactate dehydrogenase decreased,0,0,0,0,1,0,0,0,...,7.0,945893.0,17.0,1533044.0,4.0,3.0,0,0,0,1
1122,0,Breast cellulitis,0,0,0,0,2,0,0,2,...,4.0,537275.0,16.0,1941666.0,1.0,3.0,0,1,0,0
1123,0,Vascular graft thrombosis,0,0,0,0,0,2,0,1,...,3.0,537276.0,10.0,1941672.0,3.0,0.0,0,1,0,0


In [15]:
df_all.to_csv('../Machine_Learning/data/df_all.csv', index=False)

# Dateset where not negative symptom appears twice

In [115]:
y_true_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/y_true_Pfizer.npy', allow_pickle=True)
y_true_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/y_true_Moderna.npy', allow_pickle=True)
y_true_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/y_true_Janssen.npy', allow_pickle=True)
y_true_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/y_true_Novavax.npy', allow_pickle=True)

manufacturer_array_Pfizer = np.full(len(y_true_Pfizer), 'PFIZER\\BIONTECH')
manufacturer_array_Moderna = np.full(len(y_true_Moderna), 'MODERNA')
manufacturer_array_Janssen = np.full(len(y_true_Janssen), 'JANSSEN')
manufacturer_array_Novavax = np.full(len(y_true_Novavax), 'NOVAVAX')


order_Pfizer = np.load('../Array_Generation/All_data/Arrays_Pfizer/order_Pfizer.npy', allow_pickle=True)
order_Moderna = np.load('../Array_Generation/All_data/Arrays_Moderna/order_Moderna.npy', allow_pickle=True)
order_Janssen = np.load('../Array_Generation/All_data/Arrays_Janssen/order_Janssen.npy', allow_pickle=True)
order_Novavax = np.load('../Array_Generation/All_data/Arrays_Novavax/order_Novavax.npy', allow_pickle=True)

y_true = np.concatenate([y_true_Pfizer, y_true_Moderna, y_true_Janssen, y_true_Novavax])
manu = np.concatenate([manufacturer_array_Pfizer, manufacturer_array_Moderna, manufacturer_array_Janssen, manufacturer_array_Novavax])
order = np.concatenate([order_Pfizer, order_Moderna, order_Janssen, order_Novavax])

In [116]:
data = {'y_true': y_true, 'order': order, 'manufacturer': manu}
df = pd.DataFrame(data)

In [117]:
df

Unnamed: 0,y_true,order,manufacturer
0,0,Blood lead,PFIZER\BIONTECH
1,0,Magnetic resonance imaging,PFIZER\BIONTECH
2,0,Hypothyroidism,PFIZER\BIONTECH
3,0,Vitamin B6,PFIZER\BIONTECH
4,0,Anxiety disorder,PFIZER\BIONTECH
...,...,...,...
20423,0,Incorrect product formulation administered,NOVAVAX
20424,0,Head discomfort,NOVAVAX
20425,0,Blood pressure increased,NOVAVAX
20426,0,Muscle spasms,NOVAVAX


In [118]:
JANSSEN_df = df[(df['y_true'] == 1) & (df['manufacturer'] == "JANSSEN")]
OTHERS_df = df[(df['y_true'] == 1) & (df['manufacturer'] != "JANSSEN")]

In [119]:
# Define the list of orders to remove
orders_to_remove = ["Vomiting", "Lymphadenopathy", "Rash", "Urticaria", "Hypersensitivity", "Hyperhidrosis", "Diarrhoea"]

# Filter Janssen_df to exclude rows with the specified orders
JANSSEN_df = JANSSEN_df[~JANSSEN_df['order'].isin(orders_to_remove)]

In [120]:
JANSSEN_df

Unnamed: 0,y_true,order,manufacturer
16675,1,Back pain,JANSSEN
16710,1,Nausea,JANSSEN
16804,1,Muscular weakness,JANSSEN
17016,1,Facial paralysis,JANSSEN
17254,1,Cutaneous vasculitis,JANSSEN
17266,1,Injection site pain,JANSSEN
17473,1,Pyrexia,JANSSEN
17510,1,Myalgia,JANSSEN
17780,1,Tinnitus,JANSSEN
17906,1,Paraesthesia,JANSSEN


In [121]:
# Extract the order values from Janssen_df
janssen_orders = JANSSEN_df['order'].unique()

# Filter OTHERS_df to exclude rows with orders present in Janssen_df
OTHERS_df = OTHERS_df[~OTHERS_df['order'].isin(janssen_orders)]

In [122]:
JANSSEN_df

Unnamed: 0,y_true,order,manufacturer
16675,1,Back pain,JANSSEN
16710,1,Nausea,JANSSEN
16804,1,Muscular weakness,JANSSEN
17016,1,Facial paralysis,JANSSEN
17254,1,Cutaneous vasculitis,JANSSEN
17266,1,Injection site pain,JANSSEN
17473,1,Pyrexia,JANSSEN
17510,1,Myalgia,JANSSEN
17780,1,Tinnitus,JANSSEN
17906,1,Paraesthesia,JANSSEN


In [123]:
OTHERS_df

Unnamed: 0,y_true,order,manufacturer
165,1,Angioedema,PFIZER\BIONTECH
1659,1,Erythema multiforme,PFIZER\BIONTECH
2022,1,Insomnia,PFIZER\BIONTECH
2906,1,Vomiting,PFIZER\BIONTECH
2913,1,Malaise,PFIZER\BIONTECH
2964,1,Extensive swelling of vaccinated limb,PFIZER\BIONTECH
3498,1,Decreased appetite,PFIZER\BIONTECH
4091,1,Asthenia,PFIZER\BIONTECH
4349,1,Injection site pruritus,PFIZER\BIONTECH
4695,1,Heavy menstrual bleeding,PFIZER\BIONTECH


In [124]:
janssen_entries_to_add = df[(df['y_true'] == 0) & (df['manufacturer'] == "JANSSEN")]


# Randomly sample 280 entries
sampled_janssen_entries = janssen_entries_to_add.sample(n=532, random_state=42)

# Append the sampled entries to Janssen_df
JANSSEN_df = pd.concat([JANSSEN_df, sampled_janssen_entries], ignore_index=True)


In [125]:
JANSSEN_df

Unnamed: 0,y_true,order,manufacturer
0,1,Back pain,JANSSEN
1,1,Nausea,JANSSEN
2,1,Muscular weakness,JANSSEN
3,1,Facial paralysis,JANSSEN
4,1,Cutaneous vasculitis,JANSSEN
...,...,...,...
555,0,Compression garment application,JANSSEN
556,0,Injection site urticaria,JANSSEN
557,0,Tuberculin test,JANSSEN
558,0,Jaundice neonatal,JANSSEN


In [126]:
# Filter df to get entries where y_true == 0 and manufacturer is not "JANSSEN"
others_entries_to_add = df[(df['y_true'] == 0) & (df['manufacturer'] != "JANSSEN")]

# Remove duplicate orders
unique_others_entries = others_entries_to_add.drop_duplicates(subset='order')

# Randomly sample 460 unique entries
sampled_others_entries = unique_others_entries.sample(n=874, random_state=42)

# Append the sampled entries to OTHERS_df
OTHERS_df = pd.concat([OTHERS_df, sampled_others_entries], ignore_index=True)

In [127]:
OTHERS_df

Unnamed: 0,y_true,order,manufacturer
0,1,Angioedema,PFIZER\BIONTECH
1,1,Erythema multiforme,PFIZER\BIONTECH
2,1,Insomnia,PFIZER\BIONTECH
3,1,Vomiting,PFIZER\BIONTECH
4,1,Malaise,PFIZER\BIONTECH
...,...,...,...
915,0,Calculus urinary,PFIZER\BIONTECH
916,0,CSF virus no organisms observed,PFIZER\BIONTECH
917,0,Emergency care,PFIZER\BIONTECH
918,0,Periventricular leukomalacia,PFIZER\BIONTECH


In [128]:
df = pd.concat([JANSSEN_df, OTHERS_df], ignore_index=True)
df.rename(columns={'order': 'symptom'}, inplace=True)

In [129]:
df

Unnamed: 0,y_true,symptom,manufacturer
0,1,Back pain,JANSSEN
1,1,Nausea,JANSSEN
2,1,Muscular weakness,JANSSEN
3,1,Facial paralysis,JANSSEN
4,1,Cutaneous vasculitis,JANSSEN
...,...,...,...
1475,0,Calculus urinary,PFIZER\BIONTECH
1476,0,CSF virus no organisms observed,PFIZER\BIONTECH
1477,0,Emergency care,PFIZER\BIONTECH
1478,0,Periventricular leukomalacia,PFIZER\BIONTECH


In [130]:
columns = ['AGE', 'SEX', 'SERIOUS', 'NUMDAYS', 'SEX', 'DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT', 'RECOVD', 'NUMDAYS']
# Define the age bins and labels once
age_bins = list(range(0, 130, 10))  # Creates bins from 0 to 120 in steps of 10
age_labels = [f"{i}-{i+9}" for i in age_bins[:-1]]

# Define the NUMDAYS bins and labels once
numdays_bins = list(range(0, 16, 2)) + [float('inf')]  # Creates bins from 0 to 14 in steps of 2, plus a bin for 14+
numdays_labels = [f"{i}-{i+1}" for i in numdays_bins[:-2]] + ['14+']

# Initialize the age group columns and the serious_count column in the DataFrame
for label in age_labels:
    df[label] = 0

# Initialize the NUMDAYS bins columns in the DataFrame
for label in numdays_labels:
    df[f'NUMDAYS_{label}'] = 0
    
sex_values = ['M', 'F', 'U']
for sex in sex_values:
    df[f'SEX_{sex}'] = 0
    
count_columns = ['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']
for col in count_columns:
    df[f'{col}'] = 0
    

    
for index, row in tqdm(df.iterrows()):
    query1 = {
        'symptom': row['symptom'],
        'manufacturer': row['manufacturer'],
        'vaccine': 'COVID19'
    }
    result1 = list(collection.find(query1))

    df.loc[index, 'DE'] = result1[0].get("DE", None)
    df.loc[index, 'De'] = result1[0].get("De", None)
    df.loc[index, 'dE'] = result1[0].get("dE", None)
    df.loc[index, 'de'] = result1[0].get("de", None)

       # Second query to 'col'
    query2 = {
        'symptoms': row['symptom'],
        'vax_data.VAX_MANU': row['manufacturer'],
        'vax_data.VAX_TYPE': 'COVID19'
    }
        # Second query

    projection = {field: 1 for field in columns}  # Create projection dictionary
    projection['_id'] = 0  # Exclude _id from the result
    
    result2 = list(collection2.find(query2, projection))
    
    # Extract the ages
    ages = [entry['AGE'] for entry in result2]
    age_counts, _ = pd.cut(ages, bins=age_bins, labels=age_labels, right=False, retbins=True)
    for label in age_labels:
        df.loc[index, label] = (age_counts == label).sum()
        
    # Extract Serious
    serious_values = [entry['SERIOUS'] for entry in result2]
    serious_Y_count = serious_values.count('Y')
    df.loc[index, ('SERIOUS Y')] = serious_Y_count
    serious_N_count = serious_values.count('N')
    df.loc[index, ('SERIOUS N')] = serious_N_count

    for col in count_columns:
        col_values = [entry[col] for entry in result2]
        col_count = col_values.count('Y')
        df.loc[index, f'{col}'] = col_count
    
    # Extract time from vaccination to symptom
    numdays_values = [entry['NUMDAYS'] for entry in result2]
    numdays_counts, _ = pd.cut(numdays_values, bins=numdays_bins, labels=numdays_labels, right=False, retbins=True)
    for label in numdays_labels:
        df.loc[index, f'NUMDAYS_{label}'] = (numdays_counts == label).sum()

    #Extract SEX
    sex_values_list = [entry['SEX'] for entry in result2]
    for sex in sex_values:
        sex_count = sex_values_list.count(sex)
        df.loc[index, f'SEX_{sex}'] = sex_count
        
df.to_csv('../Machine_Learning/data/df_unique_5.csv', index=False)
df.to_json('../Machine_Learning/data/df_unique.json_5', orient='records', lines=True)

1480it [02:22, 10.40it/s]


In [131]:
df.head()

Unnamed: 0,y_true,symptom,manufacturer,0-9,10-19,20-29,30-39,40-49,50-59,60-69,...,HOSPITAL,X_STAY,DISABLE,BIRTH_DEFECT,DE,De,dE,de,SERIOUS Y,SERIOUS N
0,1,Back pain,JANSSEN,1,56,283,367,384,383,249,...,241,0,116,1,2125.0,96267.0,32151.0,2348418.0,375.0,1750.0
1,1,Nausea,JANSSEN,7,371,1817,1827,1501,1533,921,...,752,5,456,2,9791.0,88601.0,156847.0,2223722.0,1272.0,8519.0
2,1,Muscular weakness,JANSSEN,0,23,95,196,198,245,168,...,237,0,156,2,1198.0,97194.0,24180.0,2356389.0,345.0,853.0
3,1,Facial paralysis,JANSSEN,0,3,26,55,67,83,42,...,177,2,76,1,445.0,97947.0,10246.0,2370323.0,223.0,222.0
4,1,Cutaneous vasculitis,JANSSEN,0,0,0,4,0,1,4,...,12,0,1,0,32.0,98360.0,461.0,2380108.0,13.0,19.0


In [132]:
import pandas as pd
df = pd.read_csv('data/df_unique_5.csv')

In [133]:
df2 = df.copy()

In [134]:
df2

Unnamed: 0,y_true,symptom,manufacturer,0-9,10-19,20-29,30-39,40-49,50-59,60-69,...,HOSPITAL,X_STAY,DISABLE,BIRTH_DEFECT,DE,De,dE,de,SERIOUS Y,SERIOUS N
0,1,Back pain,JANSSEN,1,56,283,367,384,383,249,...,241,0,116,1,2125.0,96267.0,32151.0,2348418.0,375.0,1750.0
1,1,Nausea,JANSSEN,7,371,1817,1827,1501,1533,921,...,752,5,456,2,9791.0,88601.0,156847.0,2223722.0,1272.0,8519.0
2,1,Muscular weakness,JANSSEN,0,23,95,196,198,245,168,...,237,0,156,2,1198.0,97194.0,24180.0,2356389.0,345.0,853.0
3,1,Facial paralysis,JANSSEN,0,3,26,55,67,83,42,...,177,2,76,1,445.0,97947.0,10246.0,2370323.0,223.0,222.0
4,1,Cutaneous vasculitis,JANSSEN,0,0,0,4,0,1,4,...,12,0,1,0,32.0,98360.0,461.0,2380108.0,13.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,0,Calculus urinary,PFIZER\BIONTECH,0,2,2,1,2,3,6,...,14,0,1,0,23.0,945877.0,18.0,1533043.0,15.0,8.0
1476,0,CSF virus no organisms observed,PFIZER\BIONTECH,0,0,1,0,2,0,1,...,4,0,1,0,4.0,945896.0,81.0,1532980.0,4.0,0.0
1477,0,Emergency care,PFIZER\BIONTECH,0,0,3,3,2,2,0,...,21,0,3,0,33.0,945867.0,28.0,1533033.0,23.0,10.0
1478,0,Periventricular leukomalacia,PFIZER\BIONTECH,0,0,0,0,0,0,0,...,3,0,1,1,7.0,945893.0,5.0,1533056.0,5.0,2.0


In [135]:
df2 = pd.get_dummies(df2, columns=['manufacturer'], dtype=int)

In [136]:
df2

Unnamed: 0,y_true,symptom,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,...,DE,De,dE,de,SERIOUS Y,SERIOUS N,manufacturer_JANSSEN,manufacturer_MODERNA,manufacturer_NOVAVAX,manufacturer_PFIZER\BIONTECH
0,1,Back pain,1,56,283,367,384,383,249,80,...,2125.0,96267.0,32151.0,2348418.0,375.0,1750.0,1,0,0,0
1,1,Nausea,7,371,1817,1827,1501,1533,921,263,...,9791.0,88601.0,156847.0,2223722.0,1272.0,8519.0,1,0,0,0
2,1,Muscular weakness,0,23,95,196,198,245,168,78,...,1198.0,97194.0,24180.0,2356389.0,345.0,853.0,1,0,0,0
3,1,Facial paralysis,0,3,26,55,67,83,42,30,...,445.0,97947.0,10246.0,2370323.0,223.0,222.0,1,0,0,0
4,1,Cutaneous vasculitis,0,0,0,4,0,1,4,2,...,32.0,98360.0,461.0,2380108.0,13.0,19.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,0,Calculus urinary,0,2,2,1,2,3,6,1,...,23.0,945877.0,18.0,1533043.0,15.0,8.0,0,0,0,1
1476,0,CSF virus no organisms observed,0,0,1,0,2,0,1,0,...,4.0,945896.0,81.0,1532980.0,4.0,0.0,0,0,0,1
1477,0,Emergency care,0,0,3,3,2,2,0,0,...,33.0,945867.0,28.0,1533033.0,23.0,10.0,0,0,0,1
1478,0,Periventricular leukomalacia,0,0,0,0,0,0,0,1,...,7.0,945893.0,5.0,1533056.0,5.0,2.0,0,0,0,1


In [137]:
df2.to_csv('../Machine_Learning/data/df2_unique_5.csv', index=False)