# Connect to MongoDB

In [24]:
import pymongo
# Connect to your MongoDB instance
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["vaers"]
collection = db["reports cleaned"]

# define global functions

In [17]:
from datetime import datetime

def convert_date(date_string):
    if pd.isna(date_string):
        return None
    # Assuming the date format is day/month/year
    return datetime.strptime(date_string, "%m/%d/%Y")

def calculate_age(row):
    if not pd.isna(row['AGE_YRS']):
        return row['AGE_YRS']
    elif not pd.isna(row['CAGE_YR']):
        age_years = row['CAGE_YR']
        if not pd.isna(row['CAGE_MO']):
            age_years += row['CAGE_MO']  # Convert months to years and add
        return age_years
    elif not pd.isna(row['CAGE_MO']):
        return row['CAGE_MO']   # Convert months to years
    else:
        return np.nan
    
def form_completed(row):
    if not pd.isna(row['TODAYS_DATE']):
        return row['TODAYS_DATE']
    elif not pd.isna(row['RPT_DATE']):
        return row['RPT_DATE']
    else:
        return np.nan
    
corrections = {
    'penecellin': 'Penicillin',
    'penecillin': 'Penicillin',
    'penecilin': 'Penicillin',
    'penicillins': 'Penicillin',
    'sulfa': 'Sulfates',
    'sulpha': 'Sulfates',
    'sulfides': 'Sulfates',
    'sulfite': 'Sulfates',
    'sulfate': 'Sulfates'
}

def correct_spelling(text):
    for variation, correction in corrections.items():
        text = re.sub(r'\b{}\b'.format(variation), correction, text, flags=re.IGNORECASE)
    return text

# Define the replacement function
def replace_vaccination_site(symptom):
    return symptom.str.replace("Vaccination site", "Injection site", flags=re.IGNORECASE)

# Domestic Reports

In [13]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm  # Import tqdm for progress bar

base_path = "/home/sebastian/Documents/Masterarbeit/Daten/"

# Domestic Documents
for year in range(1990, 2024):
    metadata_file_path = f"{base_path}{year}VAERSDATA.csv"
    vax_file_path = f"{base_path}{year}VAERSVAX.csv"
    symptoms_file_path = f"{base_path}{year}VAERSSYMPTOMS.csv"
    
    metadata_df = pd.read_csv(metadata_file_path, encoding='ISO-8859-1', low_memory=False)
    vax_df = pd.read_csv(vax_file_path, encoding='ISO-8859-1', low_memory=False, usecols=['VAERS_ID', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'VAX_ROUTE', 'VAX_SITE' ,'VAX_NAME'])
    symptoms_df = pd.read_csv(symptoms_file_path, encoding='ISO-8859-1', low_memory=False, usecols=['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5'])
    
    metadata_df = metadata_df.drop(['V_FUNDBY', 'FORM_VERS'], axis=1)
    
    
    metadata_df['AGE'] = metadata_df.apply(calculate_age, axis=1) # Insert the 'AGE' column as the 4th column
    metadata_df.insert(3, 'AGE', metadata_df.pop('AGE'))
    metadata_df['TODAYS_DATE'] = metadata_df.apply(form_completed, axis=1)
    metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']] = metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']].apply(lambda x: x.map(convert_date))
    metadata_df = metadata_df.drop(['AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'RPT_DATE'], axis=1)
    metadata_df.insert(2, 'TODAYS_DATE', metadata_df.pop('TODAYS_DATE'))
    metadata_df['ALLERGIES'] = metadata_df['ALLERGIES'].astype(str)
    metadata_df['ALLERGIES'] = metadata_df['ALLERGIES'].apply(lambda x: correct_spelling(x) if x != 'nan' else np.nan)
    metadata_df['SERIOUS'] = np.where(metadata_df[['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']].eq('Y').any(axis=1), 'Y', 'N')
    metadata_df.insert(6, 'SERIOUS', metadata_df.pop('SERIOUS'))

    columns = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
    for col in columns:
        symptoms_df[col] = replace_vaccination_site(symptoms_df[col])
    
    for index, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc=f"Processing Year {year}"):  # Use tqdm for progress bar
        entry = {}
        vax_entries = [] 
        symptoms_entry = []
    
        # Add metadata columns directly to the entry
        for column in metadata_df.columns:
            if column == 'OTHER_MEDS':
                # Split the string on the comma or semicolon and store as a list
                if pd.notna(row[column]):
                    meds = re.split(',|;', row[column])  # Use regex to split on both delimiters
                    entry['OTHER_MEDS'] = [med.strip() for med in meds if med.strip()]  # List of non-empty stripped meds
                else:
                    entry['OTHER_MEDS'] = np.nan
            elif column == 'CUR_ILL':
                # Split the string on the comma and create a dictionary
                if pd.notna(row['CUR_ILL']):
                    ills = re.split(',|;', row['CUR_ILL'])
                    entry['CUR_ILL'] = [ill.strip() for ill in ills if ill.strip()]
                else:
                     entry['CUR_ILL'] = np.nan   
            elif column == 'HISTORY':
                # Split the string on the comma and create a dictionary
                if pd.notna(row[column]):
                    history = re.split(',|;', row[column])
                    entry['HISTORY'] = [his.strip() for his in history if his.strip()]
                else:
                    entry['HISTORY'] = np.nan
                    
            elif pd.notna(row[column]):
                entry[column] = row[column]
            else:
                entry[column] = np.nan
        
        # Filter vax_df based on VAERS_ID
        vax_data = vax_df[vax_df['VAERS_ID'] == row['VAERS_ID']] 
        vax_data = vax_data.drop(['VAERS_ID'], axis=1)
        
        # Check if vax_data is not empty
        if not vax_data.empty:
            # Iterate over each vaccine entry and add it to the list
            for vax_row in vax_data.itertuples(index=False):
                vax_entry_i = {column: getattr(vax_row, column) if pd.notna(getattr(vax_row, column)) else np.nan for column in vax_data.columns}
                if vax_entry_i["VAX_MANU"] != "UNKNOWN MANUFACTURER":
                    vax_entries.append(vax_entry_i)
        if len(vax_entries) == 0:
            continue
            
        entry['vax_data'] = vax_entries  # Add the list of vaccine entries to the main entry
                
         # Filter symptoms_df based on VAERS_ID
        symptoms_data = symptoms_df[symptoms_df['VAERS_ID'] == row['VAERS_ID']] 
        symptoms_data = symptoms_data.drop(['VAERS_ID'], axis=1)
        
        # Check if symptoms_data is not empty
        if not symptoms_data.empty:
            # Construct a list of symptoms, excluding any NaN values
            symptoms_list = [value for value in symptoms_data.values.flatten() if pd.notna(value)]
            entry['symptoms'] = symptoms_list
        else:
            entry['symptoms'] = np.nan
            
                
        # Upload entry to MongoDB
        collection.insert_one(entry)
        

Processing Year 1990: 100%|██████████| 2102/2102 [00:09<00:00, 214.15it/s]
Processing Year 1991: 100%|██████████| 9933/9933 [00:49<00:00, 199.03it/s]
Processing Year 1992: 100%|██████████| 10692/10692 [00:53<00:00, 199.42it/s]
Processing Year 1993: 100%|██████████| 10147/10147 [00:50<00:00, 202.56it/s]
Processing Year 1994: 100%|██████████| 10193/10193 [00:50<00:00, 203.43it/s]
Processing Year 1995: 100%|██████████| 10001/10001 [00:47<00:00, 210.11it/s]
Processing Year 1996: 100%|██████████| 10771/10771 [00:52<00:00, 206.40it/s]
Processing Year 1997: 100%|██████████| 11006/11006 [00:49<00:00, 224.40it/s]
Processing Year 1998: 100%|██████████| 9949/9949 [00:47<00:00, 207.31it/s]
Processing Year 1999: 100%|██████████| 12123/12123 [00:56<00:00, 216.12it/s]
Processing Year 2000: 100%|██████████| 14105/14105 [01:08<00:00, 206.10it/s]
Processing Year 2001: 100%|██████████| 13359/13359 [01:06<00:00, 202.29it/s]
Processing Year 2002: 100%|██████████| 14074/14074 [01:08<00:00, 206.95it/s]
Proce

In [16]:
entry  

{'VAERS_ID': 2717352,
 'RECVDATE': Timestamp('2023-11-24 00:00:00'),
 'TODAYS_DATE': Timestamp('2023-11-22 00:00:00'),
 'STATE': 'FR',
 'AGE': nan,
 'SEX': 'M',
 'SERIOUS': 'N',
 'SYMPTOM_TEXT': nan,
 'DIED': nan,
 'DATEDIED': nan,
 'L_THREAT': nan,
 'ER_VISIT': nan,
 'HOSPITAL': nan,
 'HOSPDAYS': nan,
 'X_STAY': nan,
 'DISABLE': nan,
 'RECOVD': 'N',
 'VAX_DATE': Timestamp('2021-03-03 00:00:00'),
 'ONSET_DATE': nan,
 'NUMDAYS': nan,
 'LAB_DATA': nan,
 'V_ADMINBY': 'OTH',
 'OTHER_MEDS': nan,
 'CUR_ILL': nan,
 'HISTORY': nan,
 'PRIOR_VAX': nan,
 'SPLTTYPE': nan,
 'BIRTH_DEFECT': nan,
 'OFC_VISIT': 'Y',
 'ER_ED_VISIT': nan,
 'ALLERGIES': nan,
 'vax_data': [{'VAX_TYPE': 'COVID19',
   'VAX_MANU': 'PFIZER\\BIONTECH',
   'VAX_LOT': nan,
   'VAX_DOSE_SERIES': '2',
   'VAX_ROUTE': nan,
   'VAX_SITE': nan,
   'VAX_NAME': 'COVID19 (COVID19 (PFIZER-BIONTECH))'}],
 'symptoms': ['Blindness unilateral', 'Visual impairment'],
 '_id': ObjectId('66477fa6f132998008686063')}

# non-domestic reports

In [26]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# Set base path
base_path = "/home/sebastian/Documents/Masterarbeit/Daten/"

# File paths
metadata_file_path = f"{base_path}NonDomesticVAERSDATA.csv"
vax_file_path = f"{base_path}NonDomesticVAERSVAX.csv"
symptoms_file_path = f"{base_path}NonDomesticVAERSSYMPTOMS.csv"

metadata_df = pd.read_csv(metadata_file_path, encoding='ISO-8859-1', low_memory=False)
vax_df = pd.read_csv(vax_file_path, encoding='ISO-8859-1', low_memory=False, usecols=['VAERS_ID', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'VAX_ROUTE', 'VAX_SITE' ,'VAX_NAME'])
symptoms_df = pd.read_csv(symptoms_file_path, encoding='ISO-8859-1', low_memory=False, usecols=['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5'])

metadata_df = metadata_df.drop(['V_FUNDBY', 'FORM_VERS'], axis=1)


metadata_df['AGE'] = metadata_df.apply(calculate_age, axis=1) # Insert the 'AGE' column as the 4th column
metadata_df.insert(3, 'AGE', metadata_df.pop('AGE'))
metadata_df['TODAYS_DATE'] = metadata_df.apply(form_completed, axis=1)
metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']] = metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']].apply(lambda x: x.map(convert_date))
metadata_df = metadata_df.drop(['AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'RPT_DATE'], axis=1)
metadata_df.insert(2, 'TODAYS_DATE', metadata_df.pop('TODAYS_DATE'))
metadata_df['ALLERGIES'] = metadata_df['ALLERGIES'].astype(str)
metadata_df['ALLERGIES'] = metadata_df['ALLERGIES'].apply(lambda x: correct_spelling(x) if x != 'nan' else np.nan)
metadata_df['SERIOUS'] = np.where(metadata_df[['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']].eq('Y').any(axis=1), 'Y', 'N')
metadata_df.insert(6, 'SERIOUS', metadata_df.pop('SERIOUS'))

# Define the cutoff date
cutoff_date = pd.Timestamp('2023-12-31')

# Identify the VAERS_IDs to remove
removed_vaers_ids = metadata_df[metadata_df['RECVDATE'] > cutoff_date]['VAERS_ID']

# Save removed VAERS_IDs to a DataFrame (optional)
removed_vaers_df = pd.DataFrame(removed_vaers_ids, columns=['VAERS_ID'])

# Remove these entries from metadata_df
metadata_df = metadata_df[metadata_df['RECVDATE'] <= cutoff_date]

# Remove corresponding entries from vax_df and symptoms_df
vax_df = vax_df[~vax_df['VAERS_ID'].isin(removed_vaers_ids)]
symptoms_df = symptoms_df[~symptoms_df['VAERS_ID'].isin(removed_vaers_ids)]

columns = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
for col in columns:
    symptoms_df[col] = replace_vaccination_site(symptoms_df[col])
    


# Iterate through rows of metadata_df
for index, row in tqdm(metadata_df.iterrows()):
        entry = {}
        vax_entries = [] 
        symptoms_entry = []
    
        # Add metadata columns directly to the entry
        for column in metadata_df.columns:
            if column == 'OTHER_MEDS':
                # Split the string on the comma or semicolon and store as a list
                if pd.notna(row[column]):
                    meds = re.split(',|;', row[column])  # Use regex to split on both delimiters
                    entry['OTHER_MEDS'] = [med.strip() for med in meds if med.strip()]  # List of non-empty stripped meds
                else:
                    entry['OTHER_MEDS'] = np.nan
            elif column == 'CUR_ILL':
                # Split the string on the comma and create a dictionary
                if pd.notna(row['CUR_ILL']):
                    ills = re.split(',|;', row['CUR_ILL'])
                    entry['CUR_ILL'] = [ill.strip() for ill in ills if ill.strip()]
                else:
                     entry['CUR_ILL'] = np.nan   
            elif column == 'HISTORY':
                # Split the string on the comma and create a dictionary
                if pd.notna(row[column]):
                    history = re.split(',|;', row[column])
                    entry['HISTORY'] = [his.strip() for his in history if his.strip()]
                else:
                    entry['HISTORY'] = np.nan
                    
            elif pd.notna(row[column]):
                entry[column] = row[column]
            else:
                entry[column] = np.nan
        
        # Filter vax_df based on VAERS_ID
        vax_data = vax_df[vax_df['VAERS_ID'] == row['VAERS_ID']] 
        vax_data = vax_data.drop(['VAERS_ID'], axis=1)
        
        # Check if vax_data is not empty
        if not vax_data.empty:
            # Iterate over each vaccine entry and add it to the list
            for vax_row in vax_data.itertuples(index=False):
                vax_entry_i = {column: getattr(vax_row, column) if pd.notna(getattr(vax_row, column)) else np.nan for column in vax_data.columns}
                if vax_entry_i["VAX_MANU"] != "UNKNOWN MANUFACTURER":
                    vax_entries.append(vax_entry_i)
        if len(vax_entries) == 0:
            continue
        
        entry['vax_data'] = vax_entries  # Add the list of vaccine entries to the main entry
                
         # Filter symptoms_df based on VAERS_ID
        symptoms_data = symptoms_df[symptoms_df['VAERS_ID'] == row['VAERS_ID']] 
        symptoms_data = symptoms_data.drop(['VAERS_ID'], axis=1)
        
        # Check if symptoms_data is not empty
        if not symptoms_data.empty:
            # Construct a list of symptoms, excluding any NaN values
            symptoms_list = [value for value in symptoms_data.values.flatten() if pd.notna(value)]
            entry['symptoms'] = symptoms_list
        else:
            entry['symptoms'] = np.nan
                
        # Upload entry to MongoDB
        collection.insert_one(entry)




744330it [1:19:00, 157.03it/s]


In [22]:
query = {"STATE": "FR"}

In [23]:
# Step 3: Remove the documents
result = collection.delete_many(query)

# Step 4: Print the number of documents deleted
print(f"Number of documents deleted: {result.deleted_count}")

Number of documents deleted: 705105


In [27]:
metadata_df

Unnamed: 0,VAERS_ID,RECVDATE,TODAYS_DATE,STATE,AGE,SEX,SERIOUS,SYMPTOM_TEXT,DIED,DATEDIED,...,V_ADMINBY,OTHER_MEDS,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,25002,1990-07-02,NaT,FR,82.0,M,Y,"23 hrs post vaccination, developed seizures fo...",,NaT,...,UNK,"Thioridazine, Triazolam,",,"Senile dementia, Diabetes mellitus, seizures",~ ()~~~In patient,WAES90040535,,,,
1,25010,1990-07-02,NaT,FR,1.7,M,Y,"17 mon. male, received 29Oct89 MMR vaccine 1 d...",Y,NaT,...,PVT,Promethanzine HCL given 21Dec89-21Dec89,,,~ ()~~~In patient,WAES90060362,,,,
2,25011,1990-07-02,NaT,FR,5.0,F,Y,Approx. 1 hr /p Engerix-B vaccine given became...,,NaT,...,PVT,,,"No hx in the pregnacy, neonatal period, family...",~ ()~~~In patient,EBWWMA010932,,,,
3,25064,1990-07-02,NaT,FR,,F,Y,mother vaccinated w/ Rubella 1979. Pregancey u...,,NaT,...,UNK,,,,~ ()~~~In patient,WAES90060087,,,,
4,25055,1990-07-09,NaT,FR,54.0,F,Y,Hepatic pain. Increase liver fx test. Nause c...,,NaT,...,UNK,,,On medication for the treatment of tuberculosi...,~ ()~~~In patient,EBWWMA010375,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744325,2728778,2023-12-29,2023-12-29,FR,2.0,M,N,,,NaT,...,OTH,,,,,,,,,
744326,2728880,2023-12-30,2023-12-29,FR,3.0,F,Y,a nodule was detected in the right fat layer o...,,NaT,...,UNK,,,Medical History/Concurrent Conditions: Body te...,,CN0095075132307CHN006089,,,,
744327,2728881,2023-12-30,2023-12-29,FR,12.0,F,N,,,NaT,...,UNK,,,,,,,,,
744328,2728882,2023-12-30,2023-12-29,FR,1.7,F,Y,,,NaT,...,UNK,,,,,,,,,


In [8]:
metadata_file_path = f"/home/sebastian/Documents/Masterarbeit/Daten/NonDomesticVAERSDATA.csv"
metadata_df = pd.read_csv(metadata_file_path, encoding='ISO-8859-1', low_memory=False)

In [5]:
import pandas as pd