# Connect to MongoDB

In [192]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
# Replace 'mydatabase' with your database name
db = client["test"]
# Replace 'mycollection' with your collection name
collection = db["test"]



# define global functions

In [193]:
from datetime import datetime
import pandas as pd
import numpy as np
import re

def convert_date(date_string):
    if pd.isna(date_string):
        return None
    # Assuming the date format is day/month/year
    return datetime.strptime(date_string, "%m/%d/%Y")

def calculate_age(row):
    if not pd.isna(row['AGE_YRS']):
        return row['AGE_YRS']
    elif not pd.isna(row['CAGE_YR']):
        age_years = row['CAGE_YR']
        if not pd.isna(row['CAGE_MO']):
            age_years += row['CAGE_MO']  # Convert months to years and add
        return age_years
    elif not pd.isna(row['CAGE_MO']):
        return row['CAGE_MO']   # Convert months to years
    else:
        return np.nan
    
def form_completed(row):
    if not pd.isna(row['TODAYS_DATE']):
        return row['TODAYS_DATE']
    elif not pd.isna(row['RPT_DATE']):
        return row['RPT_DATE']
    else:
        return np.nan
    
corrections = {
    'penecellin': 'Penicillin',
    'penecillin': 'Penicillin',
    'penecilin': 'Penicillin',
    'penicillins': 'Penicillin',
    'sulfa': 'Sulfates',
    'sulpha': 'Sulfates',
    'sulfides': 'Sulfates',
    'sulfite': 'Sulfates',
    'sulfate': 'Sulfates'
}

def correct_spelling(text):
    for variation, correction in corrections.items():
        text = re.sub(r'\b{}\b'.format(variation), correction, text, flags=re.IGNORECASE)
    return text

In [200]:
base_path = "/Users/sep/Documents/HAW/Masterarbeit/Daten.nosync/"
for year in range(1999, 2000):
    metadata_file_path = f"{base_path}{year}VAERSDATA.csv"
    vax_file_path = f"{base_path}{year}VAERSVAX.csv"
    symptoms_file_path = f"{base_path}{year}VAERSSYMPTOMS.csv"
    
    metadata_df = pd.read_csv(metadata_file_path, encoding='ISO-8859-1', low_memory=False)
    vax_df = pd.read_csv(vax_file_path, encoding='ISO-8859-1', low_memory=False, usecols=['VAERS_ID', 'VAX_TYPE', 'VAX_MANU', 'VAX_DOSE_SERIES', 'VAX_NAME'])
    symptoms_df = pd.read_csv(symptoms_file_path, encoding='ISO-8859-1', low_memory=False, usecols=['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5'])
    
    metadata_df = metadata_df.drop(['SYMPTOM_TEXT', 'LAB_DATA', 'V_ADMINBY', 'V_FUNDBY', 'SPLTTYPE', 'FORM_VERS'], axis=1)
    
    
    metadata_df['AGE'] = metadata_df.apply(calculate_age, axis=1) # Insert the 'AGE' column as the 4th column
    metadata_df.insert(3, 'AGE', metadata_df.pop('AGE'))
    metadata_df['TODAYS_DATE'] = metadata_df.apply(form_completed, axis=1)
    metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']] = metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']].apply(lambda x: x.map(convert_date))
    metadata_df = metadata_df.drop(['AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'RPT_DATE'], axis=1)
    metadata_df.insert(2, 'TODAYS_DATE', metadata_df.pop('TODAYS_DATE'))
    metadata_df['ALLERGIES'] = metadata_df['ALLERGIES'].astype(str)
    metadata_df['ALLERGIES'] = metadata_df['ALLERGIES'].apply(lambda x: correct_spelling(x) if x != 'nan' else np.nan)
    metadata_df['SERIOUS'] = np.where(metadata_df[['DIED', 'L_THREAT', 'HOSPITAL', 'X_STAY', 'DISABLE', 'BIRTH_DEFECT']].eq('Y').any(axis=1), 'Y', 'N')
    metadata_df.insert(6, 'SERIOUS', metadata_df.pop('SERIOUS'))
    
    symptoms_df["SYMPTOM1"] = symptoms_df["SYMPTOM1"].str.replace("Vaccination site","Injection site", flags=re.IGNORECASE)
    symptoms_df["SYMPTOM2"] = symptoms_df["SYMPTOM2"].str.replace("Vaccination site","Injection site", flags=re.IGNORECASE)
    symptoms_df["SYMPTOM3"] = symptoms_df["SYMPTOM3"].str.replace("Vaccination site","Injection site", flags=re.IGNORECASE)
    symptoms_df["SYMPTOM4"] = symptoms_df["SYMPTOM4"].str.replace("Vaccination site","Injection site", flags=re.IGNORECASE)
    symptoms_df["SYMPTOM5"] = symptoms_df["SYMPTOM5"].str.replace("Vaccination site","Injection site", flags=re.IGNORECASE)
    

for index, row in metadata_df.iterrows():
    entry = {}
    metadata_entry = {}
    vax_entry = {}
    symptoms_entry = []

    # Add metadata columns to the metadata_entry
    for column in metadata_df.columns:
        if pd.notna(row[column]):
            metadata_entry[column] = row[column]
        else:
            metadata_entry[column] = np.nan
    
    # Filter vax_df based on VAERS_ID
    vax_data = vax_df[vax_df['VAERS_ID'] == row['VAERS_ID']] 
    vax_data = vax_data.drop(['VAERS_ID'], axis=1)
    
    # Check if vax_data is not empty
    if not vax_data.empty:
        for i in range(len(vax_data)):
            vax_entry_i = {}
            for column in vax_data.columns:
                if pd.notna(vax_data.iloc[i][column]):
                    vax_entry_i[f"{column}_{i+1}"] = vax_data.iloc[i][column]
                else:
                    vax_entry_i[f"{column}_{i+1}"] = np.nan
            vax_entry.update(vax_entry_i)
    else:
        # If vax_data is empty, set all values to NaN
        for column in vax_df.columns:
            vax_entry[column] = np.nan
            
     # Filter symptoms_df based on VAERS_ID
    symptoms_data = symptoms_df[symptoms_df['VAERS_ID'] == row['VAERS_ID']] 
    symptoms_data = symptoms_data.drop(['VAERS_ID'], axis=1)
    
    # Check if symptoms_data is not empty
    if not symptoms_data.empty:
        # Construct symptoms dictionary with keys without quotes
        symptoms_entry = {f"SYMPTOM{i+1}": value for i, value in enumerate(symptoms_data.values.flatten()) if pd.notna(value)}
            
    entry['metadata'] = metadata_entry
    entry['vax_data'] = vax_entry
    entry['symptoms'] = symptoms_entry
            
    
    # Upload entry to MongoDB
    #collection.insert_one(entry)

  metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']] = metadata_df[['RECVDATE', 'DATEDIED', 'VAX_DATE', 'ONSET_DATE', 'TODAYS_DATE']].applymap(convert_date)


In [199]:
entry

{'metadata': {'VAERS_ID': 132814,
  'RECVDATE': Timestamp('1999-12-31 00:00:00'),
  'TODAYS_DATE': Timestamp('1999-08-18 00:00:00'),
  'STATE': nan,
  'AGE': 24.0,
  'SEX': 'M',
  'SERIOUS': 'N',
  'DIED': nan,
  'DATEDIED': nan,
  'L_THREAT': nan,
  'ER_VISIT': 'Y',
  'HOSPITAL': nan,
  'HOSPDAYS': nan,
  'X_STAY': nan,
  'DISABLE': nan,
  'RECOVD': 'N',
  'VAX_DATE': Timestamp('1999-01-29 00:00:00'),
  'ONSET_DATE': Timestamp('1999-01-29 00:00:00'),
  'NUMDAYS': 0.0,
  'OTHER_MEDS': nan,
  'CUR_ILL': 'NONE',
  'HISTORY': 'NONE',
  'PRIOR_VAX': nan,
  'BIRTH_DEFECT': nan,
  'OFC_VISIT': nan,
  'ER_ED_VISIT': nan,
  'ALLERGIES': nan},
 'vax_data': {'VAX_TYPE_1': 'ANTH',
  'VAX_MANU_1': 'MICHIGAN DEPT PUB HLTH',
  'VAX_DOSE_SERIES_1': '3',
  'VAX_NAME_1': 'ANTHRAX (NO BRAND NAME)'},
 'symptoms': {'SYMPTOM1': 'Pyrexia',
  'SYMPTOM2': 'Rash',
  'SYMPTOM3': 'Urticaria'}}

In [201]:
entry

{'metadata': {'VAERS_ID': 132814,
  'RECVDATE': Timestamp('1999-12-31 00:00:00'),
  'TODAYS_DATE': Timestamp('1999-08-18 00:00:00'),
  'STATE': nan,
  'AGE': 24.0,
  'SEX': 'M',
  'SERIOUS': 'N',
  'DIED': nan,
  'DATEDIED': nan,
  'L_THREAT': nan,
  'ER_VISIT': 'Y',
  'HOSPITAL': nan,
  'HOSPDAYS': nan,
  'X_STAY': nan,
  'DISABLE': nan,
  'RECOVD': 'N',
  'VAX_DATE': Timestamp('1999-01-29 00:00:00'),
  'ONSET_DATE': Timestamp('1999-01-29 00:00:00'),
  'NUMDAYS': 0.0,
  'OTHER_MEDS': nan,
  'CUR_ILL': 'NONE',
  'HISTORY': 'NONE',
  'PRIOR_VAX': nan,
  'BIRTH_DEFECT': nan,
  'OFC_VISIT': nan,
  'ER_ED_VISIT': nan,
  'ALLERGIES': nan},
 'vax_data': {'VAX_TYPE_1': 'ANTH',
  'VAX_MANU_1': 'MICHIGAN DEPT PUB HLTH',
  'VAX_DOSE_SERIES_1': '3',
  'VAX_NAME_1': 'ANTHRAX (NO BRAND NAME)'},
 'symptoms': {'SYMPTOM1': 'Pyrexia',
  'SYMPTOM2': 'Rash',
  'SYMPTOM3': 'Urticaria'}}