In [40]:
from pymongo import MongoClient
import pandas as pd
# Connect to your MongoDB instance
client = MongoClient('mongodb://localhost:27017/')
db = client['vaers']  
collection = db['reports']  

In [64]:
# Get one report
report = collection.find_one()

# Get one report
report = collection.find_one()

# Extract attribute names from vax_data
vax_data_attributes = set()
for vax_entry in report.get('vax_data', []):
    vax_data_attributes.update([f"vax_data.{key}" for key in vax_entry.keys()])

# Extract all other attribute names, excluding _id
other_attributes = set(report.keys()) - {'_id', 'vax_data'}

# Combine all attributes into one list
all_attributes = list(vax_data_attributes | other_attributes)

# Print all attributes
print("All attributes:")
for attribute_name in vax_data_attributes:
    print(attribute_name)

All attributes:
vax_data.VAX_TYPE
vax_data.VAX_DOSE_SERIES
vax_data.VAX_LOT
vax_data.VAX_SITE
vax_data.VAX_MANU
vax_data.VAX_ROUTE
vax_data.VAX_NAME


### For all first level attributes (code not efficient as it loops through all attributes)

In [77]:
# Initialize dictionary to store results
attribute_nan_percentages = {}

# Pipeline to project all attributes and count NaN values
pipeline = [
    {
        "$project": {
            "is_nan": {
                "$objectToArray": {
                    "$arrayToObject": {
                        "$map": {
                            "input": {"$objectToArray": "$$ROOT"},
                            "in": {
                                "k": "$$this.k",
                                "v": {
                                    "$cond": [
                                        {"$eq": ["$$this.v", float('nan')]},
                                        1,
                                        0
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
    },
    {"$unwind": "$is_nan"},
    {
        "$group": {
            "_id": "$is_nan.k",
            "total_nan": {"$sum": "$is_nan.v"}
        }
    }
]

# Execute pipeline
results = list(collection.aggregate(pipeline))

# Get total document count
total_documents = collection.count_documents({})

# Initialize dictionary to store results
attribute_reporting_ratios = {}

# Calculate reporting ratio for each attribute
for result in results:
    attr = result["_id"]
    if attr != "_id":  # Exclude "_id" field
        nan_count = result.get("total_nan", 0)
        percentage_nan = (nan_count / total_documents) * 100
        reporting_ratio = 100 - percentage_nan
        attribute_reporting_ratios[attr] = reporting_ratio

# Print reporting ratio for each attribute
print("Reporting ratio for each attribute:")
for attr, reporting_ratio in attribute_reporting_ratios.items():
    print(f"{attr}: {reporting_ratio:.2f}%")

Reporting ratio for each attribute:
DATEDIED: 1.39%
BIRTH_DEFECT: 0.06%
DIED: 1.83%
RECOVD: 92.94%
OTHER_MEDS: 40.91%
VAX_DATE: 89.00%
ER_ED_VISIT: 6.86%
NUMDAYS: 81.93%
HISTORY: 36.74%
SERIOUS: 100.00%
vax_data: 100.00%
CUR_ILL: 26.76%
HOSPDAYS: 4.27%
TODAYS_DATE: 98.36%
symptoms: 99.99%
SPLTTYPE: 32.23%
VAERS_ID: 100.00%
LAB_DATA: 25.97%
V_ADMINBY: 100.00%
SEX: 100.00%
L_THREAT: 2.11%
OFC_VISIT: 11.76%
HOSPITAL: 11.73%
SYMPTOM_TEXT: 77.89%
DISABLE: 3.55%
ALLERGIES: 18.18%
STATE: 89.14%
ONSET_DATE: 86.31%
ER_VISIT: 7.59%
PRIOR_VAX: 6.77%
RECVDATE: 100.00%
AGE: 73.92%
X_STAY: 0.27%


In [81]:
vax_attribute_reporting_ratios = {}

# Get total document count
total_documents = collection.count_documents({})

# Iterate over each attribute
for attr in vax_data_attributes:
    # Pipeline to project attribute and count NaN values
    pipeline = [
        {"$project": {"vax_data": 1}},
        {"$unwind": "$vax_data"},
        {"$project": {"is_nan": {"$cond": [{"$eq": [f"${attr}", float('nan')]}, 1, 0]}}},  # Project attribute and count NaN values
        {"$group": {"_id": None, "total_nan": {"$sum": "$is_nan"}}},  # Calculate total NaN count for attribute
    ]

    # Execute pipeline
    results = list(collection.aggregate(pipeline))

    # Calculate percentage of NaN values for attribute
    if results:
        nan_count = results[0].get("total_nan", 0)
        percentage_nan = (nan_count / total_documents) * 100
        reporting_ratio = 100 - percentage_nan
        vax_attribute_reporting_ratios[attr] = reporting_ratio
    else:
        vax_attribute_reporting_ratios[attr] = 0.0

# Print percentage of NaN values for each attribute
print("Percentage of NaN values for each attribute:")
for attr, percentage_nan in vax_attribute_reporting_ratios.items():
    print(f"{attr}: {percentage_nan:.2f}%")

Percentage of NaN values for each attribute:
vax_data.VAX_TYPE: 100.00%
vax_data.VAX_DOSE_SERIES: 99.36%
vax_data.VAX_LOT: 64.60%
vax_data.VAX_SITE: 53.58%
vax_data.VAX_MANU: 100.00%
vax_data.VAX_ROUTE: 67.96%
vax_data.VAX_NAME: 100.00%


In [82]:
# Combine the dictionaries
combined_results = {**vax_attribute_reporting_ratios, **attribute_reporting_ratios}

# Print the combined results
print("Combined results:")
for attr, percentage_nan in combined_results.items():
    print(f"{attr}: {percentage_nan:.2f}%")


Combined results:
vax_data.VAX_TYPE: 100.00%
vax_data.VAX_DOSE_SERIES: 99.36%
vax_data.VAX_LOT: 64.60%
vax_data.VAX_SITE: 53.58%
vax_data.VAX_MANU: 100.00%
vax_data.VAX_ROUTE: 67.96%
vax_data.VAX_NAME: 100.00%
DATEDIED: 1.39%
BIRTH_DEFECT: 0.06%
DIED: 1.83%
RECOVD: 92.94%
OTHER_MEDS: 40.91%
VAX_DATE: 89.00%
ER_ED_VISIT: 6.86%
NUMDAYS: 81.93%
HISTORY: 36.74%
SERIOUS: 100.00%
vax_data: 100.00%
CUR_ILL: 26.76%
HOSPDAYS: 4.27%
TODAYS_DATE: 98.36%
symptoms: 99.99%
SPLTTYPE: 32.23%
VAERS_ID: 100.00%
LAB_DATA: 25.97%
V_ADMINBY: 100.00%
SEX: 100.00%
L_THREAT: 2.11%
OFC_VISIT: 11.76%
HOSPITAL: 11.73%
SYMPTOM_TEXT: 77.89%
DISABLE: 3.55%
ALLERGIES: 18.18%
STATE: 89.14%
ONSET_DATE: 86.31%
ER_VISIT: 7.59%
PRIOR_VAX: 6.77%
RECVDATE: 100.00%
AGE: 73.92%
X_STAY: 0.27%


In [83]:
len(combined_results)

40

In [69]:
# Pipeline to unwind vax_data and count NaN vax_data.VAX_LOT
pipeline = [
    {"$unwind": "$vax_data"},
    {"$project": {"is_nan": {"$cond": [{"$eq": ["$vax_data.VAX_LOT", float('nan')]}, 1, 0]}}},
    {"$group": {"_id": None, "total_nan_vax_lot": {"$sum": "$is_nan"}}}
]

# Execute pipeline
results = list(collection.aggregate(pipeline))

# Get total document count
total_documents = collection.count_documents({})

# Calculate percentage of NaN values for vax_data.VAX_LOT
if results:
    nan_count_vax_lot = results[0].get("total_nan_vax_lot", 0)
    percentage_nan_vax_lot = (nan_count_vax_lot / total_documents) * 100
else:
    percentage_nan_vax_lot = 0.0

# Print percentage of NaN values for vax_data.VAX_LOT
print(f"Percentage of NaN values for vax_data.VAX_LOT: {percentage_nan_vax_lot:.2f}%")

Percentage of NaN values for vax_data.VAX_LOT: 35.40%


In [ ]:
# Pipeline to project NUMDAYS field and count NaN values
pipeline = [
    {"$project": {"is_nan": {"$cond": [{"$eq": ["$DIED", float('nan')]}, 1, 0]}}},  # Project NUMDAYS field and count NaN values
    {"$group": {"_id": None, "total_nan": {"$sum": "$is_nan"}}},  # Calculate total NaN count for NUMDAYS
]

# Execute pipeline
results = list(collection.aggregate(pipeline))

# Get total document count
total_documents = collection.count_documents({})

# Calculate percentage of NaN values for NUMDAYS
percentage_nan = (results[0]["total_nan"] / total_documents) * 100

# Print percentage of NaN values for NUMDAYS
print(f"Percentage of NaN values for NUMDAYS: {percentage_nan:.2f}%")

In [ ]:
Combined results:
vax_data.VAX_TYPE: 0.00%
vax_data.VAX_DOSE_SERIES: 0.64%
vax_data.VAX_LOT: 35.40%
vax_data.VAX_SITE: 46.42%
vax_data.VAX_MANU: 0.00%
vax_data.VAX_ROUTE: 32.04%
vax_data.VAX_NAME: 0.00%
NUMDAYS: 18.07%
_id: 0.00%
HISTORY: 63.26%
RECOVD: 7.06%
DIED: 98.17%
VAX_DATE: 11.00%
OTHER_MEDS: 59.09%
HOSPDAYS: 95.73%
ER_ED_VISIT: 93.14%
TODAYS_DATE: 1.64%
symptoms: 0.01%
SPLTTYPE: 67.77%
SERIOUS: 0.00%
CUR_ILL: 73.24%
vax_data: 0.00%
LAB_DATA: 74.03%
V_ADMINBY: 0.00%
SEX: 0.00%
OFC_VISIT: 88.24%
L_THREAT: 97.89%
VAERS_ID: 0.00%
ONSET_DATE: 13.69%
PRIOR_VAX: 93.23%
STATE: 10.86%
ER_VISIT: 92.41%
AGE: 26.08%
RECVDATE: 0.00%
DATEDIED: 98.61%
X_STAY: 99.73%
BIRTH_DEFECT: 99.94%
SYMPTOM_TEXT: 22.11%
HOSPITAL: 88.27%
DISABLE: 96.45%
ALLERGIES: 81.82%
