In [1]:
from pymongo import MongoClient
import pandas as pd
# Connect to your MongoDB instance
client = MongoClient('mongodb://localhost:27017/')
db = client['vaers']  
collection = db['reports']  

In [2]:
# Get one report
report = collection.find_one()

# Get one report
report = collection.find_one()

# Extract attribute names from vax_data
vax_data_attributes = set()
for vax_entry in report.get('vax_data', []):
    vax_data_attributes.update([f"vax_data.{key}" for key in vax_entry.keys()])

# Extract all other attribute names, excluding _id
other_attributes = set(report.keys()) - {'_id', 'vax_data'}

# Combine all attributes into one list
all_attributes = list(vax_data_attributes | other_attributes)

# Print all attributes
print("All attributes:")
for attribute_name in vax_data_attributes:
    print(attribute_name)

All attributes:
vax_data.VAX_TYPE
vax_data.VAX_LOT
vax_data.VAX_MANU
vax_data.VAX_DOSE_SERIES
vax_data.VAX_SITE
vax_data.VAX_ROUTE
vax_data.VAX_NAME


### For all first level attributes (code not efficient as it loops through all attributes)

In [10]:
# Initialize dictionary to store results
attribute_nan_percentages = {}

# Pipeline to project all attributes and count NaN values
pipeline = [
    {
        "$project": {
            "is_nan": {
                "$objectToArray": {
                    "$arrayToObject": {
                        "$map": {
                            "input": {"$objectToArray": "$$ROOT"},
                            "in": {
                                "k": "$$this.k",
                                "v": {
                                    "$cond": [
                                        {"$eq": ["$$this.v", float('nan')]},
                                        1,
                                        0
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
    },
    {"$unwind": "$is_nan"},
    {
        "$group": {
            "_id": "$is_nan.k",
            "total_nan": {"$sum": "$is_nan.v"}
        }
    }
]

# Execute pipeline
results = list(collection.aggregate(pipeline))

# Get total document count
total_documents = collection.count_documents({})

# Initialize dictionary to store results
attribute_reporting_ratios = {}

# Calculate reporting ratio for each attribute
for result in results:
    attr = result["_id"]
    if attr != "_id":  # Exclude "_id" field
        nan_count = result.get("total_nan", 0)
        percentage_nan = (nan_count / total_documents) * 100
        reporting_ratio = 100 - percentage_nan
        attribute_reporting_ratios[attr] = reporting_ratio

# Print reporting ratio for each attribute
print("Reporting ratio for each attribute:")
for attr, reporting_ratio in attribute_reporting_ratios.items():
    print(f"{attr}: {reporting_ratio:.2f}%")

Reporting ratio for each attribute:
TODAYS_DATE: 98.36%
DISABLE: 3.56%
vax_data: 100.00%
SEX: 100.00%
SPLTTYPE: 32.22%
L_THREAT: 2.11%
VAX_DATE: 88.99%
HISTORY: 36.70%
RECOVD: 92.95%
HOSPITAL: 11.75%
CUR_ILL: 26.72%
ER_VISIT: 7.58%
STATE: 89.15%
BIRTH_DEFECT: 0.06%
VAERS_ID: 100.00%
X_STAY: 0.27%
PRIOR_VAX: 6.76%
HOSPDAYS: 4.27%
SERIOUS: 100.00%
DIED: 1.84%
DATEDIED: 1.39%
NUMDAYS: 81.90%
symptoms: 99.99%
LAB_DATA: 25.95%
V_ADMINBY: 100.00%
ALLERGIES: 18.15%
OTHER_MEDS: 40.88%
ER_ED_VISIT: 6.86%
AGE: 73.86%
SYMPTOM_TEXT: 77.82%
ONSET_DATE: 86.29%
OFC_VISIT: 11.76%
RECVDATE: 100.00%


In [11]:
vax_attribute_reporting_ratios = {}

# Get total document count
total_documents = collection.count_documents({})

# Iterate over each attribute
for attr in vax_data_attributes:
    # Pipeline to project attribute and count NaN values
    pipeline = [
        {"$project": {"vax_data": 1}},
        {"$unwind": "$vax_data"},
        {"$project": {"is_nan": {"$cond": [{"$eq": [f"${attr}", float('nan')]}, 1, 0]}}},  # Project attribute and count NaN values
        {"$group": {"_id": None, "total_nan": {"$sum": "$is_nan"}}},  # Calculate total NaN count for attribute
    ]

    # Execute pipeline
    results = list(collection.aggregate(pipeline))

    # Calculate percentage of NaN values for attribute
    if results:
        nan_count = results[0].get("total_nan", 0)
        percentage_nan = (nan_count / total_documents) * 100
        reporting_ratio = 100 - percentage_nan
        vax_attribute_reporting_ratios[attr] = reporting_ratio
    else:
        vax_attribute_reporting_ratios[attr] = 0.0

# Print reporting ratio for each attribute
print("Reporting ratio for each attribute:")
for attr, reporting_ratio in vax_attribute_reporting_ratios.items():
    print(f"{attr}: {reporting_ratio:.2f}%")

Reporting ratio for each attribute:
vax_data.VAX_TYPE: 100.00%
vax_data.VAX_LOT: 64.58%
vax_data.VAX_MANU: 100.00%
vax_data.VAX_DOSE_SERIES: 99.36%
vax_data.VAX_SITE: 53.51%
vax_data.VAX_ROUTE: 67.90%
vax_data.VAX_NAME: 100.00%


In [12]:
# Combine the dictionaries
combined_results = {**vax_attribute_reporting_ratios, **attribute_reporting_ratios}

# Print the combined results
print("Combined results:")
for attr, percentage_nan in combined_results.items():
    print(f"{attr}: {percentage_nan:.2f}%")


Combined results:
vax_data.VAX_TYPE: 100.00%
vax_data.VAX_LOT: 64.58%
vax_data.VAX_MANU: 100.00%
vax_data.VAX_DOSE_SERIES: 99.36%
vax_data.VAX_SITE: 53.51%
vax_data.VAX_ROUTE: 67.90%
vax_data.VAX_NAME: 100.00%
TODAYS_DATE: 98.36%
DISABLE: 3.56%
vax_data: 100.00%
SEX: 100.00%
SPLTTYPE: 32.22%
L_THREAT: 2.11%
VAX_DATE: 88.99%
HISTORY: 36.70%
RECOVD: 92.95%
HOSPITAL: 11.75%
CUR_ILL: 26.72%
ER_VISIT: 7.58%
STATE: 89.15%
BIRTH_DEFECT: 0.06%
VAERS_ID: 100.00%
X_STAY: 0.27%
PRIOR_VAX: 6.76%
HOSPDAYS: 4.27%
SERIOUS: 100.00%
DIED: 1.84%
DATEDIED: 1.39%
NUMDAYS: 81.90%
symptoms: 99.99%
LAB_DATA: 25.95%
V_ADMINBY: 100.00%
ALLERGIES: 18.15%
OTHER_MEDS: 40.88%
ER_ED_VISIT: 6.86%
AGE: 73.86%
SYMPTOM_TEXT: 77.82%
ONSET_DATE: 86.29%
OFC_VISIT: 11.76%
RECVDATE: 100.00%


In [6]:
len(combined_results)

40

In [69]:
# Pipeline to unwind vax_data and count NaN vax_data.VAX_LOT
pipeline = [
    {"$unwind": "$vax_data"},
    {"$project": {"is_nan": {"$cond": [{"$eq": ["$vax_data.VAX_LOT", float('nan')]}, 1, 0]}}},
    {"$group": {"_id": None, "total_nan_vax_lot": {"$sum": "$is_nan"}}}
]

# Execute pipeline
results = list(collection.aggregate(pipeline))

# Get total document count
total_documents = collection.count_documents({})

# Calculate percentage of NaN values for vax_data.VAX_LOT
if results:
    nan_count_vax_lot = results[0].get("total_nan_vax_lot", 0)
    percentage_nan_vax_lot = (nan_count_vax_lot / total_documents) * 100
else:
    percentage_nan_vax_lot = 0.0

# Print percentage of NaN values for vax_data.VAX_LOT
print(f"Percentage of NaN values for vax_data.VAX_LOT: {percentage_nan_vax_lot:.2f}%")

Percentage of NaN values for vax_data.VAX_LOT: 35.40%


In [None]:
# Pipeline to project NUMDAYS field and count NaN values
pipeline = [
    {"$project": {"is_nan": {"$cond": [{"$eq": ["$DIED", float('nan')]}, 1, 0]}}},  # Project NUMDAYS field and count NaN values
    {"$group": {"_id": None, "total_nan": {"$sum": "$is_nan"}}},  # Calculate total NaN count for NUMDAYS
]

# Execute pipeline
results = list(collection.aggregate(pipeline))

# Get total document count
total_documents = collection.count_documents({})

# Calculate percentage of NaN values for NUMDAYS
percentage_nan = (results[0]["total_nan"] / total_documents) * 100

# Print percentage of NaN values for NUMDAYS
print(f"Percentage of NaN values for NUMDAYS: {percentage_nan:.2f}%")

In [None]:
Combined results:
vax_data.VAX_TYPE: 100.00%
vax_data.VAX_LOT: 64.58%
vax_data.VAX_MANU: 100.00%
vax_data.VAX_DOSE_SERIES: 99.36%
vax_data.VAX_SITE: 53.51%
vax_data.VAX_ROUTE: 67.90%
vax_data.VAX_NAME: 100.00%
TODAYS_DATE: 98.36%
DISABLE: 3.56%
vax_data: 100.00%
SEX: 100.00%
SPLTTYPE: 32.22%
L_THREAT: 2.11%
VAX_DATE: 88.99%
HISTORY: 36.70%
RECOVD: 92.95%
HOSPITAL: 11.75%
CUR_ILL: 26.72%
ER_VISIT: 7.58%
STATE: 89.15%
BIRTH_DEFECT: 0.06%
VAERS_ID: 100.00%
X_STAY: 0.27%
PRIOR_VAX: 6.76%
HOSPDAYS: 4.27%
SERIOUS: 100.00%
DIED: 1.84%
DATEDIED: 1.39%
NUMDAYS: 81.90%
symptoms: 99.99%
LAB_DATA: 25.95%
V_ADMINBY: 100.00%
ALLERGIES: 18.15%
OTHER_MEDS: 40.88%
ER_ED_VISIT: 6.86%
AGE: 73.86%
SYMPTOM_TEXT: 77.82%
ONSET_DATE: 86.29%
OFC_VISIT: 11.76%
RECVDATE: 100.00%

Combined results:
vax_data.VAX_TYPE: 0.00%
vax_data.VAX_LOT: 35.42%
vax_data.VAX_MANU: 0.00%
vax_data.VAX_DOSE_SERIES: 0.64%
vax_data.VAX_SITE: 46.49%
vax_data.VAX_ROUTE: 32.10%
vax_data.VAX_NAME: 0.00%
RECOVD: 7.05%
HOSPITAL: 88.25%
ER_VISIT: 92.42%
STATE: 10.85%
CUR_ILL: 73.28%
BIRTH_DEFECT: 99.94%
X_STAY: 99.73%
VAERS_ID: 0.00%
HOSPDAYS: 95.73%
PRIOR_VAX: 93.24%
TODAYS_DATE: 1.64%
DISABLE: 96.44%
vax_data: 0.00%
SEX: 0.00%
SPLTTYPE: 67.78%
L_THREAT: 97.89%
VAX_DATE: 11.01%
HISTORY: 63.30%
LAB_DATA: 74.05%
ALLERGIES: 81.85%
V_ADMINBY: 0.00%
OTHER_MEDS: 59.12%
ER_ED_VISIT: 93.14%
AGE: 26.14%
SYMPTOM_TEXT: 22.18%
ONSET_DATE: 13.71%
OFC_VISIT: 88.24%
RECVDATE: 0.00%
SERIOUS: 0.00%
DATEDIED: 98.61%
DIED: 98.16%
symptoms: 0.01%
NUMDAYS: 18.10%