# NORS Dataset Cleaning and Preperation
- This notebook performs initial exploration, cleaning, and preparation of the NORS outbreak dataset prior to analysis.

## 1. Import Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## 2. Load Raw Data

In [3]:
# Loading Data
df = pd.read_csv("../Data/NORS_original.csv")
df

  df = pd.read_csv("../Data/NORS_original.csv")


Unnamed: 0,Year,Month,State,Primary Mode,Etiology,Serotype or Genotype,Etiology Status,Setting,Illnesses,Hospitalizations,Info On Hospitalizations,Deaths,Info On Deaths,Food Vehicle,Food Contaminated Ingredient,IFSAC Category,Water Exposure,Water Type,Animal Type
0,1971,2,California,Water,Copper,,Confirmed,Restaurant,2,,,0.0,,,,,Drinking water,Community,
1,1971,6,Arkansas,Water,Hepatitis A,,Confirmed,Store,98,,,0.0,,,,,Drinking water,Other,
2,1971,6,Missouri,Water,Unknown,,Suspected,Subdivision/Neighborhood,2,,,0.0,,,,,Drinking water,Community,
3,1971,6,Alabama,Water,Selenium,,Confirmed,Unknown,3,,,0.0,,,,,Drinking water,Individual/Private,
4,1971,6,Vermont,Water,Unknown,,Suspected,Community/municipality,3,,,0.0,,,,,Drinking water,Community,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66708,2023,12,Tennessee,Person-to-person,Norovirus unknown,,Confirmed,Long-term care/nursing home/assisted living fa...,19,,0.0,0.0,19.0,,,,,,
66709,2023,12,Tennessee,Indeterminate/unknown,Norovirus Genogroup II,GII.P untypeable GII.4 untypeable,Suspected,Event space,18,0.0,18.0,0.0,18.0,,,,,,
66710,2023,12,Minnesota,Person-to-person,Norovirus unknown,,Suspected,School/college/university,7,,0.0,,0.0,,,,,,
66711,2023,12,Oregon,Person-to-person,,,,School/college/university,97,0.0,97.0,0.0,97.0,,,,,,


## 3. Initial Data Overview

In [19]:
# Data Overview
def overview(df, name="df"):
    print(f"\n=== {name} Overview ===")
    display(df.head(10))
    print("\nShape:", df.shape)
    print("\nData types:")
    display(df.dtypes.to_frame("dtype"))
    print("\nMissing values (sorted):")
    missing = df.isna().sum().sort_values(ascending=False)
    display(missing[missing > 0])
    print(f"\nDuplicate rows: {df.duplicated().sum()}")

overview(df, "NORS Outbreak Data")


=== NORS Outbreak Data Overview ===


Unnamed: 0,Year,Month,State,Primary Mode,Etiology,Serotype or Genotype,Etiology Status,Setting,Illnesses,Hospitalizations,Info On Hospitalizations,Deaths,Info On Deaths,Food Vehicle,Food Contaminated Ingredient,IFSAC Category,Water Exposure,Water Type,Animal Type
0,1971,2,California,Water,Copper,,Confirmed,Restaurant,2,,,0.0,,,,,Drinking water,Community,
1,1971,6,Arkansas,Water,Hepatitis A,,Confirmed,Store,98,,,0.0,,,,,Drinking water,Other,
2,1971,6,Missouri,Water,Unknown,,Suspected,Subdivision/Neighborhood,2,,,0.0,,,,,Drinking water,Community,
3,1971,6,Alabama,Water,Selenium,,Confirmed,Unknown,3,,,0.0,,,,,Drinking water,Individual/Private,
4,1971,6,Vermont,Water,Unknown,,Suspected,Community/municipality,3,,,0.0,,,,,Drinking water,Community,
5,1971,6,Oregon,Water,Unknown,,Suspected,Restaurant,200,,,0.0,,,,,Drinking water,Other,
6,1971,7,New Jersey,Water,Hepatitis A,,Confirmed,Camp/cabin,22,,,0.0,,,,,Drinking water,Other,
7,1971,7,Mississippi,Water,Shigella sonnei,,Confirmed,Camp/cabin,187,,,0.0,,,,,Drinking water,Community,
8,1971,7,Kentucky,Water,Unknown,,Suspected,Park - State Park,68,,,0.0,,,,,Drinking water,Other,
9,1971,7,California,Water,Unknown,,Suspected,Community/municipality,3500,,,0.0,,,,,Drinking water,Community,



Shape: (66713, 19)

Data types:


Unnamed: 0,dtype
Year,int64
Month,int64
State,object
Primary Mode,object
Etiology,object
Serotype or Genotype,object
Etiology Status,object
Setting,object
Illnesses,object
Hospitalizations,float64



Missing values (sorted):


Animal Type                     66126
Water Type                      63983
Food Contaminated Ingredient    63963
Water Exposure                  63607
IFSAC Category                  55008
Food Vehicle                    54267
Serotype or Genotype            50243
Etiology Status                 16338
Etiology                        16338
Hospitalizations                 8558
Info On Deaths                   8250
Info On Hospitalizations         8233
Deaths                           7928
Setting                          5909
dtype: int64


Duplicate rows: 1421


## 4. Data Cleaning

In [None]:
# Drop unnecessary columns
drop_cols = ['Info On Hospitalizations', 'Info On Deaths']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Convert numeric columns
numeric_cols = ['Illnesses', 'Hospitalizations', 'Deaths']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Create datetime column for monthly data
if all(x in df.columns for x in ['Year','Month']):
    df['Date'] = pd.to_datetime(df[['Year','Month']].assign(DAY=1))

# Handling NaN in outbreak columns 
primary_mode_mapping = {
    'Food': ['Food Vehicle', 'Food Contaminated Ingredient', 'IFSAC Category'],
    'Water': ['Water Exposure', 'Water Type'],
    'Person-to-person': [],
    'Animal': ['Animal Type'],
    'Indeterminate/unknown': []
}

all_inapplicable_cols = [
    'Food Vehicle', 'Food Contaminated Ingredient', 'IFSAC Category',
    'Water Exposure', 'Water Type', 'Animal Type'
]

def recode_na(row):
    primary_mode = row.get('Primary Mode', None)
    relevant_cols = primary_mode_mapping.get(primary_mode, [])

    # Handle outbreak-related columns
    for col in all_inapplicable_cols:
        if col not in relevant_cols and pd.isna(row[col]):
            row[col] = 'Inapplicable'
        elif col in relevant_cols and pd.isna(row[col]):
            row[col] = 'Unknown'

    # Handle Etiology-related columns
    for col in ['Etiology', 'Etiology Status', 'Setting']:
        if pd.isna(row[col]):
            row[col] = 'Unknown'

    etio = str(row['Etiology']).strip().lower()
    sero = str(row['Serotype or Genotype']).strip().lower() if not pd.isna(row['Serotype or Genotype']) else ''

    # Handle serotype/genotype rules
    if etio in ['', 'unknown', 'nan', 'none']:
        row['Serotype or Genotype'] = 'Unknown'
    elif sero in ['', 'unknown', 'nan', 'none']:
        row['Serotype or Genotype'] = 'Not Typed'
    else:
        row['Serotype or Genotype'] = row['Serotype or Genotype']

    return row

# Apply recoding
df = df.apply(recode_na, axis=1)

# Validation
print("\n=== Post-Cleaning Types ===")
display(df.dtypes.to_frame("dtype"))

print("\n=== Missing Values After Cleaning ===")
display(df.isna().mean())

df.head(5)


=== Post-Cleaning Types ===


Unnamed: 0,dtype
Year,int64
Month,int64
State,object
Primary Mode,object
Etiology,object
Serotype or Genotype,object
Etiology Status,object
Setting,object
Illnesses,float64
Hospitalizations,float64



=== Missing Values After Cleaning ===


Year                            0.000000
Month                           0.000000
State                           0.000000
Primary Mode                    0.000000
Etiology                        0.000000
Serotype or Genotype            0.000000
Etiology Status                 0.000000
Setting                         0.000000
Illnesses                       0.000809
Hospitalizations                0.128281
Deaths                          0.118837
Food Vehicle                    0.000000
Food Contaminated Ingredient    0.000000
IFSAC Category                  0.000000
Water Exposure                  0.000000
Water Type                      0.000000
Animal Type                     0.000000
Date                            0.000000
dtype: float64

Unnamed: 0,Year,Month,State,Primary Mode,Etiology,Serotype or Genotype,Etiology Status,Setting,Illnesses,Hospitalizations,Deaths,Food Vehicle,Food Contaminated Ingredient,IFSAC Category,Water Exposure,Water Type,Animal Type,Date
0,1971,2,California,Water,Copper,Not Typed,Confirmed,Restaurant,2.0,,0.0,Inapplicable,Inapplicable,Inapplicable,Drinking water,Community,Inapplicable,1971-02-01
1,1971,6,Arkansas,Water,Hepatitis A,Not Typed,Confirmed,Store,98.0,,0.0,Inapplicable,Inapplicable,Inapplicable,Drinking water,Other,Inapplicable,1971-06-01
2,1971,6,Missouri,Water,Unknown,Unknown,Suspected,Subdivision/Neighborhood,2.0,,0.0,Inapplicable,Inapplicable,Inapplicable,Drinking water,Community,Inapplicable,1971-06-01
3,1971,6,Alabama,Water,Selenium,Not Typed,Confirmed,Unknown,3.0,,0.0,Inapplicable,Inapplicable,Inapplicable,Drinking water,Individual/Private,Inapplicable,1971-06-01
4,1971,6,Vermont,Water,Unknown,Unknown,Suspected,Community/municipality,3.0,,0.0,Inapplicable,Inapplicable,Inapplicable,Drinking water,Community,Inapplicable,1971-06-01


In [27]:
df[(df['Hospitalizations'].isna()) & (df['Deaths'].isna())].sample(15)

Unnamed: 0,Year,Month,State,Primary Mode,Etiology,Serotype or Genotype,Etiology Status,Setting,Illnesses,Hospitalizations,Deaths,Food Vehicle,Food Contaminated Ingredient,IFSAC Category,Water Exposure,Water Type,Animal Type,Date
25110,2012,3,New York,Person-to-person,Norovirus,Not Typed,Suspected,Unknown,22.0,,,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,2012-03-01
24740,2012,2,New York,Person-to-person,Norovirus,Not Typed,Suspected,Unknown,64.0,,,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,2012-02-01
1343,1998,4,Maryland,Food,Unknown,Unknown,Unknown,Restaurant: Other,2.0,,,macaroni and cheese,Unknown,Multiple,Inapplicable,Inapplicable,Inapplicable,1998-04-01
5017,2000,11,New York,Food,Unknown,Unknown,Unknown,Restaurant: Other,8.0,,,Unknown,Unknown,Unknown,Inapplicable,Inapplicable,Inapplicable,2000-11-01
35191,2014,12,New York,Person-to-person,Unknown,Unknown,Unknown,Unknown,47.0,,,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,2014-12-01
23446,2012,1,New York,Person-to-person,Norovirus,Not Typed,Suspected,Unknown,79.0,,,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,2012-01-01
65959,2023,10,Illinois,Water,Legionella pneumophila,serogroup 1,Confirmed,Hotel/motel,2.0,,,Inapplicable,Inapplicable,Inapplicable,Undetermined water,Unknown,Inapplicable,2023-10-01
52766,2019,2,Minnesota,Person-to-person,Norovirus,Not Typed,Suspected,Long-term care/nursing home/assisted living fa...,32.0,,,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,2019-02-01
63924,2023,3,Pennsylvania,Indeterminate/unknown,Unknown,Unknown,Unknown,Long-term care/nursing home/assisted living fa...,40.0,,,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,Inapplicable,2023-03-01
12380,2006,10,Ohio,Food,Listeria monocytogenes,Not Typed,Suspected,Private home/residence;Other,3.0,,,"ham, unspecified",Unknown,Pork,Inapplicable,Inapplicable,Inapplicable,2006-10-01


## 5. Export Clean Dataset

In [22]:
# Save Cleaned Dataset
df.to_csv("NORS_cleaned.csv", index=False)