# Cleaned Data Validation 

This notebook performs a quality assurance check on the datasets `states_daily_cleaned` and `us_daily_cleaned`. Its purpose is to confirm it is 100% clean, correctly formatted, and ready for the analysis phase.

In [1]:
import pandas as pd
import numpy as np
import os

cwd = os.getcwd()
if cwd.endswith("notebooks"):
    os.chdir("..")

In [None]:
states_daily_cleaned = pd.read_csv('data/processed/states_daily_cleaned.csv')
us_daily_cleaned = pd.read_csv('data/processed/us_daily_cleaned.csv')

## Check `states_daily_cleaned`

In [3]:
# Check data types
print(states_daily_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20780 entries, 0 to 20779
Data columns (total 45 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         20780 non-null  object 
 1   state                        20780 non-null  object 
 2   positive                     20780 non-null  float64
 3   probableCases                20780 non-null  float64
 4   negative                     20780 non-null  float64
 5   pending                      20780 non-null  float64
 6   totalTestResultsSource       20780 non-null  object 
 7   totalTestResults             20780 non-null  float64
 8   hospitalizedCurrently        20780 non-null  float64
 9   hospitalizedCumulative       20780 non-null  float64
 10  inIcuCurrently               20780 non-null  float64
 11  inIcuCumulative              20780 non-null  float64
 12  onVentilatorCurrently        20780 non-null  float64
 13  onVentilatorCumu

In [4]:
states_daily_cleaned['date'][:5]

0    2020-03-06
1    2020-03-07
2    2020-03-08
3    2020-03-09
4    2020-03-10
Name: date, dtype: object

In [5]:
states_daily_cleaned['lastUpdateEt'][:5]

0    2020-03-06 00:00:00
1    2020-03-07 00:00:00
2    2020-03-08 00:00:00
3    2020-03-09 00:00:00
4    2020-03-10 00:00:00
Name: lastUpdateEt, dtype: object

In [6]:
# Check for null values
total_nulls = states_daily_cleaned.isna().sum().sum()
print(f"Total null values in the entire dataset: {total_nulls}")

Total null values in the entire dataset: 0


In [7]:
# Check for Infinite Values
numeric_cols = states_daily_cleaned.select_dtypes(include=[np.number]).columns
inf_count = np.isinf(states_daily_cleaned[numeric_cols]).sum().sum()
print(f"Total infinite values in the dataset: {inf_count}")

Total infinite values in the dataset: 0


In [8]:
# Check for cumulative logic (current <= cumulative)
hosp_errors = (states_daily_cleaned['hospitalizedCurrently'] > states_daily_cleaned['hospitalizedCumulative']).sum()
icu_errors = (states_daily_cleaned['inIcuCurrently'] > states_daily_cleaned['inIcuCumulative']).sum()
vent_errors = (states_daily_cleaned['onVentilatorCurrently'] > states_daily_cleaned['onVentilatorCumulative']).sum()
total_errors = hosp_errors + icu_errors + vent_errors

if total_errors == 0:
    print("PASS: All cumulative logic is correct.")
else:
    print(f"FAIL: Found {total_errors} cumulative logic errors.")

PASS: All cumulative logic is correct.


## Check `us_daily_cleaned`

In [None]:
# Check data types
print(us_daily_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      420 non-null    object 
 1   states                    420 non-null    int64  
 2   positive                  420 non-null    float64
 3   negative                  420 non-null    float64
 4   hospitalizedCurrently     420 non-null    float64
 5   hospitalizedCumulative    420 non-null    float64
 6   inIcuCurrently            420 non-null    float64
 7   inIcuCumulative           420 non-null    float64
 8   onVentilatorCurrently     420 non-null    float64
 9   onVentilatorCumulative    420 non-null    float64
 10  death                     420 non-null    float64
 11  hospitalized              420 non-null    float64
 12  totalTestResults          420 non-null    float64
 13  deathIncrease             420 non-null    float64
 14  hospitaliz

In [None]:
us_daily_cleaned['date'][:5]

0    2020-01-13
1    2020-01-14
2    2020-01-15
3    2020-01-16
4    2020-01-17
Name: date, dtype: object

In [None]:
# Check for null values
total_nulls = us_daily_cleaned.isna().sum().sum()
print(f"Total null values in the entire dataset: {total_nulls}")

Total null values in the entire dataset: 0


In [None]:
# Check for Infinite Values
numeric_cols = us_daily_cleaned.select_dtypes(include=[np.number]).columns
inf_count = np.isinf(us_daily_cleaned[numeric_cols]).sum().sum()
print(f"Total infinite values in the dataset: {inf_count}")

Total infinite values in the dataset: 0


In [None]:
# Check for cumulative logic (current <= cumulative)
hosp_errors = (us_daily_cleaned['hospitalizedCurrently'] > us_daily_cleaned['hospitalizedCumulative']).sum()
icu_errors = (us_daily_cleaned['inIcuCurrently'] > us_daily_cleaned['inIcuCumulative']).sum()
vent_errors = (us_daily_cleaned['onVentilatorCurrently'] > us_daily_cleaned['onVentilatorCumulative']).sum()
total_errors = hosp_errors + icu_errors + vent_errors

if total_errors == 0:
    print("PASS: All cumulative logic is correct.")
else:
    print(f"FAIL: Found {total_errors} cumulative logic errors.")

PASS: All cumulative logic is correct.


In [None]:
# Check for totalTestResults >= positive + negative
comp = us_daily_cleaned['positive'] + us_daily_cleaned['negative']
print("totalTestResults < positive+negative:", (us_daily_cleaned['totalTestResults'] < comp).sum())
print("Non-monotonic totalTestResults:", (us_daily_cleaned['totalTestResults'].diff().fillna(0) < 0).sum())

# Check for totalTestResultsIncrease matches diff(totalTestResults)
expected_ttri = us_daily_cleaned['totalTestResults'].diff().fillna(0).clip(lower=0)
print("totalTestResultsIncrease mismatches:", (expected_ttri != us_daily_cleaned['totalTestResultsIncrease']).sum())

# Check all daily increases
for inc, cum in {
    'positiveIncrease': 'positive',
    'negativeIncrease': 'negative',
    'deathIncrease': 'death',
    'hospitalizedIncrease': 'hospitalized'
}.items():
    expected = us_daily_cleaned[cum].diff().fillna(0).clip(lower=0)
    print(f"{inc} mismatches:", (expected != us_daily_cleaned[inc]).sum())

totalTestResults < positive+negative: 0
Non-monotonic totalTestResults: 0
totalTestResultsIncrease mismatches: 0
positiveIncrease mismatches: 0
negativeIncrease mismatches: 0
deathIncrease mismatches: 0
hospitalizedIncrease mismatches: 0
