# A. Predictive Analytics Case Study: Anticipating Dengue Outbreaks - Data Cleaning

In [3]:
# Author: Brian Gray
# Date: 7 May 2025
# Purpose: Clean and prepare dataset for predictive modeling in BigML
# Dataset source file: Dengue_training_fille_Spring_25.csv
# Output dataset: cleaned_dengue_data_combined.csv - Both San Juan and Iquitos
# Dependencies: pandas and numpy
# Notes: Removes 'year' column, handles missing values, and optimizes data for BigML upload

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset and reading the CSV file into a pandas DataFrame
df = pd.read_csv('Dengue_training_fille_Spring 25.csv')


In [3]:
# Initial data review, checking for missing values and data types
print("Initial Data Info:")
print(df.info())
print("\nMissing Values Count:")
print(df.isnull().sum())

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   city                                   1456 non-null   object 
 1   year                                   1456 non-null   int64  
 2   weekofyear                             1456 non-null   int64  
 3   ndvi_ne                                1262 non-null   float64
 4   ndvi_nw                                1404 non-null   float64
 5   ndvi_se                                1434 non-null   float64
 6   ndvi_sw                                1434 non-null   float64
 7   ndvi_ce                                1456 non-null   float64
 8   precipitation_amt_mm                   1443 non-null   float64
 9   precipitation_ground_mm                1443 non-null   float64
 10  reanalysis_air_temp_k                  1446 non-null 

In [4]:
# Remove unnecessary column
# Drop 'year' column, as it is not required for predictive modeling

df = df.drop(['year'], axis=1)

In [5]:
# Handle missing values
# Simple approach: for numerical columns, impute missing values with median to avoid skewing the data

numerical_columns = df.select_dtypes(include=[np.number]).columns
for col in numerical_columns:
    df[col] = df[col].fillna(df[col].median())
    
# Verify no missing values remain

print("n\Missing Values After Imputation:")
print(df.isnull().sum())

n\Missing Values After Imputation:
city                                     0
weekofyear                               0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
ndvi_ce                                  0
precipitation_amt_mm                     0
precipitation_ground_mm                  0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c 

In [6]:
# Round numerical columns to 2 decimal places

df[numerical_columns] = df[numerical_columns].round(2)

In [7]:
# Check for outliers
# Cap extreme values at the 1st and 99th percentiles to reduce outlier impact

for col in numerical_columns:
    lower_bound = df[col].quantile(0.01)
    upper_bound = df[col].quantile(0.99)
    df[col] = df[col].clip(lower=lower_bound,upper=upper_bound)

In [8]:
# Confirm data consistency
# Convert 'total_cases' to integer as it represents a count

df['total_cases'] = df['total_cases'].astype(int)

In [9]:
# Save the cleaned dataset as CSV for BigML upload

output_file ='cleaned_dengue_data.csv'
df.to_csv(output_file,index=False)
print(f"\nCleaned dataset saved as:{output_file}")


Cleaned dataset saved as:cleaned_dengue_data.csv


In [10]:
# Preview the cleaned dataset

print("\nPreview of Cleaned Dataset")
print(df.head())


Preview of Cleaned Dataset
  city  weekofyear  ndvi_ne  ndvi_nw  ndvi_se  ndvi_sw  ndvi_ce  \
0   sj        18.0     0.12     0.10     0.20     0.18     3.16   
1   sj        19.0     0.17     0.14     0.16     0.16     4.33   
2   sj        20.0     0.03     0.17     0.16     0.17    -5.49   
3   sj        21.0     0.13     0.25     0.23     0.24    -3.83   
4   sj        22.0     0.20     0.26     0.25     0.25    -5.81   

   precipitation_amt_mm  precipitation_ground_mm  reanalysis_air_temp_k  ...  \
0                 12.42                    13.06                 297.57  ...   
1                 22.82                    30.44                 298.21  ...   
2                 34.54                    35.77                 298.78  ...   
3                 15.36                    23.45                 298.99  ...   
4                  7.52                    18.16                 299.52  ...   

   reanalysis_relative_humidity_percent  reanalysis_sat_precip_amt_mm  \
0              