# C. Feature Re-engineering - Refined

In [11]:
# Author: Brian Gray, with Grok for data insight and debugging support
# Date: 21 May 2025
# Purpose: Refine re-engineer features to improve model accuracy based on model testing results
# Dataset source file: dengue_reengineered.csv
# Output dataset: dengue_refined.csv
# Dependencies: pandas and numpy
# Notes:
# 1. Outlier handling through Winsorisation
# 2. Cyclical encoding to capture seasonality
# 3. Adjust weight of the composite precipitation feature
# 4. Reduce redundant features, reducing model complexity
# 5. Data validation of the dominant feature (ndvi_ce)

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the source dataset

df = pd.read_csv('dengue_reengineered.csv')

In [4]:
# Enhance outlier handling through Winsorisation

df['ndvi_ce'] = df['ndvi_ce'].clip(lower = df['ndvi_ce'].quantile(0.01),upper = df['ndvi_ce'].quantile(0.99))
print("n\After Winsorisation of ndvi_ce:")
print(df['ndvi_ce'].describe())

n\After Winsorisation of ndvi_ce:
count    1456.000000
mean        0.277275
std         0.152230
min         0.001607
25%         0.190218
50%         0.242911
75%         0.330908
max         0.995016
Name: ndvi_ce, dtype: float64


In [5]:
# Cyclical encoding to capture seasonality - adding sin and cos transformations for 'weekofyear' feature

df['weekofyear_sin'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
df['weekofyear_cos'] = np.cos(2 * np.pi * df['weekofyear'] / 52)

# Add interaction with temperature and humidity

df['week_temp_interaction'] = df['weekofyear_sin'] * df['reanalysis_avg_temp_k']
df['week_humidity_interaction'] = df['weekofyear_cos'] * df['reanalysis_relative_humidity_percent']

In [6]:
# Re-weight composite precipitation feature

df['precip_composite'] = 0.5 * df['precipitation_ground_mm'] + 0.5 * df['reanalysis_sat_precip_amt_mm']

for lag in [1,2,3,4]:
    df[f'precip_composite_lag{lag}'] = df['precip_composite'].shift(lag)

for col in df.columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace = True)

In [7]:
# Identify and drop redundant features

corr_matrix = df.corr().abs()

high_corr = [(col1,col2) for col1 in corr_matrix.columns for col2 in corr_matrix.index
             if col1!= col2 and corr_matrix.loc[col2,col1] > 0.8]
print("n\Highly Correlated Features:",high_corr)

drop_cols = ['reanalysis_max_air_temp_k','reanalysis_min_air_temp_k']
df = df.drop(columns = drop_cols, errors = 'ignore')



n\Highly Correlated Features: [('ndvi_ne', 'ndvi_nw'), ('ndvi_nw', 'ndvi_ne'), ('ndvi_se', 'ndvi_sw'), ('ndvi_sw', 'ndvi_se'), ('ndvi_ce', 'total_cases'), ('precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm'), ('precipitation_ground_mm', 'total_cases'), ('precipitation_ground_mm', 'precip_composite'), ('precipitation_ground_mm', 'precip_temp_interaction'), ('reanalysis_max_air_temp_k', 'reanalysis_tdtr_k'), ('reanalysis_max_air_temp_k', 'station_diur_temp_rng_c'), ('reanalysis_min_air_temp_k', 'reanalysis_tdtr_k'), ('reanalysis_sat_precip_amt_mm', 'precipitation_amt_mm'), ('reanalysis_tdtr_k', 'reanalysis_max_air_temp_k'), ('reanalysis_tdtr_k', 'reanalysis_min_air_temp_k'), ('reanalysis_tdtr_k', 'station_diur_temp_rng_c'), ('station_diur_temp_rng_c', 'reanalysis_max_air_temp_k'), ('station_diur_temp_rng_c', 'reanalysis_tdtr_k'), ('total_cases', 'ndvi_ce'), ('total_cases', 'precipitation_ground_mm'), ('total_cases', 'precip_temp_interaction'), ('precip_composite', 'precipitation_grou

In [8]:
# Validation of the dominant feature - ndvi_ce

invalid_ndvi = (df['ndvi_ce'] < -1) | (df['ndvi_ce'] > 1)
df.loc[invalid_ndvi,'ndvi_ce'] = df.loc[invalid_ndvi,['ndvi_ce','ndvi_nw','ndvi_se','ndvi_sw']].mean(axis = 1)
print("n\After ndvi_ce Validation:")
print(df['ndvi_ce'].describe())

n\After ndvi_ce Validation:
count    1456.000000
mean        0.277275
std         0.152230
min         0.001607
25%         0.190218
50%         0.242911
75%         0.330908
max         0.995016
Name: ndvi_ce, dtype: float64


In [9]:
# Save refined dataset

df.to_csv('dengue_refined.csv',index=False)
print("n\Refined dataset saved as 'dengue_refined.csv'")

n\Refined dataset saved as 'dengue_refined.csv'


In [10]:
# Preview the cleaned dataset

print("\nPreview of Cleaned Dataset")
print(df.head())


Preview of Cleaned Dataset
  city  weekofyear   ndvi_ne   ndvi_nw   ndvi_se   ndvi_sw   ndvi_ce  \
0   sj        18.0  0.495525  0.442308  0.406386  0.333333  0.271200   
1   sj        19.0  0.576892  0.519231  0.290276  0.282051  0.298329   
2   sj        20.0  0.349064  0.576923  0.290276  0.307692  0.070629   
3   sj        21.0  0.511798  0.730769  0.493469  0.487179  0.109120   
4   sj        22.0  0.625712  0.750000  0.551524  0.512821  0.063209   

   precipitation_amt_mm  precipitation_ground_mm  reanalysis_avg_temp_k  ...  \
0                 12.42                    13.06               0.262732  ...   
1                 22.82                    30.44               0.396002  ...   
2                 34.54                    35.77               0.479772  ...   
3                 15.36                    23.45               0.546406  ...   
4                  7.52                    18.16               0.628272  ...   

   reanalysis_air_temp_k_lag3  reanalysis_relative_humidit