# B. Predictive Analytics - Feature Re-engineering

In [17]:
# Author: Brian Gray, with Grok for data insight and debugging support
# Date: 21 May 2025
# Purpose: Re-engineer features to improve model accuracy
# Dataset source file: cleaned_dengue_data.csv
# Output dataset: dengue_reengineered.csv
# Dependencies: pandas and numpy
# Notes:
# 1. Outlier handling to control for potential sensor values
# 2. Create composite precipitation to enhance scalability
# 3. Add lagged feature to capture epidemiological factors to improve forecasting for real-world application
# 4. Create interaction feature to capture synergy of warm and humid conditions that promote mosquito population growth
# 5. Reduce redundant features, reducing model complexity
# 6. Normalize NVDI and temperature features to support equal feature weighting when modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the source dataset

df = pd.read_csv('cleaned_dengue_data.csv')

In [3]:
# Verify data integrity

print(df.info())
print("n\Missing Values:")
print(df.isnull().sum())
print("n\Feature Statistics:")
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   city                                   1456 non-null   object 
 1   weekofyear                             1456 non-null   float64
 2   ndvi_ne                                1456 non-null   float64
 3   ndvi_nw                                1456 non-null   float64
 4   ndvi_se                                1456 non-null   float64
 5   ndvi_sw                                1456 non-null   float64
 6   ndvi_ce                                1456 non-null   float64
 7   precipitation_amt_mm                   1456 non-null   float64
 8   precipitation_ground_mm                1456 non-null   float64
 9   reanalysis_air_temp_k                  1456 non-null   float64
 10  reanalysis_avg_temp_k                  1456 non-null   float64
 11  rean

In [4]:
# Outlier Handling

for col in ['ndvi_ce','precipitation_ground_mm','reanalysis_sat_precip_amt_mm']:
    df[col] = df[col].clip(upper=df[col].quantile(0.99))
print("n\After Outlier Handling (99th Percentile):")
print(df[['ndvi_ce','precipitation_ground_mm','reanalysis_sat_precip_amt_mm']].describe())

n\After Outlier Handling (99th Percentile):
           ndvi_ce  precipitation_ground_mm  reanalysis_sat_precip_amt_mm
count  1456.000000              1456.000000                   1456.000000
mean      3.423498                81.046504                     45.056733
std       6.576952                83.282130                     40.514337
min      -8.536000                -6.503500                      0.000000
25%      -0.332500                31.817500                      9.960000
50%       1.940000                56.980000                     38.340000
75%       5.735000               101.170000                     70.047500
max      34.590825               507.607500                    179.612925


In [5]:
# Create composite precipitation feature

df['precip_composite'] = 0.7 * df['precipitation_ground_mm'] + 0.3 * df['reanalysis_sat_precip_amt_mm']

In [6]:
# Add lagged feature

for lag in [1,2,3,4]:
    df[f'precip_composite_lag{lag}'] = df['precip_composite'].shift(lag)
    df[f'reanalysis_air_temp_k_lag{lag}'] = df['reanalysis_air_temp_k'].shift(lag)
    df[f'reanalysis_relative_humidity_percent_lag{lag}'] = df['reanalysis_relative_humidity_percent'].shift(lag)

for col in df.columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace = True)

In [7]:
# Create interaction term

df['precip_temp_interaction'] = df['precip_composite'] * df['reanalysis_air_temp_k']

In [8]:
# Identify and drop redundant features

corr_matrix = df.corr().abs()

high_corr = [(col1,col2) for col1 in corr_matrix.columns for col2 in corr_matrix.index
             if col1!= col2 and corr_matrix.loc[col2,col1] > 0.8]
print("n\Highly Correlated Features:",high_corr)

low_importance = ['reanalysis_specific_humidity_g_per_kg','reanalysis_air_temp_k']
df = df.drop(columns = low_importance, errors = 'ignore')

n\Highly Correlated Features: [('ndvi_ne', 'ndvi_nw'), ('ndvi_nw', 'ndvi_ne'), ('ndvi_se', 'ndvi_sw'), ('ndvi_sw', 'ndvi_se'), ('ndvi_ce', 'total_cases'), ('precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm'), ('precipitation_ground_mm', 'total_cases'), ('precipitation_ground_mm', 'precip_composite'), ('precipitation_ground_mm', 'precip_temp_interaction'), ('reanalysis_air_temp_k', 'reanalysis_avg_temp_k'), ('reanalysis_air_temp_k', 'reanalysis_air_temp_k_lag1'), ('reanalysis_avg_temp_k', 'reanalysis_air_temp_k'), ('reanalysis_dew_point_temp_k', 'reanalysis_specific_humidity_g_per_kg'), ('reanalysis_max_air_temp_k', 'reanalysis_tdtr_k'), ('reanalysis_max_air_temp_k', 'station_diur_temp_rng_c'), ('reanalysis_min_air_temp_k', 'reanalysis_tdtr_k'), ('reanalysis_sat_precip_amt_mm', 'precipitation_amt_mm'), ('reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k'), ('reanalysis_tdtr_k', 'reanalysis_max_air_temp_k'), ('reanalysis_tdtr_k', 'reanalysis_min_air_temp_k'), ('rea

In [9]:
# Normalize NDVI and temperature features

norm_cols = ['ndvi_ce','ndvi_ne','ndvi_nw','ndvi_se','ndvi_sw','reanalysis_avg_temp_k','reanalysis_max_air_temp_k','reanalysis_min_air_temp_k','station_avg_temp_c','station_max_temp_c','station_min_temp_c']

for col in norm_cols:
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
print("n\After Normalization:")
print(df[norm_cols].describe())

n\After Normalization:
           ndvi_ce      ndvi_ne      ndvi_nw      ndvi_se      ndvi_sw  \
count  1456.000000  1456.000000  1456.000000  1456.000000  1456.000000   
mean      0.277310     0.529650     0.501162     0.416628     0.390075   
std       0.152503     0.208041     0.221745     0.208095     0.208963   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.190218     0.397884     0.346154     0.290276     0.230769   
50%       0.242911     0.511798     0.480769     0.406386     0.358974   
75%       0.330908     0.674532     0.653846     0.551524     0.512821   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

       reanalysis_avg_temp_k  reanalysis_max_air_temp_k  \
count            1456.000000                1456.000000   
mean                0.545640                   0.359633   
std                 0.235428                   0.254292   
min                 0.000000                   0.000000   
25%            

In [10]:
# Save re-engineered dataset

df.to_csv('dengue_reengineered.csv',index = False)
print("n\Re-engineered dataset saved as 'dengue_reengineered.csv'")

n\Re-engineered dataset saved as 'dengue_reengineered.csv'


In [11]:
# Preview re-engineered dataset

print("n\Re-engineered Dataset Preview:")
print(df.head())

n\Re-engineered Dataset Preview:
  city  weekofyear   ndvi_ne   ndvi_nw   ndvi_se   ndvi_sw   ndvi_ce  \
0   sj        18.0  0.495525  0.442308  0.406386  0.333333  0.271200   
1   sj        19.0  0.576892  0.519231  0.290276  0.282051  0.298329   
2   sj        20.0  0.349064  0.576923  0.290276  0.307692  0.070629   
3   sj        21.0  0.511798  0.730769  0.493469  0.487179  0.109120   
4   sj        22.0  0.625712  0.750000  0.551524  0.512821  0.063209   

   precipitation_amt_mm  precipitation_ground_mm  reanalysis_avg_temp_k  ...  \
0                 12.42                    13.06               0.262732  ...   
1                 22.82                    30.44               0.396002  ...   
2                 34.54                    35.77               0.479772  ...   
3                 15.36                    23.45               0.546406  ...   
4                  7.52                    18.16               0.628272  ...   

   precip_composite_lag2  reanalysis_air_temp_k_lag2 