# D. Feature Re-engineering - Advanced

In [2]:
# Author: Brian Gray, with Grok for data insight and debugging support
# Date: 21 May 2025
# Purpose: Apply advanced feature re-engineer features to the refined dataset to improve model accuracy based on model testing results
# Dataset source files: 'dengue_refined.csv'
# Output datasets: dengue_advanced.csv, sj_advanced.csv, and iq_advanced.csv
# Dependencies: pandas, numpy, and Scikit-Learn
# Notes:
# 1. Re-run of data cleaning, feature re-engineering, and refined feature re-engineering to add cities to the dataset
# 2. Advanced ndvi log transformation and scaling
# 3. Boost temperature and humidity features
# 4. Drop redundant features
# 5. Validate and Re-Create Binned Temperature Features
# 6. Adaptive feature pruning
# 7. Data quality validation

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [14]:
# Load datasets

df = pd.read_csv('dengue_refined.csv')

In [15]:
# Initial data review, checking for missing values and data types

print("Initial Data Info:")
print(df.info())

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 39 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   city                                       1456 non-null   object 
 1   weekofyear                                 1456 non-null   float64
 2   ndvi_ne                                    1456 non-null   float64
 3   ndvi_nw                                    1456 non-null   float64
 4   ndvi_se                                    1456 non-null   float64
 5   ndvi_sw                                    1456 non-null   float64
 6   ndvi_ce                                    1456 non-null   float64
 7   precipitation_amt_mm                       1456 non-null   float64
 8   precipitation_ground_mm                    1456 non-null   float64
 9   reanalysis_avg_temp_k                      1456 non-null   float64
 10  reana

In [16]:
# Advanced ndvi log transformation and scaling

df['ndvi_ce'] = df['ndvi_ce'] + abs(df['ndvi_ce'].min()) + 1 
df['ndvi_ce_log'] = np.log(df['ndvi_ce'])

scaler = RobustScaler()

df['ndvi_ce_log'] = scaler.fit_transform(df[['ndvi_ce_log']])

print("\nAfter NDVI Transformation:")
print(df['ndvi_ce_log'].describe())


After NDVI Transformation:
count    1456.000000
mean        0.187362
std         0.984186
min        -1.931670
25%        -0.387722
50%         0.000000
75%         0.612278
max         4.236416
Name: ndvi_ce_log, dtype: float64


In [5]:
# Seasonal feature engineering

df['weekofyear_sin_sq'] = df['weekofyear_sin'] ** 2
df['weekofyear_cos_sq'] = df['weekofyear_cos'] ** 2
df['week_temp_lag_interaction'] = df['weekofyear_sin'] * df['reanalysis_air_temp_k_lag1']
df['week_humidity_lag_interaction'] = df['weekofyear_cos'] * df['reanalysis_relative_humidity_percent_lag1']

print("\nAfter Seasonal Feature Engineering:")
print(df[['weekofyear_sin_sq', 'weekofyear_cos_sq', 'week_temp_lag_interaction', 'week_humidity_lag_interaction']].describe())


After Seasonal Feature Engineering:
       weekofyear_sin_sq  weekofyear_cos_sq  week_temp_lag_interaction  \
count       1.456000e+03       1.456000e+03               1.456000e+03   
mean        5.000000e-01       5.000000e-01              -5.121216e-01   
std         3.536749e-01       3.536749e-01               2.113332e+02   
min         1.034423e-31       2.586058e-32              -3.012490e+02   
25%         1.257446e-01       1.257446e-01              -2.053190e+02   
50%         5.000000e-01       5.000000e-01               4.780627e-14   
75%         8.742554e-01       8.742554e-01               2.049800e+02   
max         1.000000e+00       1.000000e+00               2.990000e+02   

       week_humidity_lag_interaction  
count                   1.456000e+03  
mean                   -3.222482e-01  
std                     5.865851e+01  
min                    -9.617364e+01  
25%                    -5.799141e+01  
50%                    -1.429438e-14  
75%                    

In [17]:
# Boost temperature and humidity features

df['temp_binned'] = pd.cut(df['reanalysis_avg_temp_k'], bins=[0, 298, 303, float('inf')],
                           labels=['low', 'optimal', 'high'], include_lowest=True)
df = pd.get_dummies(df, columns=['temp_binned'], prefix='temp_binned')
df['humidity_precip_interaction'] = df['reanalysis_relative_humidity_percent'] * df['precip_composite']
print("\nAfter Temperature and Humidity Feature Boosting:")
print(df[['humidity_precip_interaction'] + [col for col in df.columns if 'temp_binned' in col]].describe())


After Temperature and Humidity Feature Boosting:
       humidity_precip_interaction  temp_binned_low  temp_binned_optimal  \
count                  1456.000000           1456.0               1456.0   
mean                   5249.879848              1.0                  0.0   
std                    4333.847491              0.0                  0.0   
min                    -243.274400              1.0                  0.0   
25%                    2165.715400              1.0                  0.0   
50%                    4240.456250              1.0                  0.0   
75%                    7050.985875              1.0                  0.0   
max                   27910.998286              1.0                  0.0   

       temp_binned_high  
count            1456.0  
mean                0.0  
std                 0.0  
min                 0.0  
25%                 0.0  
50%                 0.0  
75%                 0.0  
max                 0.0  


In [18]:
# Drop redundant features

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
corr_matrix = df[numeric_cols].corr().abs()
high_corr = [(col1, col2) for col1 in corr_matrix.columns for col2 in corr_matrix.index
             if col1 != col2 and corr_matrix.loc[col2, col1] > 0.8]
print("\nHighly Correlated Features (>0.8):", high_corr)
# Prioritize dropping lower-importance features (based on prior report)
drop_candidates = ['reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']
drop_cols = [col for col in drop_candidates if col in df.columns and col in numeric_cols]
df = df.drop(columns=drop_cols, errors='ignore')
print("\nDropped Columns:", drop_cols)


Highly Correlated Features (>0.8): [('ndvi_ne', 'ndvi_nw'), ('ndvi_nw', 'ndvi_ne'), ('ndvi_se', 'ndvi_sw'), ('ndvi_sw', 'ndvi_se'), ('ndvi_ce', 'total_cases'), ('ndvi_ce', 'ndvi_ce_log'), ('precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm'), ('precipitation_ground_mm', 'total_cases'), ('precipitation_ground_mm', 'precip_composite'), ('precipitation_ground_mm', 'precip_temp_interaction'), ('precipitation_ground_mm', 'humidity_precip_interaction'), ('reanalysis_sat_precip_amt_mm', 'precipitation_amt_mm'), ('reanalysis_tdtr_k', 'station_diur_temp_rng_c'), ('station_diur_temp_rng_c', 'reanalysis_tdtr_k'), ('total_cases', 'ndvi_ce'), ('total_cases', 'precipitation_ground_mm'), ('total_cases', 'precip_temp_interaction'), ('precip_composite', 'precipitation_ground_mm'), ('precip_composite', 'precip_temp_interaction'), ('precip_composite', 'humidity_precip_interaction'), ('reanalysis_air_temp_k_lag1', 'reanalysis_air_temp_k_lag2'), ('reanalysis_air_temp_k_lag2', 'reanalysis_air_temp_k_lag

In [21]:
# Validate and Re-Create Binned Temperature Features

if 'reanalysis_avg_temp_k' not in df.columns:
    raise ValueError("Missing 'reanalysis_avg_temp_k' column required for binning.")
try:
    df['temp_binned'] = pd.cut(df['reanalysis_avg_temp_k'], bins=[0, 298, 303, float('inf')],
                               labels=['low', 'optimal', 'high'], include_lowest=True)
    df = pd.get_dummies(df, columns=['temp_binned'], prefix='temp_binned')
except Exception as e:
    print(f"Error creating temp_binned features: {e}")
    df['temp_binned_optimal'] = (df['reanalysis_avg_temp_k'] >= 298) & (df['reanalysis_avg_temp_k'] <= 303).astype(float)
print("\nAfter Binned Temperature Feature Creation:")
print(df[[col for col in df.columns if 'temp_binned' in col]].describe())


After Binned Temperature Feature Creation:
       temp_binned_low  temp_binned_low  temp_binned_low  temp_binned_optimal  \
count           1456.0           1456.0           1456.0               1456.0   
mean               1.0              1.0              1.0                  0.0   
std                0.0              0.0              0.0                  0.0   
min                1.0              1.0              1.0                  0.0   
25%                1.0              1.0              1.0                  0.0   
50%                1.0              1.0              1.0                  0.0   
75%                1.0              1.0              1.0                  0.0   
max                1.0              1.0              1.0                  0.0   

       temp_binned_optimal  temp_binned_optimal  temp_binned_high  \
count               1456.0               1456.0            1456.0   
mean                   0.0                  0.0               0.0   
std                

In [24]:
# Adaptive feature pruning

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
corr_matrix = df[numeric_cols].corr().abs()
high_corr = [(col1, col2) for col1 in corr_matrix.columns for col2 in corr_matrix.index
             if col1 != col2 and corr_matrix.loc[col2, col1] > 0.8]
print("\nHighly Correlated Features (>0.8):", high_corr)
drop_candidates = ['reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']
drop_cols = [col for col in drop_candidates if col in df.columns and col in numeric_cols]
df = df.drop(columns=drop_cols, errors='ignore')
print("\nDropped Columns:", drop_cols)


Highly Correlated Features (>0.8): [('ndvi_ne', 'ndvi_nw'), ('ndvi_nw', 'ndvi_ne'), ('ndvi_se', 'ndvi_sw'), ('ndvi_sw', 'ndvi_se'), ('ndvi_ce', 'total_cases'), ('ndvi_ce', 'ndvi_ce_log'), ('precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm'), ('precipitation_ground_mm', 'total_cases'), ('precipitation_ground_mm', 'precip_composite'), ('precipitation_ground_mm', 'precip_temp_interaction'), ('precipitation_ground_mm', 'humidity_precip_interaction'), ('reanalysis_sat_precip_amt_mm', 'precipitation_amt_mm'), ('reanalysis_tdtr_k', 'station_diur_temp_rng_c'), ('station_diur_temp_rng_c', 'reanalysis_tdtr_k'), ('total_cases', 'ndvi_ce'), ('total_cases', 'precipitation_ground_mm'), ('total_cases', 'precip_temp_interaction'), ('precip_composite', 'precipitation_ground_mm'), ('precip_composite', 'precip_temp_interaction'), ('precip_composite', 'humidity_precip_interaction'), ('reanalysis_air_temp_k_lag1', 'reanalysis_air_temp_k_lag2'), ('reanalysis_air_temp_k_lag2', 'reanalysis_air_temp_k_lag

In [25]:
# Feature importance weighting

climatic_cols = ['reanalysis_relative_humidity_percent', 'reanalysis_avg_temp_k'] + \
                [f'reanalysis_relative_humidity_percent_lag{lag}' for lag in [1, 2, 3, 4]] + \
                [f'reanalysis_air_temp_k_lag{lag}' for lag in [1, 2, 3, 4]] + \
                [f'precip_temp_binned_lag{lag}_interaction' for lag in [1, 2, 3, 4]]
for col in climatic_cols:
    if col in df.columns:
        df[col] = df[col] * 1.5
if 'weekofyear_sin' in df.columns:
    df['weekofyear_sin'] = df['weekofyear_sin'] * 1.5
    df['weekofyear_cos'] = df['weekofyear_cos'] * 1.5
scaler = RobustScaler()
existing_climatic_cols = [col for col in climatic_cols if col in df.columns]
if existing_climatic_cols:
    df[existing_climatic_cols] = scaler.fit_transform(df[existing_climatic_cols])
print("\nAfter Feature Importance Weighting:")
print(df[['weekofyear_sin', 'weekofyear_cos'] + existing_climatic_cols].describe())


After Feature Importance Weighting:
       weekofyear_sin  weekofyear_cos  reanalysis_relative_humidity_percent  \
count    1.456000e+03    1.456000e+03                           1456.000000   
mean    -1.220025e-18   -4.361590e-17                              0.212623   
std      1.591537e+00    1.591537e+00                              0.800171   
min     -2.250000e+00   -2.250000e+00                             -1.318502   
25%     -1.540057e+00   -1.540057e+00                             -0.348468   
50%      3.618276e-16   -3.875729e-16                              0.000000   
75%      1.540057e+00    1.540057e+00                              0.651532   
max      2.250000e+00    2.250000e+00                              1.934960   

       reanalysis_avg_temp_k  reanalysis_relative_humidity_percent_lag1  \
count            1456.000000                                1456.000000   
mean               -0.032834                                   0.212542   
std                 0.6341

In [26]:
# Data quality validation

if 'ndvi_ce_log' in df.columns:
    invalid_ndvi = (df['ndvi_ce_log'].abs() > 5)
    df.loc[invalid_ndvi, 'ndvi_ce_log'] = df.loc[invalid_ndvi, ['ndvi_nw', 'ndvi_se', 'ndvi_sw']].mean(axis=1)
for col in ['reanalysis_relative_humidity_percent'] + [f'reanalysis_relative_humidity_percent_lag{lag}' for lag in [1, 2, 3, 4]]:
    if col in df.columns:
        invalid_humidity = (df[col] < 0) | (df[col] > 100)
        df.loc[invalid_humidity, col] = df[col].median()

print("\nAfter Data Quality Validation:")
print(df[['ndvi_ce_log', 'reanalysis_relative_humidity_percent']].describe())


After Data Quality Validation:
       ndvi_ce_log  reanalysis_relative_humidity_percent
count  1456.000000                           1456.000000
mean      0.187362                              0.416691
std       0.984186                              0.619157
min      -1.931670                              0.000000
25%      -0.387722                              0.000000
50%       0.000000                              0.000000
75%       0.612278                              0.651532
max       4.236416                              1.934960


In [27]:
# Save Advanced Feature Re-engineered dataset

df.to_csv('dengue_advanced.csv', index=False)
print("\nCorrected dataset saved as 'dengue_advanced.csv'")


Corrected dataset saved as 'dengue_advanced.csv'


In [28]:
# Split dataset by city and create datasets for each

if df['city'].nunique() > 1 and 'unknown' not in df['city'].unique():
    sj_df = df[df['city'] == 'sj'].drop(columns=['city'])
    iq_df = df[df['city'] == 'iq'].drop(columns=['city'])
    sj_df.to_csv('sj_advanced.csv', index=False)
    iq_df.to_csv('iq_advanced.csv', index=False)
    print("\nCity-specific datasets saved as 'sj_advanced.csv' and 'iq_advanced.csv'")
    datasets = [('San Juan', 'sj_advanced.csv'), ('Iquitos', 'iq_advanced.csv')]
else:
    datasets = [('Combined', 'dengue_advanced.csv')]
    print("\nProceeding with combined dataset due to missing city labels.")


City-specific datasets saved as 'sj_advanced.csv' and 'iq_advanced.csv'


In [29]:
# Preview the cleaned dataset

print("\nPreview of Cleaned Dataset")
print(df.head())


Preview of Cleaned Dataset
  city  weekofyear   ndvi_ne   ndvi_nw   ndvi_se   ndvi_sw   ndvi_ce  \
0   sj        18.0  0.495525  0.442308  0.406386  0.333333  1.272807   
1   sj        19.0  0.576892  0.519231  0.290276  0.282051  1.299936   
2   sj        20.0  0.349064  0.576923  0.290276  0.307692  1.072236   
3   sj        21.0  0.511798  0.730769  0.493469  0.487179  1.110727   
4   sj        22.0  0.625712  0.750000  0.551524  0.512821  1.064816   

   precipitation_amt_mm  precipitation_ground_mm  reanalysis_avg_temp_k  ...  \
0                 12.42                    13.06              -0.794872  ...   
1                 22.82                    30.44              -0.435897  ...   
2                 34.54                    35.77              -0.210256  ...   
3                 15.36                    23.45              -0.030769  ...   
4                  7.52                    18.16               0.189744  ...   

   temp_binned_low  temp_binned_optimal  temp_binned_high 