# D. Predictive Analytics - Dataset optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [2]:
df = pd.read_csv('dengue_advanced.csv')

In [3]:
climatic_cols = ['reanalysis_relative_humidity_percent', 'reanalysis_avg_temp_k'] + \
                [f'reanalysis_relative_humidity_percent_lag{lag}' for lag in [1, 2, 3, 4]] + \
                [f'reanalysis_air_temp_k_lag{lag}' for lag in [1, 2, 3, 4]] + \
                [f'precip_temp_binned_lag{lag}_interaction' for lag in [1, 2, 3, 4]]
for col in climatic_cols:
    if col in df.columns:
        df[col] = df[col] * 2.0  # Higher weight for climatic features
if 'weekofyear_sin' in df.columns:
    df['weekofyear_sin'] = df['weekofyear_sin'] * 1.5
    df['weekofyear_cos'] = df['weekofyear_cos'] * 1.5
if 'ndvi_ce_log' in df.columns:
    df['ndvi_ce_log'] = df['ndvi_ce_log'] * 0.5  # Reduce NDVI dominance
scaler = RobustScaler()
existing_cols = [col for col in climatic_cols + ['ndvi_ce_log', 'weekofyear_sin', 'weekofyear_cos'] if col in df.columns]
if existing_cols:
    df[existing_cols] = scaler.fit_transform(df[existing_cols])
print("\nAfter Feature Importance Re-Balancing:")
print(df[['weekofyear_sin', 'weekofyear_cos', 'reanalysis_relative_humidity_percent']].describe())


After Feature Importance Re-Balancing:
       weekofyear_sin  weekofyear_cos  reanalysis_relative_humidity_percent
count    1.456000e+03    1.456000e+03                           1456.000000
mean    -1.230605e-16    6.717764e-17                              0.639555
std      5.167137e-01    5.167137e-01                              0.950309
min     -7.304925e-01   -7.304925e-01                              0.000000
25%     -5.000000e-01   -5.000000e-01                              0.000000
50%      0.000000e+00    0.000000e+00                              0.000000
75%      5.000000e-01    5.000000e-01                              1.000000
max      7.304925e-01    7.304925e-01                              2.969861


In [4]:
if 'ndvi_ce_log' in df.columns:
    invalid_ndvi = (df['ndvi_ce_log'].abs() > 5)
    df.loc[invalid_ndvi, 'ndvi_ce_log'] = df.loc[invalid_ndvi, ['ndvi_nw', 'ndvi_se', 'ndvi_sw']].mean(axis=1)
for col in ['reanalysis_relative_humidity_percent'] + [f'reanalysis_relative_humidity_percent_lag{lag}' for lag in [1, 2, 3, 4]]:
    if col in df.columns:
        invalid_humidity = (df[col] < 0) | (df[col] > 100)
        df.loc[invalid_humidity, col] = df[col].median()
print("\nAfter Data Quality Validation:")
print(df[['ndvi_ce_log', 'reanalysis_relative_humidity_percent']].describe())


After Data Quality Validation:
       ndvi_ce_log  reanalysis_relative_humidity_percent
count  1456.000000                           1456.000000
mean      0.187362                              0.639555
std       0.984186                              0.950309
min      -1.931670                              0.000000
25%      -0.387722                              0.000000
50%       0.000000                              0.000000
75%       0.612278                              1.000000
max       4.236416                              2.969861


In [5]:
df.to_csv('dengue_optimized.csv', index=False)
print("\nOptimized dataset saved as 'dengue_optimized.csv'")


Optimized dataset saved as 'dengue_optimized.csv'
