In [57]:
import pandas as pd

df = pd.read_csv('/content/dengue-dataset-with-alert-epidemic.csv')
display(df.head())

Unnamed: 0,CITY,YEAR_WEEK,CASES,DEATHS,RAINFALL,TMAX,TMIN,TMEAN,RH,SUNSHINE,...,TMEAN_roll2_sum,TMEAN_roll4_sum,RH_roll2_mean,RH_roll4_mean,RH_roll2_sum,RH_roll4_sum,INCIDENCE_per_100k,RISK_LEVEL,ALERT,EPIDEMIC
0,CALOOCAN CITY,2016-W02,27,0,0.0,32.0,21.8,26.9,73.0,6.4,...,,,,,,,1.690776,Low,False,False
1,CALOOCAN CITY,2016-W03,19,0,0.0,32.3,23.0,27.65,67.0,8.3,...,,,,,,,1.189623,Low,False,False
2,CALOOCAN CITY,2016-W04,43,0,0.0,30.6,23.8,27.2,65.0,3.9,...,54.55,,70.0,,140.0,,2.691891,Moderate,False,False
3,CALOOCAN CITY,2016-W05,30,0,0.0,32.2,22.6,27.4,67.0,6.4,...,54.85,,66.0,,132.0,,1.877776,Low,False,False
4,CALOOCAN CITY,2016-W06,28,0,0.0,28.3,19.4,23.85,70.0,1.6,...,54.6,109.15,66.0,68.0,132.0,272.0,1.752322,Low,False,False


In [58]:
df.shape

(4403, 65)

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd # Ensure pandas is imported

# 1. Identify and handle missing values
# Check for missing values
print("Missing values before handling:")
print(df.isnull().sum())

# Impute missing numerical values with the mean
numerical_cols = df.select_dtypes(include=np.number).columns
for col in numerical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mean())

# For categorical columns with missing values, imputation with mode is a common strategy.
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
     if df[col].isnull().any():
         df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing values after handling all types:")
print(df.isnull().sum())


# 2. Identify categorical features and apply appropriate encoding
# 'CITY' is categorical. 'RISK_LEVEL' is categorical with defined levels. 'ALERT' and 'EPIDEMIC' are boolean.

categorical_cols_to_encode = []
# Define the categories for 'RISK_LEVEL' including 'High' and 'Very High' as observed in the data
risk_level_categories = ['Low', 'Moderate', 'High', 'Very High']

if 'RISK_LEVEL' in df.columns:
    df['RISK_LEVEL'] = pd.Categorical(df['RISK_LEVEL'], categories=risk_level_categories, ordered=False)
    categorical_cols_to_encode.append('RISK_LEVEL')
else:
    print("'RISK_LEVEL' column not found. Assuming it has already been one-hot encoded.")

if 'CITY' in df.columns:
    categorical_cols_to_encode.append('CITY')
else:
    print("'CITY' column not found. Assuming it has already been one-hot encoded.")


if categorical_cols_to_encode:
    # Use get_dummies without drop_first=True to keep all risk level columns for classification
    df = pd.get_dummies(df, columns=categorical_cols_to_encode, drop_first=False)


# Convert boolean columns to integer (0 or 1)
# Keep ALERT and EPIDEMIC as features, not targets
if 'ALERT' in df.columns:
    df['ALERT'] = df['ALERT'].astype(int)
else:
     print("'ALERT' column not found. Assuming it has already been converted to int.")

if 'EPIDEMIC' in df.columns:
    df['EPIDEMIC'] = df['EPIDEMIC'].astype(int)
else:
    print("'EPIDEMIC' column not found. Assuming it has already been converted to int.")

# Convert 'YEAR_WEEK' to numerical format YYYYww
# Handle potential errors during conversion
def convert_year_week_to_numerical(year_week_str):
    try:
        year_str, week_str = year_week_str.split('-W')
        return int(year_str) * 100 + int(week_str)
    except:
        return np.nan # Return NaN for any conversion errors

if 'YEAR_WEEK' in df.columns:
    df['YEAR_WEEK_numerical'] = df['YEAR_WEEK'].apply(convert_year_week_to_numerical)
    # Drop the original 'YEAR_WEEK' column
    df = df.drop('YEAR_WEEK', axis=1, errors='ignore') # Add errors='ignore'

    # Impute any NaNs created during numerical conversion of YEAR_WEEK
    if df['YEAR_WEEK_numerical'].isnull().any():
        df['YEAR_WEEK_numerical'] = df['YEAR_WEEK_numerical'].fillna(df['YEAR_WEEK_numerical'].mean())
else:
    print("'YEAR_WEEK' column not found. Assuming 'YEAR_WEEK_numerical' is available.")


# 3. Separate the target variable from the features
# Set only the one-hot encoded 'RISK_LEVEL' columns as the target for classification.
risk_level_cols = [col for col in df.columns if 'RISK_LEVEL_' in col]
target_cols = risk_level_cols # Only include RISK_LEVEL columns in target

# Ensure all target columns exist in the DataFrame
target_cols_existing = [col for col in target_cols if col in df.columns]
if len(target_cols_existing) != len(target_cols):
    missing_targets = list(set(target_cols) - set(target_cols_existing))
    print(f"Warning: The following target columns were not found in the DataFrame: {missing_targets}")
    target_cols = target_cols_existing # Update target_cols to only include existing columns

y_classification = df[target_cols]

# Separate the features for classification
# Drop the original 'CASES', the target columns, 'ALERT', and 'EPIDEMIC' from features
cols_to_drop_from_features = ['CASES'] + target_cols + ['ALERT', 'EPIDEMIC']
cols_to_drop_from_features_existing = [col for col in cols_to_drop_from_features if col in df.columns]
X_classification = df.drop(cols_to_drop_from_features_existing, axis=1)


# Identify and remove any remaining non-numerical columns from X_classification,
# except for the boolean columns from 'CITY' one-hot encoding which are already numerical (0 or 1).
non_numerical_cols_in_X_classification = X_classification.select_dtypes(exclude=np.number).columns
if len(non_numerical_cols_in_X_classification) > 0:
    print(f"\nRemoving non-numerical columns from features: {list(non_numerical_cols_in_X_classification)}")
    X_classification = X_classification.drop(non_numerical_cols_in_X_classification, axis=1, errors='ignore') # Add errors='ignore'


# Print columns in X_classification after removing non-numerical ones
print("\nColumns in X_classification after removing non-numerical columns:")
print(X_classification.columns)


# Scale numerical features in X_classification
# Identify numerical columns in X_classification after dropping any non-numerical ones
numerical_cols_classification = X_classification.select_dtypes(include=np.number).columns
if len(numerical_cols_classification) > 0:
    scaler_classification = StandardScaler()
    X_classification[numerical_cols_classification] = scaler_classification.fit_transform(X_classification[numerical_cols_classification])
else:
    print("No numerical columns found in X_classification to scale.")


# 4. Split the data into training and testing sets
X_train_classification, X_test_classification, y_train_classification, y_test_classification = train_test_split(
    X_classification, y_classification, test_size=0.2, random_state=42
)

# Reshape data for CNN-LSTM input (samples, timesteps, features)
# Since we are treating each row as a single timestep with multiple features,
# we reshape to (samples, 1, features)
X_train_classification_reshaped = X_train_classification.values.reshape((X_train_classification.shape[0], 1, X_train_classification.shape[1]))
X_test_classification_reshaped = X_test_classification.values.reshape((X_test_classification.shape[0], 1, X_test_classification.shape[1]))


print("\nShape of training features for classification:", X_train_classification_reshaped.shape)
print("Shape of testing features for classification:", X_test_classification_reshaped.shape)
print("Shape of training target for classification:", y_train_classification.shape)
print("Shape of testing target for classification:", y_test_classification.shape)
print("\nTarget columns after one-hot encoding:", y_classification.columns.tolist())

Missing values before handling:
CITY                   0
YEAR_WEEK              0
CASES                  0
DEATHS                 0
RAINFALL               0
                      ..
RH_roll4_sum          68
INCIDENCE_per_100k     0
RISK_LEVEL             0
ALERT                  0
EPIDEMIC               0
Length: 65, dtype: int64

Missing values after handling all types:
CITY                  0
YEAR_WEEK             0
CASES                 0
DEATHS                0
RAINFALL              0
                     ..
RH_roll4_sum          0
INCIDENCE_per_100k    0
RISK_LEVEL            0
ALERT                 0
EPIDEMIC              0
Length: 65, dtype: int64

Removing non-numerical columns from features: ['CITY_CALOOCAN CITY', 'CITY_LAS PINAS CITY', 'CITY_MAKATI CITY', 'CITY_MALABON CITY', 'CITY_MANDALUYONG CITY', 'CITY_MANILA CITY', 'CITY_MARIKINA CITY', 'CITY_MUNTINLUPA CITY', 'CITY_NAVOTAS CITY', 'CITY_PARANAQUE CITY', 'CITY_PASAY CITY', 'CITY_PASIG CITY', 'CITY_PATEROS', 'CITY_QUEZON C

In [60]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define the CNN-LSTM model for classification
model_classification = Sequential()

# CNN layers - Input shape should match X_train_classification_reshaped
model_classification.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(X_train_classification_reshaped.shape[1], X_train_classification_reshaped.shape[2])))
model_classification.add(MaxPooling1D(pool_size=1))
model_classification.add(Dropout(0.2))

# LSTM layers
model_classification.add(LSTM(50, return_sequences=True))
model_classification.add(Dropout(0.2))
model_classification.add(LSTM(50))
model_classification.add(Dropout(0.2))

# Dense layers
model_classification.add(Dense(50, activation='relu'))
# Output layer for multi-output classification (4 for RISK_LEVEL)
# Use 'sigmoid' activation for multi-label classification
model_classification.add(Dense(y_classification.shape[1], activation='sigmoid'))

# Print the model summary
model_classification.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [61]:
print(X_classification.columns.tolist())

['DEATHS', 'RAINFALL', 'TMAX', 'TMIN', 'TMEAN', 'RH', 'SUNSHINE', 'POPULATION', 'LAND AREA', 'POP_DENSITY', 'CASES_lag1', 'CASES_lag2', 'CASES_lag3', 'CASES_lag4', 'DEATHS_lag1', 'DEATHS_lag2', 'DEATHS_lag3', 'DEATHS_lag4', 'RAINFALL_lag1', 'RAINFALL_lag2', 'RAINFALL_lag3', 'RAINFALL_lag4', 'TMAX_lag1', 'TMAX_lag2', 'TMAX_lag3', 'TMAX_lag4', 'TMIN_lag1', 'TMIN_lag2', 'TMIN_lag3', 'TMIN_lag4', 'TMEAN_lag1', 'TMEAN_lag2', 'TMEAN_lag3', 'TMEAN_lag4', 'RH_lag1', 'RH_lag2', 'RH_lag3', 'RH_lag4', 'SUNSHINE_lag1', 'SUNSHINE_lag2', 'SUNSHINE_lag3', 'SUNSHINE_lag4', 'CASES_roll2_mean', 'CASES_roll4_mean', 'CASES_roll2_sum', 'CASES_roll4_sum', 'RAINFALL_roll2_mean', 'RAINFALL_roll4_mean', 'RAINFALL_roll2_sum', 'RAINFALL_roll4_sum', 'TMEAN_roll2_mean', 'TMEAN_roll4_mean', 'TMEAN_roll2_sum', 'TMEAN_roll4_sum', 'RH_roll2_mean', 'RH_roll4_mean', 'RH_roll2_sum', 'RH_roll4_sum', 'INCIDENCE_per_100k', 'YEAR_WEEK_numerical']


In [62]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Compile the classification model
# Use binary_crossentropy for multi-label classification with sigmoid activation
model_classification.compile(optimizer=Adam(learning_rate=0.001),
                             loss='binary_crossentropy',
                             metrics=['accuracy'])

print("Classification model compilation complete.")

# Train the classification model
# Using EarlyStopping to prevent overfitting
early_stopping_classification = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history_classification = model_classification.fit(X_train_classification_reshaped, y_train_classification,
                                                  epochs=300, batch_size=32, validation_split=0.2,
                                                  callbacks=[early_stopping_classification])

print("Classification model training complete.")

Classification model compilation complete.
Epoch 1/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.4422 - loss: 0.6135 - val_accuracy: 0.5461 - val_loss: 0.4074
Epoch 2/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6047 - loss: 0.3745 - val_accuracy: 0.6922 - val_loss: 0.3114
Epoch 3/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7122 - loss: 0.3036 - val_accuracy: 0.7376 - val_loss: 0.2675
Epoch 4/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7275 - loss: 0.2786 - val_accuracy: 0.7546 - val_loss: 0.2487
Epoch 5/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7630 - loss: 0.2489 - val_accuracy: 0.7915 - val_loss: 0.2275
Epoch 6/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7878 - loss: 0.2330 - val_accuracy: 0.7957 - val_los

In [63]:
df.head()

Unnamed: 0,CASES,DEATHS,RAINFALL,TMAX,TMIN,TMEAN,RH,SUNSHINE,POPULATION,LAND AREA,...,CITY_NAVOTAS CITY,CITY_PARANAQUE CITY,CITY_PASAY CITY,CITY_PASIG CITY,CITY_PATEROS,CITY_QUEZON CITY,CITY_SAN JUAN CITY,CITY_TAGUIG CITY,CITY_VALENZUELA CITY,YEAR_WEEK_numerical
0,27,0,0.0,32.0,21.8,26.9,73.0,6.4,1596900,55.8,...,False,False,False,False,False,False,False,False,False,201602
1,19,0,0.0,32.3,23.0,27.65,67.0,8.3,1597145,55.8,...,False,False,False,False,False,False,False,False,False,201603
2,43,0,0.0,30.6,23.8,27.2,65.0,3.9,1597390,55.8,...,False,False,False,False,False,False,False,False,False,201604
3,30,0,0.0,32.2,22.6,27.4,67.0,6.4,1597635,55.8,...,False,False,False,False,False,False,False,False,False,201605
4,28,0,0.0,28.3,19.4,23.85,70.0,1.6,1597880,55.8,...,False,False,False,False,False,False,False,False,False,201606


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4403 entries, 0 to 4402
Data columns (total 84 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CASES                  4403 non-null   int64  
 1   DEATHS                 4403 non-null   int64  
 2   RAINFALL               4403 non-null   float64
 3   TMAX                   4403 non-null   float64
 4   TMIN                   4403 non-null   float64
 5   TMEAN                  4403 non-null   float64
 6   RH                     4403 non-null   float64
 7   SUNSHINE               4403 non-null   float64
 8   POPULATION             4403 non-null   int64  
 9   LAND AREA              4403 non-null   float64
 10  POP_DENSITY            4403 non-null   int64  
 11  CASES_lag1             4403 non-null   float64
 12  CASES_lag2             4403 non-null   float64
 13  CASES_lag3             4403 non-null   float64
 14  CASES_lag4             4403 non-null   float64
 15  DEAT

In [65]:
# Train the classification model
# Using EarlyStopping to prevent overfitting
early_stopping_classification = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history_classification = model_classification.fit(X_train_classification_reshaped, y_train_classification,
                                                  epochs=300, batch_size=32, validation_split=0.2,
                                                  callbacks=[early_stopping_classification])

print("Classification model training complete.")

Epoch 1/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8764 - loss: 0.1460 - val_accuracy: 0.8397 - val_loss: 0.1651
Epoch 2/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8694 - loss: 0.1467 - val_accuracy: 0.8525 - val_loss: 0.1596
Epoch 3/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8843 - loss: 0.1370 - val_accuracy: 0.8482 - val_loss: 0.1704
Epoch 4/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8778 - loss: 0.1472 - val_accuracy: 0.8340 - val_loss: 0.1779
Epoch 5/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8771 - loss: 0.1426 - val_accuracy: 0.8426 - val_loss: 0.1650
Epoch 6/300
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8880 - loss: 0.1273 - val_accuracy: 0.8369 - val_loss: 0.1756
Epoch 7/300
[1m89/89[0m [32m━

In [66]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Evaluate the classification model
loss_classification, accuracy_classification = model_classification.evaluate(X_test_classification_reshaped, y_test_classification, verbose=0)
print(f'Test Loss (Categorical Crossentropy): {loss_classification:.4f}')
print(f'Test Accuracy: {accuracy_classification:.4f}')

# Make predictions on the testing data
predictions_classification = model_classification.predict(X_test_classification_reshaped)

# Convert predicted probabilities to binary labels using a threshold (e.g., 0.5)
predicted_labels = (predictions_classification > 0.5).astype(int)

# The actual labels are already in the correct format
actual_labels = y_test_classification.values

target_names_classification = y_test_classification.columns.tolist()
print("\nClassification Report:")
print(classification_report(actual_labels, predicted_labels, target_names=target_names_classification))


print("\nConfusion Matrix:")
# Confusion matrix for multi-label classification can be computed for each label
# separately or as a single matrix if flattened. Let's compute for each label.
# Reshape actual_labels and predicted_labels to be 1D for confusion matrix
# Or, iterate through each label column to get individual confusion matrices
print("Confusion matrices for each target label:")
for i, target_name in enumerate(target_names_classification):
    print(f"\nConfusion Matrix for {target_name}:")
    print(confusion_matrix(actual_labels[:, i], predicted_labels[:, i]))

Test Loss (Categorical Crossentropy): 0.1649
Test Accuracy: 0.8593
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step

Classification Report:
                      precision    recall  f1-score   support

      RISK_LEVEL_Low       0.94      0.94      0.94       452
 RISK_LEVEL_Moderate       0.77      0.77      0.77       217
     RISK_LEVEL_High       0.74      0.81      0.78       158
RISK_LEVEL_Very High       0.83      0.65      0.73        54

           micro avg       0.86      0.86      0.86       881
           macro avg       0.82      0.79      0.80       881
        weighted avg       0.86      0.86      0.86       881
         samples avg       0.86      0.86      0.86       881


Confusion Matrix:
Confusion matrices for each target label:

Confusion Matrix for RISK_LEVEL_Low:
[[403  26]
 [ 25 427]]

Confusion Matrix for RISK_LEVEL_Moderate:
[[613  51]
 [ 50 167]]

Confusion Matrix for RISK_LEVEL_High:
[[679  44]
 [ 30 128]]

Confusion Matrix for RI

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [69]:
# Display sample predictions
print("\nSample Classification Predictions vs Actuals:")
sample_size = 10
for i in range(sample_size):
    print(f"\nSample {i+1}:")
    predicted_sample = predicted_labels[i]
    actual_sample = actual_labels[i]

    print("Predicted:")
    for j, target_name in enumerate(target_names_classification):
        print(f"  {target_name}: {predicted_sample[j]}")

    print("Actual:")
    for j, target_name in enumerate(target_names_classification):
        print(f"  {target_name}: {actual_sample[j]}")



Sample Classification Predictions vs Actuals:

Sample 1:
Predicted:
  RISK_LEVEL_Low: 1
  RISK_LEVEL_Moderate: 0
  RISK_LEVEL_High: 0
  RISK_LEVEL_Very High: 0
Actual:
  RISK_LEVEL_Low: True
  RISK_LEVEL_Moderate: False
  RISK_LEVEL_High: False
  RISK_LEVEL_Very High: False

Sample 2:
Predicted:
  RISK_LEVEL_Low: 1
  RISK_LEVEL_Moderate: 0
  RISK_LEVEL_High: 0
  RISK_LEVEL_Very High: 0
Actual:
  RISK_LEVEL_Low: True
  RISK_LEVEL_Moderate: False
  RISK_LEVEL_High: False
  RISK_LEVEL_Very High: False

Sample 3:
Predicted:
  RISK_LEVEL_Low: 0
  RISK_LEVEL_Moderate: 1
  RISK_LEVEL_High: 0
  RISK_LEVEL_Very High: 0
Actual:
  RISK_LEVEL_Low: False
  RISK_LEVEL_Moderate: False
  RISK_LEVEL_High: True
  RISK_LEVEL_Very High: False

Sample 4:
Predicted:
  RISK_LEVEL_Low: 1
  RISK_LEVEL_Moderate: 0
  RISK_LEVEL_High: 0
  RISK_LEVEL_Very High: 0
Actual:
  RISK_LEVEL_Low: True
  RISK_LEVEL_Moderate: False
  RISK_LEVEL_High: False
  RISK_LEVEL_Very High: False

Sample 5:
Predicted:
  RISK_LEVEL_Lo

In [73]:
import os

# Define the path to save the model as a .keras file
model_save_path = "/content/dengue_classification_model.keras"

# Save the classification model in .keras format
model_classification.save(model_save_path)

print(f"Classification model saved successfully to: {model_save_path}")

Classification model saved successfully to: /content/dengue_classification_model.keras
