In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectFromModel

# Load the data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')
test_features = pd.read_csv('dengue_features_test.csv')

# Merge training features and labels
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])

# Function to preprocess data
def preprocess_data(data):
    # Convert date to datetime
    data['week_start_date'] = pd.to_datetime(data['week_start_date'])
    
    # Create new time-based features
    data['month'] = data['week_start_date'].dt.month
    data['day_of_year'] = data['week_start_date'].dt.dayofyear
    data['is_high_season'] = data['month'].isin([6, 7, 8, 9, 10, 11]).astype(int)
    
    # Create lag features
    for col in ['precipitation_amt_mm', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 'station_min_temp_c']:
        data[f'{col}_lag1'] = data.groupby('city')[col].shift(1)
        data[f'{col}_lag2'] = data.groupby('city')[col].shift(2)
    
    # Create rolling mean features
    for col in ['precipitation_amt_mm', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 'station_min_temp_c']:
        data[f'{col}_rolling_mean'] = data.groupby('city')[col].rolling(window=4, min_periods=1).mean().reset_index(0, drop=True)
    
    # Drop non-numeric columns
    columns_to_drop = ['week_start_date']
    data = data.drop(columns=columns_to_drop)
    
    # Create a copy of 'city' column before encoding
    data['city_original'] = data['city']
    
    # Convert 'city' to numeric
    data['city'] = data['city'].map({'sj': 0, 'iq': 1})
    
    return data

# Preprocess train and test data
train_data = preprocess_data(train_data)
test_features = preprocess_data(test_features)

# Separate features and target
X = train_data.drop(['total_cases', 'city_original'], axis=1)
y = train_data['total_cases']

print("Done")

Done


In [57]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_imputed = pd.DataFrame(imputer.transform(test_features.drop('city_original', axis=1)), columns=X.columns)

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)
test_scaled = pd.DataFrame(scaler.transform(test_imputed), columns=test_imputed.columns)

In [59]:
# Feature selection
selector = SelectFromModel(estimator=RandomForestRegressor(n_estimators=100, random_state=42))
selector.fit(X_scaled, y)
X_selected = selector.transform(X_scaled)
test_selected = selector.transform(test_scaled)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Define models
rf_model = RandomForestRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42)

# Define parameter grids
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

print("Done")

Done


In [60]:
# Perform grid search
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
print("Done")

xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
print("Done")

Done
Done


In [63]:
# Make predictions on validation set
rf_val_predictions = rf_grid_search.best_estimator_.predict(X_val)
xgb_val_predictions = xgb_grid_search.best_estimator_.predict(X_val)
print("Done")

Done


In [65]:
# Calculate MAE for both models
rf_mae = mean_absolute_error(y_val, rf_val_predictions)
xgb_mae = mean_absolute_error(y_val, xgb_val_predictions)

print(f"Random Forest Validation MAE: {rf_mae}")
print(f"XGBoost Validation MAE: {xgb_mae}")
print("Done")

Random Forest Validation MAE: 13.626986301369863
XGBoost Validation MAE: 11.720913735460746
Done


In [70]:
# Choose the best model
best_model = rf_grid_search.best_estimator_ if rf_mae < xgb_mae else xgb_grid_search.best_estimator_

# Make predictions on test data
test_predictions = best_model.predict(test_selected)

# Create submission dataframe
submission = pd.DataFrame({
    'city': test_features['city_original'],  # Use the original city names
    'year': test_features['year'],
    'weekofyear': test_features['weekofyear'],
    'total_cases': np.round(test_predictions).astype(int)
})

# Save predictions to CSV
submission.to_csv('dengue_predictions.csv', index=False)

print("Predictions saved to 'dengue_predictions.csv'")

Predictions saved to 'dengue_predictions.csv'
