In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load the data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')
test_features = pd.read_csv('dengue_features_test.csv')

# Merge training features and labels
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])

# Function to preprocess data
def preprocess_data(data):
    # Convert date to datetime
    data['week_start_date'] = pd.to_datetime(data['week_start_date'])
    
    # Create new time-based features
    data['month'] = data['week_start_date'].dt.month
    data['day_of_year'] = data['week_start_date'].dt.dayofyear
    
    # Drop non-numeric columns
    columns_to_drop = ['week_start_date']
    data = data.drop(columns=columns_to_drop)
    
    # Create a copy of 'city' column before encoding
    data['city_name'] = data['city']
    
    # Convert 'city' to numeric
    data['city'] = data['city'].map({'sj': 0, 'iq': 1})
    
    return data

# Preprocess train and test data
train_data = preprocess_data(train_data)
test_features = preprocess_data(test_features)

# Separate features and target
X = train_data.drop(['total_cases', 'city_name'], axis=1)
y = train_data['total_cases']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_imputed = pd.DataFrame(imputer.transform(test_features.drop('city_name', axis=1)), columns=X.columns)

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)
test_scaled = pd.DataFrame(scaler.transform(test_imputed), columns=test_imputed.columns)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Make predictions on test data
predictions = model.predict(test_scaled)

# Create submission dataframe
submission = pd.DataFrame({
    'city': test_features['city_name'],
    'year': test_features['year'],
    'weekofyear': test_features['weekofyear'],
    'total_cases': predictions.astype(int)
})

# Save predictions to CSV
submission.to_csv('dengue_predictions.csv', index=False)

print("Predictions saved to 'dengue_predictions.csv'")

# Optional: Evaluate model on a validation set
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model_val = RandomForestRegressor(n_estimators=100, random_state=42)
model_val.fit(X_train, y_train)
val_predictions = model_val.predict(X_val)
mae = mean_absolute_error(y_val, val_predictions)
print(f"Validation Mean Absolute Error: {mae}")


Predictions saved to 'dengue_predictions.csv'
Validation Mean Absolute Error: 14.45791095890411
