In [17]:
# Data manipulation and analysis
# Import necessary libraries
import pandas as pd            # For data handling
from sklearn.model_selection import train_test_split  # To split data into train and test sets
from sklearn.ensemble import RandomForestRegressor   # RandomForest model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # For model evaluation


In [18]:
# Load the dataset
data = pd.read_excel('Sample_Input_Data_2000_2010.xlsx')

# Split the data into training and testing based on the 'Year' column
train_data = data[data['Year'].between(2000, 2008)]  # Training data from 2000 to 2008
test_data = data[data['Year'].between(2009, 2010)]   # Testing data for 2009 and 2010

# Separate features (X) and target (y) for training
X_train = train_data.drop(columns=['Traffic Incidents'])  # Drop the target column for training features
y_train = train_data['Traffic Incidents']                 # Target column for training

# Separate features and target for testing
X_test = test_data.drop(columns=['Traffic Incidents'])
y_test = test_data['Traffic Incidents']


In [19]:
# Check for missing values in the training and testing sets
print("Missing values in X_train:")
print(X_train.isnull().sum())

print("\nMissing values in X_test:")
print(X_test.isnull().sum())



Missing values in X_train:
Incident_ID                   0
Month                         0
Day                           0
Hour                          0
Latitude                      0
Longitude                     0
Sector                        0
Year                          0
Traffic Signals               0
Street Centreline             0
RTM Road Network Scenarios    0
Land Use Designation Codes    0
Citywide Land Cover           0
Public Trees                  0
Climate/Weather Data          0
Community Services            0
Incident_Type                 0
Severity                      0
dtype: int64

Missing values in X_test:
Incident_ID                   0
Month                         0
Day                           0
Hour                          0
Latitude                      0
Longitude                     0
Sector                        0
Year                          0
Traffic Signals               0
Street Centreline             0
RTM Road Network Scenarios    0
Land 

In [20]:
# One-hot encode categorical variables in both training and testing sets
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure that X_test has the same columns as X_train by reindexing
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [21]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)


In [22]:
# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print("Model Performance on Test Data:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R² Score: {r2}")



Model Performance on Test Data:
Mean Absolute Error (MAE): 5.232708333333334
Mean Squared Error (MSE): 34.58362916666667
R² Score: -0.035005589995551034


In [23]:
# Create a DataFrame to compare the actual and predicted values
results = pd.DataFrame({
    'Year': test_data['Year'].values,
    'Actual': y_test.values,
    'Predicted': y_pred
})

# Display the comparison
print(results.head(10))


   Year  Actual  Predicted
0  2009       5      10.06
1  2009      11      10.00
2  2009      18      10.21
3  2009      13       9.56
4  2009      18       9.62
5  2009      17       9.15
6  2009      19      10.02
7  2009      12      11.46
8  2009      13       9.71
9  2009       3      10.33
