# random forest model for homeless rate prediction

In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Load the data VERSION 2
train_data = pd.read_csv("train.csv")

# Separate features and target
X = train_data.drop(['ID', 'HOMELESS_RATE'], axis=1)
y = train_data['HOMELESS_RATE']

print("Data shape:", X.shape)
print("\nTarget statistics:")
print(f"  Mean: {y.mean():.6f}")
print(f"  Std: {y.std():.6f}")
print(f"  Min: {y.min():.6f}")
print(f"  Max: {y.max():.6f}")
print(f"  Median: {y.median():.6f}")

In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining set size:", X_train.shape)
print("Validation set size:", X_val.shape)

In [None]:
# Random Forest Model (Simple approach)
print("\nTraining Random Forest Model...")

rf_model = RandomForestRegressor(n_estimators=50, random_state=42)

# Cross Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=cv, scoring="neg_mean_squared_error")
cv_mse = -cv_scores
print(f"\nRandom Forest Cross-validation:")
print(f"  CV MSE scores: {cv_mse}")
print(f"  Average CV MSE: {np.mean(cv_mse):.10f}")
print(f"  Average CV RMSE: {np.sqrt(np.mean(cv_mse)):.10f}")

In [None]:
# Fit on training set
rf_model.fit(X_train, y_train)
rf_train_pred = rf_model.predict(X_train)
rf_val_pred = rf_model.predict(X_val)

# Evaluation
rf_train_mse = mean_squared_error(y_train, rf_train_pred)
rf_val_mse = mean_squared_error(y_val, rf_val_pred)

print("\nRandom Forest Performance:")
print("Training Metrics:")
print(f"  MSE:  {rf_train_mse:.10f}")
print(f"  RMSE: {np.sqrt(rf_train_mse):.10f}")
print(f"  MAE:  {mean_absolute_error(y_train, rf_train_pred):.10f}")
print(f"  R2:   {r2_score(y_train, rf_train_pred):.6f}")

print("\nValidation Metrics:")
print(f"  MSE:  {rf_val_mse:.10f}")
print(f"  RMSE: {np.sqrt(rf_val_mse):.10f}")
print(f"  MAE:  {mean_absolute_error(y_val, rf_val_pred):.10f}")
print(f"  R2:   {r2_score(y_val, rf_val_pred):.6f}")

In [None]:
# Feature Importance Analysis
print("\n\nTop 15 Most Important Features:")

# Random Forest Feature Importance
rf_feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'RF_Importance': rf_model.feature_importances_
}).sort_values('RF_Importance', ascending=False)

print("\nRandom Forest Top 15:")
print(rf_feature_importance.head(15).to_string(index=False))

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Target distribution
axes[0, 0].hist(y, bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Homeless Rate')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Target Distribution')
axes[0, 0].grid(True, alpha=0.3)

# 2. Predictions vs Actual - Random Forest
axes[0, 1].scatter(y_val, rf_val_pred, alpha=0.5, s=30)
axes[0, 1].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
axes[0, 1].set_xlabel('Actual Homeless Rate')
axes[0, 1].set_ylabel('Predicted Homeless Rate')
axes[0, 1].set_title(f'Random Forest: Predictions vs Actual\nMSE = {rf_val_mse:.8f}')
axes[0, 1].grid(True, alpha=0.3)

# 3. Residuals - Random Forest
rf_residuals = y_val - rf_val_pred
axes[1, 0].scatter(rf_val_pred, rf_residuals, alpha=0.5, s=30)
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 0].set_xlabel('Predicted Homeless Rate')
axes[1, 0].set_ylabel('Residuals')
axes[1, 0].set_title('Random Forest: Residual Plot')
axes[1, 0].grid(True, alpha=0.3)

# 4. Feature Importance Comparison
top_features = rf_feature_importance.head(10)
axes[1, 1].barh(range(len(top_features)), top_features['RF_Importance'])
axes[1, 1].set_yticks(range(len(top_features)))
axes[1, 1].set_yticklabels(top_features['Feature'], fontsize=8)
axes[1, 1].invert_yaxis()
axes[1, 1].set_xlabel('Importance')
axes[1, 1].set_title('Random Forest: Top 10 Features')
axes[1, 1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('rf_model_analysis.png', dpi=300, bbox_inches='tight')
print("\nVisualization saved as 'rf_model_analysis.png'")
plt.show()

In [None]:
# Retrain on full training data for final predictions
print("\n\nRetraining models on full training data...")
final_rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
final_rf_model.fit(X, y)

In [None]:
# Generate predictions for test set
print("\nGenerating Test Set Predictions...")

try:
    test_data = pd.read_csv("test.csv")
    test_ids = test_data['ID']
    X_test = test_data.drop(['ID'], axis=1)
    
    # Generate predictions with both models
    rf_test_pred = final_rf_model.predict(X_test)
    
    # Ensure no negative predictions
    rf_test_pred = np.maximum(rf_test_pred, 0)
    
    # Create submission files
    rf_submission = pd.DataFrame({
        'ID': test_ids,
        'HOMELESS_RATE': rf_test_pred
    })
    rf_submission.to_csv('rf_submission.csv', index=False)
    print("Random Forest predictions saved to 'rf_submission.csv'")
    
    print("\nTest prediction statistics:")
    print(f"Random Forest - Min: {rf_test_pred.min():.8f}, Max: {rf_test_pred.max():.8f}, Mean: {rf_test_pred.mean():.8f}")
    
except FileNotFoundError:
    print("Test file not found. Skipping test predictions.")

print("\nModel training complete.")