# Random Forest Model for Movie Score Prediction

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import randint, uniform


## Load Datasets
Both training and test datasets are loaded.
The test dataset is only used for evaluating the baseline random forest model, the final model is evaluated in the testing notebook.


In [4]:
trainingDatasetPath = os.path.join('.', 'data', 'training_dataset.csv')
dataset_training = pd.read_csv(trainingDatasetPath, sep=';')

In [5]:
testDatasetPath = os.path.join('.', 'data', 'test_dataset.csv')
df_test = pd.read_csv(testDatasetPath, sep=";")

## Preparing the Data
The training and test datasets are prepared for the random forest model.

In [6]:
y_train = dataset_training['movie_score'].values
dataset_training = dataset_training.drop(columns=['movie_score', 'Unnamed: 0', 'averageRating', 'numVotes', '_orig_order'])
x_train = dataset_training.values


In [7]:
y_test = df_test['movie_score'].values
df_test = df_test.drop(columns=['movie_score', 'Unnamed: 0', 'averageRating', 'numVotes', '_orig_order'])
x_test = df_test.values


## Create and Train Basic Random Forest Model
Here a basic random forest model is trained, without any hyperparameter tuning. This will act as a baseline for the performance of the optimized model.

In [None]:
# Creating and training a basic random forest model
rf_basic = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf_basic.fit(x_train, y_train)


In [None]:
# Evaluate basic model
y_train_pred = rf_basic.predict(x_train)
y_test_pred = rf_basic.predict(x_test)

print("Basic Random Forest Model Performance:")
print(f"Training R²: {r2_score(y_train, y_train_pred):.3f}")
print(f"Test R²: {r2_score(y_test, y_test_pred):.3f}")
print(f"Training RMSE: {np.sqrt(mean_squared_error(np.expm1(y_train), np.expm1(y_train_pred))):.3f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_test_pred))):.3f}")
print(f"Training MAE: {mean_absolute_error(np.expm1(y_train), np.expm1(y_train_pred)):.3f}")
print(f"Test MAE: {mean_absolute_error(np.expm1(y_test), np.expm1(y_test_pred)):.3f}")


## Hyperparameter Tuning with RandomizedSearchCV


In [None]:
# Random Forest with RandomizedSearchCV (much faster than GridSearchCV)
rf = RandomForestRegressor(random_state=42, n_jobs=-1, max_samples=0.8, verbose=0)

# Optimized parameter distributions for faster search
param_dist_rf = {
    # n_estimators: Number of decision trees in the random forest ensemble.
    # More trees generally improve performance but increase training time.
    # Values: 300 or 400 trees
    'n_estimators': [300, 400],
    
    # max_depth: Maximum depth of each decision tree in the forest.
    # Deeper trees can capture more complex patterns but may overfit.
    # None means nodes are expanded until all leaves are pure or contain min_samples_split samples.
    # Values: 10, 20, 30, or 40 levels deep
    'max_depth': [10, 20, 30, 40],
    
    # min_samples_split: Minimum number of samples required to split an internal node.
    # Higher values prevent overfitting by requiring more samples before splitting.
    # Random integer between 2 and 20 (exclusive of 20)
    'min_samples_split': randint(2, 20),
    
    # min_samples_leaf: Minimum number of samples required to be at a leaf node.
    # Higher values create more conservative trees and reduce overfitting.
    # Random integer between 2 and 15 (exclusive of 15)
    'min_samples_leaf': randint(2, 15),
    
    # max_features: Number of features to consider when looking for the best split.
    # 'sqrt': Uses sqrt(n_features) features (common default for classification)
    # 'log2': Uses log2(n_features) features (another common choice)
    # This parameter controls the randomness and diversity of trees in the ensemble.
    'max_features': ['sqrt', 'log2']
}
cv = TimeSeriesSplit(n_splits=5)

rf_random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist_rf,
    n_iter=30, 
    cv=cv,
    scoring='r2',
    n_jobs=1,
    verbose=2,
    random_state=42
)
rf_random_search.fit(x_train, y_train)


In [None]:
best_model = rf_random_search.best_estimator_

# Refit best model on ALL data
best_model.fit(x_train, y_train)
# Save the model
joblib.dump(best_model, "../random_forest_reg_movie_log_transformed.joblib")

# Saves the best parameters and tried parameters to csv file
cv_results = pd.DataFrame(rf_random_search.cv_results_)
saveRandomForestRegressorCvResultsAt = os.path.join('..','..', 'data', 'random_forest_reg_movie_log_transformed.csv')
cv_results.to_csv(saveRandomForestRegressorCvResultsAt, index=False)

In [None]:
# Inspect best Random Forest parameters and scores
best_params_rf = rf_random_search.best_params_
best_rf = rf_random_search.best_estimator_

print(f"Best Random Forest parameters: {best_params_rf}")
print(f"Best Random Forest CV R-squared: {rf_random_search.best_score_:.3f}")
print(f"Best Random Forest training R-squared: {best_rf.score(x_train, y_train):.3f}")
print(f"Best Random Forest test R-squared: {best_rf.score(x_test, y_test):.3f}")

# Additional metrics
y_train_pred_best = best_rf.predict(x_train)
y_test_pred_best = best_rf.predict(x_test)

print(f"\nBest Random Forest Model Performance:")
print(f"Training RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred_best)):.3f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred_best)):.3f}")
print(f"Training MAE: {mean_absolute_error(y_train, y_train_pred_best):.3f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_test_pred_best):.3f}")
#Best Random Forest parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 400}


## Feature Importance Analysis


In [None]:
# Get feature importances from the best model
feature_importances = best_rf.feature_importances_
feature_names = dataset_training.columns

# Create a DataFrame for easier visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# Display top 20 most important features
print("Top 20 Most Important Features:")
print(importance_df.head(20))


In [None]:
# Visualize top 20 feature importances
plt.figure(figsize=(10, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


## Cross-Validation Score


In [None]:
# Perform cross-validation on the best model
cv_scores = cross_val_score(best_rf, x_train, y_train, cv=10, scoring='r2', n_jobs=-1)
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
