# Random Forest Model for Movie Score Prediction


In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import randint, uniform


## Load Training Dataset


In [18]:
useBackslash = False
trainingDatasetPath = r'..\data\training_dataset.csv' if useBackslash else r'../data/training_dataset.csv'
dataset_training = pd.read_csv(trainingDatasetPath, sep=';')


In [19]:
print(dataset_training.columns.tolist())


['Unnamed: 0', 'isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes', 'movie_score', '_orig_order', 'prior_movie_director_sentiment', 'prior1_rating_director', 'prior2_rating_director', 'prior3_rating_director', 'actor1_prior1_rating_actor', 'actor1_prior2_rating_actor', 'actor1_prior3_rating_actor', 'actor1_prior_movie_actor_sentiment', 'actor2_prior1_rating_actor', 'actor2_prior2_rating_actor', 'actor2_prior3_rating_actor', 'actor2_prior_movie_actor_sentiment', 'actor3_prior1_rating_actor', 'actor3_prior2_rating_actor', 'actor3_prior3_rating_actor', 'actor3_prior_movie_actor_sentiment', 'actor4_prior1_rating_actor', 'actor4_prior2_rating_actor', 'actor4_prior3_rating_actor', 'actor4_prior_movie_actor_sentiment', 'actor5_prior1_rating_actor', 'actor5_prior2_rating_actor', 'actor5_prior3_rating_actor', 'actor5_prior_movie_actor_sentiment', 'actor6_prior1_rating_actor', 'actor6_prior2_rating_actor', 'actor6_prior3_rating_actor', 'actor6_prior_movie_actor_sentiment', 'act

## Load Test Dataset


In [20]:
useBackslash = False
testDatasetPath = r'..\data\test_dataset.csv' if useBackslash else r'../data/test_dataset.csv'
df_test = pd.read_csv(testDatasetPath, sep=";")


## Prepare Training Data


In [21]:
y_train = np.log1p(dataset_training['movie_score'].values)
dataset_training = dataset_training.drop(columns=['movie_score', 'Unnamed: 0', 'averageRating', 'numVotes', '_orig_order'])
x_train = dataset_training.values


## Prepare Test Data


In [22]:
y_test = np.log1p(df_test['movie_score'].values)
df_test = df_test.drop(columns=['movie_score', 'Unnamed: 0', 'averageRating', 'numVotes', '_orig_order'])
x_test = df_test.values


## Create and Train Basic Random Forest Model


In [23]:
# Creating and training a basic random forest model
rf_basic = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf_basic.fit(x_train, y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
# Evaluate basic model
y_train_pred = rf_basic.predict(x_train)
y_test_pred = rf_basic.predict(x_test)

print("Basic Random Forest Model Performance:")
print(f"Training R²: {r2_score(y_train, y_train_pred):.3f}")
print(f"Test R²: {r2_score(y_test, y_test_pred):.3f}")
print(f"Training RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred)):.3f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.3f}")
print(f"Training MAE: {mean_absolute_error(y_train, y_train_pred):.3f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_test_pred):.3f}")


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.4s


Basic Random Forest Model Performance:
Training R²: 0.932
Test R²: 0.437
Training RMSE: 0.466
Test RMSE: 1.256
Training MAE: 0.355
Test MAE: 0.996


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


In [14]:
x_train.shape

(114962, 126)

## Hyperparameter Tuning with RandomizedSearchCV


In [None]:
# Random Forest with RandomizedSearchCV (much faster than GridSearchCV)
rf = RandomForestRegressor(random_state=42, n_jobs=-1, max_samples=0.8, verbose=0)

# Optimized parameter distributions for faster search
param_dist_rf = {
    'n_estimators': [300, 400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(2, 15),
    'max_features': ['sqrt', 'log2']
}

rf_random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist_rf,
    n_iter=30,  # Reduced from 50 for faster execution
    cv=5,
    scoring='r2',
    n_jobs=1,
    verbose=2,
    random_state=42
)
rf_random_search.fit(x_train, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=14, min_samples_split=16, n_estimators=300; total time=   6.6s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=14, min_samples_split=16, n_estimators=300; total time=   7.1s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=14, min_samples_split=16, n_estimators=300; total time=   6.9s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=14, min_samples_split=16, n_estimators=300; total time=   6.6s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=14, min_samples_split=16, n_estimators=300; total time=   6.5s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=6, min_samples_split=8, n_estimators=400; total time=  14.8s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=6, min_samples_split=8, n_estimators=400; total time=  17.0s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=6, min_samples_split=8, n_est

In [None]:
# Inspect best Random Forest parameters and scores
best_params_rf = rf_random_search.best_params_
best_rf = rf_random_search.best_estimator_

print(f"Best Random Forest parameters: {best_params_rf}")
print(f"Best Random Forest CV R-squared: {rf_random_search.best_score_:.3f}")
print(f"Best Random Forest training R-squared: {best_rf.score(x_train, y_train):.3f}")
print(f"Best Random Forest test R-squared: {best_rf.score(x_test, y_test):.3f}")

# Additional metrics
y_train_pred_best = best_rf.predict(x_train)
y_test_pred_best = best_rf.predict(x_test)

print(f"\nBest Random Forest Model Performance:")
print(f"Training RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred_best)):.3f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred_best)):.3f}")
print(f"Training MAE: {mean_absolute_error(y_train, y_train_pred_best):.3f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_test_pred_best):.3f}")
#Best Random Forest parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 400}


Best Random Forest parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 400}
Best Random Forest CV R-squared: 0.268
Best Random Forest training R-squared: 0.688
Best Random Forest test R-squared: -0.101

Best Random Forest Model Performance:
Training RMSE: 274693.094
Test RMSE: 226519.410
Training MAE: 46918.245
Test MAE: 60953.228


## Feature Importance Analysis


In [None]:
# Get feature importances from the best model
feature_importances = best_rf.feature_importances_
feature_names = dataset_training.columns

# Create a DataFrame for easier visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# Display top 20 most important features
print("Top 20 Most Important Features:")
print(importance_df.head(20))


In [None]:
# Visualize top 20 feature importances
plt.figure(figsize=(10, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


## Save the Best Model


In [None]:
# Save the best model
joblib.dump(best_rf, "random_forest_movie_score.joblib")
print("Model saved as 'random_forest_movie_score.joblib'")


## Cross-Validation Score


In [None]:
# Perform cross-validation on the best model
cv_scores = cross_val_score(best_rf, x_train, y_train, cv=10, scoring='r2', n_jobs=-1)
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
