In [1]:
import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
# import warnings # For handling error messages.
# warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
movie_data = pd.read_csv('data/preprocessed_data.csv')

In [3]:
movie_data.head()

Unnamed: 0,title,vote_average,year,month,budget,revenue,runtime,Action,Adventure,Animation,...,director_12995,director_15217,director_16938,director_17494,director_17825,director_18878,director_19303,director_20907,director_36602,director_90367
0,Psycho,8.434,1960,6,-0.841548,-0.38865,0.012591,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Spartacus,7.5,1960,10,-0.57851,-0.236352,4.473263,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,The Magnificent Seven,7.5,1960,10,-0.813511,-0.536026,0.925002,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,The Apartment,8.214,1960,6,-0.790011,-0.426725,0.823623,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,La Dolce Vita,8.121,1960,2,-0.82753,-0.455841,3.408785,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = movie_data.drop(columns=['title', 'vote_average'])
y = movie_data['vote_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
lr_rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {lr_rmse}")
print(f"R-squared Score: {r2}")

Root Mean Squared Error (RMSE): 0.5897698821467545
R-squared Score: 0.3735782078735075


## XGBoost Regression

### Default hyperparameters

In [5]:
reg = xgb.XGBRegressor(random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
xgb_default_rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {xgb_default_rmse}")
print(f"R-squared Score: {r2}")

Root Mean Squared Error (RMSE): 0.555646544106851
R-squared Score: 0.44396911984664


### Finding best parameters with GridSearchCV

In [6]:
# Define the grid of hyperparameters
param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, scoring='neg_root_mean_squared_error')

# Fit the grid search
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best score: -0.571530824309222


### Applying to test set

In [7]:
reg = xgb.XGBRegressor(**best_params, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
xgb_gridsearch_rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {xgb_gridsearch_rmse}")
print(f"R-squared Score: {r2}")

Root Mean Squared Error (RMSE): 0.541439765805673
R-squared Score: 0.4720388395195805


In [9]:
xgb_default_rmse - xgb_gridsearch_rmse

0.014206778301178002

We can see an improvement over the default hyperparameters, as measured by RMSE and R^2 values. This model's predicted ratings are 0.014 closer than the predicted ratings of the XGBoost default hyperparameters.

## Comparison: Random Forest

### Default hyperparameters

In [10]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rf_default_rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rf_default_rmse}")
print(f"R-squared Score: {r2}")

Root Mean Squared Error (RMSE): 0.554469489460672
R-squared Score: 0.44632236231656164


### Finding best parameters with GridSearchCV

In [11]:
# Define the grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, verbose=1, scoring='neg_root_mean_squared_error')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_rf_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f'Best Parameters: {best_params}')

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}


### Applying to test set

In [12]:
rf_model = RandomForestRegressor(**best_params, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rf_gridsearch_rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rf_gridsearch_rmse}")
print(f"R-squared Score: {r2}")

Root Mean Squared Error (RMSE): 0.5508870719846513
R-squared Score: 0.4534538521081134


In [13]:
rf_default_rmse - rf_gridsearch_rmse

0.0035824174760207272

This particular grid search yields less improvement over the original model, as compared to the tests done with XGBoost.

## Comparison of methods

In [21]:
rmse_values = [
    ('lr_rmse', lr_rmse),
    ('rf_default_rmse', rf_default_rmse),
    ('xgb_default_rmse', xgb_default_rmse),
    ('rf_gridsearch_rmse', rf_gridsearch_rmse),
    ('xgb_gridsearch_rmse', xgb_gridsearch_rmse)
]

sorted_rmse_values = sorted(rmse_values, key=lambda x: x[1])

for label, value in sorted_rmse_values:
    print(f'{label}: {value}')

xgb_gridsearch_rmse: 0.541439765805673
rf_gridsearch_rmse: 0.5508870719846513
rf_default_rmse: 0.554469489460672
xgb_default_rmse: 0.555646544106851
lr_rmse: 0.5897698821467545
