In [1]:
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [3]:
df = pd.read_csv('../../data/cleaned/data_clean_v6.csv')

In [3]:
features = ['actor', 'actress', 'director', 'writer', 'Production_Company', 'runtimeMinutes', 'genres', 'isAdult']
target = 'averageRating'

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=42)
categorical_features_indices = [X_train.columns.get_loc(col) for col in ['actor', 'actress', 'director', 'Production_Company', 'writer', 'genres']]

In [5]:
model = cb.CatBoostRegressor(
    iterations=1500,
    learning_rate=0.1,
    depth=8,
    eval_metric='RMSE',
    l2_leaf_reg = 5,
    cat_features=categorical_features_indices,
    use_best_model=True
)


In [6]:
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

0:	learn: 0.9148800	test: 0.9121256	best: 0.9121256 (0)	total: 844ms	remaining: 21m 4s
100:	learn: 0.5329261	test: 0.5122857	best: 0.5122857 (100)	total: 45.3s	remaining: 10m 28s
200:	learn: 0.4947259	test: 0.4727584	best: 0.4727584 (200)	total: 1m 36s	remaining: 10m 22s
300:	learn: 0.4651607	test: 0.4426011	best: 0.4426011 (300)	total: 2m 42s	remaining: 10m 46s
400:	learn: 0.4442971	test: 0.4209614	best: 0.4209614 (400)	total: 5m 2s	remaining: 13m 47s
500:	learn: 0.4245773	test: 0.4009492	best: 0.4009492 (500)	total: 5m 58s	remaining: 11m 53s
600:	learn: 0.4079913	test: 0.3839516	best: 0.3839516 (600)	total: 6m 47s	remaining: 10m 9s
700:	learn: 0.3935638	test: 0.3690219	best: 0.3690219 (700)	total: 7m 30s	remaining: 8m 33s
800:	learn: 0.3818138	test: 0.3568211	best: 0.3568211 (800)	total: 8m 17s	remaining: 7m 14s
900:	learn: 0.3716192	test: 0.3464861	best: 0.3464861 (900)	total: 9m 2s	remaining: 6m
1000:	learn: 0.3608386	test: 0.3354235	best: 0.3354235 (1000)	total: 9m 48s	remaining: 

<catboost.core.CatBoostRegressor at 0x207cb07bc10>

In [None]:
#run only once to get optimal params
param_grid = {
    'iterations': [1000, 1500, 2000],
    'depth': [6, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]  # Values for L2 regularization
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)
print(grid_search.best_params_)
print(grid_search.cv_results_)

In [None]:
#run only once to check overfitting. If ran gridsearch, no need to evaluate seperately
scores = cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5), scoring="neg_mean_squared_error", fit_params={'eval_set': [(X_test, y_test)]})
print(scores)

In [7]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.3003133990829797


In [8]:
r2_score(y_pred=y_pred, y_true=y_test)

0.9010392577580292

In [19]:
hypothetical_movie = {
    'actor': 'Leonardo Dicaprio',
    'actress': 'Kate Winslet',
    'director': 'Christopher Nolan',
    'writer': 'Steven Spielberg',
    "Production_Company":"Twentieth Century Fox",
    'runtimeMinutes': 150,
    'genres': 'Thriller',
    'isAdult': 0
}

new_data = pd.DataFrame([hypothetical_movie])

predicted_rating = model.predict(new_data)
print(f'Predicted Rating: {predicted_rating[0]}')


Predicted Rating: 7.000901735878847


In [20]:
#get movies from dataset based on Production Company pattern. Only for testing TODO: remove later
import re
df[df['Production_Company'].apply(lambda x: bool(re.search('Fox', str(x))))]

Unnamed: 0,tconst,actor,actress,director,producer,writer,averageRating,numVotes,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,inflation_corrector,Revenue_InflationCorrected,Production_Company
9,tt0118589,Eric Benét,Mariah Carey,Vondie Curtis-Hall,Laurence Mark,Cheryl L. West,2.4,24015,movie,Glitter,0,2001,104,Drama,1.720508,9069944,Twentieth Century Fox
10,tt0118589,Eric Benét,Mariah Carey,Vondie Curtis-Hall,Laurence Mark,Cheryl L. West,2.4,24015,movie,Glitter,0,2001,104,Music,1.720508,9069944,Twentieth Century Fox
11,tt0118589,Eric Benét,Mariah Carey,Vondie Curtis-Hall,Laurence Mark,Cheryl L. West,2.4,24015,movie,Glitter,0,2001,104,Romance,1.720508,9069944,Twentieth Century Fox
12,tt0118589,Eric Benét,Mariah Carey,Vondie Curtis-Hall,Laurence Mark,Kate Lanier,2.4,24015,movie,Glitter,0,2001,104,Drama,1.720508,9069944,Twentieth Century Fox
13,tt0118589,Eric Benét,Mariah Carey,Vondie Curtis-Hall,Laurence Mark,Kate Lanier,2.4,24015,movie,Glitter,0,2001,104,Music,1.720508,9069944,Twentieth Century Fox
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285465,tt6628394,Jon Hamm,Cynthia Erivo,Drew Goddard,Jeremy Latcham,missing,7.1,163092,movie,Bad Times at the El Royale,0,2018,141,Drama,1.213435,38687610,Twentieth Century Fox
285466,tt6628394,Jon Hamm,Cynthia Erivo,Drew Goddard,Jeremy Latcham,missing,7.1,163092,movie,Bad Times at the El Royale,0,2018,141,Mystery,1.213435,38687610,Twentieth Century Fox
285467,tt6628394,Jon Hamm,Dakota Johnson,Drew Goddard,Jeremy Latcham,missing,7.1,163092,movie,Bad Times at the El Royale,0,2018,141,Crime,1.213435,38687610,Twentieth Century Fox
285468,tt6628394,Jon Hamm,Dakota Johnson,Drew Goddard,Jeremy Latcham,missing,7.1,163092,movie,Bad Times at the El Royale,0,2018,141,Drama,1.213435,38687610,Twentieth Century Fox


In [22]:
joblib.dump(model, 'ratingModelv1.joblib')

['ratingModelv1.joblib']