In [2]:
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform, randint
import joblib

In [3]:
df = pd.read_csv('../../data/cleaned/data_clean_v6.csv')

In [4]:
df = df.rename(columns={'Revenue_InflationCorrected':'Revenue'})

In [5]:
features = ['actor', 'actress', 'director', 'writer', 'Production_Company', 'runtimeMinutes', 'genres', 'isAdult', 'averageRating']
target = 'Revenue'
X = df[features]
y = df[target]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
categorical_features_indices = [X_train.columns.get_loc(col) for col in ['actor', 'actress', 'director', 'Production_Company', 'writer', 'genres']]

In [None]:
#hyperparameter tuning, to be run only once
param_grid = {
    'depth': np.arange(6,11,1),
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3,5,7]  # Values for L2 regularization
}
cbobject = cb.CatBoostRegressor(
    eval_metric='RMSE',
    cat_features=  categorical_features_indices
)
cbobject.randomized_search(param_grid, X=X_train, y=y_train, cv=5, n_iter=10, plot=True)

In [17]:
model = cb.CatBoostRegressor(
    iterations=1500,
    learning_rate=0.1,
    depth=9,
    eval_metric='RMSE',
    l2_leaf_reg = 5,
    cat_features=categorical_features_indices,
    use_best_model=True
)


In [19]:
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

0:	learn: 252117886.0478669	test: 251373543.1464243	best: 251373543.1464243 (0)	total: 830ms	remaining: 20m 44s
100:	learn: 118536037.6075000	test: 117714801.8140936	best: 117714801.8140936 (100)	total: 1m 9s	remaining: 16m 7s
200:	learn: 97824393.3001943	test: 96087575.8326453	best: 96087575.8326453 (200)	total: 2m 7s	remaining: 13m 41s
300:	learn: 85382717.2852062	test: 83022506.5234020	best: 83022506.5234020 (300)	total: 3m 1s	remaining: 12m 3s
400:	learn: 77088814.6457026	test: 74504924.5929157	best: 74504924.5929157 (400)	total: 3m 58s	remaining: 10m 54s
500:	learn: 70462416.3833655	test: 67969311.3713836	best: 67969311.3713836 (500)	total: 4m 54s	remaining: 9m 47s
600:	learn: 65949835.7435987	test: 63560058.5135126	best: 63560058.5135126 (600)	total: 5m 50s	remaining: 8m 44s
700:	learn: 61805558.7125540	test: 59515372.7066914	best: 59515372.7066914 (700)	total: 6m 47s	remaining: 7m 44s
800:	learn: 58210682.2150159	test: 56180826.1269464	best: 56180826.1269464 (800)	total: 7m 45s	

<catboost.core.CatBoostRegressor at 0x25fc23bb390>

In [None]:
#run only once to check overfitting. If ran gridsearch, no need to evaluate seperately
scores = cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5), scoring="neg_mean_squared_error", fit_params={'eval_set': [(X_test, y_test)]})
print(scores)

In [21]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 41884083.50209742


In [22]:
r2_score(y_pred=y_pred, y_true=y_test)

0.9746421721416366

In [53]:
hypothetical_movie = {
    'actor': 'christian bale',
    'actress': 'Anne Hathaway',
    'director': 'Christopher Nolan',
    'writer': 'Chris Terrio',
    "Production_Company":"Focus Features",
    'runtimeMinutes': 180,
    'genres': 'Horror',
    'isAdult': 1,
    "averageRating": 6.0
}

new_data = pd.DataFrame([hypothetical_movie])

predicted_revenue = model.predict(new_data)
print(f'Predicted Revenue: {predicted_revenue[0]}')


Predicted Revenue: 303935257.36119413


In [36]:
#get movies from dataset based on Production Company pattern. Only for testing TODO: remove later
import re
df[df['Production_Company'].apply(lambda x: bool(re.search('Warner', str(x))))]

Unnamed: 0,tconst,actor,actress,director,producer,writer,averageRating,numVotes,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,inflation_corrector,Revenue,Production_Company
339,tt0132245,Sylvester Stallone,missing,Renny Harlin,Elie Samaha,Jan Skrentny,4.6,42859,movie,Driven,0,2001,116,Action,1.720508,94188769,Warner Bros.
340,tt0132245,Sylvester Stallone,missing,Renny Harlin,Elie Samaha,Jan Skrentny,4.6,42859,movie,Driven,0,2001,116,Drama,1.720508,94188769,Warner Bros.
341,tt0132245,Sylvester Stallone,missing,Renny Harlin,Elie Samaha,Jan Skrentny,4.6,42859,movie,Driven,0,2001,116,Sport,1.720508,94188769,Warner Bros.
342,tt0132245,Sylvester Stallone,missing,Renny Harlin,Elie Samaha,Neal Tabachnick,4.6,42859,movie,Driven,0,2001,116,Action,1.720508,94188769,Warner Bros.
343,tt0132245,Sylvester Stallone,missing,Renny Harlin,Elie Samaha,Neal Tabachnick,4.6,42859,movie,Driven,0,2001,116,Drama,1.720508,94188769,Warner Bros.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295337,tt8912936,John Krasinski,Kate McKinnon,Sam J. Levine,Dany Garcia,John Whittington,7.1,74647,movie,DC League of Super-Pets,0,2022,105,Adventure,1.041165,216101104,Warner Bros.
295338,tt8912936,John Krasinski,Kate McKinnon,Sam J. Levine,Dany Garcia,John Whittington,7.1,74647,movie,DC League of Super-Pets,0,2022,105,Animation,1.041165,216101104,Warner Bros.
295339,tt8912936,John Krasinski,Kate McKinnon,Sam J. Levine,Hiram Garcia,John Whittington,7.1,74647,movie,DC League of Super-Pets,0,2022,105,Action,1.041165,216101104,Warner Bros.
295340,tt8912936,John Krasinski,Kate McKinnon,Sam J. Levine,Hiram Garcia,John Whittington,7.1,74647,movie,DC League of Super-Pets,0,2022,105,Adventure,1.041165,216101104,Warner Bros.


In [22]:
joblib.dump(model, 'ratingModelv1.joblib')

['ratingModelv1.joblib']