In [2]:
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [8]:
df = pd.read_csv('data_clean_v2.csv')
for col in ['actor', 'actress', 'director', 'writer', 'genres']:
    df[col] = df[col].fillna('missing')

In [9]:
features = ['actor', 'actress', 'director', 'writer', 'runtimeMinutes', 'genres', 'isAdult']
target = 'averageRating'

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=42)

categorical_features_indices = [X_train.columns.get_loc(col) for col in ['actor', 'actress', 'director', 'writer', 'genres']]

In [13]:
model = cb.CatBoostRegressor(
    iterations=2000,
    learning_rate=0.1,
    depth=7,
    eval_metric='RMSE',
    cat_features=categorical_features_indices,
    use_best_model=True
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)


0:	learn: 0.9466028	test: 0.9470474	best: 0.9470474 (0)	total: 33.4ms	remaining: 1m 6s
100:	learn: 0.5809680	test: 0.5617582	best: 0.5617582 (100)	total: 2.91s	remaining: 54.7s
200:	learn: 0.5606009	test: 0.5405922	best: 0.5405922 (200)	total: 6.24s	remaining: 55.9s
300:	learn: 0.5466476	test: 0.5250280	best: 0.5250280 (300)	total: 9.72s	remaining: 54.9s
400:	learn: 0.5364768	test: 0.5138385	best: 0.5138385 (400)	total: 13.2s	remaining: 52.7s
500:	learn: 0.5277987	test: 0.5046548	best: 0.5046548 (500)	total: 16.9s	remaining: 50.4s
600:	learn: 0.5205155	test: 0.4970481	best: 0.4970481 (600)	total: 20.5s	remaining: 47.7s
700:	learn: 0.5132077	test: 0.4895220	best: 0.4895220 (700)	total: 24.1s	remaining: 44.6s
800:	learn: 0.5070755	test: 0.4830030	best: 0.4830030 (800)	total: 27.7s	remaining: 41.4s
900:	learn: 0.4999560	test: 0.4757590	best: 0.4757590 (900)	total: 31.4s	remaining: 38.3s
1000:	learn: 0.4947839	test: 0.4702814	best: 0.4702814 (1000)	total: 35s	remaining: 35s
1100:	learn: 0.

<catboost.core.CatBoostRegressor at 0x15f9f66d0>

In [14]:

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 0.42929409168921634


In [37]:
hypothetical_movie = {
    'actor': 'Robert Downey Jr.',
    'actress': '',
    'director': '',
    'writer': '',
    'runtimeMinutes': 150,
    'genres': 'Horror',
    'isAdult': 0
}

new_data = pd.DataFrame([hypothetical_movie])

predicted_rating = model.predict(new_data)
print(f'Predicted Rating: {predicted_rating[0]}')


Predicted Rating: 6.47580558878023


In [None]:
model.save_model("CatBoosy")