In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
tvdb_df = pd.read_csv("data/tvdb.csv", index_col="imdb_id")
my_ratings_df = pd.read_csv("data/my_ratings.csv", index_col="imdb_id")
imdb_df = pd.read_csv("data/imdb.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_df.columns.difference(imdb_df.columns)
df1 = pd.merge(imdb_df, tvdb_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

# Drop prediction feature if present.
try:
    tv_df.drop('prediction', axis=1, inplace=True)
except KeyError:
    pass

# Save a copy to display results after prediction.
main_df = tv_df.copy()

In [3]:
# Create new feature: episodes per season.
tv_df['ep_per_season'] = tv_df['n_episodes'] / tv_df['n_seasons']
tv_df[['name', 'ep_per_season', 'n_episodes', 'n_seasons']].head(10)

Unnamed: 0,name,ep_per_season,n_episodes,n_seasons
tt0092337,Dekalog,10.0,10.0,1.0
tt0094525,Poirot,5.384615,70.0,13.0
tt0096542,Baywatch,22.0,242.0,11.0
tt0096548,Blackadder Goes Forth,6.0,6.0,1.0
tt0096579,Family Matters,23.888889,215.0,9.0
tt0096639,Lonesome Dove,4.0,4.0,1.0
tt0096657,Mr. Bean,15.0,15.0,1.0
tt0096684,Quantum Leap,19.4,97.0,5.0
tt0096694,Saved by the Bell,17.4,87.0,5.0
tt0096697,The Simpsons,21.53125,689.0,32.0


In [4]:
# Fill NaN values in network.
tv_df['network'] = tv_df['network'].fillna('Unknown')
# Calculate my avg rating for each network.
networks_rat = tv_df.groupby(by="network").mean()["my_rating"].sort_values(ascending=False)
# Replace network name with my avg rating for its shows.
tv_df['network'] = tv_df["network"].map(lambda x: networks_rat[x])

In [5]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,name,type,poster,banner,fanart,first_aired,overview,rating,series_name,status
count,1231,1231,1231,1213,1213,1213,1210,1153,1212,1213
unique,1218,2,1231,1202,1210,1080,1210,6,1212,2
top,Life,TV Series,https://m.media-amazon.com/images/M/MV5BOGViOW...,https://artworks.thetvdb.com/banners/,https://artworks.thetvdb.com/banners/,2014-02-06,A fantasy drama set in a world of legendary he...,TV-14,The Fosters (2013),Ended
freq,2,1096,1,12,4,4,1,462,1,942


In [6]:
from sklearn.model_selection import train_test_split

# Split training set and test set.
rated_df = tv_df.dropna(axis=0, subset=["my_rating"])

unrated_df = tv_df[tv_df["my_rating"].isna()].drop("my_rating", axis=1)
unrated_df = unrated_df[unrated_df["type"].notna()]

X = rated_df.drop(["my_rating"], axis=1)
y = rated_df["my_rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=17)

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
import xgboost as xgb

year_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["start_year"].max()),
    MinMaxScaler())

genre_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0))

avg_ratings_pipe = make_pipeline(
    KNNImputer(n_neighbors=5, weights="uniform"),
    StandardScaler())

popularity_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["popularity_rank"].max(), add_indicator=True),
    MinMaxScaler())

ordinal_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder())

rating_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="unknown"),
    OneHotEncoder(categories=[np.append(tv_df["rating"].unique(), "unknown")]))

network_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler())

knn_pipe = make_pipeline(
    KNNImputer(n_neighbors=3, weights="distance"),
    MinMaxScaler()
)

year_cat = ["start_year", "end_year"]
genre_cat = [name for name in tv_df.columns if name.startswith("genre")]
avg_ratings_cat = [name for name in tv_df.columns if name.startswith("rating_")]
popularity_cat = ["popularity_rank"]
ordinal_cat = ["type", "status"]
rating_cat = ["rating"]
network_cat = ["network"]
knn_cat = ['n_episodes', 'n_ratings', 'tvdb_ratings', 'num_seasons', 'ep_length', 'tvdb_avg_rating',
           'ep_per_season']

transformers = [
    ('year', year_pipe, year_cat),
    ('genres', genre_pipe, genre_cat),
    ('avg_ratings', avg_ratings_pipe, avg_ratings_cat),
    ('popularity', popularity_pipe, popularity_cat),
    ('ordinal', ordinal_pipe, ordinal_cat),
    ('rating', rating_pipe, rating_cat),
    ('network', network_pipe, network_cat),
    ('knn', knn_pipe, knn_cat)
]

preprocessor = ColumnTransformer(transformers, remainder='drop')

model = make_pipeline(
    preprocessor,
    xgb.XGBRegressor(random_state=17))

In [8]:
from sklearn.model_selection import cross_val_score, GridSearchCV

# Tune parameters.
param_grid = {
    'xgbregressor__n_estimators': [40],
    'xgbregressor__max_depth': [7],
    'xgbregressor__min_child_weight': [8],
    'xgbregressor__eta': [.1],
    'xgbregressor__eval_metric': ['mae']
}

grid_clf = GridSearchCV(model, param_grid, cv=10, n_jobs=4, verbose=1)
grid_clf.fit(X_train, y_train)

print(grid_clf.best_params_)

model = grid_clf.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


{'xgbregressor__eta': 0.1, 'xgbregressor__eval_metric': 'mae', 'xgbregressor__max_depth': 7, 'xgbregressor__min_child_weight': 8, 'xgbregressor__n_estimators': 40}


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    1.9s finished


In [9]:
# Calculate Mean Absolute Error over training set.
scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=10)
print(f"MAE on training set: {-scores.mean():.2f} (+/- {(scores.std() * 2):.2f})")

MAE on training set: 1.21 (+/- 0.61)


In [10]:
# Predict results for test set.
predictions = model.predict(X_test)

# Display results comparing them to real personal ratings.
results = pd.DataFrame(
    data=dict(
        prediction = predictions, 
        real = y_test.to_list(), 
        difference = predictions - y_test.to_list()),
    index=main_df.loc[y_test.index, "name"])

print(results[results.index.notnull()].sort_values(by="difference", ascending=False).round(2))

                                   prediction  real  difference
name                                                           
Legion                                   7.18   4.0        3.18
Suits                                    7.16   4.0        3.16
Cowboy Bebop                             8.06   5.0        3.06
The 100                                  3.63   1.0        2.63
3%                                       3.71   2.0        1.71
Chuck                                    6.61   5.0        1.61
The Simpsons                             6.90   6.0        0.90
Altered Carbon                           4.88   4.0        0.88
The Last Dance                           8.72   8.0        0.72
The Shield                               7.58   7.0        0.58
Counterpart                              5.52   5.0        0.52
It's Always Sunny in Philadelphia        8.43   8.0        0.43
Ash vs Evil Dead                         6.13   6.0        0.13
Freaks and Geeks                        

In [11]:
from sklearn.metrics import mean_absolute_error

print(f"MAE on test set: {mean_absolute_error(y_test, predictions):.2f}")

MAE on test set: 1.30


In [12]:
# Predict ratings for new tv series.
predictions = model.predict(unrated_df)

# Display best tv series to watch.
predictions_df = main_df.copy().dropna(how='all')
predictions_df["prediction"] = pd.Series(data=predictions, index=unrated_df.index)

predictions_df = predictions_df.sort_values(by="prediction", ascending=False)[["name", "prediction", "overview"]].round(decimals=2)
predictions_df[:20]

Unnamed: 0,name,prediction,overview
tt1355642,Fullmetal Alchemist: Brotherhood,8.34,Edward and Alphonse Elric's reckless disregard...
tt5555260,This Is Us,8.24,This refreshingly honest and provocative serie...
tt0118421,Oz,8.13,OZ is set deep inside the Oswald Maximum Secur...
tt0417299,Avatar: The Last Airbender,8.12,With the Fire Nation on the brink of global do...
tt9059760,Normal People,8.11,Adapted from Sally Rooney's best-selling novel...
tt0092337,Dekalog,8.07,"Originally made for Polish television, “Dekalo..."
tt2560140,Attack on Titan,8.03,"Several hundred years ago, humans were nearly ..."
tt1870479,The Newsroom,7.96,The Newsroom is an American drama television s...
tt0388629,One Piece,7.94,It was a time when pirates ruled the seas. Sev...
tt2701582,Endeavour,7.94,"Set in the mid-Sixties, this prequel centers a..."
