In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
tvdb_series_df = pd.read_csv("tvdb.csv", index_col="imdb_id")
my_ratings_df = pd.read_csv("my_ratings.csv", index_col="imdb_id")
imdb_series_df = pd.read_csv("imdb.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_series_df.columns.difference(imdb_series_df.columns)
df1 = pd.merge(imdb_series_df, tvdb_series_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

# Display final dataset obtained.
main_df = tv_df.copy()
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3157 entries, tt0092337 to nan
Data columns (total 68 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               3155 non-null   object 
 1   type               3155 non-null   object 
 2   start_year         3155 non-null   float64
 3   end_year           1597 non-null   float64
 4   ep_length          3044 non-null   float64
 5   n_seasons          3150 non-null   float64
 6   n_episodes         3155 non-null   float64
 7   popularity_rank    2451 non-null   float64
 8   n_ratings          3155 non-null   float64
 9   rating_avg         3155 non-null   float64
 10  rating_top1000     3155 non-null   float64
 11  rating_us          3155 non-null   float64
 12  rating_row         3155 non-null   float64
 13  rating_M           3155 non-null   float64
 14  rating_F           3155 non-null   float64
 15  rating_0to18       2649 non-null   float64
 16  rating_M_0to18     230

In [3]:
# Remove useless columns.
cols_to_remove = ["name", "series_name", "banner", "fanart", "poster", "first_aired", "tvdb_id", 'prediction']
tv_df.drop(cols_to_remove, axis=1, inplace=True)

In [4]:
# Identify popular networks.
popular_networks = tv_df.groupby(by="network").count().sort_values(by="my_rating", ascending=False)[:20].index.to_list()
# Change network value of unpopular networks as "unpopular".
tv_df["network"] = tv_df["network"].map(lambda x: x if x in popular_networks else "unpopular")

In [5]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,type,network,rating,status
count,3155,3157,2572,3077
unique,2,21,6,2
top,TV Series,unpopular,TV-14,Ended
freq,2728,1633,1027,2433


In [6]:
# Split training set and test set.
train_df = tv_df.dropna(axis=0, subset=["my_rating"])

test_df = tv_df[tv_df["my_rating"].isna()].drop("my_rating", axis=1)
test_df = test_df[test_df["type"].notna()]

In [7]:
from sklearn.model_selection import train_test_split

X = train_df.drop(["my_rating"], axis=1)
y = train_df["my_rating"]

# Break off validation set from training data.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import multiprocessing

year_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["start_year"].max()), 
    StandardScaler())

genre_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0))

string_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OrdinalEncoder())

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="unknown"),
    OneHotEncoder(categories=[np.append(tv_df["rating"].unique(), "unknown")]))

rating_pipe = make_pipeline(
    SimpleImputer(strategy="median", add_indicator=True), 
    StandardScaler())

popularity_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["popularity_rank"].max(), add_indicator=True),
    StandardScaler())

network_pipe = make_pipeline(
    OneHotEncoder(categories=[tv_df["network"].unique()]))

year_cat = ["start_year", "end_year"]
genre_cat = [name for name in tv_df.columns if name.startswith("genre")]
rating_cat = [name for name in tv_df.columns if name.startswith("rating_")]
popularity_cat = ["popularity_rank"]
ordinal_cat = ["type", "status"]

transformers = [
    ("year", year_pipe, year_cat),
    ("genre", genre_pipe, genre_cat),
    ("ratings", rating_pipe, rating_cat),
    ("popularity", popularity_pipe, popularity_cat),
    ("ordinal", string_pipe, ordinal_cat),
    ("cat", cat_pipe, ["rating"]),
    ("network", network_pipe, ["network"])
]

combined_pipe = ColumnTransformer(transformers, n_jobs=multiprocessing.cpu_count(), remainder=SimpleImputer(add_indicator=True))

In [9]:
tr_X_train = combined_pipe.fit_transform(X_train)
tr_X_valid = combined_pipe.transform(X_valid)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Fit model.
model = XGBRegressor(n_estimators=500, random_state=17)
model.fit(tr_X_train, y_train, early_stopping_rounds=5, eval_set=[(tr_X_valid, y_valid)], verbose=False)
n_estimators = model.get_booster().best_iteration

# Predict results for validation set.
valid_predictions = model.predict(tr_X_valid)

# Display results comparing them to real personal ratings.
valid_results = pd.DataFrame(data=dict(prediction=valid_predictions, real=y_valid.to_list(), difference=valid_predictions - y_valid.to_list()), index=imdb_series_df.loc[y_valid.index, "name"])
print(valid_results.sort_values(by="difference", ascending=False).round(2))

                                         prediction  real  difference
name                                                                 
Tom Clancy's Jack Ryan                         5.49   2.0        3.49
The Big Bang Theory                            5.16   2.0        3.16
Atlanta                                        7.90   7.0        0.90
Orphan Black                                   5.90   5.0        0.90
Counterpart                                    5.48   5.0        0.48
It's Always Sunny in Philadelphia              8.45   8.0        0.45
The Office                                     7.35   7.0        0.35
How I Met Your Mother                          7.11   7.0        0.11
Band of Brothers                               8.06   8.0        0.06
The Marvelous Mrs. Maisel                      7.80   8.0       -0.20
Veep                                           7.74   8.0       -0.26
Penny Dreadful                                 6.71   7.0       -0.29
Friends             

In [11]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"MRSE: {np.sqrt(mean_squared_error(y_valid, valid_predictions)):.2f}")
print(f"R2 score: {r2_score(y_valid, valid_predictions):.2f}")

MRSE: 1.26
R2 score: 0.47


In [12]:
# Train model on the whole training data.
tr_X = combined_pipe.fit_transform(X)
model = XGBRegressor(n_estimators=n_estimators, random_state=17)
model.fit(tr_X, y)

# Predict ratings for new tv series.
X_test = combined_pipe.transform(test_df)
test_predictions = model.predict(X_test)



In [13]:
# Display best tv series to watch, removing documentary because I do not care about them.
to_watch_df = main_df.copy()
to_watch_df["prediction"] = pd.Series(data=test_predictions, index=test_df.index)
to_watch_df = to_watch_df[to_watch_df["genre_documentary"] == 0]

to_watch_df.sort_values(by="prediction", ascending=False).round(2)[["name", "prediction"]].head(20)

Unnamed: 0,name,prediction
tt0944947,Game of Thrones,8.7
tt0417299,Avatar: The Last Airbender,8.65
tt1355642,Fullmetal Alchemist: Brotherhood,8.36
tt0092337,Dekalog,8.16
tt5753856,Dark,8.03
tt7562112,Pose,8.01
tt0979432,Boardwalk Empire,7.93
tt7660850,Succession,7.91
tt0118421,Oz,7.87
tt0103359,Batman: The Animated Series,7.86
