In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
tvdb_series_df = pd.read_csv("tvdb.csv", index_col="imdb_id")
my_ratings_df = pd.read_csv("my_ratings.csv", index_col="imdb_id")
imdb_series_df = pd.read_csv("imdb.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_series_df.columns.difference(imdb_series_df.columns)
df1 = pd.merge(imdb_series_df, tvdb_series_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

# Display final dataset obtained.
main_df = tv_df.copy()
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3156 entries, tt0092337 to tt9900092
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    3156 non-null   object 
 1   type                    3156 non-null   object 
 2   start_year              3156 non-null   int64  
 3   end_year                1595 non-null   float64
 4   ep_length               3045 non-null   float64
 5   n_seasons               3151 non-null   float64
 6   n_episodes              3156 non-null   int64  
 7   popularity_rank         2452 non-null   float64
 8   n_ratings               3156 non-null   int64  
 9   rating_avg              3156 non-null   float64
 10  rating_top1000          3156 non-null   float64
 11  rating_us               3156 non-null   float64
 12  rating_row              3156 non-null   float64
 13  rating_M                3156 non-null   float64
 14  rating_F                3156 non

In [3]:
# Remove useless columns.
cols_to_remove = ["name", "series_name", "banner", "fanart", "poster", "first_aired", "tvdb_id", 'prediction', 'overview']
tv_df.drop(cols_to_remove, axis=1, inplace=True)

In [4]:
# Identify popular networks.
popular_networks = tv_df.groupby(by="network").count().sort_values(by="my_rating", ascending=False)[:20].index.to_list()
# Change network value of unpopular networks as "unpopular".
tv_df["network"] = tv_df["network"].map(lambda x: x if x in popular_networks else "unpopular")

In [5]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,type,network,rating,status
count,3156,3156,2572,3078
unique,2,21,6,2
top,TV Series,unpopular,TV-14,Ended
freq,2729,1631,1027,2435


In [6]:
from sklearn.model_selection import train_test_split

# Split training set and test set.
rated_df = tv_df.dropna(axis=0, subset=["my_rating"])

unrated_df = tv_df[tv_df["my_rating"].isna()].drop("my_rating", axis=1)
unrated_df = unrated_df[unrated_df["type"].notna()]

X = rated_df.drop(["my_rating"], axis=1)
y = rated_df["my_rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

year_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["start_year"].max()),
    StandardScaler())

genre_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0))

avg_ratings_pipe = make_pipeline(
    KNNImputer(n_neighbors=2, weights="uniform"),
    StandardScaler())

popularity_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["popularity_rank"].max(), add_indicator=True),
    StandardScaler())

ordinal_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder())

rating_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="unknown"),
    OneHotEncoder(categories=[np.append(tv_df["rating"].unique(), "unknown")]))

network_pipe = make_pipeline(
    OneHotEncoder(categories=[tv_df["network"].unique()]))

median_pipe = make_pipeline(
    KNNImputer(n_neighbors=2, weights="uniform"),
    StandardScaler()
)

mean_pipe = make_pipeline(
    KNNImputer(n_neighbors=2, weights="uniform"),
    StandardScaler()
)

year_cat = ["start_year", "end_year"]
genre_cat = [name for name in tv_df.columns if name.startswith("genre")]
avg_ratings_cat = [name for name in tv_df.columns if name.startswith("rating_")]
popularity_cat = ["popularity_rank"]
ordinal_cat = ["type", "status"]
rating_cat = ["rating"]
network_cat = ["network"]
median_cat = ['n_episodes', 'n_ratings', 'tvdb_ratings', 'num_seasons']
mean_cat = ['ep_length', 'tvdb_avg_rating']

transformers = [
    ('year', year_pipe, year_cat),
    ('genres', genre_pipe, genre_cat),
    ('avg_ratings', avg_ratings_pipe, avg_ratings_cat),
    ('popularity', popularity_pipe, popularity_cat),
    ('ordinal', ordinal_pipe, ordinal_cat),
    ('rating', rating_pipe, rating_cat),
    ('network', network_pipe, network_cat),
    ('median', median_pipe, median_cat),
    ('mean', mean_pipe, mean_cat)
]

preprocess = ColumnTransformer(transformers, remainder='drop')

model = make_pipeline(
    preprocess,
    XGBRegressor(n_estimators=50, random_state=17))

In [8]:
# Fit model.
model.fit(X_train, y_train)

# Predict results for test set.
predictions = model.predict(X_test)

# Display results comparing them to real personal ratings.
results = pd.DataFrame(
    data=dict(
        prediction = predictions, 
        real = y_test.to_list(), 
        difference = predictions - y_test.to_list()),
    index=imdb_series_df.loc[y_test.index, "name"])

print(results.sort_values(by="difference", ascending=False).round(2))

                                         prediction  real  difference
name                                                                 
Tom Clancy's Jack Ryan                         5.86   2.0        3.86
The Big Bang Theory                            4.95   2.0        2.95
Orphan Black                                   6.20   5.0        1.20
Atlanta                                        7.80   7.0        0.80
It's Always Sunny in Philadelphia              8.64   8.0        0.64
Counterpart                                    5.59   5.0        0.59
The Office                                     7.28   7.0        0.28
How I Met Your Mother                          6.84   7.0       -0.16
The Marvelous Mrs. Maisel                      7.77   8.0       -0.23
Band of Brothers                               7.66   8.0       -0.34
Black Mirror                                   7.53   8.0       -0.47
Penny Dreadful                                 6.48   7.0       -0.52
The Office          

In [9]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"MAE: {np.sqrt(mean_absolute_error(y_test, predictions)):.2f}")
print(f"R2 score: {r2_score(y_test, predictions):.2f}")

MAE: 1.04
R2 score: 0.39


In [10]:
# Predict ratings for new tv series.
predictions = model.predict(unrated_df)

# Display best tv series to watch, removing documentary because I do not care about them.
to_watch_df = main_df.copy()
to_watch_df["prediction"] = pd.Series(data=predictions, index=unrated_df.index)
to_watch_df = to_watch_df[to_watch_df["genre_documentary"] == 0]

to_watch_df.sort_values(by="prediction", ascending=False).round(2)[["name", "prediction"]].head(20)

Unnamed: 0,name,prediction
tt1355642,Fullmetal Alchemist: Brotherhood,8.42
tt0944947,Game of Thrones,8.4
tt0417299,Avatar: The Last Airbender,8.24
tt0979432,Boardwalk Empire,8.19
tt1587000,Rake,8.15
tt0200276,The West Wing,8.12
tt2560140,Attack on Titan,8.06
tt3322312,Daredevil,8.02
tt0092337,Dekalog,8.0
tt0290988,Trailer Park Boys,7.97
