In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
try:
  tvdb_series_df = pd.read_csv("data/output/tvdb_series.csv", index_col="imdb_id")
  my_ratings_df = pd.read_csv("data/input/my_ratings.csv", index_col="imdb_id")
  imdb_series_df = pd.read_csv("data/output/imdb_series.csv", index_col="id")
# If on Google Colab.
except FileNotFoundError:
  tvdb_series_df = pd.read_csv("tvdb_series.csv", index_col="imdb_id")
  my_ratings_df = pd.read_csv("my_ratings.csv", index_col="imdb_id")
  imdb_series_df = pd.read_csv("imdb_series.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_series_df.columns.difference(imdb_series_df.columns)
df1 = pd.merge(imdb_series_df, tvdb_series_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

# Display final dataset obtained.
main_df = tv_df.copy()
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3614 entries, tt0092337 to tt9900092
Data columns (total 90 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               3614 non-null   object 
 1   type               3614 non-null   object 
 2   start_year         3614 non-null   int64  
 3   end_year           1762 non-null   float64
 4   ep_length          3463 non-null   float64
 5   n_seasons          3612 non-null   float64
 6   n_episodes         3614 non-null   int64  
 7   popularity_rank    2651 non-null   float64
 8   n_ratings          3614 non-null   int64  
 9   rating_avg         3614 non-null   float64
 10  rating_top1000     3614 non-null   float64
 11  rating_us          3614 non-null   float64
 12  rating_row         3614 non-null   float64
 13  rating_M           3614 non-null   float64
 14  rating_F           3614 non-null   float64
 15  rating_0to18       2937 non-null   float64
 16  rating_M_0to18  

In [3]:
# Get features correlation.
corr_matrix = tv_df.corr()

# Select upper triangle of correlation matrix.
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95.
suspicious_features = upper[upper > 0.95]
suspicious_features = suspicious_features.dropna(axis=0, how="all").dropna(axis=1, how="all")
print(suspicious_features.to_string())

                rating_row  rating_M  rating_18to29  rating_M_18to29  rating_F_18to29  rating_29to45  rating_M_29to45  rating_F_29to45  rating_M_45to100  genre_garden  genre_home  genre_martial  genre_science  genre_special
rating_avg        0.975057  0.975652       0.972724         0.959650              NaN       0.977839         0.953676              NaN               NaN           NaN         NaN            NaN            NaN            NaN
rating_row             NaN  0.967782       0.953501              NaN              NaN       0.969083         0.953262              NaN               NaN           NaN         NaN            NaN            NaN            NaN
rating_M               NaN       NaN            NaN         0.972698              NaN       0.971682         0.985652              NaN               NaN           NaN         NaN            NaN            NaN            NaN
rating_F               NaN       NaN            NaN              NaN         0.953453            NaN    

In [4]:
# Rename some columns.
tv_df.rename(columns={
    "genre_arts": "genre_martial_arts",
    "genre_fiction": "genre_science_fiction",
    "genre_interest": "genre_special_interest"
})

# Remove useless columns.
cols_to_remove = ["name", "genre_martial", "genre_science", "genre_special", "genre_", "series_name", "banner", "fanart", "overview", "poster", "first_aired", "tvdb_id"]
tv_df.drop(cols_to_remove, axis=1, inplace=True)

In [5]:
# Identify popular networks.
popular_networks = tv_df.groupby(by="network").count().sort_values(by="my_rating", ascending=False)[:20].index.to_list()
# Change network value of unpopular networks as "unpopular".
tv_df["network"] = tv_df["network"].map(lambda x: x if x in popular_networks else "unpopular")

In [6]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,type,network,rating,status
count,3614,3614,2846,3506
unique,2,21,6,2
top,TV Series,unpopular,TV-14,Ended
freq,3119,1910,1128,2775


In [7]:
# Split training set and test set.
train_df = tv_df.dropna(axis=0, subset=["my_rating"])

test_df = tv_df[tv_df["my_rating"].isna()].drop("my_rating", axis=1)
test_df = test_df[test_df["type"].notna()]

In [8]:
from sklearn.model_selection import train_test_split

X = train_df.drop(["my_rating"], axis=1)
y = train_df["my_rating"]

# Break off validation set from training data.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import multiprocessing

year_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["start_year"].max()), 
    StandardScaler())

genre_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0))

string_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OrdinalEncoder())

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="unknown"),
    OneHotEncoder(categories=[np.append(tv_df["rating"].unique(), "unknown")]))

rating_pipe = make_pipeline(
    SimpleImputer(strategy="median", add_indicator=True), 
    StandardScaler())

popularity_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["popularity_rank"].max(), add_indicator=True),
    StandardScaler())

network_pipe = make_pipeline(
    OneHotEncoder(categories=[tv_df["network"].unique()]))

year_cat = ["start_year", "end_year"]
genre_cat = [name for name in tv_df.columns if name.startswith("genre")]
rating_cat = [name for name in tv_df.columns if name.startswith("rating_")]
popularity_cat = ["popularity_rank"]
ordinal_cat = ["type", "status"]

transformers = [
    ("year", year_pipe, year_cat),
    ("genre", genre_pipe, genre_cat),
    ("ratings", rating_pipe, rating_cat),
    ("popularity", popularity_pipe, popularity_cat),
    ("ordinal", string_pipe, ordinal_cat),
    ("cat", cat_pipe, ["rating"]),
    ("network", network_pipe, ["network"])
]

combined_pipe = ColumnTransformer(transformers, n_jobs=multiprocessing.cpu_count(), remainder=SimpleImputer(add_indicator=True))

In [10]:
tr_X_train = combined_pipe.fit_transform(X_train)
tr_X_valid = combined_pipe.transform(X_valid)

In [11]:
from sklearn.ensemble import RandomForestRegressor
try:
    from xgboost import XGBRegressor
except:
    pass

# Fit model.
try:
    model = XGBRegressor(n_estimators=500, random_state=17)
    model.fit(tr_X_train, y_train, early_stopping_rounds=5, eval_set=[(tr_X_valid, y_valid)], verbose=False)
except NameError:
    model = RandomForestRegressor(random_state=17)
    model.fit(tr_X_train, y_train)

# Predict results for validation set.
valid_predictions = model.predict(tr_X_valid)

# Display results comparing them to real personal ratings.
valid_results = pd.DataFrame(data=dict(prediction=valid_predictions, real=y_valid.to_list(), difference=valid_predictions - y_valid.to_list()), index=imdb_series_df.loc[y_valid.index, "name"])
print(valid_results.sort_values(by="difference", ascending=False))

                                         prediction  real  difference
name                                                                 
The Big Bang Theory                        5.240856   2.0    3.240856
Tom Clancy's Jack Ryan                     5.151359   2.0    3.151359
It's Always Sunny in Philadelphia          8.893214   8.0    0.893214
Atlanta                                    7.741800   7.0    0.741800
Counterpart                                5.614441   5.0    0.614441
Orphan Black                               5.581559   5.0    0.581559
The Marvelous Mrs. Maisel                  8.389423   8.0    0.389423
How I Met Your Mother                      7.077860   7.0    0.077860
Band of Brothers                           8.066974   8.0    0.066974
The Office                                 6.986377   7.0   -0.013623
Friends                                    7.940108   8.0   -0.059892
The Office                                 8.849109   9.0   -0.150891
The Man in the High 

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"MRSE: {np.sqrt(mean_squared_error(y_valid, valid_predictions)):.2f}")
print(f"R2 score: {r2_score(y_valid, valid_predictions):.2f}")

MRSE: 1.26
R2 score: 0.47


In [13]:
# Fit model on the whole training data.
tr_X = combined_pipe.fit_transform(X)
try:
    model = XGBRegressor(n_estimators=75)
except NameError:
    pass
model.fit(tr_X, y)

# Predict unseen ratings of unseen tv series.
X_test = combined_pipe.transform(test_df)
test_predictions = model.predict(X_test)



In [14]:
# Widen overview columns for better reading.
pd.set_option('max_colwidth', 250)

# Display best tv series to watch, removing documentary because I do not care about them.
to_watch_df = main_df.copy()
to_watch_df["prediction"] = pd.Series(data=test_predictions, index=test_df.index)
to_watch_df = to_watch_df[to_watch_df["genre_documentary"] == 0]

to_watch_df.sort_values(by="prediction", ascending=False)[["name", "prediction", "overview"]].head(20)

Unnamed: 0,name,prediction,overview
tt0944947,Game of Thrones,8.836425,"Seven noble families fight for control of the mythical land of Westeros. Friction between the houses leads to full-scale war. All while a very ancient evil awakens in the farthest north. Amidst the war, a neglected military order of misfits, the ..."
tt0417299,Avatar: The Last Airbender,8.671462,"With the Fire Nation on the brink of global domination, a young boy reawakens after a hundred-year absence. To save his war-torn world he must accept his destiny as the Avatar before it's too late."
tt0092337,Dekalog,8.329264,"Originally made for Polish television, “Dekalog” focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply personal and universally human...."
tt1355642,Fullmetal Alchemist: Brotherhood,8.325186,"Edward and Alphonse Elric's reckless disregard for alchemy's fun­damental laws ripped half of Ed's limbs from his body and left Al's soul clinging to a cold suit of armor. To restore what was lost, the brothers scour a war-torn land for the Philo..."
tt0096697,The Simpsons,8.288667,"Set in Springfield, the average American town, the show focuses on the antics and everyday adventures of the Simpson family; Homer, Marge, Bart, Lisa and Maggie, as well as a virtual cast of thousands. Since the beginning, the series has been a p..."
tt9471404,The Chosen,8.021186,"The Chosen is the first-ever-multi-season TV show about the life of Jesus. Created outside of the Hollywood system, The Chosen allows us to see Him through the eyes of those who knew him. No matter where you are at in your journey with Jesus Chri..."
tt7660850,Succession,7.903399,"A drama about a dysfunctional media family dynasty in the 21st century.\r\nThe Roy family – Logan Roy and his four children – controls one of the biggest media and entertainment conglomerates in the world. ""Succession"" tracks their lives as they ..."
tt4647692,Letterkenny,7.897769,"The residents of Letterkenny belong to one of three groups: Hicks, Skids, and Hockey Players. The three groups are constantly feuding with each other over seemingly trivial matters; often ending with someone getting their ass kicked."
tt7562112,Pose,7.881162,"Pose is a drama spotlighting the legends, icons and ferocious house mothers of New York’s underground ball culture, a movement that first gained notice in the 1980s."
tt5421602,Anne with an E,7.86459,"A coming-of-age story about an outsider who, against all odds and numerous challenges, fights for love and acceptance and for her place in the world. The series centers on a young orphaned girl in the late 1890’s, who, after an abusive childhood ..."
