In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
tvdb_series_df = pd.read_csv("data/output/tvdb_series.csv", index_col="imdb_id")
my_ratings_df = pd.read_csv("data/input/my_ratings.csv", index_col="imdb_id")
imdb_series_df = pd.read_csv("data/output/imdb_series.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_series_df.columns.difference(imdb_series_df.columns)
df1 = pd.merge(imdb_series_df, tvdb_series_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1955 entries, tt0092337 to tt9900092
Data columns (total 86 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1953 non-null   object 
 1   type               1953 non-null   object 
 2   start_year         1953 non-null   float64
 3   end_year           1086 non-null   float64
 4   ep_length          1924 non-null   float64
 5   n_seasons          1952 non-null   float64
 6   n_episodes         1953 non-null   float64
 7   popularity_rank    1773 non-null   float64
 8   n_ratings          1953 non-null   float64
 9   rating_avg         1953 non-null   float64
 10  rating_top1000     1953 non-null   float64
 11  rating_us          1953 non-null   float64
 12  rating_row         1953 non-null   float64
 13  rating_M           1953 non-null   float64
 14  rating_F           1953 non-null   float64
 15  rating_0to18       1772 non-null   float64
 16  rating_M_0to18  

In [3]:
corr_matrix = tv_df.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
suspicious_features = upper[upper > 0.95]

# Print suspicious features.
for i, row in suspicious_features.iterrows():
    for j in range(len(row)):
        if not np.isnan(row.iloc[j]):
            print(f"{i} and {row.index[j]} have a correlation of {row.values[j]:.2f}")

rating_avg and rating_row have a correlation of 0.97
rating_avg and rating_M have a correlation of 0.98
rating_avg and rating_18to29 have a correlation of 0.98
rating_avg and rating_M_18to29 have a correlation of 0.96
rating_avg and rating_29to45 have a correlation of 0.98
rating_avg and rating_M_29to45 have a correlation of 0.96
rating_row and rating_M have a correlation of 0.96
rating_row and rating_18to29 have a correlation of 0.95
rating_row and rating_29to45 have a correlation of 0.97
rating_row and rating_M_29to45 have a correlation of 0.95
rating_M and rating_M_18to29 have a correlation of 0.98
rating_M and rating_29to45 have a correlation of 0.97
rating_M and rating_M_29to45 have a correlation of 0.99
rating_F and rating_F_18to29 have a correlation of 0.96
rating_F and rating_F_29to45 have a correlation of 0.97
rating_18to29 and rating_M_18to29 have a correlation of 0.98
rating_18to29 and rating_29to45 have a correlation of 0.95
rating_M_18to29 and rating_M_29to45 have a correl

In [4]:
# Rename some columns.
tv_df.rename(columns={
    "genre_arts": "genre_martial_arts",
    "genre_fiction": "genre_science_fiction",
    "genre_interest": "genre_special_interest"
})

# Remove useless columns.
cols_to_remove = ["name", "genre_martial", "genre_science", "genre_special", "genre_", "series_name", "banner", "fanart", "overview", "poster", "first_aired", "tvdb_id"]
tv_df.drop(cols_to_remove, axis=1, inplace=True)

# Drop Boris.
tv_df.drop("tt1020116", axis=0, inplace=True)

# If end_year is not set, set it to current year.
tv_df["end_year"] = tv_df.apply(
    lambda row: row["start_year"] if np.isnan(row["end_year"]) else row["end_year"],
    axis=1
)

# Drop network feature.
tv_df.drop("network", axis=1, inplace=True)

In [5]:
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1954 entries, tt0092337 to tt9900092
Data columns (total 73 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   type               1953 non-null   object 
 1   start_year         1953 non-null   float64
 2   end_year           1953 non-null   float64
 3   ep_length          1924 non-null   float64
 4   n_seasons          1952 non-null   float64
 5   n_episodes         1953 non-null   float64
 6   popularity_rank    1773 non-null   float64
 7   n_ratings          1953 non-null   float64
 8   rating_avg         1953 non-null   float64
 9   rating_top1000     1953 non-null   float64
 10  rating_us          1953 non-null   float64
 11  rating_row         1953 non-null   float64
 12  rating_M           1953 non-null   float64
 13  rating_F           1953 non-null   float64
 14  rating_0to18       1772 non-null   float64
 15  rating_M_0to18     1595 non-null   float64
 16  rating_F_0to18  

In [6]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,type,rating,status
count,1953,1731,1904
unique,2,6,2
top,TV Series,TV-14,Ended
freq,1722,682,1482


In [7]:
train_df = tv_df.dropna(axis=0, subset=["my_rating"])

In [8]:
test_df = tv_df[tv_df["my_rating"].isna()].drop("my_rating", axis=1)
test_df.info()
test_df = test_df[test_df["type"].notna()]

<class 'pandas.core.frame.DataFrame'>
Index: 1815 entries, tt0092337 to tt9900092
Data columns (total 72 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   type               1814 non-null   object 
 1   start_year         1814 non-null   float64
 2   end_year           1814 non-null   float64
 3   ep_length          1785 non-null   float64
 4   n_seasons          1813 non-null   float64
 5   n_episodes         1814 non-null   float64
 6   popularity_rank    1634 non-null   float64
 7   n_ratings          1814 non-null   float64
 8   rating_avg         1814 non-null   float64
 9   rating_top1000     1814 non-null   float64
 10  rating_us          1814 non-null   float64
 11  rating_row         1814 non-null   float64
 12  rating_M           1814 non-null   float64
 13  rating_F           1814 non-null   float64
 14  rating_0to18       1633 non-null   float64
 15  rating_M_0to18     1456 non-null   float64
 16  rating_F_0to18  

In [9]:
from sklearn.model_selection import train_test_split

X = train_df.drop(["my_rating"], axis=1)
y = train_df["my_rating"]

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

year_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["start_year"].max()), 
    StandardScaler())

genre_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0))

string_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OrdinalEncoder())

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="unknown"),
    OneHotEncoder(categories=[np.append(tv_df["rating"].unique(), "unknown")]))

rating_pipe = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler())

popularity_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["popularity_rank"].max()),
    StandardScaler())

genre_cat = [name for name in tv_df.columns if name.startswith("genre")]
rating_cat = [name for name in tv_df.columns if name.startswith("rating_")]

transformers = [
    ("year", year_pipe, ["start_year", "end_year",]),
    ("genre", genre_pipe, genre_cat),
    ("ratings", rating_pipe, rating_cat),
    ("popularity", popularity_pipe, ["popularity_rank"]),
    ("ordinal", string_pipe, ["type", "status"]),
    ("cat", cat_pipe, ["rating"])
]

combined_pipe = ColumnTransformer(transformers, remainder=SimpleImputer())


In [11]:
tr_X_train = combined_pipe.fit_transform(X_train)
tr_X_valid = combined_pipe.transform(X_valid)

In [12]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=17)
model.fit(tr_X_train, y_train)
valid_predictions = model.predict(tr_X_valid)

valid_results = pd.Series(data=valid_predictions, index=imdb_series_df.loc[X_valid.index, "name"])
print(valid_results.sort_values(ascending=False))

name
The Wire                                   8.57
The Office                                 8.42
It's Always Sunny in Philadelphia          8.29
The Last Dance                             8.24
Hunter x Hunter                            8.03
Cowboy Bebop                               7.87
The Marvelous Mrs. Maisel                  7.69
Atlanta                                    7.62
Fleabag                                    7.60
Mindhunter                                 7.57
Freaks and Geeks                           7.56
Big Little Lies                            7.09
Friends                                    7.08
Ozark                                      6.80
GLOW                                       6.79
Orphan Black                               6.43
How I Met Your Mother                      6.38
American Horror Story                      6.23
Tom Clancy's Jack Ryan                     6.17
Dirk Gently's Holistic Detective Agency    6.15
Lost                               

In [24]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"MRSE: {np.sqrt(mean_squared_error(y_valid, valid_predictions))}")
print(f"R2 score: {r2_score(y_valid, valid_predictions)}")
print(test_df.isna().sum().to_string())

MRSE: 1.6064034807162062
R2 score: 0.33515044364114366
type                   0
start_year             0
end_year               0
ep_length             29
n_seasons              1
n_episodes             0
popularity_rank      180
n_ratings              0
rating_avg             0
rating_top1000         0
rating_us              0
rating_row             0
rating_M               0
rating_F               0
rating_0to18         181
rating_M_0to18       358
rating_F_0to18       482
rating_18to29          0
rating_M_18to29        0
rating_F_18to29        0
rating_29to45          0
rating_M_29to45        0
rating_F_29to45        0
rating_45to100         0
rating_M_45to100       0
rating_F_45to100       1
genre_war             12
genre_documentary     12
genre_talk-show       12
genre_animation       12
genre_sci-fi          12
genre_crime           12
genre_musical         12
genre_short           12
genre_fantasy         12
genre_adventure       12
genre_thriller        12
genre_family        

In [14]:
X_test = combined_pipe.fit_transform(test_df)
np.isnan(X_test).sum()

0

In [15]:
test_predictions = model.predict(X_test)

In [22]:
pd.Series(data=test_predictions, index=imdb_series_df.loc[test_df.index, "name"]).sort_values(ascending=False)[:30]

name
Game of Thrones                       8.77
How the Universe Works                8.66
Critical Role                         8.62
Avatar: The Last Airbender            8.59
Planet Earth II                       8.56
This Is Us                            8.55
Batman: The Animated Series           8.53
Whose Line Is It Anyway?              8.52
Monster                               8.46
Daredevil                             8.46
Attack on Titan                       8.45
The Grand Tour                        8.44
Top Gear                              8.44
Oz                                    8.44
Fullmetal Alchemist: Brotherhood      8.42
House                                 8.42
Dark                                  8.42
Planet Earth                          8.41
Impractical Jokers                    8.37
Cosmos: A Spacetime Odyssey           8.35
Young Justice                         8.33
Rake                                  8.31
Nathan for You                        8.29
Succes