In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
tvdb_series_df = pd.read_csv("data/output/tvdb_series.csv", index_col="imdb_id")
my_ratings_df = pd.read_csv("data/input/my_ratings.csv", index_col="imdb_id")
imdb_series_df = pd.read_csv("data/output/imdb_series.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_series_df.columns.difference(imdb_series_df.columns)
df1 = pd.merge(imdb_series_df, tvdb_series_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

# Display final dataset obtained.
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3614 entries, tt0092337 to tt9900092
Data columns (total 90 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               3614 non-null   object 
 1   type               3614 non-null   object 
 2   start_year         3614 non-null   int64  
 3   end_year           1762 non-null   float64
 4   ep_length          3463 non-null   float64
 5   n_seasons          3612 non-null   float64
 6   n_episodes         3614 non-null   int64  
 7   popularity_rank    2651 non-null   float64
 8   n_ratings          3614 non-null   int64  
 9   rating_avg         3614 non-null   float64
 10  rating_top1000     3614 non-null   float64
 11  rating_us          3614 non-null   float64
 12  rating_row         3614 non-null   float64
 13  rating_M           3614 non-null   float64
 14  rating_F           3614 non-null   float64
 15  rating_0to18       2937 non-null   float64
 16  rating_M_0to18  

In [3]:
# Get features correlation.
corr_matrix = tv_df.corr()

# Select upper triangle of correlation matrix.
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95.
suspicious_features = upper[upper > 0.95]
suspicious_features = suspicious_features.dropna(axis=0, how="all").dropna(axis=1, how="all")
print(suspicious_features.to_string())

                rating_row  rating_M  rating_18to29  rating_M_18to29  rating_F_18to29  rating_29to45  rating_M_29to45  rating_F_29to45  rating_M_45to100  genre_garden  genre_home  genre_martial  genre_science  genre_special
rating_avg        0.975057  0.975652       0.972724         0.959650              NaN       0.977839         0.953676              NaN               NaN           NaN         NaN            NaN            NaN            NaN
rating_row             NaN  0.967782       0.953501              NaN              NaN       0.969083         0.953262              NaN               NaN           NaN         NaN            NaN            NaN            NaN
rating_M               NaN       NaN            NaN         0.972698              NaN       0.971682         0.985652              NaN               NaN           NaN         NaN            NaN            NaN            NaN
rating_F               NaN       NaN            NaN              NaN         0.953453            NaN    

In [4]:
# Rename some columns.
tv_df.rename(columns={
    "genre_arts": "genre_martial_arts",
    "genre_fiction": "genre_science_fiction",
    "genre_interest": "genre_special_interest"
})

# Remove useless columns.
cols_to_remove = ["name", "genre_martial", "genre_science", "genre_special", "genre_", "series_name", "banner", "fanart", "overview", "poster", "first_aired", "tvdb_id"]
tv_df.drop(cols_to_remove, axis=1, inplace=True)

# Drop network feature.
tv_df.drop("network", axis=1, inplace=True)

In [5]:
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3614 entries, tt0092337 to tt9900092
Data columns (total 77 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   type               3614 non-null   object 
 1   start_year         3614 non-null   int64  
 2   end_year           1762 non-null   float64
 3   ep_length          3463 non-null   float64
 4   n_seasons          3612 non-null   float64
 5   n_episodes         3614 non-null   int64  
 6   popularity_rank    2651 non-null   float64
 7   n_ratings          3614 non-null   int64  
 8   rating_avg         3614 non-null   float64
 9   rating_top1000     3614 non-null   float64
 10  rating_us          3614 non-null   float64
 11  rating_row         3614 non-null   float64
 12  rating_M           3614 non-null   float64
 13  rating_F           3614 non-null   float64
 14  rating_0to18       2937 non-null   float64
 15  rating_M_0to18     2521 non-null   float64
 16  rating_F_0to18  

In [6]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,type,rating,status
count,3614,2846,3506
unique,2,6,2
top,TV Series,TV-14,Ended
freq,3119,1128,2775


In [7]:
train_df = tv_df.dropna(axis=0, subset=["my_rating"])

In [8]:
test_df = tv_df[tv_df["my_rating"].isna()].drop("my_rating", axis=1)
test_df.info()
test_df = test_df[test_df["type"].notna()]

<class 'pandas.core.frame.DataFrame'>
Index: 3474 entries, tt0092337 to tt9900092
Data columns (total 76 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   type               3474 non-null   object 
 1   start_year         3474 non-null   int64  
 2   end_year           1688 non-null   float64
 3   ep_length          3323 non-null   float64
 4   n_seasons          3472 non-null   float64
 5   n_episodes         3474 non-null   int64  
 6   popularity_rank    2512 non-null   float64
 7   n_ratings          3474 non-null   int64  
 8   rating_avg         3474 non-null   float64
 9   rating_top1000     3474 non-null   float64
 10  rating_us          3474 non-null   float64
 11  rating_row         3474 non-null   float64
 12  rating_M           3474 non-null   float64
 13  rating_F           3474 non-null   float64
 14  rating_0to18       2797 non-null   float64
 15  rating_M_0to18     2381 non-null   float64
 16  rating_F_0to18  

In [9]:
from sklearn.model_selection import train_test_split

X = train_df.drop(["my_rating"], axis=1)
y = train_df["my_rating"]

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

year_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["start_year"].max()), 
    StandardScaler())

genre_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0))

string_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OrdinalEncoder())

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="unknown"),
    OneHotEncoder(categories=[np.append(tv_df["rating"].unique(), "unknown")]))

rating_pipe = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler())

popularity_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=tv_df["popularity_rank"].max()),
    StandardScaler())

genre_cat = [name for name in tv_df.columns if name.startswith("genre")]
rating_cat = [name for name in tv_df.columns if name.startswith("rating_")]

transformers = [
    ("year", year_pipe, ["start_year", "end_year",]),
    ("genre", genre_pipe, genre_cat),
    ("ratings", rating_pipe, rating_cat),
    ("popularity", popularity_pipe, ["popularity_rank"]),
    ("ordinal", string_pipe, ["type", "status"]),
    ("cat", cat_pipe, ["rating"])
]

combined_pipe = ColumnTransformer(transformers, remainder=SimpleImputer())

In [11]:
tr_X_train = combined_pipe.fit_transform(X_train)
tr_X_valid = combined_pipe.transform(X_valid)

In [12]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=17)
model.fit(tr_X_train, y_train)
valid_predictions = model.predict(tr_X_valid)

valid_results = pd.Series(data=valid_predictions, index=imdb_series_df.loc[X_valid.index, "name"])
print(valid_results.sort_values(ascending=False))

name
The Office                                 8.72
Band of Brothers                           8.63
Chernobyl                                  8.33
It's Always Sunny in Philadelphia          8.26
The Marvelous Mrs. Maisel                  7.82
Atlanta                                    7.77
The Crown                                  7.57
Black Mirror                               7.55
Freaks and Geeks                           7.52
Big Little Lies                            7.34
Friends                                    7.29
Ozark                                      6.95
The Office                                 6.77
Veep                                       6.69
Penny Dreadful                             6.53
Louie                                      6.52
Orphan Black                               6.39
Lost                                       6.16
The Handmaid's Tale                        6.10
The Young Pope                             5.88
How I Met Your Mother              

In [13]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"MRSE: {np.sqrt(mean_squared_error(y_valid, valid_predictions)):.2f}")
print(f"R2 score: {r2_score(y_valid, valid_predictions):.2f}")

MRSE: 1.34
R2 score: 0.40


In [14]:
X_test = combined_pipe.transform(test_df)
np.isnan(X_test).sum()

0

In [15]:
test_predictions = model.predict(X_test)

In [17]:
to_watch_df = imdb_series_df.copy()
to_watch_df["prediction"] = pd.Series(data=test_predictions, index=test_df.index)
to_watch_df = to_watch_df[to_watch_df["genre_documentary"] == 0]
print(to_watch_df.sort_values(by="prediction", ascending=False)[["name", "prediction"]].head(20))

                                        name  prediction
id                                                      
tt0944947                    Game of Thrones        8.96
tt0417299         Avatar: The Last Airbender        8.53
tt1355642   Fullmetal Alchemist: Brotherhood        8.44
tt0092337                            Dekalog        8.12
tt3322312                          Daredevil        8.08
tt4093826                         Twin Peaks        8.06
tt0103359        Batman: The Animated Series        7.98
tt5555260                         This Is Us        7.84
tt0412142                              House        7.84
tt0118421                                 Oz        7.82
tt0096697                       The Simpsons        7.81
tt5753856                               Dark        7.76
tt9471404                         The Chosen        7.76
tt7660850                         Succession        7.75
tt1587000                               Rake        7.73
tt12451520          It's Okay t