In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import datasets.
tvdb_series_df = pd.read_csv("data/output/tvdb_series.csv", index_col="imdb_id")
my_ratings_df = pd.read_csv("data/input/my_ratings.csv", index_col="imdb_id")
imdb_series_df = pd.read_csv("data/output/imdb_series.csv", index_col="id")

# Merge datasets together.
cols_to_use = tvdb_series_df.columns.difference(imdb_series_df.columns)
df1 = pd.merge(imdb_series_df, tvdb_series_df[cols_to_use], how="outer", left_index=True, right_index=True)

cols_to_use = my_ratings_df.columns.difference(df1.columns)
tv_df = pd.merge(df1, my_ratings_df[cols_to_use], how="outer", left_index=True, right_index=True)

tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1955 entries, tt0092337 to tt9900092
Data columns (total 86 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1953 non-null   object 
 1   type               1953 non-null   object 
 2   start_year         1953 non-null   float64
 3   end_year           1086 non-null   float64
 4   ep_length          1924 non-null   float64
 5   n_seasons          1952 non-null   float64
 6   n_episodes         1953 non-null   float64
 7   popularity_rank    1773 non-null   float64
 8   n_ratings          1953 non-null   float64
 9   rating_avg         1953 non-null   float64
 10  rating_top1000     1953 non-null   float64
 11  rating_us          1953 non-null   float64
 12  rating_row         1953 non-null   float64
 13  rating_M           1953 non-null   float64
 14  rating_F           1953 non-null   float64
 15  rating_0to18       1772 non-null   float64
 16  rating_M_0to18  

In [3]:
corr_matrix = tv_df.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
suspicious_features = upper[upper > 0.95]

# Print suspicious features.
for i, row in suspicious_features.iterrows():
    for j in range(len(row)):
        if not np.isnan(row.iloc[j]):
            print(f"{i} and {row.index[j]} have a correlation of {row.values[j]:.2f}")

rating_avg and rating_row have a correlation of 0.97
rating_avg and rating_M have a correlation of 0.98
rating_avg and rating_18to29 have a correlation of 0.98
rating_avg and rating_M_18to29 have a correlation of 0.96
rating_avg and rating_29to45 have a correlation of 0.98
rating_avg and rating_M_29to45 have a correlation of 0.96
rating_row and rating_M have a correlation of 0.96
rating_row and rating_18to29 have a correlation of 0.95
rating_row and rating_29to45 have a correlation of 0.97
rating_row and rating_M_29to45 have a correlation of 0.95
rating_M and rating_M_18to29 have a correlation of 0.98
rating_M and rating_29to45 have a correlation of 0.97
rating_M and rating_M_29to45 have a correlation of 0.99
rating_F and rating_F_18to29 have a correlation of 0.96
rating_F and rating_F_29to45 have a correlation of 0.97
rating_18to29 and rating_M_18to29 have a correlation of 0.98
rating_18to29 and rating_29to45 have a correlation of 0.95
rating_M_18to29 and rating_M_29to45 have a correl

In [4]:
# Rename some columns.
tv_df.rename(columns={
    "genre_arts": "genre_martial_arts",
    "genre_fiction": "genre_science_fiction",
    "genre_interest": "genre_special_interest"
})

# Remove useless columns.
cols_to_remove = ["name", "genre_martial", "genre_science", "genre_special", "genre_", "series_name", "banner", "fanart", "overview", "poster", "first_aired"]
tv_df.drop(cols_to_remove, axis=1, inplace=True)

In [5]:
tv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1955 entries, tt0092337 to tt9900092
Data columns (total 76 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1953 non-null   object 
 1   type               1953 non-null   object 
 2   start_year         1953 non-null   float64
 3   end_year           1086 non-null   float64
 4   ep_length          1924 non-null   float64
 5   n_seasons          1952 non-null   float64
 6   n_episodes         1953 non-null   float64
 7   popularity_rank    1773 non-null   float64
 8   n_ratings          1953 non-null   float64
 9   rating_avg         1953 non-null   float64
 10  rating_top1000     1953 non-null   float64
 11  rating_us          1953 non-null   float64
 12  rating_row         1953 non-null   float64
 13  rating_M           1953 non-null   float64
 14  rating_F           1953 non-null   float64
 15  rating_0to18       1772 non-null   float64
 16  rating_M_0to18  

In [6]:
# Let's have a look at the remaining categorical features.
tv_df.describe(include="object")

Unnamed: 0,name,type,network,rating,status
count,1953,1953,1897,1731,1904
unique,1924,2,196,6,2
top,Teenage Mutant Ninja Turtles,TV Series,Netflix,TV-14,Ended
freq,2,1722,183,682,1482


In [None]:
tv_df.drop("name", axis=1, inplace=True)

In [7]:
y = tv_df["my_rating"]
rated_tv_series = y[y.notna()]
train_dataset = tv_df.loc[rated_tv_series.index, :]

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

NameError: name 'numerical_cols' is not defined