The goal is to predict user's rating of the film

Data from: https://datasets.imdbws.com/ https://ieee-dataport.org/open-access/imdb-users-ratings-dataset

In [1]:
import numpy as np
import pandas as pd
# import imdb
import xgboost as xgb
import eli5
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error

# Read users' rating data

In [2]:
dataset = np.load("data/Dataset.npy")
splitted_dataset = []
for index, elem in enumerate(dataset):
    splitted_dataset.append(elem.split(','))

df = pd.DataFrame(splitted_dataset, columns = ['userID', 'movieID', 'rating', 'review date'])

df.rating = df.rating.astype(int)
df["review_datetime"] = pd.to_datetime(df["review date"])
df["review_year"] = df.review_datetime.map(lambda x: x.year)

In [3]:
df.sample(3)

Unnamed: 0,userID,movieID,rating,review date,review_datetime,review_year
1363817,ur12844960,tt0192162,3,21 September 2011,2011-09-21,2011
3128565,ur70420627,tt8413338,9,24 February 2019,2019-02-24,2019
1683875,ur2419668,tt2234155,4,3 October 2013,2013-10-03,2013


# Choose user with the highest number of ratings

In [4]:
ordered_df = df.groupby('userID').count().sort_values('movieID', ascending=False)
userID = ordered_df.index[0]
ordered_df.iloc[0, ]
df = df[df['userID'] == userID]
df.drop("userID", axis=1, inplace=True)

In [5]:
print("We'll predict for user", userID)
df.describe()

We'll predict for user ur2467618


Unnamed: 0,rating,review_year
count,24145.0,24145.0
mean,6.210147,2012.084241
std,2.254549,4.227211
min,1.0,2003.0
25%,5.0,2009.0
50%,7.0,2012.0
75%,8.0,2015.0
max,10.0,2020.0


In [6]:
# cut first two letters from titles
# 
# ia = imdb.IMDb()
# 
# df['movieID'] = df['movieID'].agg(lambda x: x[2:])
# df['movieID'] = df['movieID'].astype('uint16')
# df['genre'] = df['movieID'].agg(ia.get_movie)

# Read movies' data

In [7]:
df_movies = pd.read_csv("data/title.basics.tsv", sep='\t', 
    dtype={'tconst': str, 'titleType': 'category', 'originalTitle':str}, low_memory=False
)

In [8]:
df_movies.sample(3)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
7070885,tt8240994,tvEpisode,Episode #1.6,Episode #1.6,0,2018,\N,29,"Comedy,Game-Show"
5834625,tt5544708,tvEpisode,8 Weird Ways to Open a Bottle,8 Weird Ways to Open a Bottle,0,2014,\N,\N,"Comedy,Family,Talk-Show"
5667176,tt5170618,tvEpisode,Michelle Buteau,Michelle Buteau,0,2015,\N,\N,"Comedy,Talk-Show"


In [9]:
df_movies["titleType_cat"] = df_movies.titleType.factorize()[0]

In [10]:
df_movies["isAdult"] = df_movies["isAdult"].replace("\\N", -1).astype(int)
df_movies.isAdult.value_counts()

 0       7622378
 1        239672
 2019          3
 2020          2
 1981          1
 2005          1
 2014          1
 2017          1
-1             1
Name: isAdult, dtype: int64

Niektóre wiersze nie dzielą się poprawnie przy wczytywaniu, stąd rok przesuwa się do kolumny isAdult. Na razie usuwam te wiersze, bo nasz użytkownik i tak nie ocenił tych filmów. Ale w przyszłości trzeba coś z tym zrobić.

In [11]:
df_movies = df_movies[(df_movies.isAdult == 0) | (df_movies.isAdult == 1)]
df_movies[(df_movies.isAdult > 1) | (df_movies.isAdult == -1)].primaryTitle.map(lambda x: len(x.split("\t")))
 #todo: fix these rows - wrong splitting while reading (why?)

Series([], Name: primaryTitle, dtype: object)

In [12]:
for column in ["startYear", "endYear", "runtimeMinutes"]:
    df_movies[column] = df_movies[column].replace("\\N", "-1").astype(int)

In [13]:
df_movies = df_movies.rename(columns={"tconst": "movieID"})

In [14]:
df_movies[df_movies.endYear != -1].titleType.value_counts()

tvSeries        61191
tvMiniSeries    14793
movie               0
short               0
tvEpisode           0
tvMovie             0
tvShort             0
tvSpecial           0
video               0
videoGame           0
Name: titleType, dtype: int64

Only tvseries have endYear.

## Genres

split genres column into separate boolean columns

In [15]:
df_movies["genres_list"] = df_movies.genres.map(lambda x: x.split(','))
# https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173

def boolean_df(list_col):
    ### create truth table for items in list
    unique_items = pd.Series([x for _list in list_col for x in _list]).unique()
    bool_dict = {}
    
    for item in unique_items:
        bool_dict[item] = list_col.apply(lambda x: item in x)   # Apply boolean mask
            
    return pd.DataFrame(bool_dict)
    
df_genres = boolean_df(df_movies.genres_list)

df_movies = pd.concat([df_movies, df_genres], axis=1)

In [16]:
df_movies.head()

Unnamed: 0,movieID,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleType_cat,...,Mystery,\N,Sci-Fi,Thriller,Musical,Film-Noir,Game-Show,Talk-Show,Reality-TV,Adult
0,tt0000001,short,Carmencita,Carmencita,0,1894,-1,1,"Documentary,Short",0,...,False,False,False,False,False,False,False,False,False,False
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,-1,5,"Animation,Short",0,...,False,False,False,False,False,False,False,False,False,False
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,-1,4,"Animation,Comedy,Romance",0,...,False,False,False,False,False,False,False,False,False,False
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,-1,12,"Animation,Short",0,...,False,False,False,False,False,False,False,False,False,False
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,-1,1,"Comedy,Short",0,...,False,False,False,False,False,False,False,False,False,False


# Read title ratings
and merge them with movies data

In [17]:
df_ratings = pd.read_csv("data/title.ratings.tsv", sep="\t", dtype={"averageRating": float, "numVotes": int})
df_ratings = df_ratings.rename(columns={"tconst": "movieID"})

df_movies_all = pd.merge(df_movies, df_ratings, on='movieID')

In [18]:
df_movies_all.head(3)
len(df_movies)
df_movies.shape

(7862050, 40)

In [19]:
df_crew = pd.read_csv("data/title.crew.tsv", sep="\t")
df_crew = df_crew.rename(columns={"tconst": "movieID"})

In [28]:
# df_crew.directors.value_counts()
# df_crew.directors.map(lambda x:  x if len(x) > 11 else 0).value_counts()

df_crew["director1"] = df_crew.directors.map(lambda x: x.split(",")[0]).factorize()[0]
#df_crew
df_movies_all = pd.merge(df_movies, df_crew[["movieID", "director1"]], on='movieID')

# Join user with movies (SQL <3)

In [29]:
df_user_movies = pd.merge(df, df_movies_all, on='movieID')
# df_user_movies.drop(["movieID"], axis=1, inplace=True)

In [30]:
df_user_movies.sample(3)

Unnamed: 0,movieID,rating,review date,review_datetime,review_year,titleType,primaryTitle,originalTitle,isAdult,startYear,...,\N,Sci-Fi,Thriller,Musical,Film-Noir,Game-Show,Talk-Show,Reality-TV,Adult,director1
16951,tt0708554,9,7 January 2015,2015-01-07,2015,tvEpisode,In Purgatory's Shadow,In Purgatory's Shadow,0,1997,...,False,False,False,False,False,False,False,False,False,11943
20232,tt2771200,9,27 March 2017,2017-03-27,2017,movie,Beauty and the Beast,Beauty and the Beast,0,2017,...,False,False,False,True,False,False,False,False,False,15685
4285,tt0493418,9,18 February 2008,2008-02-18,2008,short,Fumi and the Bad Luck Foot,Fumi and the Bad Luck Foot,0,2005,...,False,False,False,False,False,False,False,False,False,100817


In [31]:
df_user_movies.loc[df_user_movies.movieID == "tt0769345", "startYear"] = 2009 # taken from https://www.imdb.com/title/tt0769345/releaseinfo?ref_=tt_dt_rdat

W isAdult są same 0 - nie ogląda filmów dla dorosłych.

W endYear  23569 brakujące wartości.

# Add new columns

In [32]:
df_user_movies["duration"] = df_user_movies.apply(lambda x: x.endYear - x.startYear if x.endYear != -1 else -1, axis=1)

In [33]:
df_user_movies.titleType.value_counts()

movie           12763
short            5476
tvEpisode        4521
tvMovie           717
video             275
tvSeries          254
tvMiniSeries       68
tvSpecial          41
tvShort            27
videoGame           0
Name: titleType, dtype: int64

# Train model
I don't really know which model to choose, so I chose the same as on the webinar. I have to read about it.

In [34]:
def check_model(model, df, feats: list, target: str, cv=5, scoring="neg_mean_absolute_error", show_eli5=True, export_preds=False, cols_to_export = ['movieID']):
    ### dzieli dane na treningowe i testowe, przeprowadza cross validation 
    
    # podział danych
    X_train, X_test, y_train, y_test = train_test_split(df[feats], list(df[target]), test_size=0.3, random_state=42)

    # testy
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
    result = np.mean(scores), np.std(scores)
    
    model.fit(X_train, y_train)

    if show_eli5: # pokaż dane do debugowania modelu
        print("result:", result)
        return eli5.show_weights(model, feature_names=feats)

    if export_preds:
        df["rating_predicted"] = model.predict(X_test)
        cols_to_export.append("rating_predicted")
        df[cols_to_export].to_csv("output/predictions.csv", index=False)
    
    return result

In [35]:
model = xgb.XGBRegressor()
unused_cols = ['movieID', 'rating', 'review date', 'review_datetime', 'titleType', 'primaryTitle', 'originalTitle', 'genres', 'genres_list', '\\N']
feats = [x for x in df_user_movies.columns if x not in unused_cols]
print(feats)

df_user_movies[feats]

check_model(model, df_user_movies, feats, 'rating')

['review_year', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'titleType_cat', 'Documentary', 'Short', 'Animation', 'Comedy', 'Romance', 'Sport', 'News', 'Drama', 'Fantasy', 'Horror', 'Biography', 'Music', 'War', 'Crime', 'Western', 'Family', 'Adventure', 'Action', 'History', 'Mystery', 'Sci-Fi', 'Thriller', 'Musical', 'Film-Noir', 'Game-Show', 'Talk-Show', 'Reality-TV', 'Adult', 'director1', 'duration']
result: (-1.653673303666015, 0.01656382448326479)


Weight,Feature
0.0775,titleType_cat
0.0645,Documentary
0.0599,Drama
0.0583,Sci-Fi
0.0522,Family
0.0516,Horror
0.0447,Film-Noir
0.0380,startYear
0.0325,Biography
0.0309,Short


# #todo:
- add columns: how many years passed from review, how old is the film, how long lasted the series, difference between mean and his rating
- maybe combine some genres together?
- choose proper model, adjust hyperparameters


['review_year', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'titleType_cat', 'Documentary', 'Short', 'Animation', 'Comedy', 'Romance', 'Sport', 'News', 'Drama', 'Fantasy', 'Horror', 'Biography', 'Music', 'War', 'Crime', 'Western', 'Family', 'Adventure', 'Action', 'History', 'Mystery', 'Sci-Fi', 'Thriller', 'Musical', 'Film-Noir', 'Game-Show', 'Talk-Show', 'Reality-TV', 'Adult', 'averageRating', 'numVotes']
result: (-1.4011508285221765, 0.009833111243021959)


['review_year', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'titleType_cat', 'Documentary', 'Short', 'Animation', 'Comedy', 'Romance', 'Sport', 'News', 'Drama', 'Fantasy', 'Horror', 'Biography', 'Music', 'War', 'Crime', 'Western', 'Family', 'Adventure', 'Action', 'History', 'Mystery', 'Sci-Fi', 'Thriller', 'Musical', 'Film-Noir', 'Game-Show', 'Talk-Show', 'Reality-TV', 'Adult', 'numVotes'] -1.618800038957057, 0.019937683298721393 (bez averageRating)


