In [1]:
import sqlite3
from keybert import KeyBERT

import numpy as np
import pandas as pd
import nltk

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import optuna
import lightgbm as lgb
from sklearn.model_selection import KFold
import spacy

  from .autonotebook import tqdm as notebook_tqdm


### Helper functions to access sql database

In [2]:
sqldb_path = 'data/rating_database.db'

def insert_into_sql(statement, db_path = sqldb_path):
    """
    Insert or update sql database based on the statement parameter

    Parameters
    ----------
    statement : str
        The statement to execute
    db_path : str
        Path to sqlite database that you want to execute the statements

    Returns
    -------
    None
    """
    
    try:
        sqliteConnection = sqlite3.connect(db_path)
        cursor = sqliteConnection.cursor()
        cursor.execute(statement)
        sqliteConnection.commit()
        cursor.close()
    except sqlite3.Error as error:
        print("Error while connecting to sqlite", error)


def fetch_from_sql(statement, db_path = sqldb_path):
    """
    Fetch from sql database based on the statement parameter

    Parameters
    ----------
    statement : str
        The statement to execute
    db_path : str
        Path to sql database that you want to execute the statements

    Returns
    -------
    fetch_items : list
        A list of elements that match the statement from sql database
    """
    
    try:
        sqliteConnection = sqlite3.connect(db_path)
        cursor = sqliteConnection.cursor()
        cursor.execute(statement)
        fetch_items = cursor.fetchall()
        cursor.close()

        return fetch_items
    except sqlite3.Error as error:
        print("Error while connecting to sqlite", error)

## First, divide all movies into training and test set

In [3]:
all_movie_ids = 'SELECT movie_id FROM Movie'
movie_ids = [str(row[0]) for row in fetch_from_sql(all_movie_ids)]

np.random.seed(42)
train_ids = np.random.choice(movie_ids, size=int(len(movie_ids)*0.8), replace=False)
test_ids = list(set(movie_ids).difference(set(train_ids)))

## Then, create train and test sets using the ids

In [4]:
# Add stopwords and names to unwanted. Names are added because reviews are 
# likely to have a lot of different actors and actress names in them.
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])
print(f'Current unwanted length: {len(unwanted)}')

# Loading spacy for lemmatization
spacy_tokenization = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def get_data(ids, features):
    """ 
    Retrives features from Movie table that belongs to given ids.
    """

    ids = [str(one_id) for one_id in ids]
    df = pd.DataFrame(fetch_from_sql((
            f'SELECT {", ".join(features)} '
            f'FROM Movie WHERE movie_id IN ({",".join(ids)})'
        )))
    df.columns = features

    return df


def preprocess_data(train_df, test_df):
    """
    Returns train and test df after replacing minor categories in 
    movie_rating and movie_genres. For movie_rating, a minor category
    is when it appears fewer than 20 times in the train_df. For
    movie_genres, the threshold is 80. This is an arbitrary choice.
    """

    movie_rating_vc = train_df['movie_rating'].value_counts()
    minor_ratings = movie_rating_vc[movie_rating_vc < 20].index
    train_df = train_df.replace(minor_ratings, 'etc')
    test_df = test_df.replace(minor_ratings, 'etc')

    all_genres = train_df[['movie_genre',
                           'movie_subgenre',
                           'movie_sub2genre']].to_numpy().flatten()
    genre_vc = pd.Series(all_genres).value_counts()
    exclude_genres = genre_vc[genre_vc < 80].index
    train_df = train_df.replace(exclude_genres, 'None')
    test_df = test_df.replace(exclude_genres, 'None')

    return train_df, test_df


def skip_unwanted(pos_tuple):
    """
    Returns a boolean value depending on whether it should skip
    a word or not.
    """

    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith('NN'):
        return False
    return True


def filter_and_lemmatize(text, nlp=spacy_tokenization):
    """
    Filters unwanted words and lemmatizes words in the 
    given text using the nlp parameter. The default is 
    spacy_tokenization.
    """

    text = text.lower()
    words = ' '.join([word for word, tag in filter(skip_unwanted, nltk.pos_tag(text.split(' ')))])
    nlp.max_length = len(words) + 100
    spacy_words = nlp(words)
    return ' '.join([token.lemma_ for token in spacy_words])


def extract_text_features(df):
    """
    Extract stored critic reviews and text sentiment analysis 
    result for each movie and store combined reviews and mean 
    and standard deviation of each sentiment analysis scores 
    in a dataframe. The scores are negative, neutral, positive, 
    and compound score. 
    """

    text_features = {}
    for _, row in df.iterrows():
        creviews = pd.DataFrame(fetch_from_sql((
                        'SELECT critic_review, neg, neu, pos, compound '
                        f'FROM Critics WHERE movie_id == {row["movie_id"]}'
                    )))
        try:
            creviews.columns = ['critic_review', 'neg', 'neu', 'pos', 'compound']
            text_features[row['movie_id']] = [' '.join(creviews['critic_review']),
                                            creviews['neg'].mean(),
                                            creviews['neg'].std(),
                                            creviews['neu'].mean(),
                                            creviews['neu'].std(),
                                            creviews['pos'].mean(),
                                            creviews['pos'].std(),
                                            creviews['compound'].mean(),
                                            creviews['compound'].std()]
        except:
            continue
    text_features_df = pd.DataFrame(text_features).transpose()
    text_features_df.columns = ['reviews', 'mean_neg', 'std_neg',
                                'mean_neu', 'std_neu', 'mean_pos', 'std_pos',
                                'mean_compound', 'std_compound']
    
    # lemmatize
    text_features_df['reviews'] = text_features_df['reviews'].apply(filter_and_lemmatize)

    return text_features_df

Current unwanted length: 8123


In [5]:
# features to extract from the Movie table
features = ['movie_id', 'movie_year', 'movie_critic_rating', 'movie_user_rating',
            'movie_genre', 'movie_subgenre', 'movie_sub2genre', 'movie_rating',
            'num_critic_pos', 'num_critic_mix', 'num_critic_neg']

# get features from the Movie table with train_ids and test_ids
train_df = get_data(train_ids, features)
test_df = get_data(test_ids, features)

# preprocess df and extract text features
train_df, test_df = preprocess_data(train_df, test_df)
test_text_fea = extract_text_features(test_df)
train_text_fea = extract_text_features(train_df)

# combine the two dataframes
train_df = train_df.merge(train_text_fea, how='inner', left_on='movie_id', right_index=True)
test_df = test_df.merge(test_text_fea, how = 'inner', left_on='movie_id', right_index=True)

## Keywords extraction using Keybert

In [6]:
def extract_keywords(corpora, kw_model):
    """
    For each corpus in corpora, the function extracts 20 
    diversified unigram keywords. Then, it returns 
    them in a list of tuples. Each tuple contains a keyword
    and its distance from a corpus.
    """

    result = []
    for corpus in corpora:
        kw = kw_model.extract_keywords(corpus, 
                                       keyphrase_ngram_range=(1, 1), 
                                       stop_words='english',
                                       use_maxsum=True,
                                       top_n=20)
        result.append(kw)

    return result

In [7]:
# Getting combined lemmatized critic reviews for each user rating
corpora = []
for rating in np.arange(0.0, 10.1, 0.1):
    corpus = ' '.join(train_df[train_df['movie_user_rating'] == np.round(rating, 1)]['reviews'])
    corpora.append(corpus)

# Extracting 20 unigram keywords using KeyBert
# https://github.com/MaartenGr/KeyBERT
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
keywords = extract_keywords(corpora, kw_model)

# only keep keywords that appear more than twice in corpora
all_keywords = pd.Series([pair[0] for rating in keywords for pair in rating])
all_keywords_vc = all_keywords.value_counts()
words_to_care = all_keywords_vc[all_keywords_vc>2].index

In [8]:
def category_encoding(df, features, values_to_exclude):
    onehot = {}
    unique_categories = np.unique(df[features].to_numpy().flatten())
    unique_categories = np.delete(unique_categories, np.where(unique_categories == values_to_exclude))
    for category in unique_categories:
        result = []
        for _, row in df[features].iterrows():
            result.append(category in row.values)
        onehot[category] = result

    return pd.DataFrame(onehot).set_index(df['movie_id'])


def word_appearance(df, words_to_care, kw_model):
    def does_appear(row, word):
        try:
            return dict(row)[word]
        except:
            return 0

    reviews_keywords = df['reviews'].apply(lambda row: kw_model.extract_keywords(row, 
                                                        keyphrase_ngram_range=(1, 1), 
                                                        stop_words='english',
                                                        use_maxsum=True,
                                                        top_n=20))
    result = {}
    for word in words_to_care:
        result[word] = reviews_keywords.apply(lambda row: does_appear(row, word))

    return pd.DataFrame(result).set_index(df['movie_id'])


def create_set(df, kw_model, words_to_care):
    genre_df = category_encoding(df, ['movie_genre', 'movie_subgenre', 'movie_sub2genre'], 'None')
    rating_df = category_encoding(df, ['movie_rating'], 'etc')
    word_appearance_df = word_appearance(df, words_to_care, kw_model)

    df = df.merge(genre_df, left_on='movie_id', right_index=True)\
           .merge(rating_df, left_on='movie_id', right_index=True)\
           .merge(word_appearance_df, left_on='movie_id', right_index=True)

    return df


def create_set_tfidf(df, tfidf_df):
    genre_df = category_encoding(df, ['movie_genre', 'movie_subgenre', 'movie_sub2genre'], 'None')
    rating_df = category_encoding(df, ['movie_rating'], 'etc')

    df = df.merge(genre_df, left_on='movie_id', right_index=True)\
           .merge(rating_df, left_on='movie_id', right_index=True)\
           .merge(tfidf_df, left_on='movie_id', right_index=True)
    
    # convert values retrieved from sql database into float
    obj_cols = ['mean_neg', 'std_neg', 'mean_neu', 'std_neu',
                'mean_pos', 'std_pos', 'mean_compound', 'std_compound']
    df[obj_cols] = df[obj_cols].apply(pd.to_numeric)

    # drop already processed columns
    columns_to_drop = ['movie_id', 'reviews', 'movie_genre', 
                        'movie_subgenre', 'movie_sub2genre', 
                        'movie_rating']
    df = df.drop(columns_to_drop, axis=1)

    return df

In [9]:
train_final_df = create_set(train_df, kw_model, words_to_care)
test_final_df = create_set(test_df, kw_model, words_to_care)
test_final_df = test_final_df.drop(['Reality-TV'], axis=1)

In [10]:
train_final_df.head(3)

Unnamed: 0,movie_year,movie_critic_rating,movie_user_rating,num_critic_pos,num_critic_mix,num_critic_neg,mean_neg,std_neg,mean_neu,std_neu,...,weary,lackluster,meticulous,superficial,imaginative,extraordinary,meaningful,seductive,dullish,rambunctious
0,2014,57.0,7.4,4,10,0,0.171714,0.10043,0.689357,0.151849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2006,61.0,8.1,5,2,1,0.2215,0.192903,0.68025,0.167403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2009,31.0,5.7,2,7,9,0.104111,0.110915,0.723222,0.152855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
test_final_df.head(3)

Unnamed: 0,movie_year,movie_critic_rating,movie_user_rating,num_critic_pos,num_critic_mix,num_critic_neg,mean_neg,std_neg,mean_neu,std_neu,...,weary,lackluster,meticulous,superficial,imaginative,extraordinary,meaningful,seductive,dullish,rambunctious
0,1950,74.0,7.8,11,3,0,0.144455,0.160569,0.711091,0.20661,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022,45.0,6.2,7,33,4,0.070932,0.097436,0.800568,0.128292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019,84.0,4.8,49,3,0,0.049481,0.058409,0.736481,0.131844,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# saving for future use
train_final_df.to_csv('data/train_final.csv', index=False)
test_final_df.to_csv('data/test_final.csv', index=False)

## Split dataset into X and y

In [13]:
def split_dataset(df, target='movie_user_rating'):
    X = df.drop(columns=[target])
    y = df[target]

    return X, y

X_train, y_train = split_dataset(train_final_df)
X_test, y_test = split_dataset(test_final_df)

## Hyperparameter tuning

`n_estimators`: Controls the number of decision trees. Typically, high number of n_estimators is used with early stopping.

`learning_rate`: Controls the step size of the gradient descent. Need to tune this to avoid overfitting since LGBM builds tree to correct the errors of the previous trees. 

`num_leaves`: Controls the number of decision leaves in a single tree. According to LGBM documentation (check the link), the maximum limit of nuim_leaves should be 2^(max_depth)

`max_depth`: Controls the levels of tree. Higher max_depth means more complex tree and prone to overfit. Lower max_depth means less complex and prone to underfit.

`max_bin`: Controls the max number of bins when converting continuous variables into discrete.

`reg_alpha`: L1 regularization. A good search range is (0, 100). It adds the absolute value of magnitude of the coefficient as a penalty term to the loss function.

`reg_lambda`: L2 regularization. A good search range is (0, 100). It adds the squared magnitude of the coefficient as a penalty term to the loss function.

`min_split_gain`: Controls the minimum gain required to perform a split during the tree building process. It is used as a stopping criterion to prevent overfitting.

`subsample`: Controls the percentage of training samples to use when training each tree.

`colsample_bytree`: Controls the percentage of features to sample when training each tree.

`min_child_samples`: Controls the minimum number of data points needed in a leaf node, which helps to prevent overfitting.

In [14]:
def objective(trial, data = X_train, target = y_train):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 2000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 100, step=5),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 100, step=5),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 15),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0, step=0.1),
        "min_child_samples": trial.suggest_int("min_child_samples", 1,500),
    }

    cv = KFold(n_splits=5, shuffle=True)
    cv_scores = np.empty(5)

    for idx, (train_idx, val_idx) in enumerate(cv.split(data, target)):
        X_t, X_v = data.iloc[train_idx], data.iloc[val_idx]
        y_t, y_v = target.iloc[train_idx], target.iloc[val_idx]

        model = lgb.LGBMRegressor(**param_grid)
        model.fit(X_t, y_t, eval_set=[(X_v, y_v)], eval_metric='mape',
                  callbacks=[lgb.early_stopping(stopping_rounds=100),
                             lgb.log_evaluation(period=0)])
        preds = model.predict(X_v)
        cv_scores[idx] = mean_absolute_error(y_v, preds)
    
    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=300)

In [16]:
optuna.visualization.plot_optimization_history(study)

In [17]:
optuna.visualization.plot_param_importances(study)

In [18]:
best_trial = study.best_trial.params
best_trial["n_estimators"] = 10000
best_trial

{'n_estimators': 10000,
 'learning_rate': 0.1417570120141957,
 'num_leaves': 200,
 'max_depth': 4,
 'max_bin': 281,
 'reg_alpha': 0,
 'reg_lambda': 45,
 'min_split_gain': 1.4251688006312029,
 'subsample': 0.5,
 'colsample_bytree': 1.0,
 'min_child_samples': 43}

In [19]:
# final training with the best parameters and
# predict with the test set divided in the beginning
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True)

model = lgb.LGBMRegressor(**best_trial, early_stopping_rounds=100)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
          callbacks=[lgb.log_evaluation(period=0)])
y_pred = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred)



In [20]:
feature_importance = model.feature_importances_

# Create a dataframe to display feature importance
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df[importance_df['Importance'] > 0]

Unnamed: 0,Feature,Importance
0,movie_year,102
1,movie_critic_rating,83
2,num_critic_pos,43
4,num_critic_neg,28
9,mean_pos,26
7,mean_neu,23
10,std_pos,21
11,mean_compound,18
12,std_compound,16
8,std_neu,13


In [21]:
train_critic_user_ratings = fetch_from_sql(f'SELECT movie_critic_rating, movie_user_rating FROM Movie WHERE movie_id IN ({",".join(train_ids)})')
critic = [r[0]/10 for r in train_critic_user_ratings]
user = [r[1] for r in train_critic_user_ratings]
default_mae = mean_absolute_error(user, critic)

print(f"Default MAE if we assume critic rating equals to user rating: {default_mae}")
print(f"Test MAE : {test_mae}")
print(f"Improvement: {np.round((default_mae - test_mae) / default_mae, 4)*100}%")

Default MAE if we assume critic rating equals to user rating: 1.368877016551435
Test MAE : 0.8530118478423075
Improvement: 37.69%


### As a result of feature engineering and hyperparameter tuning, I'm able to predict the user rating of a movie with critic reviews with a 0.853 mean absolute error. It is a 37.69% improvement compared to believing the user rating is equal to the critic rating.