Authors:
    <br>Alejandro Alvarez (axa)
    <br>Brenda Palma (bpalmagu)

# <center>ML-Jokes: Content-Based Filtering</center>

In [2]:
# Path to ml-jokes folder
import os
if os.getcwd().split('/')[-2] == 'ml-jokes': os.chdir('..')
print(f'Current directory: {os.getcwd()}')
assert set(['data', 'mljokes', 'environment.yml', 'nbs']) <= set(os.listdir()), \
    'Wrong path; go to ./heinz-95729-project/api/ml-jokes'

Current directory: /home/alejandroxag/my_files/heinz-95729-project/api/ml-jokes


In [3]:
# imports
import time
import tqdm
import pickle
import numpy as np
import pandas as pd
from mljokes.topics import get_lda_topics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from mljokes.data import read_jokes, read_ratings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor

# Regression model inputs

In [4]:
def get_reg_model_inputs(n_topics=5, random_state=0):

    # Load jokes and ratings
    jokes_df = read_jokes()
    ratings_df = read_ratings()

    # Topics
    topics = pd.DataFrame(get_lda_topics(text=jokes_df.clean_text, 
                                         n_topics=n_topics, 
                                         random_state=random_state)) \
               .reset_index().rename(columns={'index': 'joke_id'})
    topics.loc[:, 'joke_id'] = topics.joke_id + 1

    # User-joke information
    user_joke_info = ratings_df.copy()
    user_joke_info.reset_index(inplace=True)
    user_joke_info.rename(columns={'index': 'user_id'}, inplace=True)
    user_joke_info.drop(columns='count_rated', inplace=True)
    user_joke_info = pd.melt(user_joke_info, 
                                id_vars=['user_id'],
                                var_name='joke_id',
                                value_vars=user_joke_info.columns[1:],
                                value_name='rating')
    user_joke_info = user_joke_info.loc[user_joke_info.rating != 99.0]
    user_joke_info.sort_values(by=['user_id', 'joke_id'], inplace=True)
    user_joke_info = user_joke_info.merge(topics, how='left', on='joke_id')
    user_joke_info.rename(columns=dict(zip(user_joke_info.columns[3:],
                                    ['topic_percentage_' + c  for c in np.array(user_joke_info.columns[3:], dtype=str)])), 
                        inplace=True)
    user_joke_info.reset_index(drop=True, inplace=True)

    del jokes_df, ratings_df

    # User profiles
    user_profiles = user_joke_info.iloc[:, :2]
    user_profiles.loc[:, list(range(len(user_joke_info.columns[3:])))] = \
        (user_joke_info.rating / 10).values.reshape((-1, 1)) * user_joke_info.iloc[:, 3:].values

    user_profiles = user_profiles.groupby(['user_id'])[user_profiles.columns[2:]].agg('mean').reset_index()
    user_profiles.rename(columns=dict(zip(user_profiles.columns[1:],
                                    ['topic_user_rating_' + c for c in np.array(user_profiles.columns[1:], dtype=str)])), 
                        inplace=True)

    # User-joke inputs
    user_joke_inputs = user_joke_info.merge(user_profiles, how='left', on='user_id')

    user_joke_idxs = user_joke_inputs.loc[:, ['user_id', 'joke_id']]
    y = user_joke_inputs.rating.values
    X = user_joke_inputs.iloc[:, 3:].values

    del user_joke_info, user_profiles, user_joke_inputs

    return X, y, user_joke_idxs, topics
    

# Regression model (HGB)

### Train-test split

In [5]:
# Data inputs
X, y, user_joke_idxs, topics = get_reg_model_inputs(n_topics=10)

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

./data/ratings/jester-data-3.xls
./data/ratings/jester-data-2.xls
./data/ratings/jester-data-1.xls


### Optuna hypertuning

In [None]:
import optuna

def tune(objective, n_trials=10):
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")
    return params


param_grid = {
              'learning_rate': [0.01, 0.1, 0.3],
              'max_depth': [5, 7, 13, None],
              'l2_regularization': [0.01, 0.01, 0.1, 1]
             }

def hgb_objective(trial):
    _learning_rate = trial.suggest_loguniform('learning_rate', low= )
    _n_estimators = trial.suggest_int("n_estimators", 50, 200)
    _max_depth = trial.suggest_int("max_depth", 5, 20)
    _min_samp_split = trial.suggest_int("min_samples_split", 2, 10)
    _min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 10)
    _max_features = trial.suggest_int("max_features", 10, 50)

    rf = RandomForestRegressor(
        max_depth=_max_depth,
        min_samples_split=_min_samp_split,
        min_samples_leaf=_min_samples_leaf,
        max_features=_max_features,
        n_estimators=_n_estimators,
        n_jobs=-1,
        random_state=RANDOM_SEED,
    )

    scores = cross_val_score(
        rf, X, y, cv=kfolds, scoring="neg_root_mean_squared_error"
    )
    return scores.mean()

In [13]:
subsample = np.random.randint(low=0, high=len(y_train), size=round(0.20 * len(y_train)))
X_gs, y_gs = X_train[subsample], y_train[subsample]

fixed_params = {
                'loss': 'absolute_error',
                'max_iter': 100_000,
                'early_stopping': True,
                'scoring': 'loss',
                'validation_fraction': 0.20,
                'tol': 1e-6,
                'n_iter_no_change': 20,
                'verbose': 4,
                'random_state': 0
               }

param_grid = {
              'learning_rate': [0.01, 0.1, 0.3],
              'max_depth': [5, 7, 13, None],
              'l2_regularization': [0.01, 0.01, 0.1, 1]
             }

hgb_gs = HistGradientBoostingRegressor().set_params(**fixed_params)

grid_search = GridSearchCV(hgb_gs, 
                           param_grid=param_grid, 
                           n_jobs=-1, 
                           refit=False, 
                           cv=[(slice(None), slice(None))],
                           verbose=4)

start_time = time.time()
grid_search.fit(X_gs, y_gs)
elapsed_time = time.time() - start_time
print(f'\nElapsed time: {elapsed_time:0.2f} seconds.\n')

with open('./results/hgb_26nov1100.pkl', 'wb') as f: 
  pickle.dump({'gs_object': grid_search, 'elapsed_time': elapsed_time}, f)

Fitting 1 folds for each of 48 candidates, totalling 48 fits


KeyboardInterrupt: 

### Model fitting

In [None]:
# GS parameters loading
with open('./results/hgb_26nov1100.pkl', 'rb') as f: 
  grid_search = pickle.load(f)['gs_object']

# Train
hgb = HistGradientBoostingRegressor(**{**fixed_params, **grid_search.best_params_})
hgb.fit(X_train, y_train)

# Test
y_true = y_test
y_pred = hgb.predict(X_test)
print(f'MAE: {mean_absolute_error(y_true, y_pred)}')
predictions_df = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
predictions_df.insert(2, 'error', np.abs(predictions_df.y_true - predictions_df.y_pred))
display(predictions_df)

Binning 0.212 GB of training data: 1.431 s
Binning 0.053 GB of validation data: 0.354 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 31 leaves, max depth = 7, train loss: 4.28046, val loss: 4.27800, in 0.568s
[2/100000] 1 tree, 31 leaves, max depth = 7, train loss: 4.16982, val loss: 4.16712, in 0.569s
[3/100000] 1 tree, 31 leaves, max depth = 7, train loss: 4.07496, val loss: 4.07216, in 0.597s
[4/100000] 1 tree, 31 leaves, max depth = 7, train loss: 3.99530, val loss: 3.99236, in 0.567s
[5/100000] 1 tree, 31 leaves, max depth = 7, train loss: 3.92595, val loss: 3.92283, in 0.576s
[6/100000] 1 tree, 31 leaves, max depth = 7, train loss: 3.86717, val loss: 3.86393, in 0.567s
[7/100000] 1 tree, 31 leaves, max depth = 7, train loss: 3.81588, val loss: 3.81260, in 0.566s
[8/100000] 1 tree, 31 leaves, max depth = 7, train loss: 3.76940, val loss: 3.76609, in 0.564s
[9/100000] 1 tree, 31 leaves, max depth = 7, train loss: 3.72876, val loss: 3.72548, in 0.590s
[10/100000] 1 tree, 31 l

Unnamed: 0,y_true,y_pred,error
0,6.65,6.120257,0.529743
1,-4.17,-3.416258,0.753742
2,-6.46,-0.624250,5.835750
3,0.87,2.503628,1.633628
4,6.75,5.880632,0.869368
...,...,...,...
827267,5.92,3.835198,2.084802
827268,-2.82,-1.521448,1.298552
827269,-8.30,-5.241829,3.058171
827270,-1.55,-2.634370,1.084370


# Recommendations to the users

### Unseen jokes per user

In [None]:
def unseen_jokes_predict_ratings(user_id, estimator, user_joke_idxs, topics, n_recommendations=10):
    full_combinations = user_joke_idxs.loc[:, ['user_id']].drop_duplicates() \
                                    .merge(pd.DataFrame({'joke_id': np.arange(1, 101)}), how='cross')
    
    unseen_jokes_idxs = full_combinations.merge(user_joke_idxs, how='outer', indicator=True)
    unseen_jokes_idxs = unseen_jokes_idxs[~(unseen_jokes_idxs._merge == 'both')].drop('_merge', axis=1)
    unseen_jokes_user_id = unseen_jokes_idxs[unseen_jokes_idxs.user_id == user_id]

    user_id_laugh_dist_per_topic = \
        np.unique(X[user_joke_idxs[user_joke_idxs.user_id == user_id].index][:, X.shape[1] // 2:], axis=0) 
    user_id_laugh_dist_per_topic = pd.DataFrame(user_id_laugh_dist_per_topic).reset_index()
    user_id_laugh_dist_per_topic.rename(columns={'index': 'user_id'}, inplace=True)

    unseen_jokes_user_id = unseen_jokes_user_id.merge(topics, how='left', on='joke_id')
    unseen_jokes_user_id.rename(columns=dict(zip(list(range(len(topics.columns[1:]))),
                                            [f'jt_{c}' for c in topics.columns[1:]])),
                                            inplace=True)
    
    unseen_jokes_user_id = unseen_jokes_user_id[unseen_jokes_user_id.user_id == user_id] \
                            .merge(user_id_laugh_dist_per_topic, how='left', on='user_id')
    unseen_jokes_user_id.rename(columns=dict(zip(list(range(len(topics.columns[1:]))),
                                            [f'ujr_{c}' for c in topics.columns[1:]])),
                                            inplace=True)

    unseen_jokes_rating_pred = \
        pd.concat([unseen_jokes_user_id.loc[:, ['user_id', 'joke_id']], 
                   pd.DataFrame({'rating_pred': estimator.predict(unseen_jokes_user_id.iloc[:, 2:].values)})],
                  axis=1)
    unseen_jokes_rating_pred.sort_values(by='rating_pred', ascending=False, inplace=True)

    return unseen_jokes_rating_pred.iloc[:10, :]

In [None]:
users_list = list(range(5))
unseen_jokes_predictions = pd.concat([unseen_jokes_predict_ratings(user_id=user_id, 
                                                 estimator=hgb,
                                                 user_joke_idxs=user_joke_idxs, 
                                                 topics=topics)\
                                      for user_id in tqdm.tqdm(users_list)],
                                     axis=0)

display(unseen_jokes_predictions)

100%|██████████| 5/5 [00:33<00:00,  6.71s/it]


Unnamed: 0,user_id,joke_id,rating_pred
64,0,91,3.495888
29,0,49,2.978571
62,0,89,2.763768
7,0,11,2.378543
27,0,46,2.204704
61,0,88,2.144989
59,0,86,1.920819
10,0,21,1.610546
32,0,54,1.446182
0,0,1,1.420417
