In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import lightgbm as lgb

In [3]:
ratings = pd.read_csv("all_datasets/content_based/qualified_ratings.csv", index_col=0)
n_users = np.unique(ratings['userId'])

In [4]:
# Split users by group of number of movies they rated
user_rating_counts = ratings['userId'].value_counts()

users_50_100 = user_rating_counts[user_rating_counts <= 100].index.to_numpy()
users_101_200 = user_rating_counts[(user_rating_counts > 100) & (user_rating_counts <= 200)].index.to_numpy()
users_201_500 = user_rating_counts[(user_rating_counts > 200) & (user_rating_counts <= 500)].index.to_numpy()
users_501_1000 = user_rating_counts[(user_rating_counts > 500) & (user_rating_counts <= 1000)].index.to_numpy()
users_1001_12000 = user_rating_counts[user_rating_counts > 1000].index.to_numpy()
# train test split for each group
users_50_100_train, users_50_100_test = train_test_split(users_50_100, test_size=0.2, random_state=24)
users_101_200_train, users_101_200_test = train_test_split(users_101_200, test_size=0.2, random_state=24)
users_201_500_train, users_201_500_test = train_test_split(users_201_500, test_size=0.2, random_state=24)
users_501_1000_train, users_501_1000_test = train_test_split(users_501_1000, test_size=0.2, random_state=24)
users_1001_12000_train, users_1001_12000_test = train_test_split(users_1001_12000, test_size=0.2, random_state=24)

In [2]:
data = pd.read_csv('all_datasets/content_based/processed_movies_info.csv', index_col=0)
data.reset_index(inplace=True)

In [6]:
items = data.drop(columns=['movieId', 'title', 'imdbId', '(no genres listed)']).to_numpy()

In [7]:
# Normalize all columns
scaler = StandardScaler()
items_scaled = scaler.fit_transform(items)

In [8]:
# Reduce dimensionality using TruncatedSVD after tuning n_components
items_sparse = csr_matrix(items_scaled)
items_reduced = TruncatedSVD(n_components=8).fit_transform(items_sparse)


In [9]:
# Data scaled for Ridge since it has a built-in svd solver
data_scaled = pd.DataFrame(items_scaled).assign(movieId=data['movieId'])
data_reduced = pd.DataFrame(items_reduced).assign(movieId=data['movieId'])

In [16]:
# Get X, y for each user
def get_items_rated_by_user(data, filt_ratings, user_id):
    movie_ids = filt_ratings[filt_ratings['userId'] == user_id]['movieId'].values
    feature_vector = data[data['movieId'].isin(movie_ids)].drop(columns='movieId')
    scores = filt_ratings[filt_ratings['userId'] == user_id]['rating'].values
    return feature_vector, scores

In [17]:
def predict_known_ratings_for_user(data, userId, model):
    X, y = get_items_rated_by_user(data, ratings, userId)
    model.fit(X, y)
    return np.clip(model.predict(X), 0.5, 5)
def get_not_seen_movies_from_user(data, userId):
    user_ratings = ratings[ratings['userId'] == userId]
    seen_movies = data['movieId'].isin(user_ratings['movieId'])
    return data[~seen_movies].drop(columns='movieId')
def predict_unknown_ratings_for_user(data, userId, model):
    X, y = get_items_rated_by_user(data, ratings, userId)
    X_test = get_not_seen_movies_from_user(userId)
    model.fit(X, y)
    return np.clip(model.predict(X_test), 0.5, 5)

In [20]:
# demo of hyperparameters for tuning
ridge_param_grid = {
  'alpha': [40, 50, 60, 70, 80, 90],
}

knn_param_grid = {
  'n_neighbors': [30],
}


rf_param_grid = {
  'n_estimators': [300],
}
lgbm_param_grid = {
    'num_leaves': [2, 5, 10],
    'min_data_in_leaf': [10, 50, 100],
    'boosting_type': ['gbdt']
}


In [None]:
def customized_grid_search(users, estimator, param_grid, param_grid_size, data):
    final_rmse = np.zeros(param_grid_size)
    final_mae = np.zeros(param_grid_size)
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid,
                                scoring=('neg_root_mean_squared_error', 'neg_mean_absolute_error'), refit=False,
                                cv=KFold(5, shuffle=True), n_jobs=-1)
    
    for user in users:
        X, y = get_items_rated_by_user(data, ratings, user)
        grid_search.fit(X, y)
        rmse = grid_search.cv_results_['mean_test_neg_root_mean_squared_error']
        mae = grid_search.cv_results_['mean_test_neg_mean_absolute_error']
        final_rmse += rmse
        final_mae += mae
    param_df = pd.DataFrame(grid_search.cv_results_['params'])
    param_df['rmse'] = final_rmse / len(users)
    param_df['mae'] = final_mae / len(users)
    return param_df

In [None]:
# demo of a parameter dataframe for tuning
param_df = customized_grid_search(users_1001_12000_train, Ridge(), ridge_param_grid, 6, data_scaled)
param_df.to_csv('ridge_1001_12000_param.csv')

In [21]:
# get mean rmse and mae of a model on a dataset
def evaluate_model(model, users, data):
    final_rmse = 0
    final_mae = 0
    for user in users:
        X, y = get_items_rated_by_user(data, ratings, user)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        final_rmse += root_mean_squared_error(y_test, y_pred)
        final_mae += mean_absolute_error(y_test, y_pred)
    return final_rmse/len(users), final_mae/len(users)

In [23]:
# get mean rmse and mae of LightGBM on a dataset
def evaluate_lgbm(users, data):
    final_rmse = 0
    final_mae = 0
    for user in users:
        X, y = get_items_rated_by_user(data, ratings, user)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)
        dataset = lgb.Dataset(X_train, y_train)
        params = {
            'objective': 'regression',
            'metric': 'l2',
            'boosting_type': 'gbdt',
            'learning_rate': 0.01,
            'num_leaves': 31,
            'max_depth': -1,
            'min_data_in_leaf': 5,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
        }
        model = lgb.train(params, dataset)
        y_pred = model.predict(X_test)
        final_rmse += root_mean_squared_error(y_test, y_pred)
        final_mae += mean_absolute_error(y_test, y_pred)
    return final_rmse/len(users), final_mae/len(users)

In [31]:
# demo of evaluating model after finding optimal hyperparameter
train_rmse, train_mae = evaluate_model(RandomForestRegressor(max_depth=20, n_jobs=-1),users_101_200_train, data_reduced)
print("done training")
test_rmse, test_mae = evaluate_model(RandomForestRegressor(max_depth=20, n_jobs=-1) ,users_101_200_test, data_reduced)
print(f'Train MAE: {train_mae}')
print(f'Train RMSE: {train_rmse}')
print(f'Test MAE: {test_mae}')
print(f'Test RMSE: {test_rmse}')

done train
