In [1]:
import pandas as pd
from joblib import load
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from numpy.random import choice
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

In [2]:
filename = 'all_datasets/collaborative/user_item_dataset_50_50.sav'
user_item_dataset = load(filename)

rating_df = pd.read_csv('all_datasets/movielens/ratings.csv')

In [3]:
rating_histogram = rating_df['userId'].value_counts().loc[user_item_dataset.users].sort_values()

In [4]:
# get user with number of movies watched in a specific range
def get_users_in_range(rating_histogram, low, high):
    return rating_histogram[(rating_histogram >= low) & (rating_histogram < high)]

In [5]:
stack = StackingRegressor(estimators=[('knn', KNeighborsRegressor()), ('svr', SVR())],
                          n_jobs=-1)
svd = TruncatedSVD()
stack_pip = make_pipeline(svd, stack)
stack_param_grid = {'truncatedsvd__n_components': [10], 'stackingregressor__final_estimator': [Ridge(1.4), Ridge(1.6), Ridge(1.8), Ridge(2), Ridge(2.2)],
                    'stackingregressor__estimators': [
                        [('knn', KNeighborsRegressor(24, weights='distance', metric='cosine')), ('svr', SVR(C=2.2))],
                    ]}

In [6]:
knn = KNeighborsRegressor(n_jobs=-1)
svd = TruncatedSVD()
knn_pip = make_pipeline(svd, knn)
knn_param_grid = {'truncatedsvd__n_components': [50],
                'kneighborsregressor__n_neighbors': [20, 22, 24, 26, 28],
                'kneighborsregressor__weights': ['distance'],
                'kneighborsregressor__metric': ['cosine']}

In [7]:
# get user samples for tuning
users_in_range = get_users_in_range(rating_histogram, 100, 200)
user_sample = choice(users_in_range.index, 10)
user_sample

array([323265,  34398, 244376,  36757, 215751, 119535, 138307, 275070,
       118523, 132574], dtype=int64)

In [8]:
#add param size
param_df = user_item_dataset.customized_grid_search(users=user_sample, estimator=stack_pip, param_grid=stack_param_grid, param_grid_size=5)

Number of users evaluated:  1
Number of users evaluated:  2
Number of users evaluated:  3
Number of users evaluated:  4
Number of users evaluated:  5
Number of users evaluated:  6
Number of users evaluated:  7
Number of users evaluated:  8
Number of users evaluated:  9
Number of users evaluated:  10


In [9]:
param_df

Unnamed: 0,stackingregressor__estimators,stackingregressor__final_estimator,truncatedsvd__n_components,rmse,mae
0,"[(knn, KNeighborsRegressor(metric='cosine', n_...",Ridge(alpha=1.4),10,-0.97301,-0.784341
1,"[(knn, KNeighborsRegressor(metric='cosine', n_...",Ridge(alpha=1.6),10,-0.972561,-0.784451
2,"[(knn, KNeighborsRegressor(metric='cosine', n_...",Ridge(alpha=1.8),10,-0.974546,-0.786297
3,"[(knn, KNeighborsRegressor(metric='cosine', n_...",Ridge(alpha=2),10,-0.972426,-0.784362
4,"[(knn, KNeighborsRegressor(metric='cosine', n_...",Ridge(alpha=2.2),10,-0.970914,-0.783355


In [10]:
#param_df.to_csv('result/stacking/1000+.csv', index=False)