In [83]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:43217")
client

0,1
Client  Scheduler: tcp://127.0.0.1:43217  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.24 GB


In [84]:
import pandas as pd
import joblib

#Dask
import dask.dataframe as dd
from dask_ml.cluster import SpectralClustering
from dask_ml.cluster import KMeans
from dask_ml.model_selection import train_test_split, GridSearchCV, IncrementalSearchCV

#Sklearn
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import svm, linear_model, tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

#Local Files
import src.features_engineering as fte
import src.clustering as cl
import src.supervised_learning as sl

In [85]:
ratings = dd.read_csv('./input/ratings_small.csv')
genres_dummies = pd.read_csv('./input/genres_dummies.csv')

### Features Engineering Pipeline

In [86]:
%%time
ratings = (ratings.pipe(fte.addUserFeatures)
           .pipe(fte.addMoviesFeatures)
           .pipe(fte.filterbyRatingsAmount, min_rt=100, max_rt=500)
           .pipe(fte.addWeekdayColumns)
           .pipe(fte.addGenresDummies, genres_dummies=genres_dummies)
           #.pipe(fte.popularityNormalizer)
           #.pipe(fte.ratingsNormalizer)
          )

CPU times: user 234 ms, sys: 3.52 ms, total: 237 ms
Wall time: 791 ms


We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.
  "Concatenating dataframes with unknown divisions.\n"


In [87]:
ratings.head()

Unnamed: 0,userId,movieId,GT,timestamp,user_rt_count,user_rt_mean,movie_rt_mean,popularity,weekday,weekday_6,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,4,1371,4.0,949810302,204,4.348039,3.053191,47,6,1,...,0,0,0,0,0,0,0,0,0,0
1,19,1371,4.0,855193404,423,3.534279,3.053191,47,3,0,...,0,0,0,0,0,0,0,0,0,0
2,21,1371,3.0,853852263,162,3.506173,3.053191,47,1,0,...,0,0,0,0,0,0,0,0,0,0
3,22,1371,2.0,1131662302,220,3.275,3.053191,47,3,0,...,0,0,0,0,0,0,0,0,0,0
4,41,1371,3.5,1093886662,199,3.866834,3.053191,47,0,0,...,0,0,0,0,0,0,0,0,0,0


# __Clustering__

In [88]:
users_genres = cl.dataScaling(cl.userGenresMatrix(ratings, genres_dummies))
users_genres_da = users_genres.to_dask_array(lengths=True)
users_genres.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Aniplex,BROSTA TV,Carousel Productions,Comedy,Crime,Documentary,Drama,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.428571,0.261905,0.375,0.0,0.0,0.0,0.403226,0.326531,0.090909,0.421053,...,0.5,0.545455,0.0,0.0,0.0,0.0,0.460317,0.0,0.333333,0.0
8,0.102041,0.047619,0.125,0.0,0.0,0.0,0.145161,0.040816,0.272727,0.097744,...,0.145833,0.090909,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.142857
17,0.469388,0.5,0.5,0.0,0.0,0.0,0.467742,0.530612,0.090909,0.518797,...,0.375,0.575758,0.0,0.75,0.0,0.0,0.587302,0.0,0.444444,0.857143
19,0.959184,0.857143,0.875,0.0,0.0,0.0,0.935484,1.0,0.545455,1.0,...,0.9375,0.848485,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.428571
21,0.22449,0.190476,0.375,0.0,0.0,0.0,0.209677,0.326531,0.181818,0.270677,...,0.333333,0.242424,0.0,0.0,0.0,0.0,0.253968,0.0,0.111111,0.142857


In [7]:
%%time
#Spectral Clustering
clusters_number = 8
spcl = SpectralClustering(n_clusters=clusters_number, affinity='polynomial', n_jobs=-1)
clusters = spcl.fit_predict(users_genres_da)
clusters_index = cl.getClustersIndex(clusters, users_genres)
clusters_index.to_csv('./output/clusters-index/clusters-index-spcl-poly-8-*.csv')
clusters_index.compute()['cluster'].value_counts()

CPU times: user 16.7 s, sys: 1.33 s, total: 18 s
Wall time: 1min 14s


5    43
4    36
0    33
2    30
6    27
7    25
1    15
3    12
Name: cluster, dtype: int64

In [90]:
remote_df = client.scatter(ratings)
ratings = ratings.merge(clusters_index, left_on='userId', right_on='userId')

TypeError: Input must be a pandas DataFrame or Series

In [82]:
ratings.head()

Unnamed: 0,movieId,GT,timestamp,user_rt_count,user_rt_mean,movie_rt_mean,popularity,weekday,weekday_6,weekday_2,...,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western,cluster_x,userId,cluster_y


In [18]:
#KMeans

In [80]:
for e in list(ratings['movieId'].unique().compute())
    

0       1371
1       2105
2       2193
3        153
4        185
        ... 
2102     563
2103    3563
2104    4368
2105    3784
2106     129
Name: movieId, Length: 2107, dtype: int64

# __Supervised Learning__

## __Full Dataset__

### X & y definition

In [6]:
X_columns = ['user_rt_mean', 'movie_rt_mean', 'popularity', 'weekday_6', 'weekday_2',
       'weekday_3', 'weekday_0', 'weekday_1', 'weekday_5', 'weekday_4',
       'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music',
       'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
       'Western']
y_columns = ['GT']

In [7]:
X = ratings[X_columns].to_dask_array(lengths=True)
y = ratings[y_columns].to_dask_array(lengths=True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y.ravel(),test_size=0.2)

### Model Selection

In [9]:
models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, n_jobs=-1),
        "SGDRegressor": linear_model.SGDRegressor(max_iter=1000, tol=1e-3),
        "DecisionTree" : tree.DecisionTreeRegressor(random_state=0),
        "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=100),
        "AdaBoostRegressor" : AdaBoostRegressor(n_estimators=100)    
    }
with joblib.parallel_backend('dask'):
    %time _ = sl.mlmodelSelection(models, X_train, X_test, y_train, y_test)    


Training model: RandomForest
RSME 0.8639032022541673
MAE 0.6574806666442997
r2_score 0.28557938072939637

Training model: SGDRegressor
RSME 58929447823.63246
MAE 42892314108.609604
r2_score -3.3242108009659577e+21

Training model: DecisionTree
RSME 1.1813089522941551
MAE 0.8565054015969938
r2_score -0.33582880222785993

Training model: GradientBoostingRegressor
RSME 0.8328574794958212
MAE 0.6349964765599868
r2_score 0.33600441239554923

Training model: AdaBoostRegressor
RSME 0.8946888489906396
MAE 0.7140677712619514
r2_score 0.23375463258285534
CPU times: user 5.46 s, sys: 599 ms, total: 6.06 s
Wall time: 28.1 s


In [23]:
#GradientBoostingRegressor
#Hyperparameters search
model = GradientBoostingRegressor()
params = {
    'loss' : ['ls', 'lad', 'huber', 'quantile']
    'n_estimators' : [100, 400, 700],
    'max_depth' : [3, 10, 15]
}
'n_estimators' : [100, 400, 700],
'max_depth' : [3, 10, 15]

with joblib.parallel_backend('dask'):
    search = GridSearchCV(model, params)
    search.fit(X_train, y_train)

In [24]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,params,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,param_loss
0,{'loss': 'ls'},3.129208,0.150514,0.0181,0.002876,0.277191,0.2792,0.391601,0.315993,0.053464,1,ls
1,{'loss': 'lad'},3.552742,0.086878,0.018127,0.000311,0.230807,0.257951,0.380722,0.289821,0.065219,3,lad
2,{'loss': 'huber'},3.740558,0.133385,0.013904,0.00389,0.272147,0.275398,0.383456,0.310329,0.051721,2,huber
3,{'loss': 'quantile'},0.303404,0.044609,0.004473,0.000927,-1.691302,-1.693457,-1.941155,-1.775295,0.117274,4,quantile


In [10]:
#Final Training
with joblib.parallel_backend('dask'):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    print("RSME", (mean_squared_error(y_test, y_pred)**0.5))
    print("MAE", mean_absolute_error(y_test, y_pred))
    print("r2_score", r2_score(y_test, y_pred))

RSME 0.8328577578809736
MAE 0.6350000961603689
r2_score 0.33600396851037495


In [34]:
#saving the model to a pickle
with open('./output/models/gbrdefaultpickle_file.joblib', 'wb') as gbr_file:  
    joblib.dump(gbr, gbr_file)

'/home/castares/ironhack/movie-recommender'