In [1]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:46667")
client

0,1
Client  Scheduler: tcp://127.0.0.1:46667  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.24 GB


In [2]:
import pandas as pd
import joblib
import os

#Dask
import dask.dataframe as dd
from dask_ml.cluster import SpectralClustering
from dask_ml.cluster import KMeans
from dask_ml.model_selection import train_test_split, GridSearchCV, IncrementalSearchCV

#Sklearn
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import svm, linear_model, tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

#Local Files

import src.features_engineering as fte
import src.clustering as cl
import src.supervised_learning as sl
import src.mongodb_database as mdb

In [3]:
ratings = dd.read_csv("input/ratings_small.csv")
genres_dummies = pd.read_csv("input/genres_dummies.csv")

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Features Engineering Pipeline

In [5]:
%%time
ratings = (ratings.pipe(fte.addUserFeatures)
           .pipe(fte.addMoviesFeatures)
           .pipe(fte.filterbyRatingsAmount, min_rt=100, max_rt=1000)
           .pipe(fte.addWeekdayColumns)
           .pipe(fte.addGenresDummies, genres_dummies=genres_dummies)
          )

CPU times: user 195 ms, sys: 2.79 ms, total: 198 ms
Wall time: 1.54 s


We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.
  "Concatenating dataframes with unknown divisions.\n"


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_rt_count,user_rt_mean,movie_rt_mean,popularity,weekday,weekday_6,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,4,1371,4.0,949810302,204,4.348039,3.053191,47,6,1,...,0,0,0,0,0,0,0,0,0,0
1,19,1371,4.0,855193404,423,3.534279,3.053191,47,3,0,...,0,0,0,0,0,0,0,0,0,0
2,21,1371,3.0,853852263,162,3.506173,3.053191,47,1,0,...,0,0,0,0,0,0,0,0,0,0
3,22,1371,2.0,1131662302,220,3.275,3.053191,47,3,0,...,0,0,0,0,0,0,0,0,0,0
4,41,1371,3.5,1093886662,199,3.866834,3.053191,47,0,0,...,0,0,0,0,0,0,0,0,0,0


# __Clustering__

In [7]:
users_genres = cl.dataScaling(cl.userGenresMatrix(ratings, genres_dummies))
users_genres_da = users_genres.to_dask_array(lengths=True)
users_genres.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Aniplex,BROSTA TV,Carousel Productions,Comedy,Crime,Documentary,Drama,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.259259,0.183333,0.230769,0.0,0.0,0.0,0.211864,0.225352,0.076923,0.222222,...,0.266667,0.315789,0.0,0.0,0.0,0.0,0.254386,0.0,0.176471,0.0
8,0.061728,0.033333,0.076923,0.0,0.0,0.0,0.076271,0.028169,0.230769,0.051587,...,0.077778,0.052632,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.090909
17,0.283951,0.35,0.307692,0.0,0.0,0.0,0.245763,0.366197,0.076923,0.27381,...,0.2,0.333333,0.0,0.5,0.0,0.0,0.324561,0.0,0.235294,0.545455
19,0.580247,0.6,0.538462,0.0,0.0,0.0,0.491525,0.690141,0.461538,0.527778,...,0.5,0.491228,0.0,0.0,0.0,0.0,0.552632,0.0,0.352941,0.272727
21,0.135802,0.133333,0.230769,0.0,0.0,0.0,0.110169,0.225352,0.153846,0.142857,...,0.177778,0.140351,0.0,0.0,0.0,0.0,0.140351,0.0,0.058824,0.090909


In [8]:
%%time
#Spectral Clustering
clusters_number = 4
spcl = SpectralClustering(n_clusters=clusters_number, affinity='polynomial', n_jobs=-1)
with joblib.parallel_backend('dask'):
    clusters = spcl.fit_predict(users_genres_da)
clusters_index = cl.getClustersIndex(clusters, users_genres)
#clusters_index.to_csv('./output/clusters-index/clusters-index-spcl-poly-4-*.csv')
clusters_index.compute()['cluster'].value_counts()

CPU times: user 10.7 s, sys: 528 ms, total: 11.2 s
Wall time: 1min 3s


3    93
0    86
1    45
2    24
Name: cluster, dtype: int64

In [9]:
ratings = ratings.merge(clusters_index, left_on='userId', right_on='userId')

# Upload Users and Movies to MongoDB Database

In [10]:
#ratings_df = ratings.compute()
# Uploading users to MongoDB Cluster
#mdb.addUsersbulk(ratings_df)
# Uploading movies to MongoDB cluster.
# mdb.addMoviesBulk(ratings_df,users_genres.compute())

# __Supervised Learning__

## __Full Dataset__

### X & y definition

In [11]:
X_columns = ['user_rt_mean', 'movie_rt_mean', 'popularity', 'weekday_0', 'weekday_1',
       'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
       'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music',
       'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War',
       'Western']
y_columns = ['rating']

In [12]:
X = ratings[X_columns].to_dask_array(lengths=True)
y = ratings[y_columns].to_dask_array(lengths=True)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y.ravel(),test_size=0.1)

### Model Selection

In [15]:
#Model Search
models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, n_jobs=-1),
        "SGDRegressor": linear_model.SGDRegressor(max_iter=1000, tol=1e-3),
        "DecisionTree" : tree.DecisionTreeRegressor(random_state=0),
        "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=100),
        "AdaBoostRegressor" : AdaBoostRegressor(n_estimators=100)    
    }
with joblib.parallel_backend('dask'):
    %time _ = sl.mlmodelSelection(models, X_train, X_test, y_train, y_test)  


Training model: RandomForest
RSME 0.853641420106993
MAE 0.6526187525914585
r2_score 0.32498446972824613

Training model: SGDRegressor
RSME 61582124771.68848
MAE 41659505778.62846
r2_score -3.512951421752185e+21

Training model: DecisionTree
RSME 1.180786403408996
MAE 0.8688881446751507
r2_score -0.29153295732646134

Training model: GradientBoostingRegressor
RSME 0.8100161958483189
MAE 0.6168940692478293
r2_score 0.3922146739695943

Training model: AdaBoostRegressor
RSME 0.8613554264257332
MAE 0.6824147638496613
r2_score 0.31272967272423224
CPU times: user 36.2 s, sys: 2.66 s, total: 38.9 s
Wall time: 3min 33s


In [16]:
#GradientBoostingRegressor
#Hyperparameters search
model = GradientBoostingRegressor()
params = {
    'loss' : ['ls', 'lad', 'huber', 'quantile'],
    'n_estimators' : [100, 400, 700],
    'max_depth' : [3, 10, 15]
}

with joblib.parallel_backend('dask'):
    search = GridSearchCV(model, params)
    search.fit(X_train, y_train)

In [24]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results[['rank_test_score','param_loss','param_max_depth','param_n_estimators']].sort_values(by='rank_test_score').head()

Unnamed: 0,rank_test_score,param_loss,param_max_depth,param_n_estimators
18,1,huber,3,100
0,2,ls,3,100
9,3,lad,3,100
10,4,lad,3,400
19,5,huber,3,400


In [25]:
#Final Training
with joblib.parallel_backend('dask'):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    print("RSME", (mean_squared_error(y_test, y_pred)**0.5))
    print("MAE", mean_absolute_error(y_test, y_pred))
    print("r2_score", r2_score(y_test, y_pred))

RSME 0.8100183277061246
MAE 0.6169001603913072
r2_score 0.3922114747407315


In [26]:
#saving the model to a pickle
with open('./output/models/gbrdefaultpickle_file.joblib', 'wb') as gbr_file:  
    joblib.dump(gbr, gbr_file)