In [1]:
import src.features_engineering as fte
import src.clustering as cl
import pandas as pd
import dask.dataframe as dd
from dask_ml.cluster import SpectralClustering

In [2]:
ratings = dd.read_csv('./input/ratings_small.csv')
genres_dummies = pd.read_csv('./input/genres_dummies.csv')

### Features Extraction Pipeline

In [3]:
%%time
ratings = (ratings.pipe(fte.addUserFeatures)
           .pipe(fte.addMoviesFeatures)
           .pipe(fte.filterbyRatingsAmount, min_rt=100, max_rt=500)
           .pipe(fte.addWeekdayColumns)
           .pipe(fte.addGenresDummies, genres_dummies=genres_dummies)
           .pipe(fte.popularityNormalizer)
           .pipe(fte.ratingsNormalizer)
          )
ratings.head()

We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.
  "Concatenating dataframes with unknown divisions.\n"


CPU times: user 2.03 s, sys: 226 ms, total: 2.25 s
Wall time: 2.11 s


Unnamed: 0,userId,movieId,GT,timestamp,user_rt_count,user_rt_mean,movie_rt_mean,popularity,weekday,weekday_6,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,4,1371,4.0,949810302,204,0.927937,0.567376,0.142415,6,1,...,0,0,0,0,0,0,0,0,0,0
1,19,1371,4.0,855193404,423,0.608286,0.567376,0.142415,3,0,...,0,0,0,0,0,0,0,0,0,0
2,21,1371,3.0,853852263,162,0.597246,0.567376,0.142415,1,0,...,0,0,0,0,0,0,0,0,0,0
3,22,1371,2.0,1131662302,220,0.50644,0.567376,0.142415,3,0,...,0,0,0,0,0,0,0,0,0,0
4,41,1371,3.5,1093886662,199,0.738916,0.567376,0.142415,0,0,...,0,0,0,0,0,0,0,0,0,0


### X & y definition

In [4]:
%%time
X,y = fte.defineXy(ratings.compute())

CPU times: user 550 ms, sys: 45.4 ms, total: 595 ms
Wall time: 560 ms


# Clustering

In [5]:
users_genres = cl.dataScaling(cl.userGenresMatrix(ratings, genres_dummies))
users_genres_da = users_genres.to_dask_array(lengths=True)
users_genres.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Aniplex,BROSTA TV,Carousel Productions,Comedy,Crime,Documentary,Drama,...,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.428571,0.261905,0.375,0.0,0.0,0.0,0.403226,0.326531,0.090909,0.421053,...,0.5,0.545455,0.0,0.0,0.0,0.0,0.460317,0.0,0.333333,0.0
8,0.102041,0.047619,0.125,0.0,0.0,0.0,0.145161,0.040816,0.272727,0.097744,...,0.145833,0.090909,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.142857
17,0.469388,0.5,0.5,0.0,0.0,0.0,0.467742,0.530612,0.090909,0.518797,...,0.375,0.575758,0.0,0.75,0.0,0.0,0.587302,0.0,0.444444,0.857143
19,0.959184,0.857143,0.875,0.0,0.0,0.0,0.935484,1.0,0.545455,1.0,...,0.9375,0.848485,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.428571
21,0.22449,0.190476,0.375,0.0,0.0,0.0,0.209677,0.326531,0.181818,0.270677,...,0.333333,0.242424,0.0,0.0,0.0,0.0,0.253968,0.0,0.111111,0.142857


In [19]:
#Spectral Clustering
spcl = SpectralClustering(n_clusters=8, affinity='polynomial', n_jobs=-1)
clusters = spcl.fit_predict(users_genres_da)
clusters_index = cl.getClustersIndex(clusters, users_genres)
clusters_index.to_csv('./output/clusters-index/clusters-index-spcl-poly-8-*.csv')
clusters_index.compute()['clusters'].value_counts()

1    41
5    38
6    26
4    26
0    25
3    23
7    21
2    21
Name: clusters, dtype: int64

In [7]:
#KMeans