In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns

from sklearn.externals import joblib
from scipy.sparse import csr_matrix,lil_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
%matplotlib inline

# Loading data

In [0]:
# load ratings
ratings = pd.read_csv("ml-latest-small/ratings.csv")

In [0]:
# load movies

movies = pd.read_csv("ml-latest-small/movies.csv")

In [0]:
ratings["userId"].value_counts().head(10)

547    2391
564    1868
624    1735
15     1700
73     1610
452    1340
468    1291
380    1063
311    1019
30     1011
Name: userId, dtype: int64

In [0]:
ratings["userId"].value_counts().tail(10)

249    20
540    20
604    20
668    20
657    20
221    20
444    20
484    20
35     20
485    20
Name: userId, dtype: int64

As we can see we have realy high value of minimla number of transactions

Reason : This is just peace of real dataset

In [0]:
# split train-test

ratings_tr = ratings[ratings["timestamp"]<=1388534400]
ratings_test = ratings[ratings["timestamp"]>1388534400]

In [0]:
# users presented both in train and test

len(set(ratings_tr["userId"]).intersection(ratings_test["userId"]))

19

# Evaluation  of models

In [0]:
def apk(actual, predicted, k=20):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
        

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

In [0]:
# let's check this function using sample from slides

actual = [1,4,6]
predicted = [0,1,2,3,4,5,6,7]
apk(actual, predicted)

0.44285714285714284

In [0]:
# let's play with order in predicted list to show that order here is important

actual = [1,4,6,12,11,13]
predicted = [1,0,2,3,4,5,6,7]
apk(actual, predicted)

0.30476190476190473

# CF model

From cf matrix

In [0]:
def form_cf_matrix(df, user_col = "userId", item_col = "movieId", rating_col = "rating"):
    
    df = df[[user_col, item_col, rating_col]]
    
    # normalize ratings
    df[rating_col] = df[rating_col].apply(lambda s : np.sign(s-3.5))
    

    
    # define arrays for user_col and item_col, form dict structures
    item_id = df[item_col].unique()
    user_id = df[user_col].unique()
    user_dict = {user:i for i,user in enumerate(user_id)}
    item_dict = {item:i for i,item in enumerate(item_id)}
    
    # process dataframe to speed-up matrix calculation
    df["x_rows"] = df[user_col].apply(lambda s : user_dict[s])
    df["x_cols"] = df[item_col].apply(lambda s : item_dict[s])
    
    matrix_data = np.array(df[rating_col]), np.array(df["x_rows"]),\
                  np.array(df["x_cols"]), user_id, item_id, user_dict, item_dict
        
    return matrix_data

Train svd

In [0]:
# define and train svd object

clf_svd = TruncatedSVD(n_components=20)
transactions, rows, cols, user_id, item_id, user_dict, item_dict = form_cf_matrix(ratings_tr)

X_user_item = csr_matrix((transactions,\
                        (rows, cols)),\
                        shape = (len(user_id), len(item_id)))
    
item_proj = clf_svd.fit_transform(X_user_item)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Some simple evalution samples

In [0]:
# check history for one user 
some_user_id = 343
one_user_df = ratings[ratings["userId"] == some_user_id]
one_user_df = one_user_df.sort_values(by="rating",ascending=False)

In [0]:
# form recommendation list for this user

dense_user_repr = clf_svd.transform(X_user_item)
comp_matr = clf_svd.components_
tr_res = np.matmul(dense_user_repr, comp_matr)

In [0]:
# prepare id <-->title mapping

title_map = {}
ids = movies["movieId"].unique()
for mov in ids:
    title_map[mov] = movies[movies["movieId"] == mov]["title"].values[0]

In [0]:
# form sorted list
arg_list = np.argsort(tr_res[user_dict[some_user_id]])
arg_list = arg_list[::-1]

In [0]:
# from top-20 predictions

predicted_list = []
for v in arg_list[:20]:
    predicted_list.append([item_id[v],tr_res[user_dict[some_user_id],v]])

In [0]:
# wrap up list in dataframe

predicted_df = pd.DataFrame.from_records(data = predicted_list,columns=["movieId", "rating"])
predicted_df["title"] = predicted_df["movieId"].apply(lambda s : title_map[s])

In [0]:
# get top movies for user

one_user_df["title"] = one_user_df["movieId"].apply(lambda s : title_map[s])

In [0]:
one_user_df.head(20)

Unnamed: 0,userId,movieId,rating,timestamp,title
46667,343,1,5.0,881166397,Toy Story (1995)
46668,343,6,5.0,881166915,Heat (1995)
46686,343,778,5.0,881166832,Trainspotting (1996)
46685,343,745,5.0,881166495,Wallace & Gromit: A Close Shave (1995)
46684,343,733,5.0,881166350,"Rock, The (1996)"
46692,343,858,5.0,881166452,"Godfather, The (1972)"
46694,343,994,5.0,881166495,Big Night (1996)
46696,343,1042,5.0,881167197,That Thing You Do! (1996)
46697,343,1060,5.0,881166452,Swingers (1996)
46698,343,1073,5.0,881167085,Willy Wonka & the Chocolate Factory (1971)


In [0]:
predicted_df.head(20)

Unnamed: 0,movieId,rating,title
0,1,0.179133,Toy Story (1995)
1,356,0.176078,Forrest Gump (1994)
2,296,0.151097,Pulp Fiction (1994)
3,364,0.150785,"Lion King, The (1994)"
4,318,0.150481,"Shawshank Redemption, The (1994)"
5,2571,0.14589,"Matrix, The (1999)"
6,457,0.139432,"Fugitive, The (1993)"
7,260,0.132865,Star Wars: Episode IV - A New Hope (1977)
8,593,0.130066,"Silence of the Lambs, The (1991)"
9,1210,0.125522,Star Wars: Episode VI - Return of the Jedi (1983)


# CF via keras

In [0]:
import keras
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import GlobalAveragePooling1D,SpatialDropout1D, GlobalMaxPooling1D
from keras.models import Sequential, Model
from keras.layers import Reshape
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import RMSprop, Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.wrappers import Bidirectional
from keras import backend as K
from keras.layers import Input,GRU
from keras.layers.merge import add, concatenate,dot
from keras.layers import GaussianDropout
from keras.layers.core import *

In [0]:
def create_base_network(input_dim, output_dim=10):
    """ 
    Base network to be shared (eq. to feature extraction).
    """
    model = Sequential()
    model.add(Embedding(input_dim, output_dim))
    model.add(Dropout(0.1))
    model.add(GlobalMaxPooling1D())
    return model

In [0]:
# build simple deep learning model

embed_size = 20

# Create the 3 inputs
item_in = Input(shape=(1,))
user_in = Input(shape=(1,))


item_network = create_base_network(len(item_id), output_dim=embed_size)
item_out = item_network(item_in)

user_network = create_base_network(len(user_id), output_dim=embed_size)
user_out = user_network(user_in)

merged_vector = dot([item_out, user_out], axes=[-1,-1])

In [0]:
# Define the trainable model
model_cf = Model(inputs=[item_in, user_in], outputs=merged_vector)
model_cf.compile(optimizer=Adam(),
                  loss='mean_squared_error')

In [0]:
model_cf.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
sequential_70 (Sequential)      (None, 20)           157000      input_23[0][0]                   
__________________________________________________________________________________________________
sequential_71 (Sequential)      (None, 20)           11440       input_24[0][0]                   
__________________________________________________________________________________________________
dot_24 (Do

In [0]:
# split data
idxs = [i for i in range(len(transactions))]

train_idx, val_idx = train_test_split(idxs,test_size=0.2)

In [0]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler,EarlyStopping
lr_scheduler_callback = LearningRateScheduler(lambda epoch:
                                                      0.003 if epoch < 5
                                                      else 0.001)

In [0]:
filepath = './model_cpt/cf_keras.h5'
my_callback = keras.callbacks.ModelCheckpoint(filepath, monitor = 'val_loss', 
                                              verbose = 0, save_best_only = True, 
                                              save_weights_only=False, mode='auto')

model_cf.fit(x=[cols[train_idx],rows[train_idx]],y=transactions[train_idx],
             validation_data=([cols[val_idx], rows[val_idx]],transactions[val_idx]),epochs=10,
             callbacks = [my_callback, lr_scheduler_callback,EarlyStopping(monitor='val_loss', patience = 3, verbose = 0)])

Train on 67956 samples, validate on 16989 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x7feda48207b8>

In [0]:
from keras.models import load_model
model = load_model(filepath)

In [0]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
sequential_72 (Sequential)      (None, 20)           157000      input_23[0][0]                   
__________________________________________________________________________________________________
sequential_73 (Sequential)      (None, 20)           11440       input_24[0][0]                   
__________________________________________________________________________________________________
dot_24 (Do

In [0]:
#make some predictions for user

model.predict([[item_dict[260]], [user_dict[some_user_id]]])

array([[0.18454112]], dtype=float32)

# Content-based approach

In [0]:
# load movies

movies = pd.read_csv("ml-latest-small/movies.csv")

In [0]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [0]:
# split genres
movies_genres = movies
movies_genres.genres = movies_genres.genres.str.split('|')
movies_genres = explode(movies_genres, ['genres'])
movies_genres.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
1,1,Toy Story (1995),Animation
2,1,Toy Story (1995),Children
3,1,Toy Story (1995),Comedy
4,1,Toy Story (1995),Fantasy


In [0]:
# drop movie's title
movies_genres = movies_genres.drop('title', axis=1)
movies_genres.head()

Unnamed: 0,movieId,genres
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


In [0]:
# merge for genres estimation
genre_ratings = ratings.merge(movies_genres, left_on='movieId', right_on='movieId', how='inner')
genre_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,31,2.5,1260759144,Drama
1,7,31,3.0,851868750,Drama
2,31,31,4.0,1273541953,Drama
3,32,31,4.0,834828440,Drama
4,36,31,3.0,847057202,Drama


In [0]:
# group ranks for concreate user

ratings_user = genre_ratings.groupby(['userId', 'genres'], as_index=False)['rating'].aggregate(np.mean)
ratings_user.head()

Unnamed: 0,userId,genres,rating
0,1,Action,2.8
1,1,Adventure,2.166667
2,1,Animation,2.0
3,1,Children,2.5
4,1,Comedy,2.0


In [0]:
# find user representation
user_repr = {}
genres = ratings_user["genres"].unique()
genres_map = {}
for i,g in enumerate(genres):
    genres_map[g] = i 
for u in ratings_user["userId"].unique():
    user_repr[u] = np.zeros(len(genres))
    
def update_user_dict(s):
    user_repr[s["userId"]][genres_map[s["genres"]]] += s["rating"]
    
ratings_user.apply(lambda s : update_user_dict(s),axis=1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
11002    None
11003    None
11004    None
11005    None
11006    None
11007    None
11008    None
11009    None
11010    None
11011    None
11012    None
11013    None
11014    None
11015    None
11016    None
11017    None
11018    None
11019    None
11020    None
11021    None
11022    None
11023    None
11024    None
11025    None
11026    None
11027    None
11028    None
11029    None
11030    None
11031    None
Length: 11032, dtype: object

In [0]:
# merging movie and ratings
merged_df = pd.merge(movies, ratings,on="movieId",how="inner")

In [0]:
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,3.0,851866703
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",9,4.0,938629179
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",13,5.0,1331380058
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.0,997938310
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",19,3.0,855190091


In [0]:
merged_df["index"] = merged_df.index

In [0]:
features = np.zeros((merged_df.shape[0],2*len(genres)))
y = merged_df["rating"]
def update_features(s):
    for g in s["genres"]:
        features[s["index"],genres_map[g]] = 1.0
    features[s["index"],len(genres):] = user_repr[s["userId"]]
    
merged_df.apply(lambda s : update_features(s),axis=1)

Train regression model

In [0]:
# split on train and test
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import log_loss, f1_score, r2_score,mean_squared_error

In [0]:
scores = []
kfold = KFold(5, random_state=42)
for train_idx, test_idx in kfold.split(features):
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    reg = RandomForestRegressor(n_estimators=100)
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    scores.append(mean_squared_error(y_test,y_pred))

In [0]:
scores

[0.9487305064143827,
 1.0514048191217693,
 1.0644210784036658,
 1.0127929052066091,
 0.9197477948705246]

In [0]:
def predict_on_new_movie(mid,userid):

    new_movie = movies[movies["movieId"] == mid]
    new_genres = np.array(new_movie["genres"])
    new_features = np.zeros((1,2*len(genres)))
    for g in new_genres[0]:
        new_features[0,genres_map[g]] = 1.0
        new_features[0,len(genres):] = user_repr[userid]
    return reg.predict(new_features)[0]

In [0]:
predict_on_new_movie(61160,343)

4.898

In [0]:
one_user_df.head(15)

Unnamed: 0,userId,movieId,rating,timestamp,title
46667,343,1,5.0,881166397,Toy Story (1995)
46668,343,6,5.0,881166915,Heat (1995)
46686,343,778,5.0,881166832,Trainspotting (1996)
46685,343,745,5.0,881166495,Wallace & Gromit: A Close Shave (1995)
46684,343,733,5.0,881166350,"Rock, The (1996)"
46692,343,858,5.0,881166452,"Godfather, The (1972)"
46694,343,994,5.0,881166495,Big Night (1996)
46696,343,1042,5.0,881167197,That Thing You Do! (1996)
46697,343,1060,5.0,881166452,Swingers (1996)
46698,343,1073,5.0,881167085,Willy Wonka & the Chocolate Factory (1971)
