In [1]:
from collections import defaultdict

from lightfm import LightFM
from lightfm import evaluation
import numpy as np
import pandas as pd
from sklearn import model_selection

import resources as re

In [2]:
# Data from https://grouplens.org/datasets/movielens/
ratings_df = pd.read_csv('data/ratings.tsv', delimiter='\t')
users_df = pd.read_csv('data/users.tsv', delimiter='\t')
items_df = pd.read_csv('data/items.tsv', delimiter='\t')

In [3]:
# Users and items IDs start in 1, let's fix this
ratings_df.user = ratings_df.user - 1
ratings_df.item = ratings_df.item - 1
users_df.user = users_df.user - 1
items_df.item = items_df.item - 1

In [4]:
ratings_df.head()

Unnamed: 0,user,item,rating,timestamp
0,195,241,3,881250949
1,185,301,3,891717742
2,21,376,1,878887116
3,243,50,2,880606923
4,165,345,1,886397596


In [5]:
users_df.head()

Unnamed: 0,user,age,gender,occupation,zip_code
0,0,24,M,technician,85711
1,1,53,F,other,94043
2,2,23,M,writer,32067
3,3,24,M,technician,43537
4,4,33,F,other,15213


In [6]:
items_df.head()

Unnamed: 0,item,title,release,url,unkown,action,adventure,animation,children,comedy,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,0,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0.0
1,1,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
2,2,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,3,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0
4,4,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0


In [7]:
user_features_dict = re.build_dict(np.hstack(['age', 'female', 'male', users_df.occupation.unique()]))
user_features_df = re.build_users_dataframe(users_df, user_features_dict)

In [8]:
user_features_df.head()

Unnamed: 0,user,feature,value
0,0.0,0.0,24.0
1,0.0,2.0,1.0
2,0.0,3.0,1.0
3,1.0,0.0,53.0
4,1.0,1.0,1.0


In [9]:
item_features_dict = re.build_dict(np.hstack(['release', items_df.columns[-19:]]))
item_features_df = re.build_items_dataframe(items_df, item_features_dict)

In [10]:
item_features_df.head()

Unnamed: 0,fantasy,unkown,drama,thriller,children,war,animation,action,noir,mystery,musical,scifi,adventure,horror,comedy,documentary,western,release,romance,crime
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0.0,1995,0,0
1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0.0,1995,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1995,0,0
3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0.0,1995,0,0
4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1995,0,1


In [11]:
# Getting stats
length = len(ratings_df)
num_users = len(ratings_df['user'].unique())
num_items = len(ratings_df['item'].unique())
num_user_features = len(user_features_dict)
num_item_features = len(item_features_dict)

In [12]:
length, num_users, num_items, num_user_features, num_item_features

(100000, 943, 1682, 24, 20)

In [13]:
# Getting a quasi-k-fold
ss = model_selection.ShuffleSplit(n_splits=1, random_state=19, test_size=0.1)
for train_index, test_index in ss.split(ratings_df):
    break

In [14]:
len(train_index), len(test_index)

(90000, 10000)

In [15]:
# Getting the training and testing datasets
train_df = ratings_df.iloc[train_index]
test_df = ratings_df.iloc[test_index]

In [16]:
# Transforming the final data to sparse matrices
train_data = re.build_interaction_matrix(num_users, num_items, train_df,
                                         re.collaborative_filter, {'min_rating': 3})
test_data = re.build_interaction_matrix(num_users, num_items,
                                        test_df, re.collaborative_filter, {'min_rating': 3})
user_features = re.build_interaction_matrix(num_users, num_user_features,
                                            user_features_df, re.content_filter, {'kind': 'user'})
item_features = re.build_interaction_matrix(item_features_df)

In [17]:
# Training the model
model = LightFM(no_components=5, loss='warp', learning_schedule='adagrad',
                random_state=19)

model.fit(train_data, # user_features=user_features, item_features=item_features,
          epochs=5, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f3b2efa2c88>

In [18]:
# Scoring...
auc_score_train = evaluation.auc_score(model, train_data,
                                       # user_features=user_features, item_features=item_features,
                                       num_threads=2).mean()
auc_score_test = evaluation.auc_score(model, test_data,
                                      # user_features=user_features, item_features=item_features,
                                      num_threads=2).mean()

In [19]:
auc_score_train, auc_score_test

(0.91944134, 0.89603335)

In [20]:
# And getting a recommendation!!
user_id = 2  # The user for recommending movies
year = 1995  # A filter

scores = model.predict(user_id, np.arange(num_items), user_features=user_features,
                       item_features=item_features, num_threads=2)

scores_index_sorted = scores.argsort()[::-1]  # Sorting, getting indeces and reversing
after_95 = item_features_df.release.values > year  # Filtering by year
after_95_sorted = after_95[scores_index_sorted]  # Sorting filter
recommendation = scores_index_sorted[after_95_sorted]  # Getting recommendation

items_df.title.values[recommendation][:100]  # Getting recommendation tittles

array(['From Dusk Till Dawn (1996)', 'Muppet Treasure Island (1996)',
       'Escape from L.A. (1996)', 'Lost World: Jurassic Park, The (1997)',
       'Spawn (1997)', 'Arrival, The (1996)', 'Solo (1996)',
       'Event Horizon (1997)', 'Hellraiser: Bloodline (1996)',
       'Men in Black (1997)', 'Face/Off (1997)',
       'Alien: Resurrection (1997)', 'Lost in Space (1998)',
       'Star Trek: First Contact (1996)', 'Deep Rising (1998)',
       'Space Jam (1996)', 'Nutty Professor, The (1996)',
       'Dragonheart (1996)', 'Dark City (1998)', 'Maximum Risk (1996)',
       'Daylight (1996)', 'Twister (1996)', 'Rock, The (1996)',
       'Chain Reaction (1996)', 'Hercules (1997)', 'Barb Wire (1996)',
       'Mars Attacks! (1996)', 'Close Shave, A (1995)', 'Anaconda (1997)',
       'Con Air (1997)', 'Mulholland Falls (1996)',
       'Rumble in the Bronx (1995)',
       'Ghost in the Shell (Kokaku kidotai) (1995)',
       'Warriors of Virtue (1997)', 'Fifth Element, The (1997)',
       'Re