In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

LightFM is a Python implementation of a number of popular recommendation algorithms. LightFM includes implementations of BPR and WARP ranking losses(A loss function is a measure of how good a prediction model does in terms of being able to predict the expected outcome.).LightFm also contains a large set of datasets related to the movie rating. 

BPR: Bayesian Personalised Ranking pairwise loss: It maximizes the prediction difference between a positive example and a randomly chosen negative example. It is useful when only positive interactions are present.

WARP: Weighted Approximate-Rank Pairwise loss: Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found

In [2]:
pip install lightfm

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM

fetch_movielens method is the method from lightfm that can be used to fetch movie data. We can fetch the movie data with a minimum rating of 4.

In [4]:
data = fetch_movielens(min_rating = 4.0)

The ‘data’ variable will contain the movie data that is divided into many categories test and train.

In [5]:
data

{'train': <943x1682 sparse matrix of type '<class 'numpy.int32'>'
 	with 49906 stored elements in COOrdinate format>,
 'test': <943x1682 sparse matrix of type '<class 'numpy.int32'>'
 	with 5469 stored elements in COOrdinate format>,
 'item_features': <1682x1682 sparse matrix of type '<class 'numpy.float32'>'
 	with 1682 stored elements in Compressed Sparse Row format>,
 'item_feature_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'item_labels': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)}

In [6]:
print(repr(data['train'])) # repr() Function returns a printable representation of the object passed to it.
print(repr(data['test']))

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 49906 stored elements in COOrdinate format>
<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 5469 stored elements in COOrdinate format>


We can see that the amount of train data is much greater than the test data. This because typically when you separate a dataset into a training set and testing set, most of the data is used for training.

**Modelling using 'warp' loss**

In [7]:
model = LightFM(loss = 'warp')

train this model using our train data, with an epoch or iteration value of 30.

In [8]:
model.fit(data['train'], epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fc0dac24450>

build the function that process this data to recommend movies for any number of users. Our function will take the model, data and an array of user_ids.

In [9]:
def sample_recommendation(model, data, user_ids): 
    n_users, n_items = data['train'].shape #get the number of all users and movies.
    for user_id in user_ids: # iterate the user_ids.
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] #For each user, we need to find the known positives or the movies they liked
        # we need to find the movies the user like. 
        # This can be done by the predict method of LightFm. 
        # The parameters of this function are user_id and the n_items variable arrange by numpy arrange
        scores = model.predict(user_id, np.arange(n_items)) 
        top_items = data['item_labels'][np.argsort(-scores)] #sort the scores based on the order of most liked to least liked
        print("User %s" % user_id) #  print the first 3 known positives and 3 predictions.
        print("Known positives:")
        for x in known_positives[:3]:
            print("%s" % x)
            print("Recommended:")
        for x in top_items[:3]:
            print("%s" % x)

In [10]:
sample_recommendation(model, data, [3, 25, 451]) #providing three random user_ids.

User 3
Known positives:
Seven (Se7en) (1995)
Recommended:
Contact (1997)
Recommended:
Starship Troopers (1997)
Recommended:
Scream (1996)
Cop Land (1997)
Jackie Brown (1997)
User 25
Known positives:
Dead Man Walking (1995)
Recommended:
Star Wars (1977)
Recommended:
Fargo (1996)
Recommended:
English Patient, The (1996)
Fargo (1996)
Contact (1997)
User 451
Known positives:
Twelve Monkeys (1995)
Recommended:
Babe (1995)
Recommended:
Mr. Holland's Opus (1995)
Recommended:
Raiders of the Lost Ark (1981)
Casablanca (1942)
Citizen Kane (1941)


In [11]:
# Extract our training and test datasets
train = data['train']
test = data['test']

In [12]:
# Evaluate it's performance
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.53, test 0.08.
AUC: train 0.96, test 0.92.


**Modelling using 'bpr' loss**

In [13]:
model1 = LightFM(loss = 'bpr')

In [14]:
model1.fit(data['train'], epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fc0db671950>

In [15]:
# Evaluate it's performance
train_precision = precision_at_k(model1, train, k=10).mean()
test_precision = precision_at_k(model1, test, k=10).mean()

train_auc = auc_score(model1, train).mean()
test_auc = auc_score(model1, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.51, test 0.06.
AUC: train 0.92, test 0.85.


In [16]:
def sample_recommendation(model1, data, user_ids): 
    n_users, n_items = data['train'].shape #get the number of all users and movies.
    for user_id in user_ids: # iterate the user_ids.
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] #For each user, we need to find the known positives or the movies they liked
        # we need to find the movies the user like. 
        # This can be done by the predict method of LightFm. 
        # The parameters of this function are user_id and the n_items variable arrange by numpy arrange
        scores = model1.predict(user_id, np.arange(n_items)) 
        top_items = data['item_labels'][np.argsort(-scores)] #sort the scores based on the order of most liked to least liked
        print("User %s" % user_id) #  print the first 3 known positives and 3 predictions.
        print("Known positives:")
        for x in known_positives[:3]:
            print("%s" % x)
            print("Recommended:")
        for x in top_items[:3]:
            print("%s" % x)

In [17]:
sample_recommendation(model1, data, [3, 25, 451]) #providing three random user_ids.

User 3
Known positives:
Seven (Se7en) (1995)
Recommended:
Contact (1997)
Recommended:
Starship Troopers (1997)
Recommended:
Contact (1997)
Chasing Amy (1997)
Jackie Brown (1997)
User 25
Known positives:
Dead Man Walking (1995)
Recommended:
Star Wars (1977)
Recommended:
Fargo (1996)
Recommended:
Titanic (1997)
Contact (1997)
L.A. Confidential (1997)
User 451
Known positives:
Twelve Monkeys (1995)
Recommended:
Babe (1995)
Recommended:
Mr. Holland's Opus (1995)
Recommended:
Raiders of the Lost Ark (1981)
Amadeus (1984)
Sting, The (1973)


**Modelling using 'logostic' loss**

In [18]:
model2 = LightFM(loss = 'logistic')

In [19]:
model2.fit(data['train'], epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fc0dac108d0>

In [20]:
# Evaluate it's performance
train_precision = precision_at_k(model2, train, k=10).mean()
test_precision = precision_at_k(model2, test, k=10).mean()

train_auc = auc_score(model2, train).mean()
test_auc = auc_score(model2, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.34, test 0.05.
AUC: train 0.88, test 0.87.


In [21]:
def sample_recommendation(model2, data, user_ids): 
    n_users, n_items = data['train'].shape #get the number of all users and movies.
    for user_id in user_ids: # iterate the user_ids.
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] #For each user, we need to find the known positives or the movies they liked
        # we need to find the movies the user like. 
        # This can be done by the predict method of LightFm. 
        # The parameters of this function are user_id and the n_items variable arrange by numpy arrange
        scores = model2.predict(user_id, np.arange(n_items)) 
        top_items = data['item_labels'][np.argsort(-scores)] #sort the scores based on the order of most liked to least liked
        print("User %s" % user_id) #  print the first 3 known positives and 3 predictions.
        print("Known positives:")
        for x in known_positives[:3]:
            print("%s" % x)
            print("Recommended:")
        for x in top_items[:3]:
            print("%s" % x)

In [22]:
sample_recommendation(model2, data, [3, 25, 451]) #providing three random user_ids.

User 3
Known positives:
Seven (Se7en) (1995)
Recommended:
Contact (1997)
Recommended:
Starship Troopers (1997)
Recommended:
Star Wars (1977)
Fargo (1996)
Return of the Jedi (1983)
User 25
Known positives:
Dead Man Walking (1995)
Recommended:
Star Wars (1977)
Recommended:
Fargo (1996)
Recommended:
Star Wars (1977)
Fargo (1996)
Return of the Jedi (1983)
User 451
Known positives:
Twelve Monkeys (1995)
Recommended:
Babe (1995)
Recommended:
Mr. Holland's Opus (1995)
Recommended:
Star Wars (1977)
Fargo (1996)
Return of the Jedi (1983)


**Modelling using 'kOS warp' loss**

In [23]:
model3 = LightFM(loss = 'warp-kos')

In [24]:
model3.fit(data['train'], epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fc0dac2ff50>

In [25]:
# Evaluate it's performance
train_precision = precision_at_k(model3, train, k=10).mean()
test_precision = precision_at_k(model3, test, k=10).mean()

train_auc = auc_score(model3, train).mean()
test_auc = auc_score(model3, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.51, test 0.07.
AUC: train 0.93, test 0.89.


In [26]:
def sample_recommendation(model3, data, user_ids): 
    n_users, n_items = data['train'].shape #get the number of all users and movies.
    for user_id in user_ids: # iterate the user_ids.
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] #For each user, we need to find the known positives or the movies they liked
        # we need to find the movies the user like. 
        # This can be done by the predict method of LightFm. 
        # The parameters of this function are user_id and the n_items variable arrange by numpy arrange
        scores = model3.predict(user_id, np.arange(n_items)) 
        top_items = data['item_labels'][np.argsort(-scores)] #sort the scores based on the order of most liked to least liked
        print("User %s" % user_id) #  print the first 3 known positives and 3 predictions.
        print("Known positives:")
        for x in known_positives[:3]:
            print("%s" % x)
            print("Recommended:")
        for x in top_items[:3]:
            print("%s" % x)

In [27]:
sample_recommendation(model3, data, [3, 25, 451]) #providing three random user_ids.

User 3
Known positives:
Seven (Se7en) (1995)
Recommended:
Contact (1997)
Recommended:
Starship Troopers (1997)
Recommended:
In & Out (1997)
Contact (1997)
Starship Troopers (1997)
User 25
Known positives:
Dead Man Walking (1995)
Recommended:
Star Wars (1977)
Recommended:
Fargo (1996)
Recommended:
Fargo (1996)
Godfather, The (1972)
L.A. Confidential (1997)
User 451
Known positives:
Twelve Monkeys (1995)
Recommended:
Babe (1995)
Recommended:
Mr. Holland's Opus (1995)
Recommended:
Casablanca (1942)
Amadeus (1984)
Graduate, The (1967)
