# LightFM Demo 

In [5]:
import itertools
import os
import zipfile

import numpy as np
import pandas as pd

import requests

import scipy.sparse as sp

# Download Movielense data 

In [8]:
url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
req = requests.get(url, stream=True)

dest_path = 'data/movielense.zip'

if not os.path.exists(dest_path):
    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content():
            fd.write(chunk)

# Get raw data

In [9]:
def get_raw_data(dest_path):
    with zipfile.ZipFile(dest_path) as datafile:
        train_raw = datafile.read("ml-100k/ua.base").decode().split("\n")
        test_raw = datafile.read("ml-100k/ua.test").decode().split("\n")
    return train_raw, test_raw
train_raw, test_raw = get_raw_data(dest_path)

The data contains user_id, item_id, rating and timestamp which are tab seperated and have to be parsed. To do so we create a generator that will parse the data for us.

In [10]:
def parse(data):
    for line in data:

        if not line:
            continue

        uid, iid, rating, timestamp = [int(x) for x in line.split("\t")]

        yield uid, iid, rating, timestamp

Lets explore the data a bit...

In [11]:
data = []
for uid, iid, rating, timestamp in parse(train_raw):
    data.append([uid, iid, rating, timestamp])
for uid, iid, rating, timestamp in parse(test_raw):
    data.append([uid, iid, rating, timestamp])
df = pd.DataFrame(data, columns=['user_id', 'item_id', 'rating', 'timestamp'])

In [12]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [13]:
print(f'Number of ratings: {len(df)}')
print(f'Number of users: {len(df["user_id"].unique())}')
print(f'Number of movies: {len(df["item_id"].unique())}')

Number of ratings: 100000
Number of users: 943
Number of movies: 1682


# Building the user-item interaction matrix

LightFM expects a (no_users, no_items) sparse matrix (with 1s denoting positive, and -1s negative interactions), so lets build that...

We will consider movie ratings > 4.0 as positive ratings and others as negative ratings and create an interaction matrix and store it as a sparse matrix.

In [14]:
def build_interaction_matrix(rows, cols, data):
    """
    Build the training matrix (no_users, no_items),
    with ratings >= 4.0 being marked as positive and
    the rest as negative.
    """

    mat = sp.lil_matrix((rows, cols), dtype=np.int32)

    for uid, iid, rating, timestamp in data:
        if rating >= 4.0:
            mat[uid, iid] = 1.0
        else:
            mat[uid, iid] = -1.0

    return mat.tocoo()


In [15]:
rows = len(df["user_id"].unique()) + 1
cols = len(df["item_id"].unique()) + 1

print(rows, cols)

train = build_interaction_matrix(rows, cols, parse(train_raw))
test = build_interaction_matrix(rows, cols, parse(test_raw))

944 1683


In [16]:
train.shape

(944, 1683)

# Get Raw Metadata

Information about the items (movies) is a tab separated list of movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | The last 19 fields are the genres, a 1 indicates the movie is of that genre, a 0 indicates it is not; movies can be in several genres at once.

In [17]:
def get_raw_movie_meta(dest_path):
    with zipfile.ZipFile(dest_path) as datafile:
        movie_meta = datafile.read("ml-100k/u.item").decode(errors="ignore").split("\n")
    return movie_meta
movie_meta_raw = get_raw_movie_meta(dest_path)

In [18]:
movie_meta_raw[:5]

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0',
 '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0',
 '5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0']

In [19]:
def get_item_labels(item_meta_raw):
    labels = ['']
    for line in item_meta_raw:

        if not line:
            continue

        splt = line.split("|")
        item_label = splt[1]
        labels.append(item_label)
    return np.array(labels)

item_labels = get_item_labels(movie_meta_raw)

# Build Item Features
To use the item meta data as feature in LightFM we need to convert the raw data into a (no_items, no_features) sparse matrix.

In [26]:
def get_movielens_item_metadata(raw_data, use_item_ids):
    """
    Build a matrix of genre features (no_items, no_features).
    If use_item_ids is True, per-item features will also be used.
    """

    features = {}
    genre_set = set()

    for line in raw_data:

        if not line:
            continue

        splt = line.split("|")
        item_id = int(splt[0])

        genres = [
            idx for idx, val in zip(range(len(splt[5:])), splt[5:]) if int(val) > 0
        ]

        if use_item_ids:
            # Add item-specific features too
            genres.append(item_id)

        for genre_id in genres:
            genre_set.add(genre_id)

        features[item_id] = genres

    mat = sp.lil_matrix((len(features) + 1, len(genre_set)), dtype=np.int32)

    for item_id, genre_ids in features.items():
        for genre_id in genre_ids:
            mat[item_id, genre_id] = 1

    return mat
item_features = get_movielens_item_metadata(movie_meta_raw, True)

# Get Raw User Data 
The user data is tab seperated list of Demographic information about the users; this is a tab separated list of user id | age | gender | occupation | zip code

In [21]:
def get_raw_user_meta(dest_path):
    with zipfile.ZipFile(dest_path) as datafile:
        user_meta = datafile.read("ml-100k/u.user").decode(errors="ignore").split("\n")
    return user_meta
user_meta_raw = get_raw_user_meta(dest_path)

In [22]:
user_meta_raw[:5]

['1|24|M|technician|85711',
 '2|53|F|other|94043',
 '3|23|M|writer|32067',
 '4|24|M|technician|43537',
 '5|33|F|other|15213']

# Building The Model
Now that we have processed our data we are ready to build our model...

In [23]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score



## Train the model

LightFM implements two algorithms that have proven particular successful:

BPR: Bayesian Personalised Ranking pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.

WARP: Weighted Approximate-Rank Pairwise loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.

In [39]:
model = LightFM(no_components=30)
model.fit(train, epochs=20)

<lightfm.lightfm.LightFM at 0x157eba8b0>

In [25]:
test_precision = precision_at_k(model, test, k=20).mean()
test_auc = auc_score(model, test).mean()
print(f'AUC: {test_auc}')
print(f'Precision: {test_precision}')

AUC: 0.6307819485664368
Precision: 0.033881232142448425


In [70]:
model = LightFM(loss='warp')
model.fit(train, epochs=20)

<lightfm.lightfm.LightFM at 0x14796aa60>

In [71]:
test_precision = precision_at_k(model, test, k=20).mean()
test_auc = auc_score(model, test).mean()
print(f'AUC: {test_auc}')
print(f'Precision: {test_precision}')

AUC: 0.8916676044464111
Precision: 0.09490986168384552


In [30]:
item_features.shape

(1683, 1683)

In [37]:
model = LightFM(loss='warp', no_components=30)
model.fit(train,
          item_features=item_features,
          epochs=20)

<lightfm.lightfm.LightFM at 0x157f19e50>

In [38]:
test_precision = precision_at_k(model, test, k=20, item_features=item_features).mean()
test_auc = auc_score(model, test, item_features=item_features).mean()
print(f'AUC: {test_auc}')
print(f'Precision: {test_precision}')

AUC: 0.8883857727050781
Precision: 0.08594910055398941


In [41]:
def sample_recommendation(model, train_data, user_ids, item_labels, topk=5):
    n_users, n_items = train_data.shape
    for user_id in user_ids: 
        #movies they always like
        known_positives = item_labels[train.tocsr()[user_id].indices]
        scores = model.predict(user_id, np.arange(n_items))
        #rank in order of most liked to least
        top_items = item_labels[np.argsort(-scores)]
        #print results
        print("\nUser %s" % user_id)
        print("Most Liked:")

        for x in known_positives[:topk]:
            print("%s" % x)
        
        print("Recommend:")

        for x in top_items[:topk]:
            print("%s" % x)

In [42]:
sample_recommendation(model, train, [1], item_labels)


User 1
Most Liked:
Toy Story (1995)
GoldenEye (1995)
Four Rooms (1995)
Get Shorty (1995)
Copycat (1995)
Recommend:
Star Wars (1977)
Raiders of the Lost Ark (1981)
Godfather, The (1972)
Lawrence of Arabia (1962)
Chasing Amy (1997)
