In [4]:
!pip install lightfm -q

[?25l[K     |█                               | 10 kB 19.6 MB/s eta 0:00:01[K     |██▏                             | 20 kB 25.5 MB/s eta 0:00:01[K     |███▏                            | 30 kB 13.2 MB/s eta 0:00:01[K     |████▎                           | 40 kB 9.9 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 5.5 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 6.0 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.8 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 6.5 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 4.9 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 5.3 MB/s eta 0:00:01[K     |██████████████▉                 | 143 kB 5.3 MB/s eta 0:00:01[K  

In [5]:
# download data
!wget http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip -q
!unzip -q ./BX-CSV-Dump.zip

In [1]:
import pandas as pd
import numpy as np
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from sklearn.model_selection import train_test_split

In [2]:
def get_data():
    books = pd.read_csv(
        open('./BX-Books.csv', 'r', encoding='utf-8', errors='ignore'), 
        sep=';',
        error_bad_lines=False
    )

    users = pd.read_csv(
        open('./BX-Users.csv', 'r', encoding='utf-8', errors='ignore'), 
        sep=';',
        error_bad_lines=False
    )

    ratings = pd.read_csv(
        open('./BX-Book-Ratings.csv', 'r', encoding='utf-8', errors='ignore'), 
        sep=';',
        error_bad_lines=False
    )
    
    ratings = ratings[ratings['User-ID'].isin(users['User-ID'])]
    
    ratings = ratings[ratings['ISBN'].isin(books['ISBN'])]

    return {
        'books': books,
        'ratings': ratings,
        'users': users,
    }

In [3]:
data = get_data()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  if self.run_code(code, result):


# fit dataset

Lets say you have 2 isbn, 3 author, 1 year in your dataset
```
isbn_1, isbn_2
author_1, author_2, author_3
year_1
```

mapping will be created that maps each value to integer
```python
{
  'isbn_1': 0, 
  'isbn_2': 1, 
  'author_1': 2, 
  'author_2': 3,
  'author_3': 4, 
  'year_1' : 5
}
```

Given book with isbn_1, author_1, year_1 it will be encoded as ```[1, 0, 1, 0, 0, 1]```. It has 1 at position 
```
0 (mapping['isbn_1']) 
1 (mapping['author_1'])
5 (mapping['year_1'])
```

you can use `dataset.fit` to create mappings and `dataset.partial_fit` to update the mappings

## fit user_id / item_id

In [4]:
dataset = Dataset()

book_ids = data['books']['ISBN']
user_ids = data['users']['User-ID']

# internally consecutive non negative integers are used as ids
# map ISBN to internal item-id
# map User-ID to internal user-id
dataset.fit(user_ids, book_ids)

In [5]:
# check shape
num_users, num_items = dataset.interactions_shape()
print("num users", num_users, "num_items", num_items)

num users 278858 num_items 271360


## fit user/item features


In [6]:
dataset.fit_partial(
    item_features=data['books']['Book-Author']
)

In [7]:
dataset.fit_partial(
     item_features=data['books']['Year-Of-Publication']
)

## internal dictionaires

In [8]:
def peek_dict(d, n=2):
    return list(d.items())[:n]

In [9]:
print("dictionary that maps user_id to internal_user_id")
print(peek_dict(dataset._user_id_mapping))
print()

print("dictionary that maps item_id to internal_user_id")
print(peek_dict(dataset._item_id_mapping))
print()

print("dictionary that maps user_id / user_feature to its position in user feature vector")
print(peek_dict(dataset._user_feature_mapping, 10))
print()

print("dictionary that maps item_id / item_feature to its position in item feature vector")
print(peek_dict(dataset._item_feature_mapping, 10))
print()

dictionary that maps user_id to internal_user_id
[(1, 0), (2, 1)]

dictionary that maps item_id to internal_user_id
[('0195153448', 0), ('0002005018', 1)]

dictionary that maps user_id / user_feature to its position in user feature vector
[(1, 0), (2, 1), (3, 2), (4, 3), (5, 4), (6, 5), (7, 6), (8, 7), (9, 8), (10, 9)]

dictionary that maps item_id / item_feature to its position in item feature vector
[('0195153448', 0), ('0002005018', 1), ('0060973129', 2), ('0374157065', 3), ('0393045218', 4), ('0399135782', 5), ('0425176428', 6), ('0671870432', 7), ('0679425608', 8), ('074322678X', 9)]



In [10]:
dataset.interactions_shape()

(278858, 271360)

In [11]:
dataset.user_features_shape()

(278858, 278858)

In [12]:
dataset.item_features_shape()

(271360, 373584)

In [13]:
expected_dim = (data['books']['ISBN'].nunique() + 
                data['books']['Book-Author'].nunique() + 
                data['books']['Year-Of-Publication'].nunique())

actual_dim = dataset.item_features_shape()[1] + 1

assert  actual_dim == expected_dim, "Invalid Item Vector Shape"

# build

## how to use build

In [14]:
user = item_gen = ((row['User-ID'], []) for _, row in data['users'][:2].iterrows())

user_features = dataset.build_user_features(user)

user_features

<278858x278858 sparse matrix of type '<class 'numpy.float32'>'
	with 278858 stored elements in Compressed Sparse Row format>

In [15]:
item_gen = ((row['ISBN'], [row['Book-Author'], row['Year-Of-Publication']]) for _, row in data['books'][:2].iterrows())

item_features = dataset.build_item_features(item_gen)

item_features

<271360x373584 sparse matrix of type '<class 'numpy.float32'>'
	with 271364 stored elements in Compressed Sparse Row format>

In [16]:
interactions_gen = ((row['User-ID'], row['ISBN']) for _, row in data['ratings'][:2].iterrows())

interactions, weights = dataset.build_interactions(interactions_gen)

interactions

<278858x271360 sparse matrix of type '<class 'numpy.int32'>'
	with 2 stored elements in COOrdinate format>

In [17]:
# we are not considering ratings so its implicit data
weights

<278858x271360 sparse matrix of type '<class 'numpy.float32'>'
	with 2 stored elements in COOrdinate format>

# train model

In [18]:
from lightfm.cross_validation import random_train_test_split

In [19]:
df_merged = pd.merge(data['ratings'], data['books'], on='ISBN')
df_merged = df_merged[['User-ID', 'ISBN', 'Book-Author', 'Year-Of-Publication']]

In [21]:
# df_train, df_test = train_test_split(df_merged, test_size=0.1, random_state=42)

In [20]:
def get_item_features(dataset, df):
  d = df.apply(lambda r: (r['ISBN'], [r['Book-Author'], r['Year-Of-Publication']]), axis=1).values.tolist()
  return dataset.build_item_features(d)

def get_interactions(dataset, df):
  d = df.apply(lambda r: (r['User-ID'], r['ISBN']), axis=1).values.tolist()
  return dataset.build_interactions(d)

# df_merged
# split such that we have atelast one entries for each user item

In [21]:
# train_items = get_item_features(dataset, df_train)
item_features = get_item_features(dataset, df_merged)
interactions, _ = get_interactions(dataset, df_merged)

In [22]:
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

In [23]:
model = LightFM(loss='bpr')
model.fit(train, item_features=item_features)

<lightfm.lightfm.LightFM at 0x7f876c3b71d0>

# evaluate model

In [28]:
train_auc = auc_score(model, train, 
                            item_features=item_features,
                            num_threads=4).mean()

train_auc

0.7241838

In [29]:
train_precision = precision_at_k(model, train, 
                                item_features=item_features,
                                k=10, num_threads=4).mean()

train_precision

0.0037101302

In [30]:
print(f"Precision: {train_precision:.2f}, AUC: {train_auc:0.2f}")

Precision: 0.00, AUC: 0.72


In [25]:
test_precision = precision_at_k(model, test, 
                                item_features=item_features,
                                k=10, num_threads=4).mean()

test_precision

0.0017697456

In [24]:
test_auc = auc_score(model, test, 
                            item_features=item_features,
                            num_threads=4).mean()
test_auc

0.6885617

In [27]:
print(f"Precision: {test_precision:.2f}, AUC: {test_auc:0.2f}")

Precision: 0.00, AUC: 0.69
