This notebook will illustrate the process of training a recommender system using matrxi factorization under WARP loss with python's LightFM library. Data set used in this notebook can be found on http://www2.informatik.uni-freiburg.de/~cziegler/BX/

# Load Data

In [85]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from sklearn.preprocessing import LabelBinarizer

In [3]:
users=pd.read_csv('BX-Users.csv',sep=';',encoding = "ISO-8859-1",header=0)

In [4]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
books=pd.read_csv('BX-Books.csv',sep=';',encoding='ISO-8859-1',header=0,error_bad_lines=False)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
books.shape

(271360, 8)

In [7]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [8]:
ratings=pd.read_csv('BX-Book-Ratings.csv',sep=';',encoding='ISO-8859-1',header=0,error_bad_lines=False)

In [9]:
ratings.shape

(1149780, 3)

In [10]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [11]:
ratings['Book-Rating'].value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: Book-Rating, dtype: int64

In [14]:
ratings_p=ratings[ratings['Book-Rating']>5]
ratings_p.reset_index()

Unnamed: 0,index,User-ID,ISBN,Book-Rating
0,4,276729,0521795028,6
1,6,276736,3257224281,8
2,7,276737,0600570967,6
3,8,276744,038550120X,7
4,9,276745,342310538,10
5,16,276747,0060517794,9
6,19,276747,0671537458,9
7,20,276747,0679776818,8
8,21,276747,0943066433,7
9,23,276747,1885408226,7


# Build interaction matrix

In [226]:
# creating mappings
uid_to_index={}
index_to_uid={}

for index,ID in enumerate(ratings_p['User-ID'].unique().tolist()):
    uid_to_index[ID]=index
    index_to_uid[index]=ID

In [227]:
bid_to_index={}
index_to_bid={}

for index,ID in enumerate(ratings_p['ISBN'].unique().tolist()):
    bid_to_index[ID]=index
    index_to_bid[index]=ID

In [228]:
len(index_to_bid.keys())

159323

In [229]:
def map_ids(row, mapper):
    return mapper[row]

In [230]:
I=ratings_p['User-ID'].apply(map_ids,args=[uid_to_index]).as_matrix()
J=ratings_p['ISBN'].apply(map_ids,args=[bid_to_index]).as_matrix()

In [231]:
V=np.ones(I.shape[0])

In [240]:
likes=sparse.coo_matrix((V,(I,J)),dtype=np.float64)

In [241]:
likes

<69775x159323 sparse matrix of type '<class 'numpy.float64'>'
	with 363268 stored elements in COOrdinate format>

# Build User Faetures matrix

In [23]:
joined_user=(users.merge(ratings_p,on='User-ID'))

In [54]:
def location_trimmer(x):
    loca=x.split(',')
    if loca[2]=='':
        alias='NaN'
    else:
        alias=loca[2].strip(' ')
    return alias

In [49]:
def age_converter(x):
    if pd.isnull(x):
        age=0
    else:
        age=x
    return age    

In [55]:
joined_user['state']=joined_user['Location'].apply(location_trimmer)

In [56]:
joined_user['age']=joined_user['Age'].apply(age_converter)

In [68]:
user_info=joined_user[['User-ID','state','age','Book-Rating']]

In [81]:
user_info.head()

Unnamed: 0,User-ID,state,age,Book-Rating
0,8,canada,0.0,6
1,8,canada,0.0,6
2,8,canada,0.0,7
3,9,usa,0.0,6
4,10,spain,26.0,6


In [82]:
user_info=user_info.groupby('User-ID',as_index=False).agg({'state':lambda x:x.iloc[0],
                                                'age': lambda x:max(x)})

In [86]:
def age_to_string(x):
    if x==0.0:
        String='No'
    elif 0<x<=20:
        String='Young'
    elif x>20 and x<=40:
        String='Adult'
    elif x>40:
        String='Old'
    return String

In [88]:
user_info['Age']=user_info['age'].apply(age_to_string)

In [93]:
user_info.drop('age',axis=1,inplace=True)

In [104]:
user_info.head()

Unnamed: 0,User-ID,state,Age
0,8,canada,No
1,9,usa,No
2,10,spain,Adult
3,12,usa,No
4,14,usa,No


In [107]:
encoder=LabelBinarizer()
state_matrix=encoder.fit_transform(user_info['state'])
age_matrix=encoder.fit_transform(user_info['Age'])

In [113]:
state_matrix.shape

(69775, 375)

In [114]:
age_matrix.shape

(69775, 4)

In [116]:
user_matrix=np.concatenate([state_matrix,age_matrix],axis=1)

In [118]:
sparse_user=sparse.coo_matrix(user_matrix)

In [119]:
sparse_user

<69775x379 sparse matrix of type '<class 'numpy.int64'>'
	with 139550 stored elements in COOrdinate format>

# Model Building

In [199]:
def train_test_split(ratings, split_count, fraction=None):
    """
    Split recommendation data into train and test sets
    
    Params
    ------
    ratings : scipy.sparse matrix
        Interactions between users and items.
    split_count : int
        Number of user-item-interactions per user to move
        from training to test set.
    fractions : float
        Fraction of users to split off some of their
        interactions into test set. If None, then all 
        users are considered.
    """
    # Note: likely not the fastest way to do things below.
    train = ratings.copy().tocoo()
    test = sparse.lil_matrix(train.shape)
    
    if fraction:
        try:
            user_index = np.random.choice(
                np.where(np.bincount(train.row) >= split_count * 2)[0], 
                replace=False,
                size=np.int32(np.floor(fraction * train.shape[0]))
            ).tolist()
        except:
            print(('Not enough users with > {} '
                  'interactions for fraction of {}')\
                  .format(2*k, fraction))
            raise
    else:
        user_index = range(train.shape[0])
        
    train = train.tolil()

    for user in user_index:
        test_ratings = np.random.choice(ratings.getrow(user).indices, 
                                        size=split_count, 
                                        replace=False)
        train[user, test_ratings] = 0.
        # These are just 1.0 right now
        test[user, test_ratings] = ratings[user, test_ratings]
   
    
    # Test and training are truly disjoint
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index

In [242]:
likes = likes.tocsr()

In [249]:
train, test, user_index = train_test_split(likes, 2, fraction=0.2)

In [250]:
test

<69775x159323 sparse matrix of type '<class 'numpy.float64'>'
	with 27910 stored elements in Compressed Sparse Row format>

In [251]:

# Import LightFM
from lightfm import LightFM
import lightfm.evaluation

In [351]:
NUM_THREADS = 2
NUM_COMPONENTS = 40
NUM_EPOCHS = 60
ITEM_ALPHA = 1e-6

In [352]:
# Create LightFM model
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

In [353]:
# Fit the model
from lightfm.evaluation import auc_score

model=model.fit(train,user_features=sparse_user,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)

In [354]:
train_auc=auc_score(model,
                   train,
                   user_features=sparse_user,
                   num_threads=NUM_THREADS).mean()

In [355]:
print('LightFM training AUC score: {}'.format(train_auc))

LightFM training AUC score: 0.9417732357978821


In [356]:
test_auc=auc_score(model,
                   test,
                   user_features=sparse_user,
                   num_threads=NUM_THREADS).mean()

In [357]:
print('LightFM test AUC score: {}'.format(test_auc))

LightFM test AUC score: 0.6315793395042419


In [358]:


## Recommend for user with user id 2
scores=model.predict(2,np.arange(159323))

In [359]:
sorted_index=np.argsort(-scores)

In [360]:
recommends=[]
for i in range(5):
    ISBN=index_to_bid[sorted_index[i]]
    ISBN=books[books['ISBN']==ISBN]
    title=ISBN['Book-Title']
    recommends.append(title)

In [361]:
print(recommends)# Print out top 5 recommended titles

[748    The Da Vinci Code
Name: Book-Title, dtype: object, 408    The Lovely Bones: A Novel
Name: Book-Title, dtype: object, 1105    Divine Secrets of the Ya-Ya Sisterhood: A Novel
Name: Book-Title, dtype: object, 356    The Secret Life of Bees
Name: Book-Title, dtype: object, 4430    House of Sand and Fog
Name: Book-Title, dtype: object]
