In [27]:
# Installation

# !pip install pandas
# !pip install numpy
# !pip install scipy
# !pip install implicit

In [1]:
# Imports
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit
import os
import random  

In [2]:
# Environment and global variables
pd.set_option('display.max_columns',10)
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'

In [3]:
# Utility functions

# map the user and item names to contiguous integers and also return the maps
def maptrans(trans):
    uniqueusers = np.sort(trans['user'].unique())
    uniqueitems = np.sort(trans['item'].unique())
    umap = dict(zip(uniqueusers,[i for i in range(len(uniqueusers))])) # this maps username -> index
    imap = dict(zip(uniqueitems,[i for i in range(len(uniqueitems))])) # this maps itemname -> index
    trans['user'] = trans.apply(lambda row: umap[row['user']], axis = 1) 
    trans['item'] = trans.apply(lambda row: imap[row['item']], axis = 1) 
    return (trans,umap,imap)

#return list of similar items, use the item-properties matrix (Q) to do nearest neighbour using cosine similarity
def findsimilaritems(item, item_vecs, n_similar=10):
    #Calculate the item vector norms (the vector lengths)
    item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))
    #Calculate the (cosine) similarity score: do dot product of selected content with every other content
    #Note: cosine sim = A.B/(norm(A)*norm(B)), since B (item 450) is the same for every item A, we can ignore its norm in this calc
    simscores = item_vecs.dot(item_vecs[item]) / item_norms
    #Get the top 10 contents (do a sort)
    top_idx = np.argpartition(simscores, -n_similar)[-n_similar:]
    #Create a descending list of content-score tuples of most similar articles with this article.
    similar = sorted(zip(top_idx, simscores[top_idx]/item_norms[item]), key=lambda x: -x[1])
    return (similar)

#return the top 10 recommendations chosen based on the person / content vectors 
#for contents never interacted with for any given person.
def recommend(user, sparse_user_item, userprefs, itemprops, num_items=10):

    # create a template vector, where unrated items = 1, rated items =0
    existing_ratings = sparse_user_item[user,:].toarray() # Get existing ratings for target
    existing_ratings = existing_ratings.reshape(-1) + 1  # Add 1 to everything, so items with no rating = 1
    existing_ratings[existing_ratings > 1] = 0  # make items already rated = 0

    # Get dot product of the target user preferences and all item properties ~ P[user]*transpose(Q)
    predrats = userprefs[user,:].dot(itemprops.T)
    
    # Items already rated have their predictions multiplied by zero (ie eliminated)
    predrats = predrats * existing_ratings 

    # Sort into descending order of predicted rating and select the topN item indexes
    itemids = np.argsort(predrats)[::-1][:num_items]
    
    # Start empty list to store items and scores
    recs = []
    for item in itemids: recs.append((item, predrats[item]))
    return recs

def implicit_testusers(testset, userprefs, itemprops, debug=False):
    errs = list([])
    #tic = time.perf_counter()
    for (indx,(uname,iname,rating)) in testset.iterrows():
        if (debug): print('.', end = '')
        err = abs(userprefs[uname,:].dot(itemprops[iname,:]) - rating)
        errs.append(err)
    #print(f"\ntime {time.perf_counter() - tic:0.4f} seconds")  
    return(errs)

def ahead(arr,r=7,c=7):
    with np.printoptions(threshold=np.inf):
        print(arr[0:r,0:c])

def sparsity(arr):
    return np.isnan(arr).sum()/np.prod(arr.shape)
   #1.0 - ( count_nonzero(arr) / float(arr.size) )

In [4]:
path = '/home/mobasshir/recommendation_engine_lab/NUS-Artificial-Intelligence-Training/recommender/Datasets'
os.chdir(path)

In [5]:
trans = pd.read_csv('BookCrossings/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
print(trans.head())
trans.columns = ['user','isbn','rating']

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [6]:
trans.rating.value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: rating, dtype: int64

In [7]:
# trans['rating'] = trans['rating'].apply(lambda x: 5 if x == 0 else x)

In [9]:
trans['item'] = trans.groupby('isbn').grouper.group_info[0]
print(trans['item'])

0           57190
1           29751
2          107394
3          127255
4          127289
            ...  
1149775    256878
1149776    164863
1149777    123713
1149778    145627
1149779    124714
Name: item, Length: 1149780, dtype: int64


In [13]:
trans = trans.drop_duplicates()
trans = trans.groupby(['user', 'item', 'isbn']).sum().reset_index()
print(trans)

           user    item        isbn  rating
0             2   32070  0195153448       0
1             7   56365   034542252       0
2             8     231  0002005018       5
3             8   10495  0060973129       0
4             8   72060  0374157065       0
...         ...     ...         ...     ...
1149774  278854   95073  0425163393       7
1149775  278854  123345  0515087122       0
1149776  278854  135474  0553275739       6
1149777  278854  139261  0553578596       0
1149778  278854  139330  0553579606       8

[1149779 rows x 4 columns]


In [14]:
trans,umap,imap = maptrans(trans)
print(trans)

           user    item        isbn  rating
0             0   32070  0195153448       0
1             1   56365   034542252       0
2             2     231  0002005018       5
3             2   10495  0060973129       0
4             2   72060  0374157065       0
...         ...     ...         ...     ...
1149774  105282   95073  0425163393       7
1149775  105282  123345  0515087122       0
1149776  105282  135474  0553275739       6
1149777  105282  139261  0553578596       0
1149778  105282  139330  0553579606       8

[1149779 rows x 4 columns]


In [15]:
sparse_item_user = sparse.csr_matrix((trans['rating'].astype(float), (trans['item'],trans['user'])))
sparse_user_item = sparse.csr_matrix((trans['rating'].astype(float), (trans['user'],trans['item'])))

In [16]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

In [17]:
alpha = 15
data = (sparse_item_user * alpha).astype('double')
model.fit(data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [18]:
item_id = 231
similar = model.similar_items(item_id)
for item, score in similar: 
    print(score,'\t',trans.isbn.loc[trans.item == item].iloc[0], "\n")

1.0000001 	 0002005018 

0.886799 	 1552041778 

0.886799 	 074322678X 

0.8867989 	 1567407781 

0.8867846 	 1575663937 

0.88676167 	 0887841740 

0.82949364 	 0385259875 

0.82615316 	 087113733X 

0.8188976 	 0140098798 

0.8173198 	 0771086873 



In [19]:
item_id = 32070
similar = model.similar_items(item_id)
for item, score in similar: 
    print(score,'\t',trans.isbn.loc[trans.item == item].iloc[0], "\n")

1.0000001 	 0195153448 

0.93911016 	 0762705140 

0.92823815 	 0722176082 

0.92634135 	 0930764196 

0.919283 	 0394537009 

0.91659886 	 0142004006 

0.9153051 	 0879677856 

0.91185397 	 067123109X 

0.90892893 	 0446936510 

0.90599835 	 0449144844 



In [22]:
user_id = 8
recommendations = model.recommend(user_id, sparse_user_item, filter_already_liked_items=True)
for item, score in recommendations: 
    print(f'{score:0.5f}','\t', trans.isbn.loc[trans.item == item].iloc[0], "\n")

0.24382 	 0446608955 

0.24227 	 0446606812 

0.22244 	 0061009059 

0.21570 	 0446672211 

0.20958 	 0671041789 

0.20503 	 0440222656 

0.19523 	 0060502258 

0.19079 	 0439064864 

0.18740 	 0439136350 

0.18476 	 0345384466 



In [24]:
user_id = 26
recommendations = model.recommend(user_id, sparse_user_item, filter_already_liked_items=True)
for item, score in recommendations: 
    print(f'{score:0.5f}','\t', trans.isbn.loc[trans.item == item].iloc[0], "\n")

0.12164 	 0440224764 

0.11972 	 0671727796 

0.11807 	 0440211727 

0.11605 	 0345391802 

0.11503 	 0440220602 

0.11214 	 0440213525 

0.11200 	 0743418174 

0.10991 	 0440234743 

0.10834 	 0312291639 

0.10591 	 044021145X 



In [25]:
item = 26
recommendations = model.recommend(item, sparse_item_user, filter_already_liked_items=True)
for user, score in recommendations: 
    print(f'{score:0.5f}','\t', user, "\n")

0.12164 	 101237 

0.11972 	 159855 

0.11807 	 100580 

0.11605 	 55765 

0.11503 	 101013 

0.11214 	 100665 

0.11200 	 182111 

0.10991 	 101505 

0.10834 	 42390 

0.10591 	 100566 

