In [1]:
import numpy as np
import lightfm

from lightfm.datasets import fetch_movielens

movielens = fetch_movielens()



In [2]:
for key, value in movielens.items():
    print(key, type(value), value.shape)

train <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
test <class 'scipy.sparse.coo.coo_matrix'> (943, 1682)
item_features <class 'scipy.sparse.csr.csr_matrix'> (1682, 1682)
item_feature_labels <class 'numpy.ndarray'> (1682,)
item_labels <class 'numpy.ndarray'> (1682,)


In [3]:
train = movielens['train']
test = movielens['test']

In [4]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM()
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

Precision: train 0.43, test 0.12.


In [5]:
model = LightFM(learning_rate=0.05, loss='warp')

model.fit_partial(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

Precision: train 0.60, test 0.22.


In [6]:
def sample_recommendation(model, data, user_ids):
    n_users, n_items = data['train'].shape
    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()                                    
                          [user_id].indices]
        
        scores = model.predict(user_id, np.arange(n_items))

        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")
        
        for x in known_positives[:3]:
            print("        %s" % x)
        
        print("     Recommended:")
        
        for x in top_items[:3]:
            print("        %s" % x)

In [7]:
sample_recommendation(model, movielens, [10, 25, 451])


User 10
     Known positives:
        Babe (1995)
        Dead Man Walking (1995)
        Seven (Se7en) (1995)
     Recommended:
        Raiders of the Lost Ark (1981)
        Star Wars (1977)
        Return of the Jedi (1983)
User 25
     Known positives:
        Toy Story (1995)
        Twelve Monkeys (1995)
        Dead Man Walking (1995)
     Recommended:
        Scream (1996)
        Fargo (1996)
        Toy Story (1995)
User 451
     Known positives:
        Twelve Monkeys (1995)
        Babe (1995)
        Postino, Il (1994)
     Recommended:
        Raiders of the Lost Ark (1981)
        Casablanca (1942)
        Graduate, The (1967)


In [8]:
movielens['item_labels']

array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
       'Sliding Doors (1998)', 'You So Crazy (1994)',
       'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object)

In [9]:
movielens['train'].shape

(943, 1682)

In [10]:
movielens['item_labels'].shape

(1682,)

In [11]:
import pandas as pd

In [12]:
from surprise import Dataset

In [13]:
data = Dataset.load_builtin('ml-100k')

In [15]:
# test set is made of 25% of the ratings.
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

In [16]:
from surprise import SVD
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11a19fb50>

In [17]:
test_pred = algo.test(testset)

In [18]:
from surprise import accuracy
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9387


0.9386888791695305

In [20]:
data_pd = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])

In [21]:
data_pd.user_id = data_pd.user_id.astype(int)
data_pd.timestamp = data_pd.timestamp.astype(int)
data_pd.item_id = data_pd.item_id.astype(int)

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [23]:
data_pd.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [24]:
neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(data_pd)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=20)

In [25]:
tfidf_transformer = TfidfTransformer()

In [26]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = data_pd[data_pd.user_id == current_user_id].item_id
    
    last_user_movie = max(data_pd[(data_pd.timestamp == max(data_pd.timestamp[(data_pd.rating == max(data_pd.rating))]))].item_id)
    movie_rating = data_pd.rating[last_user_movie]
    
    predict = algo.predict(uid=current_user_id,iid=movie_rating).est

    res = neigh.kneighbors(data_pd, return_distance=True)
    
    movies_to_score = data_pd.item_id.iloc[res[1][0]]

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-3:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])
    

In [27]:
rec = recommend_for_user(12)

70 3.5311733333333333
382 3.5311733333333333
269 3.5311733333333333
