In [1]:

pip install surprise

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Maha\anaconda3\envs\venv\python.exe -m pip install --upgrade pip' command.


In [2]:
from tqdm import tqdm
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

from surprise import NormalPredictor, SVD, KNNBasic, NMF
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, KFold

## 1. Introduction
Recommender systems goal is to push *relevant* items to a given user. Understanding and modelling the user's preferences is required to reach this goal. In this project you will learn how to model the user's preferences with the [Surprise library](http://surpriselib.com/) to build different recommender systems. The first one will be a pure *collaborative filtering* approach, and the second one will rely on item attributes in a *content-based* way.

## 2. Loading Data


In [3]:
RATINGS_DATA_FILE = './data/ratings.csv'
BOOKS_DATA_FILE = './data/books.csv'

In [4]:
# load the raw csv into a data_frame
df_ratings = pd.read_csv(RATINGS_DATA_FILE)


# movies dataframe
df_books = pd.read_csv(BOOKS_DATA_FILE)

In [5]:
df_ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [6]:
# check we have 25M users' ratings
df_ratings.user_id.count()

981756

In [7]:
def get_subset(df, number):
    """
        just get a subset of a large dataset for debug purpose
    """
    rids = np.arange(df.shape[0])
    np.random.shuffle(rids)
    df_subset = df.iloc[rids[:number], :].copy()
    return df_subset
df_ratings_100k = get_subset(df_ratings, 100000)
df_books_1000 = get_subset(df_books, 1000)

In [8]:
# Surprise reader
reader = Reader(rating_scale=(0, 5))

# Finally load all ratings
ratings = Dataset.load_from_df(df_ratings_100k, reader)

In [9]:
df_ratings_100k.head(5)

Unnamed: 0,book_id,user_id,rating
490137,4913,30255,3
31123,312,11287,4
117643,1177,31242,4
831729,8405,170,3
101895,1019,52937,4


## 3. Collaborative Filtering
We can test first any of the [Surprise algorithms](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html).

In [10]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algos = [SVD(), NMF(), KNNBasic()]    

In [11]:
def get_rmse(algo, testset):
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
        
for trainset, testset in tqdm(kf.split(ratings)): 
    """
        get an evaluation with cross-validation for different algorithms
    """  
    for algo in algos:
        algo.fit(trainset)
        get_rmse(algo, testset)

0it [00:00, ?it/s]

RMSE: 0.9255
RMSE: 1.1084
Computing the msd similarity matrix...
Done computing similarity matrix.


1it [00:37, 37.23s/it]

RMSE: 0.9875
RMSE: 0.9309
RMSE: 1.1065
Computing the msd similarity matrix...
Done computing similarity matrix.


2it [01:16, 38.69s/it]

RMSE: 0.9926
RMSE: 0.9282
RMSE: 1.1074
Computing the msd similarity matrix...
Done computing similarity matrix.


3it [01:55, 38.46s/it]

RMSE: 0.9915





## 4. Content-based Filtering
Here we will rely directly on items attributes. First we have to describe a user profile with an attributes vector. Then we will use these vectors to generate recommendations.

In [12]:
# computing similarities requires too much ressources on the whole dataset, so we take the subset with 100 items
df_books_1000 = df_books_1000.reset_index(drop=True)
df_books_1000.head(5)
df_books_1000 = df_books_1000.dropna()
df_books_1000 = df_books_1000.reset_index(drop=True)

In [13]:
# we compute a TFIDF on the titles of the movies
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_books_1000['original_title'])

In [14]:
# we get cosine similarities: this takes a lot of time on the real dataset
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
# we generate in 'results' the most similar movies for each movie: we put a pair (score, movie_id)
results = {}
for idx, row in df_books_1000.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
    similar_items = [(cosine_similarities[idx][i], df_books_1000['book_id'].iloc[[i]].tolist()[0]) for i in similar_indices] 
    results[idx] = similar_items[1:]

In [16]:
# transform a 'movieId' into its corresponding movie title
def item(id):  
    return df_books_1000.loc[df_books_1000['book_id'] == id]['original_title'].tolist()[0].split(' - ')[0] 

In [17]:
# transform a 'movieId' into the index id
def get_idx(id):
    return df_books_1000[df_books_1000['book_id'] == id].index.tolist()[0]

In [18]:
# Finally we put everything together here:
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")   
    print("-------")    
    recs = results[get_idx(item_id)][:num]   
    for rec in recs: 
        print("\tRecommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

In [19]:
df_ratings_100k.head()

Unnamed: 0,book_id,user_id,rating
490137,4913,30255,3
31123,312,11287,4
117643,1177,31242,4
831729,8405,170,3
101895,1019,52937,4


In [20]:
df_books_1000.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1618,95617,95617,883913,36,842377506,9780842000000.0,"Francine Rivers, Richard Ferrone",1993.0,A Voice in the Wind,...,55556,56920,2647,680,903,3382,10680,41275,https://images.gr-assets.com/books/1459567327m...,https://images.gr-assets.com/books/1459567327s...
1,1635,766955,766955,434352,39,689832133,9780690000000.0,"Doreen Cronin, Betsy Lewin",1999.0,"Click, Clack, Moo: Cows That Type",...,67171,67998,2220,1028,2671,10968,18473,34858,https://images.gr-assets.com/books/1423222939m...,https://images.gr-assets.com/books/1423222939s...
2,7480,11107244,11107244,16029496,40,670022950,9780670000000.0,Steven Pinker,2010.0,The Better Angels of Our Nature. Why Violence ...,...,11437,12958,1288,302,514,1774,4134,6234,https://images.gr-assets.com/books/1311281857m...,https://images.gr-assets.com/books/1311281857s...
3,3121,9462812,9462812,10280032,32,62004018,9780062000000.0,Amy Plum,2011.0,Die for Me,...,45756,48246,4319,1373,3235,9396,14472,19770,https://images.gr-assets.com/books/1358427893m...,https://images.gr-assets.com/books/1358427893s...
4,243,10614,10614,3230869,197,450417395,9780450000000.0,Stephen King,1987.0,Misery,...,334647,352203,6416,4302,12725,64835,128990,141351,https://images.gr-assets.com/books/1270545451m...,https://images.gr-assets.com/books/1270545451s...


Suppose a user wants the 10 most 'similar' (from a CBF point of view) movies from the movie 'Alley Cats Strike':

In [21]:
recommend(item_id=6545536, num=10)

IndexError: list index out of range