This tutorial for recommender systems was done using [Krish Naik's Youtube Video.](https://www.youtube.com/watch?v=kccT0FVK6OY&list=PLZoTAELRMXVN7QGpcuN-Vg35Hgjp3htvi&index=3)

Dataset used - https://grouplens.org/datasets/movielens/latest/

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("movies.csv", usecols=["movieId", "title"])
ratings = pd.read_csv("ratings.csv", usecols=["userId", "movieId", "rating"])

In [3]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
df = pd.merge(movies, ratings, on="movieId")
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [6]:
combine_movie_rating = df.dropna(axis=0, subset=["title"])
movie_rating_count = (
    combine_movie_rating.groupby(by=["title"])["rating"]
    .count()
    .reset_index()
    .rename(columns={"rating": "totalratingcount"})[["title", "totalratingcount"]]
)

movie_rating_count.head()

Unnamed: 0,title,totalratingcount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [7]:
rating_total_count = combine_movie_rating.merge(
    movie_rating_count, left_on="title", right_on="title", how="left"
)
rating_total_count.head()

Unnamed: 0,movieId,title,userId,rating,totalratingcount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [8]:
pd.set_option("display.float_format", lambda x: "%.3f" % x)
movie_rating_count["totalratingcount"].describe()

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalratingcount, dtype: float64

In [9]:
popularity_treshold = 50
rating_popular_movie = rating_total_count.query(
    "totalratingcount >= @popularity_treshold"
)
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalratingcount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [10]:
rating_popular_movie.shape

(41362, 5)

In [11]:
movie_features = rating_popular_movie.pivot_table(
    index="title", columns="userId", values="rating"
).fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [12]:
from scipy.sparse import csr_matrix

movie_features_matrix = csr_matrix(
    movie_features.values
)  # creating a sparse matrix out of the pivot table

In [13]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric="cosine", algorithm="brute")
model_knn.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [14]:
movie_features.shape

(450, 606)

In [15]:
query_index = np.random.choice(movie_features.shape[0])  # selects a random row
print(query_index)
distances, indices = model_knn.kneighbors(
    movie_features.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6
)
# iloc selects the whole vector of all the features for the row selected while reshape changes array shape without changing values

6


In [16]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print("Reccomendations for {0}:\n".format(movie_features.index[query_index]))
    else:
        print(
            "{0}: {1}, with distance of {2}".format(
                i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]
            )
        )

Reccomendations for A.I. Artificial Intelligence (2001):

1: I, Robot (2004), with distance of 0.4431083711221785
2: Signs (2002), with distance of 0.4462507077641533
3: Unbreakable (2000), with distance of 0.4588232441629708
4: Minority Report (2002), with distance of 0.48112603135952814
5: Matrix Reloaded, The (2003), with distance of 0.48878076049338115
