**Setup Data**

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import files 
  
  
uploaded = files.upload()


Saving ml-latest-small.zip to ml-latest-small.zip


In [3]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [4]:
# data from MovieLens|GroupLens (http://files.grouplens.org/datasets/movielens/ml-latest-small.zip)

movie_data = pd.read_csv('ml-latest-small/movies.csv',error_bad_lines=False, header=0, usecols=[0,1,2] ,index_col=0, names=['movieId', 'Title','Genre'])
movie_data.head()

Unnamed: 0_level_0,Title,Genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
movie_data.describe()

Unnamed: 0,Title,Genre
count,9742,9742
unique,9737,951
top,Eros (2004),Drama
freq,2,1053


In [6]:
movie_ratings = pd.read_csv('ml-latest-small/ratings.csv', error_bad_lines=False, usecols=[0,1,2],header=0, names=['userId','movieId', 'rating'] )
movie_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
movie_ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [8]:
def movieTitle(movieId):
    title = movie_data.at[movieId, 'Title']
    return title
movieTitle(1)

'Toy Story (1995)'

In [9]:
def movieGenre(movieId):  
    genre = movie_data.at[movieId, 'Genre']
    return  genre
movieGenre(1)

'Adventure|Animation|Children|Comedy|Fantasy'

In [10]:
# Data Preprocessing for huge dataset (However here Not Required)
# to select only those movies whose id is present in movie_data
# movie_ratings = movie_ratings[movie_ratings['movieId'].isin(movie_data.index)]


In [11]:
def favMovie(userId, N):
    userRatings = movie_ratings[movie_ratings.userId==userId]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'] ,ascending=[0])[:N]
    sortedRatings['Title'] = sortedRatings['movieId'].apply(movieTitle)
    sortedRatings['Genre'] = sortedRatings['movieId'].apply(movieGenre)
    return sortedRatings
favMovie(1, 10)


Unnamed: 0,userId,movieId,rating,Title,Genre
231,1,5060,5.0,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
185,1,2872,5.0,Excalibur (1981),Adventure|Fantasy
89,1,1291,5.0,Indiana Jones and the Last Crusade (1989),Action|Adventure
90,1,1298,5.0,Pink Floyd: The Wall (1982),Drama|Musical
190,1,2948,5.0,From Russia with Love (1963),Action|Adventure|Thriller
189,1,2947,5.0,Goldfinger (1964),Action|Adventure|Thriller
188,1,2944,5.0,"Dirty Dozen, The (1967)",Action|Drama|War
186,1,2899,5.0,Gulliver's Travels (1939),Adventure|Animation|Children
184,1,2858,5.0,American Beauty (1999),Drama|Romance
179,1,2700,5.0,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical


**Setup Rating Matrix**

In [12]:
movie_ratings.shape, movie_data.shape

((100836, 3), (9742, 2))

In [13]:
userPerMovieID = movie_ratings.movieId.value_counts()
userPerMovieID.head()

356     329
318     317
296     307
593     279
2571    278
Name: movieId, dtype: int64

In [14]:
userPerMovieID.shape

(9724,)

In [15]:
## Data Preprocessing to obtain less sparse matrix for huge dataset(However here Not Required)

    ## Take only those movies which are seen by more than 10 users
#movie_ratings = movie_ratings[movie_ratings.index.isin(userPerMovieID[userPerMovieID > 10].index)]
#movie_ratings.shape

In [16]:
userMovieRatingMatrix = pd.pivot_table(movie_ratings, index=['userId'],columns=['movieId'] ,values='rating')
userMovieRatingMatrix.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,,4.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,4.0,4.0,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,4.0,,3.0,,4.0,4.0,4.0,,2.0,,2.0,5.0,,4.0,3.0,4.0,3.0,,,,3.0,4.0,4.0,5.0,,,,4.0,,4.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,4.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,,4.0,,,,,,,,2.0,4.0,,,,,,,,,,4.0,,,,,,,,,,,3.0,5.0,,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


**Find K Nearest Neighbours**

In [18]:
user1 = 100
user2 = 200

user1_ratings = userMovieRatingMatrix.transpose()[user1]
user1_ratings.head()


movieId
1    NaN
2    NaN
3    3.5
4    NaN
5    NaN
Name: 100, dtype: float64

In [19]:
user2_ratings = userMovieRatingMatrix.transpose()[user2]
user2_ratings.head()

movieId
1    3.5
2    NaN
3    NaN
4    NaN
5    4.0
Name: 200, dtype: float64

In [20]:
from scipy.spatial.distance import hamming
# hamming() returns a value which shows the pecentage of disagreement

hamming(user1_ratings, user1_ratings)

0.9847799259563965

In [21]:
# Wrapping it up in a function
def distance(user1, user2):
    try:
        user1_ratings = userMovieRatingMatrix.transpose()[user1]
        user2_ratings = userMovieRatingMatrix.transpose()[user2]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.nan
    return distance
distance(100,200)

0.9987659399424106

In [22]:

user = 1
allusers = pd.DataFrame(userMovieRatingMatrix.index)
# Removing the activee user
allusers = allusers[allusers.userId != user]
allusers.head()

Unnamed: 0,userId
1,2
2,3
3,4
4,5
5,6


In [23]:
allusers['distance'] = allusers['userId'].apply(lambda x: distance(user, x))
allusers.head()

Unnamed: 0,userId,distance
1,2,1.0
2,3,0.999897
3,4,0.998149
4,5,0.999486
5,6,0.99928


In [24]:
K = 10
KNearestUsers = allusers.sort_values(['distance'], ascending=True)['userId'][:K]
KNearestUsers

413    414
379    380
44      45
596    597
447    448
554    555
468    469
451    452
265    266
201    202
Name: userId, dtype: int64

In [25]:
# Wrapping it up in a function
def nearestNeighbours(user, K=10):
    allusers = pd.DataFrame(userMovieRatingMatrix.index)
    allusers = allusers[allusers.userId != user]
    allusers['distance'] = allusers['userId'].apply(lambda x: distance(user, x))
    KNearestUsers = allusers.sort_values(['distance'], ascending=True)['userId'][:K]
    return KNearestUsers

KNearestNeighbours = nearestNeighbours(1,5)
KNearestNeighbours

413    414
379    380
44      45
596    597
447    448
Name: userId, dtype: int64

**Find Top N Recommendations**

In [26]:
# Nearest Neighbours ratings

NNratings = userMovieRatingMatrix[userMovieRatingMatrix.index.isin(KNearestNeighbours)]
NNratings

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
45,4.0,,,,3.0,4.0,3.0,,,,3.0,,,,,,,,4.5,,4.0,,,,,,,,,,,4.5,,,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
380,5.0,5.0,,,,5.0,,,,5.0,,4.0,,,,3.0,,4.0,5.0,,2.0,,,,,,,,,,,5.0,4.0,,,,,,,,...,,,,,,,,3.0,4.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
414,4.0,3.0,4.0,,2.0,3.0,3.0,3.0,,3.0,5.0,,,,2.0,3.0,4.0,3.0,,,4.0,3.0,2.0,3.0,3.0,,2.0,,,,3.0,5.0,5.0,3.0,,4.0,,,2.0,,...,,,,,,,,,3.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
448,5.0,3.0,3.0,,3.0,,,,,4.0,,2.0,,,,5.0,,,2.0,3.0,2.0,,,,,,,,,,,2.0,,,3.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
597,4.0,,,,,3.0,1.0,,,3.0,3.0,,,,,,3.0,,,,5.0,,,,,,,,,,,,4.0,,,4.0,,,3.0,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [27]:
#Getting the average rating of each movie seen by Nearest Neighbours of active user
avgRating = NNratings.apply(np.nanmean).dropna()
avgRating.head()

# warning where the columns of NNratings are completely empty(nan)

  results[i] = self.f(v)


movieId
1    4.400000
2    3.666667
3    3.500000
5    2.666667
6    3.750000
dtype: float64

In [28]:
moviesAlreadySeen = userMovieRatingMatrix.transpose()[user].dropna().index
moviesAlreadySeen

Int64Index([   1,    3,    6,   47,   50,   70,  101,  110,  151,  157,
            ...
            3671, 3702, 3703, 3729, 3740, 3744, 3793, 3809, 4006, 5060],
           dtype='int64', name='movieId', length=232)

In [29]:
# Removing the movies which are already seen by user
avgRating = avgRating[~avgRating.index.isin(moviesAlreadySeen)]

In [30]:
N=3
topNMovieId = avgRating.sort_values(ascending=False).index[:N]
topNMovieId

Int64Index([99813, 7099, 5617], dtype='int64', name='movieId')

In [31]:
pd.Series(topNMovieId).apply(movieTitle)

0       Batman: The Dark Knight Returns, Part 2 (2013)
1    Nausicaä of the Valley of the Wind (Kaze no ta...
2                                     Secretary (2002)
Name: movieId, dtype: object

In [32]:
pd.Series(topNMovieId).apply(movieGenre)

0                            Action|Animation
1    Adventure|Animation|Drama|Fantasy|Sci-Fi
2                        Comedy|Drama|Romance
Name: movieId, dtype: object

In [33]:
# Wrapping it up in a function
def topN(user,N=3):
    KnearestUsers = nearestNeighbours(user)
    NNRatings = userMovieRatingMatrix[userMovieRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    moviesAlreadySeen = userMovieRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(moviesAlreadySeen)]
    topNMovieId = avgRating.sort_values(ascending=False).index[:N]
    return pd.DataFrame({'Movie':pd.Series(topNMovieId).apply(movieTitle), 'Genre':pd.Series(topNMovieId).apply(movieGenre)})

In [34]:
favMovie(3,5)

Unnamed: 0,userId,movieId,rating,Title,Genre
289,3,5181,5.0,Hangar 18 (1980),Action|Sci-Fi|Thriller
298,3,70946,5.0,Troll 2 (1990),Fantasy|Horror
296,3,7991,5.0,Death Race 2000 (1975),Action|Sci-Fi
266,3,849,5.0,Escape from L.A. (1996),Action|Adventure|Sci-Fi|Thriller
294,3,6835,5.0,Alien Contamination (1980),Action|Horror|Sci-Fi


In [35]:
# To remove the RunTimeWarning error 
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

topN(3,5)

Unnamed: 0,Movie,Genre
0,Reform School Girls (1986),Action|Drama
1,Watership Down (1978),Adventure|Animation|Children|Drama|Fantasy
2,"Omega Man, The (1971)",Action|Drama|Sci-Fi|Thriller
3,Yojimbo (1961),Action|Adventure
4,"Mystery, Alaska (1999)",Comedy|Drama


**We can see that the genre of recommended movies are nearly same as that of favourite movies of active user**