<a href="https://colab.research.google.com/github/YasmineJiang/Recommendation_System/blob/main/Recommendation_Systems_using_Nearest_Neighbors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32','title':'string'})
ratings_df = pd.read_csv('ratings.csv',usecols=['userId','movieId','rating'],dtype={'userId': 'int32','movieId':'int32','rating':'float32'})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
merged_df = pd.merge(ratings_df,movies_df,on='movieId')
merged_df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [6]:
combine_movie_rating = merged_df.dropna(axis=0,subset=['title'])
movie_ratingCount = (combine_movie_rating.
                     groupby(by=['title'])['rating'].
                     count().reset_index().
                     rename(columns={'rating':'totalRatingCount'})[['title','totalRatingCount']])
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [7]:
Rating_with_TotalRatingCount = merged_df.merge(movie_ratingCount,left_on='title',right_on='title',how='left')
Rating_with_TotalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [8]:
pd.set_option('display.float_format',lambda x: '%.3f' % x)
Rating_with_TotalRatingCount['totalRatingCount'].describe()

count   100836.000
mean        58.759
std         61.965
min          1.000
25%         13.000
50%         39.000
75%         84.000
max        329.000
Name: totalRatingCount, dtype: float64

In [9]:
popularity_threshold = 59
rating_popular_movie = Rating_with_TotalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [10]:
movie_feature_df = rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_feature_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
"40-Year-Old Virgin, The (2005)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
"Abyss, The (1989)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0


In [11]:
from scipy.sparse import csr_matrix
movie_feature_df_matrix = csr_matrix(movie_feature_df.values)

from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine',algorithm='brute')
model_knn.fit(movie_feature_df_matrix)

In [12]:
query_index = np.random.choice(movie_feature_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_feature_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

101


In [13]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_feature_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_feature_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Django Unchained (2012):

1: Inglourious Basterds (2009), with distance of 0.3470427393913269:
2: Interstellar (2014), with distance of 0.4127156138420105:
3: Inception (2010), with distance of 0.4186580777168274:
4: Dark Knight Rises, The (2012), with distance of 0.4194601774215698:
5: Hangover, The (2009), with distance of 0.44817864894866943:
