In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/archive/rating.csv')

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
df.shape

(20000263, 4)

In [6]:
len(df['userId'].value_counts())

138493

In [7]:
len(df['movieId'].value_counts())

26744

In [8]:
df = df.drop(['timestamp'], axis=1)

In [12]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [17]:
user_cnt = pd.DataFrame(df.groupby('userId').size(), columns=['cnt'])

In [18]:
user_cnt.head()

Unnamed: 0_level_0,cnt
userId,Unnamed: 1_level_1
1,175
2,61
3,187
4,28
5,66


In [42]:
movie_cnt = pd.DataFrame(df.groupby('movieId').size(), columns=['cnt'])

In [72]:
movie_threshold = 300
user_threshold = 200

In [73]:
popular_movies = list(set(movie_cnt.query('cnt >= @movie_threshold').index))

In [74]:
active_users = list(set(user_cnt.query('cnt >= @user_threshold').index))

In [78]:
df_drop_movies = df[df.movieId.isin(popular_movies)]

In [79]:
df_clean = df_drop_movies[df_drop_movies['userId'].isin(active_users)]

In [80]:
df_clean.shape

(11788644, 3)

In [81]:
df_clean.head()

Unnamed: 0,userId,movieId,rating
541,7,3,3.0
542,7,7,3.0
543,7,11,4.0
544,7,15,2.0
545,7,16,3.0


In [162]:
movie_user_mat = df_clean.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [163]:
movie_user_mat.shape

(5726, 26826)

In [164]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [165]:
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [166]:
model_knn = NearestNeighbors(metric='cosine', n_neighbors=25, n_jobs=-1)

In [167]:
model_knn.fit(movie_user_mat_sparse)

In [194]:
movie_to_idx = {}
idx_to_movie = {}

In [200]:
df_movies.loc[]

title    Specialist, The (1994)
Name: 315, dtype: object

In [195]:
for i in df_movies.index:
    idx_to_movie[i] = df_movies.loc[i, 'title'][:-7]

In [197]:
for i in movie_to_idx:
    if i == 'Shawshank Redemption':
        print(i, movie_to_idx[i])

In [196]:
movie_to_idx = {v: k for  k, v in idx_to_movie.items()}

In [216]:
from thefuzz import fuzz

In [217]:
def matching_the_movie(movie):
    match = []
    for title, idx in movie_to_idx.items():
        ratio = fuzz.ratio(title.lower(), movie.lower())
        if ratio >= 60:
            match.append((title, idx, ratio))

    match = sorted(match, key=lambda x: x[2])[::-1]

    if not match:
        return -1
    return match[0][1]


In [225]:
def make_recommentations(data, movie):
    model_knn.fit(data)
    idx = matching_the_movie(movie)

    dist, ind = model_knn.kneighbors(data[idx], n_neighbors=15)

    recommends = sorted(list(zip(ind.squeeze().tolist(), dist.squeeze().tolist())), key = lambda x: x[1])[:0:-1]

    for i, (index, dist) in enumerate(recommends):
        print(f'{i+1}: {idx_to_movie[index]}, distance = {dist}')


In [227]:
make_recommentations(movie_user_mat_sparse, 'Jurassic Park')

1: Kiss of Death, distance = 0.8458416829447526
2: Davy Crockett, King of the Wild Frontier, distance = 0.8454003857935992
3: E.T. the Extra-Terrestrial, distance = 0.8432470558048535
4: Just Cause, distance = 0.8396756800230524
5: Kiss of the Dragon, distance = 0.8373982176486725
6: Kid in King Arthur's Court, A, distance = 0.8352106900814616
7: King Creole, distance = 0.8302100797016261
8: Dingo, distance = 0.827171478166308
9: Perez Family, The, distance = 0.8190524687396645
10: Dirty Work, distance = 0.8109718296226611
11: Lady Vanishes, The, distance = 0.7963506231852329
12: Ref, The, distance = 0.7529243792999439
13: Small Faces, distance = 0.7429853226797964
14: Pink Floyd: The Wall, distance = 0.7298398678710138


In [206]:
df_movies = pd.read_csv('./data/archive/movie.csv')

In [208]:
df_movies = df_movies.drop(['genres'], axis=1)

In [209]:
df_movies.set_index('movieId', inplace=True)

In [187]:
df_movies.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)
