In [1]:
import pandas as pd
import numpy as np
import json
import requests
import ast
from scipy import sparse
import re
from difflib import SequenceMatcher
from scipy.sparse.linalg import svds
import pickle

#### Data Import and Processing ####

In [2]:
df_movies = pd.read_csv('ml-latest/movies.csv')

In [3]:
df_ratings = pd.read_csv('ml-latest/ratings.csv')

In [4]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [5]:
# Start user Id at 0
df_ratings['userId'] = df_ratings['userId']-1

In [6]:
# Make the ratings dataframe smaller, by removing the timestamp column, and changing the dtypes to 
# unsigned ints (thereby decreasing the df from ~650 mb to ~250 mb)

df_ratings.drop('timestamp', axis=1, inplace=True)
df_ratings['rating'] = df_ratings['rating'].astype(np.float16)
df_ratings['userId'] = df_ratings['userId'].astype(np.uint32)
df_ratings['movieId'] = df_ratings['movieId'].astype(np.uint32)

In [7]:
df_ratings. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   uint32 
 1   movieId  uint32 
 2   rating   float16
dtypes: float16(1), uint32(2)
memory usage: 264.7 MB


In [8]:
# Need to split the title and the year into two sections.
# Keep genres, it can be returned when the recommendations are shown
# The title should be in lowercase, so that the matching process is easy
df_movies 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


In [9]:
class SPLIT_TITLE_AND_YEAR():
    def __init__(self, title):
        self.title = title
        pattern = re.compile(r'(.*?)\((\d{4})\)') # movie_name (year)
        self.mo = re.match(pattern, self.title)
        
    def return_title(self):
        if self.mo == None:
            return self.title
        else:
            return self.mo.group(1)
    
    def return_year(self):
        if self.mo == None:
            return None
        else:
            return self.mo.group(2)
        

In [10]:
# Do not change the order
df_movies['year'] = df_movies['title'].map(lambda x: SPLIT_TITLE_AND_YEAR(x).return_year())
df_movies['title'] = df_movies['title'].map(lambda x: SPLIT_TITLE_AND_YEAR(x).return_title())

In [11]:
df_movies['title'] = df_movies['title'].map(lambda x: x.lower().strip())

In [12]:
df_movies.reset_index(inplace=True)

In [13]:
# Create new movie ids, that are the index of the df_movies dataset. This way, the number of columns
# in the eventual user-rating matrix will drop to the number of movies in the df_movies dataset
df_movies.rename(columns={'index':'new_movie_id'}, inplace=True)

In [14]:
# Merge the two datasets together (the movie and its info, as well as the user and ratings)
final_df = pd.merge(df_ratings, df_movies, how='inner', on='movieId')

In [15]:
final_df.isna().sum()

userId             0
movieId            0
rating             0
new_movie_id       0
title              0
genres             0
year            6863
dtype: int64

In [16]:
final_df.shape

(27753444, 7)

#### Making the user-movie rating matrix for SVD ####

In [17]:
# User id goes from 0 to ~28000
# Movie id starts from 0, to ~58000, however, there are only 53000 movies rated
number_of_users = final_df['userId'].nunique()
number_of_movies = final_df['new_movie_id'].max() + 1

In [18]:
number_of_users, number_of_movies

(283228, 58098)

In [19]:
new_movie_ids = final_df['new_movie_id']
user_ids = final_df['userId']
ratings = final_df['rating']

In [20]:
# Make a sparse matrix based on the ratings in the final df
user_movie_ratings_matrix = sparse.csc_matrix((ratings, (new_movie_ids, user_ids)), \
                               shape=(number_of_movies, number_of_users), dtype=np.float32)

In [21]:
movie_mat, S, user_mat = svds(user_movie_ratings_matrix)

In [35]:
df_movies = df_movies.drop('movieId', axis=1).rename(columns={'new_movie_id': 'ID'})

# Save the movie matrix and the df_movies dataset
with open ('movie_matrix_backup.pickle', 'wb') as f:
    pickle.dump(movie_mat, f)
    
df_movies.to_pickle('df_movies.pickle')

In [23]:
def top_cosine_similarity(data, movie_id, top_n=10):
    movie_row = data[movie_id, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[movie_id] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[1:top_n]

In [24]:
df_movies[df_movies['title'] == 'pulp fiction']['new_movie_id'].values[0]

293

In [31]:
df_movies.iloc[293]

new_movie_id                            293
movieId                                 296
title                          pulp fiction
genres          Comedy|Crime|Drama|Thriller
year                                   1994
Name: 293, dtype: object

In [25]:
x = top_cosine_similarity(movie_mat, 293)
x

array([   49,   587, 54080, 47457,   315,    46,    31,   290,   523])

In [26]:
df_movies[df_movies['new_movie_id'].isin(x)]

Unnamed: 0,new_movie_id,movieId,title,genres,year
31,31,32,twelve monkeys (a.k.a. 12 monkeys),Mystery|Sci-Fi|Thriller,1995
46,46,47,seven (a.k.a. se7en),Mystery|Thriller,1995
49,49,50,"usual suspects, the",Crime|Mystery|Thriller,1995
290,290,293,léon: the professional (a.k.a. the professiona...,Action|Crime|Drama|Thriller,1994
315,315,318,"shawshank redemption, the",Crime|Drama,1994
523,523,527,schindler's list,Drama|War,1993
587,587,593,"silence of the lambs, the",Crime|Horror|Thriller,1991
47457,47457,169950,disappearance,Drama,2017
54080,54080,184403,suckers,Comedy,1999


## Word Similarity ##

In [26]:
def return_similar_titles(search_item, list_of_items, top_n=10, threshold=None):
    '''
    Returns the top n most similar items from the list of items
    list of items must be a pandas series
    threshold - the similarity ratio threshold (calculated using python's inbuilt sequence matcher)
    '''
    ratios = list_of_items.map(lambda x: SequenceMatcher(a=search_item, b=x).ratio())
    ratios.sort_values(ascending=False, inplace=True)
    if threshold:
        indexes = ratios[ratios>threshold].index
        indexes = indexes[:top_n]
    else:
        indexes = ratios[:top_n].index
    return list_of_items.iloc[indexes]
    

In [27]:
return_similar_titles('pulp fic', df_movies['title'], threshold=0.60)

12636             pulp 
293       pulp fiction 
52839    plump fiction 
16858             paul 
9239              pups 
31606             pump 
Name: title, dtype: object

In [28]:
df_movies.iloc[return_similar_titles('pulp fic', df_movies['title'], threshold=0.60).index]

Unnamed: 0,new_movie_id,movieId,title,genres,year
12636,12636,59114,pulp,Comedy|Thriller,1972
293,293,296,pulp fiction,Comedy|Crime|Drama|Thriller,1994
52839,52839,181745,plump fiction,Comedy|Crime,1998
16858,16858,84772,paul,Adventure|Comedy|Sci-Fi,2011
9239,9239,27197,pups,Crime|Drama|Thriller,1999
31606,31606,133291,pump,Documentary,2014
