In [58]:
#Importing libraries
import sys
import random
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, TruncatedSVD

my_seed = 1337
random.seed(my_seed)
np.random.seed(my_seed)

In [59]:
#Read Database
data = pd.read_csv('rating_final.csv')

In [60]:
data.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [61]:
ratings = data[['rating','food_rating','service_rating']].copy()

In [62]:
ratings.head()

Unnamed: 0,rating,food_rating,service_rating
0,2,2,2
1,2,2,1
2,2,2,2
3,1,2,2
4,1,1,2


In [63]:
ratings['rating'] = ratings.sum(axis=1)

In [64]:
ratings.head()

Unnamed: 0,rating,food_rating,service_rating
0,6,2,2
1,5,2,1
2,6,2,2
3,5,2,2
4,4,1,2


In [65]:
ratings=ratings.drop(ratings.columns[[1,2]], 1)

In [66]:
ratings.head()

Unnamed: 0,rating
0,6
1,5
2,6
3,5
4,4


In [67]:
ratings=ratings.div(9)

In [68]:
ratings.head()

Unnamed: 0,rating
0,0.666667
1,0.555556
2,0.666667
3,0.555556
4,0.444444


In [69]:
ratings=ratings.mul(3)

In [70]:
ratings.head()

Unnamed: 0,rating
0,2.0
1,1.666667
2,2.0
3,1.666667
4,1.333333


In [71]:
data = data.drop(data.columns[[3,4]], 1)

In [72]:
data.head()

Unnamed: 0,userID,placeID,rating
0,U1077,135085,2
1,U1077,135038,2
2,U1077,132825,2
3,U1077,135060,1
4,U1068,135104,1


In [73]:
frames = [data,ratings]

In [74]:
data.rating = ratings.rating.copy()

In [75]:
data.head()

Unnamed: 0,userID,placeID,rating
0,U1077,135085,2.0
1,U1077,135038,1.666667
2,U1077,132825,2.0
3,U1077,135060,1.666667
4,U1068,135104,1.333333


In [76]:
# Spliting data!

from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import Reader, Dataset
import numpy as np
from typing import *
from IPython.display import display, HTML, Markdown

In [77]:
reader = Reader(rating_scale=(1,3))

In [78]:
dataset = Dataset.load_from_df(data,reader)

In [79]:
trainset, testset = train_test_split(dataset, test_size=0.25)

In [80]:
trainset

<surprise.trainset.Trainset at 0x250e0fbc588>

In [81]:
model = SVD(n_factors=100)


In [82]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x250e1022608>

In [83]:
model.qi.shape


(130, 100)

In [94]:
from scipy.spatial.distance import cosine


def get_vector_by_movie_title(movie_title: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_title]
    return trained_model.qi[movie_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

In [95]:
item_to_row_idx: Dict[Any, int] = model.trainset._raw2inner_id_items

display(item_to_row_idx)

{132925: 0,
 135047: 1,
 135041: 2,
 132834: 3,
 132921: 4,
 135063: 5,
 135075: 6,
 135080: 7,
 135044: 8,
 135106: 9,
 135042: 10,
 132951: 11,
 135060: 12,
 134983: 13,
 135108: 14,
 132733: 15,
 135028: 16,
 135085: 17,
 135082: 18,
 135039: 19,
 135032: 20,
 132830: 21,
 135025: 22,
 132955: 23,
 135045: 24,
 132862: 25,
 134992: 26,
 132825: 27,
 132872: 28,
 135058: 29,
 135027: 30,
 135072: 31,
 132723: 32,
 132773: 33,
 135104: 34,
 135054: 35,
 132958: 36,
 134999: 37,
 132870: 38,
 135086: 39,
 135043: 40,
 132768: 41,
 132875: 42,
 132846: 43,
 132630: 44,
 132766: 45,
 132856: 46,
 135062: 47,
 135051: 48,
 135046: 49,
 135016: 50,
 132754: 51,
 135013: 52,
 135065: 53,
 135064: 54,
 135052: 55,
 132954: 56,
 132608: 57,
 135071: 58,
 135059: 59,
 135079: 60,
 135076: 61,
 135049: 62,
 135069: 63,
 135088: 64,
 135038: 65,
 134996: 66,
 132663: 67,
 132922: 68,
 132755: 69,
 132572: 70,
 135026: 71,
 132584: 72,
 132561: 73,
 134986: 74,
 132594: 75,
 132740: 76,
 135030: 

In [96]:
a_user = "U1077"
a_product = "135060"
model.predict(a_user, a_product)


Prediction(uid='U1077', iid='135060', r_ui=None, est=1.2753553798521906, details={'was_impossible': False})

In [97]:
def get_top_similarities(movie_title: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar movies to a specified movie
    
    This function iterates over every possible movie in MovieLens and calculates
    distance between `movie_title` vector and that movie's vector.
    """
    
    # Get the first movie vector
    movie_vector: np.array = get_vector_by_movie_title(movie_title, model)
    similarity_table = []
    
    # Iterate over every possible movie and calculate similarity
    for other_movie_title in model.trainset._raw2inner_id_items.keys():
        other_movie_vector = get_vector_by_movie_title(other_movie_title, model)
        
        # Get the second movie vector, and calculate distance
        similarity_score = cosine_distance(other_movie_vector, movie_vector)
        similarity_table.append((similarity_score, other_movie_title))
    
    # sort movies by ascending similarity
    return display(sorted(similarity_table))

In [100]:
get_top_similarities(135060, model)

[(0.0, 135060),
 (0.8044910286113957, 132834),
 (0.8177596948919518, 134976),
 (0.838351234708343, 132755),
 (0.8387313883295311, 132660),
 (0.845800368174341, 135109),
 (0.8500798289821629, 132870),
 (0.8518814260167586, 132885),
 (0.8664145668673461, 135021),
 (0.8727142425230296, 132667),
 (0.8758501817780345, 132583),
 (0.8811037281556295, 132654),
 (0.8840210546536923, 135011),
 (0.8840826584246629, 135013),
 (0.8867413145633973, 135057),
 (0.8881429367124906, 135028),
 (0.8881535816483963, 135039),
 (0.8937594008717191, 132740),
 (0.8953206245618573, 132921),
 (0.9051334077791663, 132715),
 (0.9083429166472277, 135016),
 (0.9155502226308431, 132856),
 (0.9201389893405143, 135001),
 (0.9209067396663944, 135082),
 (0.923051753056026, 132872),
 (0.9238519930207022, 135030),
 (0.924339997832078, 135018),
 (0.9254267989323037, 135058),
 (0.9278503044660449, 132851),
 (0.9287030604074378, 135055),
 (0.9294942566311101, 132875),
 (0.9350967010784798, 132825),
 (0.9361479803311601, 13266

In [101]:
get_top_similarities(132825, model)

[(0.0, 132825),
 (0.7163116032542982, 135016),
 (0.7432624748525993, 135048),
 (0.7445641791854728, 132866),
 (0.7906729816245338, 135043),
 (0.7916825390615803, 132872),
 (0.7955124374291032, 132609),
 (0.8219363000683576, 132630),
 (0.8223536057152223, 135069),
 (0.8290695740926013, 132667),
 (0.8471390869642705, 135046),
 (0.8512581236008621, 135019),
 (0.8513467581356862, 135042),
 (0.8536978082128449, 135027),
 (0.855916722691708, 132564),
 (0.8628533953011674, 135055),
 (0.8640022722959616, 135033),
 (0.8654189700415891, 135034),
 (0.8708175116075615, 132884),
 (0.8782102467840458, 135045),
 (0.8785266696218925, 135057),
 (0.8889764371886973, 135076),
 (0.8931613958894729, 135047),
 (0.8997974069438519, 132861),
 (0.9010478589209665, 132668),
 (0.9050775055780339, 135088),
 (0.9078393529761244, 132885),
 (0.9116739265344224, 135049),
 (0.9132360949914616, 134992),
 (0.9161057307866541, 132875),
 (0.916872179453601, 134975),
 (0.9176115839211965, 132560),
 (0.9188171290374326, 132