In [1]:
import pandas as pd
df = pd.read_csv("C:/Users/Bharath/Downloads/ml-20m/ml-20m/ratings.csv")
movies = pd.read_csv("C:/Users/Bharath/Downloads/ml-20m/ml-20m/movies.csv")
data = df.merge(movies,on='movieId',how='inner')

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,1112486027,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,851527569,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,849082742,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,835562174,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,846509384,Jumanji (1995),Adventure|Children|Fantasy


In [5]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
df = df.loc[:,df.columns != 'timestamp']

In [7]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

In [8]:
reader = Reader(rating_scale=(1,5))
# create dataset instance with the pandas dataframe and the reader object
X = Dataset.load_from_df(df,reader)

In [13]:
trainset, testset = train_test_split(X, test_size=.25)

In [14]:
# train a new SVD with 100 latent features
%timeit
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x18ea792d780>

In [15]:
model.qi.shape

(25561, 100)

In [17]:
model.pu.shape

(138493, 100)

In [57]:
item_to_row_idx = model.trainset._raw2inner_id_items

In [58]:
result = pd.Series(item_to_row_idx).to_frame().reset_index() 

In [59]:
result.columns = ['movieId','model_qi_index']

In [60]:
result = result.merge(movies,on='movieId',how='inner')

In [61]:
result = result[['title','model_qi_index']]

In [71]:
def get_qi_index(movie):
    return result[result['title'] == movie]['model_qi_index'].values[0]

In [112]:
movies[movies['title'].str.contains('Dark Knight')]

Unnamed: 0,movieId,title,genres
12525,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
18312,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
19876,98124,"Batman: The Dark Knight Returns, Part 1 (2012)",Action|Animation|Sci-Fi
20307,99813,"Batman: The Dark Knight Returns, Part 2 (2013)",Action|Animation
21255,103454,Batman Unmasked: The Psychology of the Dark Kn...,Documentary
27078,130219,The Dark Knight (2011),Action|Crime|Drama|Thriller


In [72]:
model.qi[get_qi_index('Toy Story (1995)')]

array([ 0.00682176, -0.02742974, -0.13675823,  0.00816807, -0.33277309,
        0.03182118,  0.09083358,  0.05865437,  0.03113894, -0.16493524,
       -0.23838983, -0.04231692, -0.08681526, -0.12783375, -0.10292278,
       -0.09057432, -0.04994185, -0.07499672, -0.06888434, -0.06789744,
       -0.12925843,  0.39951614,  0.05932971, -0.01516195,  0.24967539,
       -0.27205902,  0.06053507,  0.15551011,  0.30996803,  0.03402403,
        0.18701346, -0.18956579, -0.20142125, -0.11643695,  0.25940385,
       -0.11968252, -0.12116614,  0.0231169 , -0.08431985, -0.12658584,
       -0.09191728, -0.27884193,  0.27505785, -0.12392975, -0.15058196,
        0.21057446, -0.12654753,  0.0700443 , -0.06247823, -0.05484703,
       -0.11932657, -0.14725768,  0.04008789,  0.28153724, -0.21574657,
       -0.04239177,  0.06184737, -0.02086092,  0.10713537,  0.19600807,
       -0.06545519, -0.05501959, -0.33478811,  0.1573239 , -0.36911088,
        0.05108005,  0.05010795,  0.03304164,  0.04825734,  0.14

# once we have the user and item latent features we can do the following

1. If we provide the user id and the movie id , it will give the predicted rating
2. Compute Item Similarity: Finding similarity between vectors
3. Find similar movies by ranking top N movies based on the cosine distance

In [73]:
model.predict(1,2)

Prediction(uid=1, iid=2, r_ui=None, est=3.862294246090189, details={'was_impossible': False})

In [82]:
starwars_vector = model.qi[get_qi_index('Star Wars: Episode IV - A New Hope (1977)')]
return_of_jedi_vector = model.qi[get_qi_index('Star Wars: Episode VI - Return of the Jedi (1983)')]
aladdin_vector = model.qi[get_qi_index('Aladdin (1992)')]

In [89]:
from scipy import spatial
spatial.distance.cosine(return_of_jedi_vector, starwars_vector)

0.079432871323868

In [113]:
# get top 5 movie recommendation
main_vec = model.qi[get_qi_index('Dark Knight, The (2008)')]
sim = {}
for i in movies['title'].values:
    try:
        vec = model.qi[get_qi_index(i)]
    except:
        continue
    sim[i] = spatial.distance.cosine(vec,main_vec)
rating = pd.Series(sim).to_frame()
rating.columns = ['score']
# Top 10 predictions 
rating['score'].sort_values().head(10)

Dark Knight, The (2008)          0.000000
Batman Begins (2005)             0.241819
Dark Knight Rises, The (2012)    0.305556
Inception (2010)                 0.378386
Iron Man (2008)                  0.401103
Departed, The (2006)             0.411194
Casino Royale (2006)             0.417304
Matrix, The (1999)               0.466873
Star Trek (2009)                 0.473091
Django Unchained (2012)          0.491722
Name: score, dtype: float64

In [124]:
from surprise import dump
dump.dump("C:/Users/Bharath/Downloads/ml-20m/ml-20m/data.pkl", predictions=None, algo=model, verbose=1)

The dump has been saved as file C:/Users/Bharath/Downloads/ml-20m/ml-20m/data.pkl
