# Movie Recommender System

In [1]:

import pandas as pd 
    
def simple_dataset():
    movies = pd.read_csv('./simple-dataset/movies.csv', index_col=0 )
    ratings = pd.read_csv('./simple-dataset/ratings.csv')
    return movies, ratings

In [2]:
movies_extra, ratings = simple_dataset()
print(movies_extra.shape)
print(ratings.shape) 

(62423, 2)
(25000095, 4)


In [3]:
movies_extra.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [5]:
users = list(set(ratings['userId'].values))
numUsers = len(users)
moviesnames = list(set(ratings['movieId'].values))
numMovies = len(moviesnames)
print(max(moviesnames))

209171


In [19]:
set(ratings['rating'].values)

{np.float64(0.5),
 np.float64(1.0),
 np.float64(1.5),
 np.float64(2.0),
 np.float64(2.5),
 np.float64(3.0),
 np.float64(3.5),
 np.float64(4.0),
 np.float64(4.5),
 np.float64(5.0)}

In [30]:
min(users)

np.int64(1)

In [6]:
numMovies

59047

In [7]:
numUsers

162541

In [6]:
def cosineSimilarity (A:list[int], B:list[int]) -> float:
    assert len(A) == len(B), "The list lengths are not equal"
    length = len(A)
    similarity = sum([A[idx] * B[idx] for idx in range(length)])
    normalization_A = sum([A[idx]**2 for idx in range(length)])**0.5
    normalization_B = sum([B[idx]**2 for idx in range(length)])**0.5
    return similarity / normalization_A / normalization_B

In [7]:
import numpy as np
def cosSimNP (A: np.array, B: np.array) -> np.float64:
    assert A.shape[0] == B.shape[0], "The list lengths are not equal"
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    return np.dot(A,B) / (norm_a*norm_b)

In [10]:
cosineSimilarity([1, 1], [2, 1])

0.9486832980505137

In [26]:
a = np.array([1, 1])
b = np.array([2, 2])
type(cosSimNP(a, b))

numpy.float64

In [15]:
# Method 1
movieIds = {element: index for index, element in enumerate(moviesnames)}
movieNames = {index : element for index, element in enumerate(moviesnames)}

# method 2
# key_value_pairs = zip(keys, values)
# my_dict = dict(key_value_pairs)

In [9]:
import time 
start = time.time()
# create the User-Movie ratings matrix
ratingsMatrx = np.zeros((numUsers, numMovies), dtype=np.int8)
for row in ratings.itertuples():
    movieId = movieIds[row.movieId]
    userId = int(row.userId) - 1
    ratingsMatrx[userId, movieId] = int(2 * row.rating)
mid = time.time()
print(mid - start)



36.64936137199402


In [10]:
from scipy.sparse import csr_matrix

# Create User Similarity Matrix Matrix
similarityMatrix = csr_matrix((numUsers, numUsers), dtype = np.int8).toarray()
accuracy = 100
for user1 in range(numUsers):
    if user1 % 10000 == 0: print(f'Working on User: {user1}')
    for user2 in range(user1, numUsers):
        if user1 == user2:
            similarityMatrix[user1, user2] = 0
            continue;
        user1_opinion = ratingsMatrx[user1, :]
        user2_opinion = ratingsMatrx[user2, :]
        similarity = int(accuracy * cosSimNP(user1_opinion, user2_opinion) )
        similarityMatrix[user1, user2] = similarity
        similarityMatrix[user2, user1] = similarity
    break;
print('Done!')

Working on User: 0
Done!


In [27]:
testuser = 53
n = 5
most_similar = np.argmax(similarityMatrix)
ind = np.argpartition(similarityMatrix[testuser, :], -n)[-n:].tolist()
movieID = 0
testUserList = []
similarUserList = []
for movieID in range(numMovies):
    for userID in ind:
        if ratingsMatrx[testuser, movieID] == 0 and ratingsMatrx[userID, movieID] >= 4:
            similarUserList.append(movies_extra.loc[movieNames[movieID], 'title'])
    if ratingsMatrx[testuser, movieID] >= 4:
        testUserList.append(movies_extra.loc[movieNames[movieID], 'title'])

In [30]:
print(similarUserList)

['Toy Story (1995)', 'Toy Story (1995)', 'Toy Story (1995)', 'Jumanji (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)', 'Heat (1995)', 'GoldenEye (1995)', 'GoldenEye (1995)', 'Cutthroat Island (1995)', 'Ace Ventura: When Nature Calls (1995)', 'Babe (1995)', 'To Die For (1995)', 'Pocahontas (1995)', 'Usual Suspects, The (1995)', 'Bed of Roses (1996)', "Things to Do in Denver When You're Dead (1995)", 'Vampire in Brooklyn (1995)', 'Broken Arrow (1996)', 'Broken Arrow (1996)', 'Happy Gilmore (1996)', 'Braveheart (1995)', 'Braveheart (1995)', 'Taxi Driver (1976)', 'Rumble in the Bronx (Hont faan kui) (1995)', 'Boomerang (1992)', 'Birdcage, The (1996)', 'Bad Boys (1995)', 'Apollo 13 (1995)', 'Batman Forever (1995)', 'Congo (1995)', 'Crumb (1994)', 'Desperado (1995)', 'Die Hard: With a Vengeance (1995)', 'Die Hard: With a Vengeance (1995)', 'First Knight (1995)', 'Hackers (1995)', 'Johnny Mnemonic (1995)', 'Judge Dredd (1995)', 'Living in Oblivion (1995)', 'Net, The (1995)

In [None]:
# for each user
# history of videos watched with ratings
# so look users in his similarty column orered by similarity.  any videos he hasnt watched that were rated highly? add them to a list.  
# look up movie info based on ID, check if the groupings work. 
# pick a user, list randomly highly rated history, check which new movies would be recommended.  

In [None]:
np.save('/tmp/123', np.array([[1, 2, 3], [4, 5, 6]]))
np.load('/tmp/123.npy')
array([[1, 2, 3],
       [4, 5, 6]])