<a href="https://colab.research.google.com/github/psyduck1203/ML-ProjectYard/blob/main/Movie%20Recommendation/Model/Movie_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# to open files
import pandas as pd

# for numerical operations
import numpy as np

# sci-kit learn to measure distances
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
movielens_data = pd.read_csv('/content/sample_data/ml-100k/u.data', sep='\t', names=header)
movielens_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movielens_data.shape

(100000, 4)

In [4]:
n_users, n_movies  = movielens_data['user_id'].nunique(), movielens_data['item_id'].nunique()
n_users, n_movies

(943, 1682)

In [5]:
# We can also use panda's pivot_table to create this 2D matrix. But I'll keep it simple by doing it mannually.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html

train_data_matrix = np.zeros((n_users, n_movies))

for line in movielens_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
train_data_matrix.shape

(943, 1682)

In [6]:
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [7]:
user_distances = pairwise_distances(train_data_matrix, metric="cosine")

# ".T" below is to transpose our 2D matrix.
train_data_matrix_transpose = train_data_matrix.T
movie_distances = pairwise_distances(train_data_matrix_transpose, metric="cosine")

user_distances.shape, movie_distances.shape

((943, 943), (1682, 1682))

In [8]:
user_distances

array([[0.        , 0.83306902, 0.95254046, ..., 0.85138306, 0.82049212,
        0.60182526],
       [0.83306902, 0.        , 0.88940868, ..., 0.83851522, 0.82773219,
        0.89420212],
       [0.95254046, 0.88940868, 0.        , ..., 0.89875744, 0.86658385,
        0.97344413],
       ...,
       [0.85138306, 0.83851522, 0.89875744, ..., 0.        , 0.8983582 ,
        0.90488042],
       [0.82049212, 0.82773219, 0.86658385, ..., 0.8983582 , 0.        ,
        0.81753534],
       [0.60182526, 0.89420212, 0.97344413, ..., 0.90488042, 0.81753534,
        0.        ]])

In [9]:
movie_distances

array([[0.        , 0.59761782, 0.66975521, ..., 1.        , 0.95281693,
        0.95281693],
       [0.59761782, 0.        , 0.72693082, ..., 1.        , 0.92170064,
        0.92170064],
       [0.66975521, 0.72693082, 0.        , ..., 1.        , 1.        ,
        0.90312495],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.95281693, 0.92170064, 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.95281693, 0.92170064, 0.90312495, ..., 1.        , 1.        ,
        0.        ]])

In [10]:
user_similarity = 1 - user_distances
movie_similarity = 1 - movie_distances

In [11]:
user_similarity

array([[1.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.16693098, 1.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.04745954, 0.11059132, 1.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.14861694, 0.16148478, 0.10124256, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.17950788, 0.17226781, 0.13341615, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.39817474, 0.10579788, 0.02655587, ..., 0.09511958, 0.18246466,
        1.        ]])

In [12]:
movie_similarity

array([[1.        , 0.40238218, 0.33024479, ..., 0.        , 0.04718307,
        0.04718307],
       [0.40238218, 1.        , 0.27306918, ..., 0.        , 0.07829936,
        0.07829936],
       [0.33024479, 0.27306918, 1.        , ..., 0.        , 0.        ,
        0.09687505],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.04718307, 0.07829936, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.04718307, 0.07829936, 0.09687505, ..., 0.        , 0.        ,
        1.        ]])

In [13]:
idx_to_movie = {}

with open('/content/sample_data/ml-100k/u.item', 'r', encoding="ISO-8859-1") as f:
    for line in f.readlines():
        info = line.split('|')
        idx_to_movie[int(info[0])-1] = info[1]

movie_to_idx = {v: k for k, v in idx_to_movie.items()}

In [14]:
idx_to_movie[0], idx_to_movie[1], idx_to_movie[2], idx_to_movie[3]

('Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)')

In [15]:
movie_to_idx['Toy Story (1995)'], movie_to_idx['GoldenEye (1995)'], movie_to_idx['Four Rooms (1995)'], movie_to_idx['Get Shorty (1995)']

(0, 1, 2, 3)

In [16]:
# What we do is, we just that movie's column & sort it by value.
# Those value represents "similarity" so, we just need to sort it & pick first "k" values.

def top_k_movies(similarity, mapper, movie_idx, k=6):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-2:-1]]

# Let's find out similar movies of "Batman Forever" movie
We can recommend these movies to users who like "Batman Forever" movie.

In [17]:
favorite_movie_name = 'Batman Forever (1995)'
movie_index = movie_to_idx[favorite_movie_name]
movie_index

28

In [18]:
how_much_movie_to_show = 7

movies = top_k_movies(movie_similarity, idx_to_movie, movie_index, k = how_much_movie_to_show)
movies[1:how_much_movie_to_show + 1]

['Batman (1989)',
 'Batman Returns (1992)',
 'Cliffhanger (1993)',
 'Demolition Man (1993)',
 'Stargate (1994)',
 'Net, The (1995)',
 'Waterworld (1995)']

"Batman Forever (1995)" movie is similar to "Batman (1989)" & "Batman Returns (1992)"
And "Cliffhanger (1993)" is also an Action movie.
Let's find similar movies to "Star Wars (1977)" movie.

In [19]:
favorite_movie_name = 'Star Wars (1977)'
movie_index = movie_to_idx[favorite_movie_name]
movie_index

49

In [20]:
how_much_movie_to_show = 7

movies = top_k_movies(movie_similarity, idx_to_movie, movie_index, k = how_much_movie_to_show)
movies[1:how_much_movie_to_show + 1]

['Return of the Jedi (1983)',
 'Raiders of the Lost Ark (1981)',
 'Empire Strikes Back, The (1980)',
 'Toy Story (1995)',
 'Godfather, The (1972)',
 'Independence Day (ID4) (1996)',
 'Indiana Jones and the Last Crusade (1989)']

"Return of the Jedi (1983)" & "Empire Strikes Back, The (1980)" are also "Star Wars" movies.