<a href="https://colab.research.google.com/github/bansarithummar/Recommendation-system/blob/main/Collaborative%20Filtering/Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
Dataset: http://files.grouplens.org/datasets/movielens/

In [36]:
from google.colab import files
uploaded = files.upload()

Saving ml-latest-small.zip to ml-latest-small.zip


In [37]:
import zipfile
import os

zip_file = 'ml-latest-small.zip'

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('data')

extracted_files = os.listdir('data')
print(extracted_files)

['ml-latest-small']


In [38]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [39]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)


The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [40]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [41]:

# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [42]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [47]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm
from torch import nn
from torch.utils.data import DataLoader, Dataset

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=8):
        super(MatrixFactorization, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = nn.Embedding(n_items, n_factors, sparse=True)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [52]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self, ratings_df):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Producing new continuous IDs for users and movies
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x, dtype=torch.long), torch.tensor(self.y, dtype=torch.float32)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

# Create DataLoader
train_set = Loader(ratings_df)
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)


In [53]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader(ratings_df)
print("Data loader created successfully.")
print("First batch of data:", next(iter(train_set)))


Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8, sparse=True)
  (item_factors): Embedding(9724, 8, sparse=True)
)
user_factors.weight tensor([[0.0346, 0.0065, 0.0089,  ..., 0.0465, 0.0142, 0.0087],
        [0.0410, 0.0060, 0.0460,  ..., 0.0035, 0.0185, 0.0285],
        [0.0458, 0.0010, 0.0218,  ..., 0.0468, 0.0140, 0.0410],
        ...,
        [0.0303, 0.0207, 0.0009,  ..., 0.0209, 0.0015, 0.0015],
        [0.0433, 0.0459, 0.0154,  ..., 0.0462, 0.0390, 0.0487],
        [0.0394, 0.0054, 0.0435,  ..., 0.0140, 0.0482, 0.0195]])
item_factors.weight tensor([[0.0029, 0.0202, 0.0161,  ..., 0.0035, 0.0255, 0.0113],
        [0.0422, 0.0440, 0.0073,  ..., 0.0285, 0.0488, 0.0444],
        [0.0290, 0.0227, 0.0014,  ..., 0.0492, 0.0451, 0.0087],
        ...,
        [0.0132, 0.0403, 0.0025,  ..., 0.0174, 0.0238, 0.0058],
        [0.0437, 0.0270, 0.0221,  ..., 0.0243, 0.0051, 0.0449],
        [0.0041, 0.0175, 0.0061,  ..., 0.0316, 0.0292, 0.0399]])
Data loader create

In [56]:
from tqdm import tqdm
optimizer = torch.optim.SparseAdam(model.parameters(), lr=1e-3)

for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        if torch.cuda.is_available():
            x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

  1%|          | 1/128 [00:03<06:21,  3.00s/it]

iter #0 Loss: 12.013407802823837


  2%|▏         | 2/128 [00:05<05:27,  2.60s/it]

iter #1 Loss: 6.937140610919991


  2%|▏         | 3/128 [00:07<05:09,  2.47s/it]

iter #2 Loss: 4.117978862094395


  3%|▎         | 4/128 [00:09<05:00,  2.42s/it]

iter #3 Loss: 3.0239035534072043


  4%|▍         | 5/128 [00:12<05:01,  2.45s/it]

iter #4 Loss: 2.438212767772868


  5%|▍         | 6/128 [00:15<05:08,  2.53s/it]

iter #5 Loss: 2.074804322973726


  5%|▌         | 7/128 [00:17<04:58,  2.47s/it]

iter #6 Loss: 1.8275751577416046


  6%|▋         | 8/128 [00:19<04:51,  2.43s/it]

iter #7 Loss: 1.6481460886558301


  7%|▋         | 9/128 [00:22<04:44,  2.39s/it]

iter #8 Loss: 1.5120296923793513


  8%|▊         | 10/128 [00:24<04:47,  2.43s/it]

iter #9 Loss: 1.4043713685522226


  9%|▊         | 11/128 [00:27<04:53,  2.51s/it]

iter #10 Loss: 1.31746595886153


  9%|▉         | 12/128 [00:29<04:44,  2.46s/it]

iter #11 Loss: 1.2458343333399235


 10%|█         | 13/128 [00:32<04:38,  2.42s/it]

iter #12 Loss: 1.1854677042985327


 11%|█         | 14/128 [00:34<04:32,  2.39s/it]

iter #13 Loss: 1.134084389748307


 12%|█▏        | 15/128 [00:36<04:36,  2.44s/it]

iter #14 Loss: 1.0895045355794393


 12%|█▎        | 16/128 [00:39<04:41,  2.51s/it]

iter #15 Loss: 1.050983430392246


 13%|█▎        | 17/128 [00:41<04:32,  2.46s/it]

iter #16 Loss: 1.0168931846161784


 14%|█▍        | 18/128 [00:44<04:26,  2.42s/it]

iter #17 Loss: 0.9867813745277182


 15%|█▍        | 19/128 [00:46<04:21,  2.40s/it]

iter #18 Loss: 0.9598246315106522


 16%|█▌        | 20/128 [00:49<04:27,  2.48s/it]

iter #19 Loss: 0.9356375884283618


 16%|█▋        | 21/128 [00:51<04:27,  2.50s/it]

iter #20 Loss: 0.9139664705634722


 17%|█▋        | 22/128 [00:54<04:19,  2.45s/it]

iter #21 Loss: 0.8943789658812702


 18%|█▊        | 23/128 [00:56<04:13,  2.41s/it]

iter #22 Loss: 0.876368756826759


 19%|█▉        | 24/128 [00:58<04:08,  2.39s/it]

iter #23 Loss: 0.8600607998480047


 20%|█▉        | 25/128 [01:01<04:15,  2.48s/it]

iter #24 Loss: 0.8451820463698527


 20%|██        | 26/128 [01:04<04:14,  2.49s/it]

iter #25 Loss: 0.831390830503805


 21%|██        | 27/128 [01:06<04:06,  2.44s/it]

iter #26 Loss: 0.8188235203021674


 22%|██▏       | 28/128 [01:08<04:01,  2.41s/it]

iter #27 Loss: 0.80722302743021


 23%|██▎       | 29/128 [01:11<03:56,  2.39s/it]

iter #28 Loss: 0.796494450741613


 23%|██▎       | 30/128 [01:13<04:03,  2.49s/it]

iter #29 Loss: 0.7863836980940122


 24%|██▍       | 31/128 [01:16<04:00,  2.48s/it]

iter #30 Loss: 0.7769602206606551


 25%|██▌       | 32/128 [01:18<03:53,  2.44s/it]

iter #31 Loss: 0.7684741705882973


 26%|██▌       | 33/128 [01:20<03:48,  2.40s/it]

iter #32 Loss: 0.7605765632112619


 27%|██▋       | 34/128 [01:23<03:43,  2.38s/it]

iter #33 Loss: 0.7531905244979157


 27%|██▋       | 35/128 [01:25<03:51,  2.49s/it]

iter #34 Loss: 0.7461116558631059


 28%|██▊       | 36/128 [01:28<03:47,  2.48s/it]

iter #35 Loss: 0.739645742787625


 29%|██▉       | 37/128 [01:30<03:42,  2.45s/it]

iter #36 Loss: 0.7334171548334475


 30%|██▉       | 38/128 [01:33<03:37,  2.41s/it]

iter #37 Loss: 0.7278237928669464


 30%|███       | 39/128 [01:35<03:32,  2.39s/it]

iter #38 Loss: 0.7223094229392594


 31%|███▏      | 40/128 [01:38<03:41,  2.52s/it]

iter #39 Loss: 0.7173503741697611


 32%|███▏      | 41/128 [01:40<03:35,  2.48s/it]

iter #40 Loss: 0.7125772742677461


 33%|███▎      | 42/128 [01:42<03:28,  2.43s/it]

iter #41 Loss: 0.7080395741738038


 34%|███▎      | 43/128 [01:45<03:23,  2.40s/it]

iter #42 Loss: 0.703832924139076


 34%|███▍      | 44/128 [01:47<03:19,  2.38s/it]

iter #43 Loss: 0.6998898563575624


 35%|███▌      | 45/128 [01:50<03:30,  2.53s/it]

iter #44 Loss: 0.6959695691233359


 36%|███▌      | 46/128 [01:52<03:23,  2.48s/it]

iter #45 Loss: 0.6925934162841836


 37%|███▋      | 47/128 [01:55<03:17,  2.44s/it]

iter #46 Loss: 0.6891031077670567


 38%|███▊      | 48/128 [01:57<03:13,  2.42s/it]

iter #47 Loss: 0.6859629742687728


 38%|███▊      | 49/128 [01:59<03:10,  2.41s/it]

iter #48 Loss: 0.6827936620654793


 39%|███▉      | 50/128 [02:02<03:20,  2.57s/it]

iter #49 Loss: 0.6797982325711226


 40%|███▉      | 51/128 [02:05<03:13,  2.51s/it]

iter #50 Loss: 0.6770840114370216


 41%|████      | 52/128 [02:07<03:06,  2.45s/it]

iter #51 Loss: 0.6743392442461803


 41%|████▏     | 53/128 [02:09<03:00,  2.41s/it]

iter #52 Loss: 0.6718671681917259


 42%|████▏     | 54/128 [02:12<02:56,  2.38s/it]

iter #53 Loss: 0.6693142479673255


 43%|████▎     | 55/128 [02:15<03:04,  2.53s/it]

iter #54 Loss: 0.6668466782282452


 44%|████▍     | 56/128 [02:17<02:57,  2.47s/it]

iter #55 Loss: 0.6645744327966332


 45%|████▍     | 57/128 [02:19<02:52,  2.43s/it]

iter #56 Loss: 0.6623300760984421


 45%|████▌     | 58/128 [02:22<02:47,  2.39s/it]

iter #57 Loss: 0.6601251482131517


 46%|████▌     | 59/128 [02:24<02:43,  2.37s/it]

iter #58 Loss: 0.6578626782458445


 47%|████▋     | 60/128 [02:27<02:51,  2.53s/it]

iter #59 Loss: 0.6557526634202391


 48%|████▊     | 61/128 [02:29<02:46,  2.48s/it]

iter #60 Loss: 0.6536135972680779


 48%|████▊     | 62/128 [02:32<02:41,  2.44s/it]

iter #61 Loss: 0.6516331761016458


 49%|████▉     | 63/128 [02:34<02:36,  2.41s/it]

iter #62 Loss: 0.649530267881863


 50%|█████     | 64/128 [02:36<02:32,  2.39s/it]

iter #63 Loss: 0.6475746973138775


 51%|█████     | 65/128 [02:39<02:40,  2.54s/it]

iter #64 Loss: 0.6455119634188976


 52%|█████▏    | 66/128 [02:41<02:33,  2.48s/it]

iter #65 Loss: 0.6433492448502386


 52%|█████▏    | 67/128 [02:44<02:28,  2.43s/it]

iter #66 Loss: 0.6412039069733039


 53%|█████▎    | 68/128 [02:46<02:24,  2.40s/it]

iter #67 Loss: 0.6392081947949937


 54%|█████▍    | 69/128 [02:48<02:20,  2.38s/it]

iter #68 Loss: 0.636957828806439


 55%|█████▍    | 70/128 [02:51<02:27,  2.54s/it]

iter #69 Loss: 0.6346915028422012


 55%|█████▌    | 71/128 [02:54<02:21,  2.47s/it]

iter #70 Loss: 0.6325503871374324


 56%|█████▋    | 72/128 [02:56<02:16,  2.43s/it]

iter #71 Loss: 0.6301191592503925


 57%|█████▋    | 73/128 [02:58<02:12,  2.40s/it]

iter #72 Loss: 0.6277461307983713


 58%|█████▊    | 74/128 [03:01<02:09,  2.41s/it]

iter #73 Loss: 0.6253742353262635


 59%|█████▊    | 75/128 [03:04<02:14,  2.54s/it]

iter #74 Loss: 0.6227534658790845


 59%|█████▉    | 76/128 [03:06<02:08,  2.47s/it]

iter #75 Loss: 0.6201176371066098


 60%|██████    | 77/128 [03:08<02:03,  2.43s/it]

iter #76 Loss: 0.6177420856308211


 61%|██████    | 78/128 [03:11<02:00,  2.40s/it]

iter #77 Loss: 0.6148154238229475


 62%|██████▏   | 79/128 [03:13<01:57,  2.41s/it]

iter #78 Loss: 0.6122469637675334


 62%|██████▎   | 80/128 [03:16<02:01,  2.53s/it]

iter #79 Loss: 0.6092455098925508


 63%|██████▎   | 81/128 [03:18<01:55,  2.46s/it]

iter #80 Loss: 0.6064340423811511


 64%|██████▍   | 82/128 [03:20<01:51,  2.43s/it]

iter #81 Loss: 0.6034776761674033


 65%|██████▍   | 83/128 [03:23<01:47,  2.39s/it]

iter #82 Loss: 0.6006175596928839


 66%|██████▌   | 84/128 [03:25<01:45,  2.41s/it]

iter #83 Loss: 0.5975244408331547


 66%|██████▋   | 85/128 [03:28<01:48,  2.52s/it]

iter #84 Loss: 0.5944890384822328


 67%|██████▋   | 86/128 [03:30<01:44,  2.48s/it]

iter #85 Loss: 0.5912861768439942


 68%|██████▊   | 87/128 [03:33<01:39,  2.43s/it]

iter #86 Loss: 0.5882992210424491


 69%|██████▉   | 88/128 [03:35<01:35,  2.40s/it]

iter #87 Loss: 0.5852201680468424


 70%|██████▉   | 89/128 [03:37<01:34,  2.42s/it]

iter #88 Loss: 0.5819338180767704


 70%|███████   | 90/128 [03:40<01:35,  2.52s/it]

iter #89 Loss: 0.5788902943660765


 71%|███████   | 91/128 [03:43<01:31,  2.46s/it]

iter #90 Loss: 0.5757020167287836


 72%|███████▏  | 92/128 [03:45<01:26,  2.42s/it]

iter #91 Loss: 0.5727627454372832


 73%|███████▎  | 93/128 [03:47<01:23,  2.39s/it]

iter #92 Loss: 0.5694792934783219


 73%|███████▎  | 94/128 [03:50<01:22,  2.43s/it]

iter #93 Loss: 0.5663874460204603


 74%|███████▍  | 95/128 [03:52<01:23,  2.52s/it]

iter #94 Loss: 0.5632478480562946


 75%|███████▌  | 96/128 [03:55<01:18,  2.46s/it]

iter #95 Loss: 0.5602719074275893


 76%|███████▌  | 97/128 [03:57<01:15,  2.42s/it]

iter #96 Loss: 0.5572462396951496


 77%|███████▋  | 98/128 [03:59<01:11,  2.40s/it]

iter #97 Loss: 0.5542851937982032


 77%|███████▋  | 99/128 [04:02<01:11,  2.46s/it]

iter #98 Loss: 0.5514366738687312


 78%|███████▊  | 100/128 [04:05<01:10,  2.52s/it]

iter #99 Loss: 0.5484678764936283


 79%|███████▉  | 101/128 [04:07<01:06,  2.47s/it]

iter #100 Loss: 0.5454435618925215


 80%|███████▉  | 102/128 [04:09<01:02,  2.42s/it]

iter #101 Loss: 0.5428162125615299


 80%|████████  | 103/128 [04:12<00:59,  2.39s/it]

iter #102 Loss: 0.5399431259877185


 81%|████████▏ | 104/128 [04:14<00:58,  2.45s/it]

iter #103 Loss: 0.5373241455497475


 82%|████████▏ | 105/128 [04:17<00:57,  2.51s/it]

iter #104 Loss: 0.5344289049203625


 83%|████████▎ | 106/128 [04:19<00:54,  2.46s/it]

iter #105 Loss: 0.5317565343782381


 84%|████████▎ | 107/128 [04:22<00:50,  2.42s/it]

iter #106 Loss: 0.5292026542770076


 84%|████████▍ | 108/128 [04:24<00:47,  2.39s/it]

iter #107 Loss: 0.5264572288254796


 85%|████████▌ | 109/128 [04:26<00:46,  2.46s/it]

iter #108 Loss: 0.5240803644590571


 86%|████████▌ | 110/128 [04:29<00:44,  2.50s/it]

iter #109 Loss: 0.5214041070965341


 87%|████████▋ | 111/128 [04:31<00:41,  2.46s/it]

iter #110 Loss: 0.5188548804690996


 88%|████████▊ | 112/128 [04:34<00:38,  2.42s/it]

iter #111 Loss: 0.5163421222022947


 88%|████████▊ | 113/128 [04:36<00:35,  2.39s/it]

iter #112 Loss: 0.5139255434804156


 89%|████████▉ | 114/128 [04:39<00:34,  2.47s/it]

iter #113 Loss: 0.511554921528107


 90%|████████▉ | 115/128 [04:41<00:32,  2.50s/it]

iter #114 Loss: 0.5092259636385187


 91%|█████████ | 116/128 [04:44<00:29,  2.45s/it]

iter #115 Loss: 0.506745492179079


 91%|█████████▏| 117/128 [04:46<00:26,  2.42s/it]

iter #116 Loss: 0.5045518283235845


 92%|█████████▏| 118/128 [04:48<00:24,  2.40s/it]

iter #117 Loss: 0.5022161793587777


 93%|█████████▎| 119/128 [04:51<00:22,  2.49s/it]

iter #118 Loss: 0.5000121482283936


 94%|█████████▍| 120/128 [04:54<00:19,  2.50s/it]

iter #119 Loss: 0.4978583040954498


 95%|█████████▍| 121/128 [04:56<00:17,  2.45s/it]

iter #120 Loss: 0.4957091340861345


 95%|█████████▌| 122/128 [04:58<00:14,  2.41s/it]

iter #121 Loss: 0.49351815368772156


 96%|█████████▌| 123/128 [05:01<00:11,  2.40s/it]

iter #122 Loss: 0.49151099862786113


 97%|█████████▋| 124/128 [05:03<00:10,  2.53s/it]

iter #123 Loss: 0.489335351032654


 98%|█████████▊| 125/128 [05:06<00:07,  2.49s/it]

iter #124 Loss: 0.4873876522110803


 98%|█████████▊| 126/128 [05:08<00:04,  2.44s/it]

iter #125 Loss: 0.4853124328449293


 99%|█████████▉| 127/128 [05:11<00:02,  2.41s/it]

iter #126 Loss: 0.4833949909446203


100%|██████████| 128/128 [05:13<00:00,  2.45s/it]

iter #127 Loss: 0.4813890685965567





In [57]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.5710,  1.4464,  1.5014,  ...,  2.1994,  1.4891,  1.3978],
        [ 1.3227,  1.3260,  1.2967,  ...,  1.1947,  1.3056,  1.1760],
        [ 0.8365,  0.7280,  0.7439,  ...,  0.9421,  0.6307,  0.9601],
        ...,
        [ 0.9910, -0.5364, -0.5619,  ...,  2.5521,  0.2769,  1.8829],
        [ 1.1453,  1.3251,  1.2447,  ...,  1.0617,  1.0863,  1.2091],
        [ 1.5815,  1.8342, -0.8168,  ...,  0.3223,  3.4216,  1.7629]],
       device='cuda:0')
item_factors.weight tensor([[0.3770, 0.3963, 0.3489,  ..., 0.5031, 0.5135, 0.3426],
        [0.0861, 0.2576, 0.4545,  ..., 0.5006, 0.3859, 0.4798],
        [0.5174, 0.2763, 0.2826,  ..., 0.3982, 0.4691, 0.2149],
        ...,
        [0.2708, 0.3025, 0.2564,  ..., 0.2723, 0.2884, 0.2678],
        [0.3392, 0.3278, 0.2994,  ..., 0.3063, 0.3187, 0.3467],
        [0.2971, 0.3158, 0.2826,  ..., 0.3123, 0.3392, 0.3391]],
       device='cuda:0')


In [58]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [60]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [61]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [62]:

'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Ace Ventura: When Nature Calls (1995)
	 Nutty Professor, The (1996)
	 Honey, I Shrunk the Kids (1989)
	 Blair Witch Project, The (1999)
	 Coneheads (1993)
	 Judge Dredd (1995)
	 Hot Shots! Part Deux (1993)
	 Beverly Hills Cop III (1994)
	 Cable Guy, The (1996)
	 Arachnophobia (1990)
Cluster #1
	 Shawshank Redemption, The (1994)
	 Pulp Fiction (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Braveheart (1995)
	 Terminator 2: Judgment Day (1991)
	 Schindler's List (1993)
	 Fight Club (1999)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
Cluster #2
	 Forrest Gump (1994)
	 Toy Story (1995)
	 Apollo 13 (1995)
	 Aladdin (1992)
	 Shrek (2001)
	 Dances with Wolves (1990)
	 Truman Show, The (1998)
	 Minority Report (2002)
	 Ocean's Eleven (2001)
	 Catch Me If You Can (2002)
Cluster #3
	 Wild Wild West (1999)
	 Batman & Robin (1997)
	 Godzilla (1998)
	 I Know What You Did Last Summer (1997)
	 Anaconda (1997)
	 Rich