In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [None]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip


--2025-06-26 21:39:07--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2025-06-26 21:39:10 (55.7 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [None]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [None]:
df = pd.read_csv('ml-20m/ratings.csv')

In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [None]:
df.userId = pd.Categorical(df.userId)
df["new_user_id"] = df.userId.cat.codes

In [None]:
df.movieId = pd.Categorical(df.movieId)
df["new_movie_id"] = df.movieId.cat.codes

In [None]:
df.head(1000)

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id,new_movie_id
0,1,2,3.5,1112486027,0,1
1,1,29,3.5,1112484676,0,28
2,1,32,3.5,1112484819,0,31
3,1,47,3.5,1112484727,0,46
4,1,50,3.5,1112484580,0,49
...,...,...,...,...,...,...
995,11,441,1.5,1230853962,10,437
996,11,442,4.5,1230788002,10,438
997,11,480,5.0,1230788713,10,476
998,11,500,4.5,1230858949,10,496


In [None]:
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values - 2.5

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column        Dtype   
---  ------        -----   
 0   userId        category
 1   movieId       category
 2   rating        float64 
 3   timestamp     int64   
 4   new_user_id   int32   
 5   new_movie_id  int16   
dtypes: category(2), float64(1), int16(1), int32(1), int64(1)
memory usage: 540.4 MB


In [None]:
class Recommender(nn.Module):
  def __init__(self, n_users, n_items, embed_dim, n_hidden=1024):
    super().__init__()
    self.N = n_users
    self.M = n_items
    self.D = embed_dim

    self.u_emb = nn.Embedding(self.N, self.D)
    self.m_emb = nn.Embedding(self.M, self.D)

    self.fc1 = nn.Linear(2*self.D, n_hidden)
    self.fc2 = nn.Linear(n_hidden, 1)

  def forward(self, u, m):
    u = self.u_emb(u)
    m = self.m_emb(m)

    # Concat embeddings
    out = torch.cat((u,m), 1)

    # ANN
    out = self.fc1(out)
    out = F.relu(out)
    out = self.fc2(out)

    return out


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
model = Recommender(len(set(user_ids)), len(set(movie_ids)), 10)
model.to(device)

Recommender(
  (u_emb): Embedding(138493, 10)
  (m_emb): Embedding(26744, 10)
  (fc1): Linear(in_features=20, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1, bias=True)
)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())


In [None]:
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)

In [None]:
user_ids_tensor = torch.from_numpy(user_ids).long()
movie_ids_tensor = torch.from_numpy(movie_ids).long()
ratings_tensor = torch.from_numpy(ratings).long()

In [None]:
Ntrain = int(0.8 * len(ratings))
train_dataset = torch.utils.data.TensorDataset(
    user_ids_tensor[:Ntrain],
    movie_ids_tensor[:Ntrain],
    ratings_tensor[:Ntrain]
)
test_dataset = torch.utils.data.TensorDataset(
    user_ids_tensor[Ntrain:],
    movie_ids_tensor[Ntrain:],
    ratings_tensor[Ntrain:]
)

In [None]:
batch_size = 512
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False
)

In [None]:
def batch_gd(model, criterion, optimizer, train_iter, test_iter, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    t0 = datetime.now()
    train_loss = []
    for users, movies, targets in train_loader:
      # print("inputs.shape:", inputs.shape, "targets.shape:", targets.shape)
      targets = targets.view(-1, 1).float()
      # move data to GPU
      users, movies, targets = users.to(device), movies.to(device), targets.to(device)

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(users, movies)
      loss = criterion(outputs, targets)

      # Backward and optimize
      loss.backward()
      optimizer.step()

      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss) # a little misleading

    test_loss = []
    for users, movies, targets in test_loader:
      users, movies, targets = users.to(device), movies.to(device), targets.to(device)
      targets = targets.view(-1, 1).float()
      outputs = model(users, movies)
      loss = criterion(outputs, targets)
      test_loss.append(loss.item())
    test_loss = np.mean(test_loss)

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss

    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')

  return train_losses, test_losses

In [None]:
train_losses, test_losses = batch_gd(model, criterion, optimizer, train_loader, test_loader, 25)

Epoch 1/25, Train Loss: 0.6010,       Test Loss: 0.5578, Duration: 0:04:54.737671
Epoch 2/25, Train Loss: 0.5491,       Test Loss: 0.5492, Duration: 0:05:09.195166
Epoch 3/25, Train Loss: 0.5422,       Test Loss: 0.5449, Duration: 0:04:54.061429
Epoch 4/25, Train Loss: 0.5354,       Test Loss: 0.5380, Duration: 0:04:53.030106
Epoch 5/25, Train Loss: 0.5265,       Test Loss: 0.5309, Duration: 0:04:52.317402
Epoch 6/25, Train Loss: 0.5180,       Test Loss: 0.5254, Duration: 0:04:55.058160
Epoch 7/25, Train Loss: 0.5119,       Test Loss: 0.5219, Duration: 0:04:53.664287
Epoch 8/25, Train Loss: 0.5076,       Test Loss: 0.5193, Duration: 0:04:53.095391
Epoch 9/25, Train Loss: 0.5043,       Test Loss: 0.5182, Duration: 0:04:54.063556


In [None]:
plt.plot(train_losses, label='Train loss')
plt.plot(test_losses, 'Test loss')
plt.legend()
plt.show()