# Importing Dataset

In [1]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1573k      0 --:--:-- --:--:-- --:--:-- 1573k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [3]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

# Data Exploration

In [4]:
print(f'The dimensions of movies dataframe are: {movies_df.shape}, \nThe dimensions of ratings dataframe are: {ratings_df.shape}')

The dimensions of movies dataframe are: (9742, 3), 
The dimensions of ratings dataframe are: (100836, 4)


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df.isna()

Unnamed: 0,movieId,title,genres
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
9737,False,False,False
9738,False,False,False
9739,False,False,False
9740,False,False,False


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
ratings_df.isna()

Unnamed: 0,userId,movieId,rating,timestamp
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
100831,False,False,False,False
100832,False,False,False,False
100833,False,False,False,False
100834,False,False,False,False


In [9]:
# Data Exploration
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = ratings_df.userId.nunique()
n_items = ratings_df.movieId.nunique()

In [10]:
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.


In [11]:
print(movies_df.info())
print(movies_df.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
              movieId        title genres
count     9742.000000         9742   9742
unique            NaN         9737    951
top               NaN  Emma (1996)  Drama
freq              NaN            2   1053
mean     42200.353623          NaN    NaN
std      52160.494854          NaN    NaN
min          1.000000          NaN    NaN
25%       3248.250000          NaN    NaN
50%       7300.000000          NaN    NaN
75%      76232.000000          NaN    NaN
max     193609.000000          NaN    NaN


In [12]:
print(ratings_df.info())
print(ratings_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
              userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%       177.000000    1199.000000       3.000000  1.019124e+09
50%       325.000000    2991.000000       3.500000  1.186087e+09
75%       477.000000    8122.000000       4.000000  1.435994e+09
max       610.000000  193609.000000       5.000000  1.537799e+09


# Data Preprocessing

In [13]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [14]:
# Creating the dataloader
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---
        # MOAR Preprocessing

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

# Model Training

In [15]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0494, 0.0335, 0.0427,  ..., 0.0053, 0.0316, 0.0045],
        [0.0196, 0.0254, 0.0278,  ..., 0.0064, 0.0274, 0.0230],
        [0.0173, 0.0455, 0.0341,  ..., 0.0384, 0.0086, 0.0027],
        ...,
        [0.0036, 0.0051, 0.0098,  ..., 0.0450, 0.0114, 0.0099],
        [0.0262, 0.0390, 0.0086,  ..., 0.0432, 0.0366, 0.0183],
        [0.0418, 0.0028, 0.0362,  ..., 0.0398, 0.0168, 0.0042]])
item_factors.weight tensor([[0.0280, 0.0107, 0.0440,  ..., 0.0390, 0.0030, 0.0377],
        [0.0373, 0.0467, 0.0441,  ..., 0.0285, 0.0039, 0.0385],
        [0.0473, 0.0442, 0.0273,  ..., 0.0246, 0.0429, 0.0495],
        ...,
        [0.0189, 0.0187, 0.0062,  ..., 0.0480, 0.0014, 0.0427],
        [0.0359, 0.0239, 0.0423,  ..., 0.0422, 0.0212, 0.0378],
        [0.0226, 0.0096, 0.0100,  ..., 0.0158, 0.0351, 0.0057]])


In [16]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("Epoch #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

Epoch #0 Loss: 11.061016987423
Epoch #1 Loss: 4.743520342759069
Epoch #2 Loss: 2.4778885016889136
Epoch #3 Loss: 1.7226517197444353
Epoch #4 Loss: 1.3468413860513475
Epoch #5 Loss: 1.1287615453531294
Epoch #6 Loss: 0.9917157141085203
Epoch #7 Loss: 0.9005391771720751
Epoch #8 Loss: 0.8371959415637902
Epoch #9 Loss: 0.7920903134618313
Epoch #10 Loss: 0.7593078455949193
Epoch #11 Loss: 0.734612286128671
Epoch #12 Loss: 0.7158811609000724
Epoch #13 Loss: 0.7013988335271777
Epoch #14 Loss: 0.6904861052629306
Epoch #15 Loss: 0.6816292223213288
Epoch #16 Loss: 0.6750590224480871
Epoch #17 Loss: 0.6699755157764793
Epoch #18 Loss: 0.6659839819303623
Epoch #19 Loss: 0.6627460786231278
Epoch #20 Loss: 0.660652050852473
Epoch #21 Loss: 0.658967716553187
Epoch #22 Loss: 0.65772621133152
Epoch #23 Loss: 0.656946971672138
Epoch #24 Loss: 0.6561147556994772
Epoch #25 Loss: 0.6553136777454222
Epoch #26 Loss: 0.6546112554251845
Epoch #27 Loss: 0.6536693755291441
Epoch #28 Loss: 0.6531000785189232
Epoch

In [17]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.7437e+00,  1.7488e+00,  1.1555e+00,  ...,  8.2583e-01,
          1.0780e+00,  1.0644e+00],
        [ 1.6159e+00,  7.0294e-04,  9.8661e-01,  ...,  1.6419e-01,
          1.2086e+00,  1.2470e+00],
        [ 4.0453e-01, -1.4769e+00,  2.7120e+00,  ...,  1.1742e+00,
         -1.8746e+00,  7.8614e-01],
        ...,
        [ 2.6087e-01,  1.1377e+00,  2.3963e+00,  ...,  3.3846e-01,
         -5.6919e-01,  1.7430e+00],
        [ 1.5734e+00,  7.0043e-01,  9.3011e-01,  ...,  4.5302e-01,
          1.2231e+00,  9.3798e-01],
        [ 1.5248e+00,  4.7867e-01,  1.6314e+00,  ...,  1.6817e+00,
          9.0291e-01,  1.2074e+00]], device='cuda:0')
item_factors.weight tensor([[0.3062, 0.5244, 0.0881,  ..., 0.5831, 0.5823, 0.7263],
        [1.0825, 0.0021, 0.2272,  ..., 0.0793, 0.0657, 0.5188],
        [0.4018, 0.3235, 0.6829,  ..., 0.5246, 0.5173, 0.8388],
        ...,
        [0.3592, 0.3479, 0.3453,  ..., 0.3878, 0.3379, 0.3809],
        [0.4197, 0.4007, 0.4239,  ..., 0.4

In [18]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [19]:
len(trained_movie_embeddings)

9724

In [20]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [21]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Braveheart (1995)
	 Schindler's List (1993)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Usual Suspects, The (1995)
	 Seven (a.k.a. Se7en) (1995)
Cluster #1
	 Jurassic Park (1993)
	 Terminator 2: Judgment Day (1991)
	 Apollo 13 (1995)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Fugitive, The (1993)
	 Batman (1989)
	 Sixth Sense, The (1999)
	 True Lies (1994)
	 Lion King, The (1994)
	 Back to the Future (1985)
Cluster #2
	 Toy Story (1995)
	 Aladdin (1992)
	 E.T. the Extra-Terrestrial (1982)
	 Toy Story 2 (1999)
	 Wizard of Oz, The (1939)
	 Jaws (1975)
	 Airplane! (1980)
	 Jerry Maguire (1996)
	 Pan's Labyrinth (Laberinto del fauno, El) (2006)
	 Chicken Run (2000)
Cluster #3
	 Dances with Wolves (1990)
	 Monsters, Inc. (2001)
	 Babe (1995)
	 Incredibles, The (2004)
	 Net, The (1995)
	 Interview wi