<a href="https://colab.research.google.com/github/Sharjeeyl/Machine_Learning_Projects/blob/main/Recommendation_system_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# U Can Directly Upload Data Set By Putting Link Like This ...!

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   581k      0  0:00:01  0:00:01 --:--:--  581k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:

import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [6]:

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:

ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:

movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [10]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data)
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [12]:

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [13]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
if cuda:
    model = model.cuda()

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0427, 0.0273, 0.0484,  ..., 0.0224, 0.0360, 0.0407],
        [0.0286, 0.0325, 0.0199,  ..., 0.0215, 0.0110, 0.0381],
        [0.0131, 0.0452, 0.0161,  ..., 0.0286, 0.0231, 0.0006],
        ...,
        [0.0348, 0.0411, 0.0018,  ..., 0.0308, 0.0201, 0.0209],
        [0.0015, 0.0181, 0.0184,  ..., 0.0286, 0.0481, 0.0338],
        [0.0373, 0.0246, 0.0403,  ..., 0.0025, 0.0207, 0.0026]])
item_factors.weight tensor([[0.0246, 0.0116, 0.0344,  ..., 0.0074, 0.0152, 0.0015],
        [0.0107, 0.0269, 0.0048,  ..., 0.0469, 0.0145, 0.0421],
        [0.0225, 0.0286, 0.0399,  ..., 0.0446, 0.0009, 0.0186],
        ...,
        [0.0083, 0.0029, 0.0112,  ..., 0.0082, 0.0391, 0.0487],
        [0.0076, 0.0070, 0.0393,  ..., 0.0183, 0.0084, 0.0309],
        [0.0435, 0.0404, 0.0127,  ..., 0.0426, 0.0006, 0.0337]])


In [14]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

ZeroDivisionError: ignored

In [15]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.0427, 0.0273, 0.0484,  ..., 0.0224, 0.0360, 0.0407],
        [0.0286, 0.0325, 0.0199,  ..., 0.0215, 0.0110, 0.0381],
        [0.0131, 0.0452, 0.0161,  ..., 0.0286, 0.0231, 0.0006],
        ...,
        [0.0348, 0.0411, 0.0018,  ..., 0.0308, 0.0201, 0.0209],
        [0.0015, 0.0181, 0.0184,  ..., 0.0286, 0.0481, 0.0338],
        [0.0373, 0.0246, 0.0403,  ..., 0.0025, 0.0207, 0.0026]])
item_factors.weight tensor([[0.0246, 0.0116, 0.0344,  ..., 0.0074, 0.0152, 0.0015],
        [0.0107, 0.0269, 0.0048,  ..., 0.0469, 0.0145, 0.0421],
        [0.0225, 0.0286, 0.0399,  ..., 0.0446, 0.0009, 0.0186],
        ...,
        [0.0083, 0.0029, 0.0112,  ..., 0.0082, 0.0391, 0.0487],
        [0.0076, 0.0070, 0.0393,  ..., 0.0183, 0.0084, 0.0309],
        [0.0435, 0.0404, 0.0127,  ..., 0.0426, 0.0006, 0.0337]])


In [16]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [17]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [18]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [20]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 American Beauty (1999)
	 Seven (a.k.a. Se7en) (1995)
	 Godfather, The (1972)
	 One Flew Over the Cuckoo's Nest (1975)
	 Terminator, The (1984)
	 Goodfellas (1990)
	 Jumanji (1995)
	 Shining, The (1980)
	 2001: A Space Odyssey (1968)
	 Star Trek: Generations (1994)
Cluster #1
	 Shawshank Redemption, The (1994)
	 Batman (1989)
	 Batman Forever (1995)
	 Beautiful Mind, A (2001)
	 Home Alone (1990)
	 Kill Bill: Vol. 2 (2004)
	 Apocalypse Now (1979)
	 Taxi Driver (1976)
	 Four Weddings and a Funeral (1994)
	 Firm, The (1993)
Cluster #2
	 Forrest Gump (1994)
	 Jurassic Park (1993)
	 Lord of the Rings: The Two Towers, The (2002)
	 Aladdin (1992)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Speed (1994)
	 Ace Ventura: Pet Detective (1994)
	 Beauty and the Beast (1991)
	 Stargate (1994)
	 Titanic (1997)
Cluster #3
	 Pulp Fiction (1994)
	 Independence Day (a.k.a. ID4) (1996)
	 Apollo 13 (1995)
	 Saving Private Ryan (1998)
	 True Lies (1994)
	 Men in Black (a.k.a. MIB) (1997)
	 Danc