In [49]:
# from google.colab import drive
# drive.mount('/content/drive')

In [50]:
import pandas as pd

url='https://drive.google.com/file/d/1Fy2L9YRBiCT0738yIhpoJVOSXmD4Gt0I/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

ratings = pd.read_csv(url, sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])


url = 'https://drive.google.com/file/d/1mItKOI2bO7DRhrZqiF2WBsarc34vZqeM/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
movies = pd.read_csv(url, sep='\t', encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])

In [51]:
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [52]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354


### Content Based

**Objective**: Build a Content-Based Recommendation system that computes similarity between movies based on movie genres. It will suggest movies that are most similar to a particular movie based on its genre.

**Dataset**:
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Split the `genres` column into separate genre strings.
   - Fill any missing values in the `genres` column and convert it to string.

2. **Compute TF-IDF Matrix**:
   - Use `TfidfVectorizer` to transform the genres into a TF-IDF matrix.

3. **Calculate Cosine Similarity**:
   - Compute cosine similarity between the TF-IDF matrices of movies.

4. **Recommend Movies**:
   - Create a function that gets movie recommendations based on the cosine similarity score of movie genres.

In [53]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [55]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [56]:
def genre_recommendations(title, n=10):
    # print(movies.loc[movies['title'].isin([title])].index)
    idx = movies.loc[movies['title'].isin([title])].index[0]

    scores = list(enumerate(cosine_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:(n+1)]#except itself of course
    # print(scores)
    movie_indices = [i[0] for i in scores]
    return movies['title'].iloc[movie_indices]

In [57]:
# Example
genre_recommendations('GoldenEye (1995)')

345     Clear and Present Danger (1994)
543           Surviving the Game (1994)
724                    Rock, The (1996)
788                     Daylight (1996)
825               Chain Reaction (1996)
978                 Maximum Risk (1996)
1467                    Anaconda (1997)
1513                     Con Air (1997)
1693                   Firestorm (1998)
3686          Perfect Storm, The (2000)
Name: title, dtype: object


### Collaborative Filtering

**Objective**: Implement a collaborative filtering movie recommendation system to recommend movies to a user based on similar users' spreferences.

**Dataset**:

- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Merge `ratings` and `movies` DataFrames on `movie_id`.
   - Split `genres` column into separate rows.

2. **Create User-Genre Matrix**:
   - Create a matrix where rows are users and columns are genres.
   - Each cell represents the ratio of movies watched by the user in that genre to the total movies watched by the user.

3. **Calculate User Similarity**:
   - Use cosine similarity to calculate the similarity between users.

4. **Find Top N Similar Users**:
   - Find the top N users with the most similar preferences to the given user.

5. **Recommend Movies**:
   - Recommend movies that similar users have watched but the target user has not.
   - Return titles and genres of the top 10 recommended movies.

In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


url='https://drive.google.com/file/d/1Fy2L9YRBiCT0738yIhpoJVOSXmD4Gt0I/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

ratings = pd.read_csv(url, sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])


url = 'https://drive.google.com/file/d/1mItKOI2bO7DRhrZqiF2WBsarc34vZqeM/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
movies = pd.read_csv(url, sep='\t', encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])

In [59]:
data = pd.merge(ratings, movies, on='movie_id')

In [60]:
data.head(-5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
0,1,1193,5,0,1192,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,1,1192,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,11,1192,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,14,1192,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,16,1192,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...
1000199,5334,3382,5,5333,3381,Song of Freedom (1936),Drama
1000200,5420,1843,3,5419,1842,Slappy and the Stinkers (1998),Children's|Comedy
1000201,5433,286,3,5432,285,Nemesis 2: Nebula (1995),Action|Sci-Fi|Thriller
1000202,5494,3530,4,5493,3529,Smoking/No Smoking (1993),Comedy


In [61]:
data = data.assign(genres=data['genres'].str.split('|')).explode('genres')

In [62]:
data.head(-5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
0,1,1193,5,0,1192,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,1,1192,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,11,1192,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,14,1192,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,16,1192,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...
1000201,5433,286,3,5432,285,Nemesis 2: Nebula (1995),Thriller
1000202,5494,3530,4,5493,3529,Smoking/No Smoking (1993),Comedy
1000203,5556,2198,3,5555,2197,Modulations (1998),Documentary
1000204,5949,2198,5,5948,2197,Modulations (1998),Documentary


In [63]:
def create_user_genre_matrix():
    # print(data.groupby('user_id').apply(lambda x: pd.get_dummies(x['genres']).sum() / x['genres'].count()))
    user_seen = data.groupby('user_id').apply(lambda x: pd.get_dummies(x['genres']).sum() / x['genres'].count())
    user_seen = user_seen.fillna(0)
    genres = np.unique(data['genres'])
    genre2idx = {genre: i for i, genre in enumerate(genres)}
    matrix = np.zeros((len(user_seen), len(genres)))
    for uu, value in user_seen.items():
        id, genre = uu
        matrix[id-1, genre2idx[genre]] = value
    return matrix

def get_top_n_similar_users(user_id, n=5):
    matrix = create_user_genre_matrix()
    user = matrix[user_id-1]
    similarity = cosine_similarity(matrix, [user])
    similar_users = np.argsort(similarity[:,0])[-n-1:-1]
    return similar_users,similarity[similar_users]

# a = create_user_genre_matrix()
a = get_top_n_similar_users(1,5)
a

(array([4882, 5342, 4766, 1480, 4455]),
 array([[0.95302272],
        [0.95900986],
        [0.96603473],
        [0.97166256],
        [0.97180458]]))

In [64]:
def recommend_movies(user_id, n_similar_users=5, n_recommendations=10):
    similar_users,similarity = get_top_n_similar_users(user_id, n_similar_users)
    user_data = data[data['user_id'] == user_id]
    user_movies = set(user_data['title'])

    scores = {}
    for i in range(len(similar_users)):
        similar_user_data = data[data['user_id'] == similar_users[i]+1]
        similar_user_movies = set(similar_user_data['title'])
        for movie in similar_user_movies:
            if movie not in user_movies:
                if movie not in scores:
                    scores[movie] = 0
                scores[movie] += similarity[i][0]

    sorted_mov = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    ans = []
    for movie, _ in sorted_mov[:n_recommendations]:
        ans.append(movie)
    ans = pd.merge(movies, pd.DataFrame(ans, columns=['title']), on='title')
    return ans


In [65]:
# Example
user_id = 1
recommended_movies = recommend_movies(user_id, n_similar_users=5, n_recommendations=10)

In [66]:
recommended_movies

Unnamed: 0,movie_id,title,genres
0,34,Babe (1995),Children's|Comedy|Drama
1,593,"Silence of the Lambs, The (1991)",Drama|Thriller
2,912,Casablanca (1942),Drama|Romance|War
3,1032,Alice in Wonderland (1951),Animation|Children's|Musical
4,1617,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
5,2080,Lady and the Tramp (1955),Animation|Children's|Comedy|Musical|Romance
6,2081,"Little Mermaid, The (1989)",Animation|Children's|Comedy|Musical|Romance
7,2087,Peter Pan (1953),Animation|Children's|Fantasy|Musical
8,2096,Sleeping Beauty (1959),Animation|Children's|Musical
9,2858,American Beauty (1999),Comedy|Drama


### SVD (Singular Value Decomposition)



**Objective**: Implement an SVD-based recommendation system to recommend movies to users by decomposing the user-item interaction matrix into latent factors.

**Dataset**:
- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Merge `ratings` and `movies` DataFrames on `movie_id`.
   - Create a user-item matrix where rows represent users, columns represent movies, and the values are the ratings.

2. **Decompose Matrix using SVD**:
   - Apply Singular Value Decomposition (SVD) to decompose the user-item matrix into three matrices: $U$, $\Sigma$, and $V^T$.

3. **Reconstruct Matrix**:
   - Reconstruct the user-item matrix using the top $k$ singular values to reduce dimensionality.

4. **Predict Ratings**:
   - Use the reconstructed matrix to predict ratings for all user-item pairs.

5. **Recommend Movies**:
   - Recommend the top 20 movies with the highest predicted ratings for a given user that the user hasn't rated yet.

In [67]:
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 6040 | Number of movies = 3706


Fill na elements.

In [68]:
Ratings = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
Ratings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
from scipy.sparse import linalg
U, sigma, Vt = linalg.svds(Ratings.to_numpy(), k = 70)
sigma = np.diag(sigma)

In [70]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [71]:
preds =  pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.743602,0.167959,0.00164,-0.000159,0.025062,0.093984,-0.154082,0.053694,-0.027952,-0.29932,...,-0.032602,-0.00188,0.028051,-0.0676,-0.067637,0.282302,-0.047235,-0.006398,0.066155,-0.14911
1,0.763631,0.236065,0.30725,-0.02099,-0.007528,1.191585,-0.031608,0.082107,0.141452,1.544303,...,-0.050852,-0.015627,-0.018653,0.07985,-0.023448,-0.065683,-0.351778,-0.065214,-0.039742,-0.208568
2,2.224023,0.408733,0.210549,-0.02083,-0.050027,-0.067233,-0.08769,0.104219,0.020806,0.892847,...,0.025895,0.000487,0.030151,0.064626,0.053306,-0.081226,0.102971,0.045691,0.03791,-0.237834
3,0.301018,-0.060284,0.075356,0.060072,0.019216,0.224341,-0.067389,-0.014056,0.073041,-0.366223,...,-0.014023,0.004538,0.012864,-0.012172,-0.049043,0.112146,-0.0234,-0.048382,0.009136,-0.069091
4,0.994341,-0.125499,-0.150633,0.166182,-0.092697,1.28186,-0.208535,-0.020675,-0.179887,0.330743,...,-0.028078,0.015781,-0.04122,-0.048106,-0.067668,-0.542868,0.295089,-0.019128,0.077952,-0.140831


In [72]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):

    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False)

    user_data =  original_ratings[original_ratings.user_id == (userID)]

    # merfe movies data with user_data
    user_full =  user_data.merge(movies, how = 'left', left_on = 'movie_id', right_on = 'movie_id').sort_values(['rating'], ascending=False)



    print(f'User {userID} has already rated {user_full.shape[0]} movies.')
    print(f'Recommending highest {num_recommendations} predicted ratings movies not already rated.')

    recommendations = movies[~movies['movie_id'].isin(user_full['movie_id'])].\
                merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',left_on = 'movie_id',right_on = 'movie_id').\
                rename(columns = {user_row_number: 'Predictions'}).\
                sort_values('Predictions', ascending = False).\
                iloc[:num_recommendations, :-1]




    return user_full, recommendations

In [73]:
already_rated, predictions = recommend_movies(preds, 4375, movies, ratings, 20)

User 4375 has already rated 325 movies.
Recommending highest 20 predicted ratings movies not already rated.


In [74]:
predictions.head(20)

Unnamed: 0,movie_id,title,genres
470,539,Sleepless in Seattle (1993),Comedy|Romance
1165,1307,When Harry Met Sally... (1989),Comedy|Romance
921,1036,Die Hard (1988),Action|Thriller
1444,1645,"Devil's Advocate, The (1997)",Crime|Horror|Mystery|Thriller
1479,1682,"Truman Show, The (1998)",Drama
1220,1370,Die Hard 2 (1990),Action|Thriller
3073,3450,Grumpy Old Men (1993),Comedy
1428,1625,"Game, The (1997)",Mystery|Thriller
2885,3247,Sister Act (1992),Comedy|Crime
2754,3107,Backdraft (1991),Action|Drama


In [75]:
# Top 20 movies that User 4375 has rated
already_rated.head(20)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
195,4375,3250,5,4374,3249,Alive (1993),Drama
285,4375,175,5,4374,174,Kids (1995),Drama
199,4375,3257,5,4374,3256,"Bodyguard, The (1992)",Action|Drama|Romance|Thriller
65,4375,3809,5,4374,3808,What About Bob? (1991),Comedy
210,4375,2688,5,4374,2687,"General's Daughter, The (1999)",Drama|Thriller
270,4375,150,5,4374,149,Apollo 13 (1995),Drama
271,4375,2710,5,4374,2709,"Blair Witch Project, The (1999)",Horror
125,4375,349,5,4374,348,Clear and Present Danger (1994),Action|Adventure|Thriller
60,4375,288,5,4374,287,Natural Born Killers (1994),Action|Thriller
141,4375,47,5,4374,46,Seven (Se7en) (1995),Crime|Thriller


#### Evaluation

In [76]:
# ratings

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

train_data, test_data =  train_test_split(ratings, test_size=0.15)

train_ratings =  train_data.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
R_train = train_ratings.values
# user_ratings_mean_train = # TODO
# Ratings_demeaned_train =

U_train, sigma_train, Vt_train = linalg.svds(R_train, k = 70)
sigma_train = np.diag(sigma_train)

all_user_predicted_ratings_train = np.dot(np.dot(U_train, sigma_train), Vt_train)
preds_train = pd.DataFrame(all_user_predicted_ratings_train, columns = train_ratings.columns)

def predict_rating(user_id, movie_id):
    user_row_number = user_id - 1
    movie_col_number = movie_id - 1
    if user_row_number >= len(preds_train) or movie_col_number >= len(preds_train.columns):
        return np.nan
    pred = preds_train.iloc[user_row_number, movie_col_number]

    return pred

test_data['predicted_rating'] = test_data.apply(lambda x: predict_rating(x['user_id'], x['movie_id']), axis=1)

test_data.dropna(inplace=True)

rmse = sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 3.5007458480649065


### Nueral Network Model (Recommender Model)



**Objective**: Implement a Recommender model to recommend movies to a user based on similar users' preferences.

**Dataset**:
- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Define Dataset and DataLoader**:
   - Create a custom PyTorch `Dataset` for ratings.
   - Create a DataLoader for batching and shuffling data.

2. **Define the Neural Network**:
   - Create a neural network with embedding layers for users and movies.

3. **Train the Model**:
   - Train the model using Mean Squared Error loss and Adam optimizer.
   - Save model checkpoints.

4. **Evaluate the Model**:
   - Calculate RMSE on the entire dataset.

5. **Predict Ratings for Unrated Movies**:
   - Predict and recommend top 10 unrated movies for a given user.

In [78]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

url='https://drive.google.com/file/d/1Fy2L9YRBiCT0738yIhpoJVOSXmD4Gt0I/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

ratings = pd.read_csv(url, sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])


url = 'https://drive.google.com/file/d/1mItKOI2bO7DRhrZqiF2WBsarc34vZqeM/view?usp=drive_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
movies = pd.read_csv(url, sep='\t', encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])
ratings

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354
...,...,...,...,...,...
1000204,6040,1091,1,6039,1090
1000205,6040,1094,5,6039,1093
1000206,6040,562,5,6039,561
1000207,6040,1096,4,6039,1095


In [80]:
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Define the dataset
class RatingsDataset(Dataset):
    def __init__(self, ratings):
        self.user_ids = torch.tensor(ratings['user_id'].values, dtype=torch.long).to(device)
        self.movie_ids = torch.tensor(ratings['movie_id'].values, dtype=torch.long).to(device)
        self.ratings = torch.tensor(ratings['rating'].values, dtype=torch.float32).to(device)



    def __len__(self):
        return len(self.movie_ids)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.movie_ids[idx], self.ratings[idx]

# Define the neural network
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=50):
        super(RecommenderNet, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)
        self.fc1 = nn.Linear(embedding_size*2, 128)
        self.ac1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 128)
        self.ac2 = nn.Sigmoid()
        self.fc3 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, user, movie):
        user_embedding = self.user_embedding(user)
        movie_embedding = self.movie_embedding(movie)
        x = torch.cat([user_embedding, movie_embedding], dim=1)
        x = self.fc1(x)
        x = self.ac1(x)
        x = self.fc2(x)
        x = self.ac2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Create the dataset and dataloader

ratings_dataset = RatingsDataset(ratings)
ratings_dataloader = DataLoader(ratings_dataset, batch_size=512, shuffle=True)



# Initialize the model, loss function and optimizer

num_users = ratings['user_id'].max()
num_movies = ratings['movie_id'].max()
model = RecommenderNet(num_users, num_movies, embedding_size=50).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop

epochs = 10
for epoch in tqdm(range(epochs)):
    model.train()
    total_loss = 0
    for i, (user, movie, rating) in tqdm(enumerate(ratings_dataloader)):
        optimizer.zero_grad()
        output = model(user - 1, movie - 1)
        loss = criterion(output, rating.view(-1, 1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'loss is {total_loss}')

torch.save(model.state_dict(), "nn")
# Evaluation

model.eval()
predictions = []
with torch.no_grad():
    for i, (user, movie, rating) in enumerate(ratings_dataloader):
        user, movie = user.to(device), movie.to(device)
        output = model(user - 1, movie - 1)
        predictions.extend(output.view(-1).tolist())


cpu


 10%|█         | 1/10 [00:26<03:54, 26.08s/it]

loss is 2328.2906981110573


 20%|██        | 2/10 [00:52<03:30, 26.30s/it]

loss is 1727.6977719664574


 30%|███       | 3/10 [01:18<03:02, 26.05s/it]

loss is 1661.3884144425392


 40%|████      | 4/10 [01:44<02:36, 26.04s/it]

loss is 1624.509674012661


 50%|█████     | 5/10 [02:10<02:11, 26.24s/it]

loss is 1595.8145369291306


 60%|██████    | 6/10 [02:37<01:45, 26.43s/it]

loss is 1573.9726125001907


 70%|███████   | 7/10 [03:03<01:18, 26.28s/it]

loss is 1555.3390833735466


 80%|████████  | 8/10 [03:31<00:53, 26.68s/it]

loss is 1539.1298478841782


 90%|█████████ | 9/10 [03:57<00:26, 26.40s/it]

loss is 1525.031451702118


100%|██████████| 10/10 [04:22<00:00, 26.30s/it]

loss is 1511.2896646857262





In [81]:
# Example

sample_user_id = 1
sample_user_data = ratings[ratings['user_id'] == sample_user_id]
sample_dataset = RatingsDataset(sample_user_data)
sample_dataloader = DataLoader(sample_dataset, batch_size=1, shuffle=False)

print(f'Sample predictions for user ID {sample_user_id}:')
with torch.no_grad():
    for user, movie, rating in sample_dataloader:
        output = model(user, movie).squeeze()
        print(f'Movie ID: {movie.item()}, Predicted Rating: {output.item()}, Actual Rating: {rating.item()}')

Sample predictions for user ID 1:
Movie ID: 1193, Predicted Rating: 3.5600037574768066, Actual Rating: 5.0
Movie ID: 661, Predicted Rating: 2.9800009727478027, Actual Rating: 3.0
Movie ID: 914, Predicted Rating: 3.9169363975524902, Actual Rating: 3.0
Movie ID: 3408, Predicted Rating: 3.1134514808654785, Actual Rating: 4.0
Movie ID: 2355, Predicted Rating: 3.3444581031799316, Actual Rating: 5.0
Movie ID: 1197, Predicted Rating: 4.383079528808594, Actual Rating: 3.0
Movie ID: 1287, Predicted Rating: 4.118558883666992, Actual Rating: 5.0
Movie ID: 2804, Predicted Rating: 2.9065628051757812, Actual Rating: 5.0
Movie ID: 594, Predicted Rating: 3.8687565326690674, Actual Rating: 4.0
Movie ID: 919, Predicted Rating: 3.796388626098633, Actual Rating: 4.0
Movie ID: 595, Predicted Rating: 3.6295642852783203, Actual Rating: 5.0
Movie ID: 938, Predicted Rating: 2.954005479812622, Actual Rating: 4.0
Movie ID: 2398, Predicted Rating: 2.7288904190063477, Actual Rating: 4.0
Movie ID: 2918, Predicted R

In [83]:
def predict_unrated_movies(user_id, model, ratings, movies):
    user_data = ratings[ratings['user_id'] == user_id]
    user_movies = set(user_data['movie_id'])
    unrated_movies = movies[~movies['movie_id'].isin(user_movies)]
    unrated_movie_ids = unrated_movies['movie_id'].values
    user_ids = torch.tensor([user_id]*len(unrated_movie_ids), dtype=torch.long)
    movie_ids = torch.tensor(unrated_movie_ids, dtype=torch.long)
    with torch.no_grad():
        ans = model(user_ids-1, movie_ids-1).squeeze().tolist()
    ans = pd.DataFrame({'movie_id': unrated_movie_ids, 'predicted_rating': ans})
    ans = pd.merge(movies, ans, on='movie_id')
    return ans.sort_values('predicted_rating', ascending=False)[0:10]

# Example
user_id = 1
predictions = predict_unrated_movies(user_id, model, ratings, movies)
print(predictions)

      movie_id                                              title  \
1914      2019  Seven Samurai (The Magnificent Seven) (Shichin...   
833        858                              Godfather, The (1972)   
2788      2905                                     Sanjuro (1962)   
1156      1198                     Raiders of the Lost Ark (1981)   
3313      3435                            Double Indemnity (1944)   
2202      2309         Inheritors, The (Die Siebtelbauern) (1998)   
878        904                                 Rear Window (1954)   
47          50                         Usual Suspects, The (1995)   
3186      3307                                 City Lights (1931)   
311        318                   Shawshank Redemption, The (1994)   

                    genres  predicted_rating  
1914          Action|Drama          4.556662  
833     Action|Crime|Drama          4.505462  
2788      Action|Adventure          4.499148  
1156      Action|Adventure          4.497676  
3313 

### GMM (Gaussian Mixture Model)

**Objective**: Use a Gaussian Mixture Model to analyze and cluster the click data based on the number of clicks from different locations, aiming to identify distinct patterns of user behavior across 10 countries.

**Dataset**:
- **Click Data**: DataFrame with columns `link_id`, `location`, and `number_of_clicks`.

**Steps**:

1. **Preprocess Data**:
   - Ensure the dataset contains 10 distinct countries.
   - Create a matrix where rows represent different links and columns represent the number of clicks from each country.
   - Normalize the number of clicks to account for different scales.

2. **Fit GMM**:
   - Apply a Gaussian Mixture Model (GMM) to the click data matrix to identify clusters of links with similar click patterns across different countries.

3. **Evaluate Model**:
   - Analyze the resulting model by calculating the log lokelihood, BIC and AIC metrics.


In [91]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

file_path = 'https://drive.google.com/file/d/1mn1xdrqU6jeXhSuM4G2ooCL3LYHsiCnX/view?usp=drive_link'
file_path='https://drive.google.com/uc?id=' + file_path.split('/')[-2]
df = pd.read_csv(file_path)
# print(df.head())
# print(df.shape)

# print(df.iloc[:, 1:])
df = df.iloc[:, 1:]
df = (df - df.mean())/df.std()
print(df.columns)
print('do we have 10 country?', len(df.columns)==10)

gmm = GaussianMixture(n_components=10)
gmm.fit(df)
clusters = gmm.predict(df)
df['Cluster'] = clusters
print(df.head(10))


log_likelihood = gmm.score(df.iloc[:, :-1])
bic = gmm.bic(df.iloc[:, :-1])
aic = gmm.aic(df.iloc[:, :-1])

print(f"Log-Likelihood: {log_likelihood}")
print(f"BIC: {bic}")
print(f"AIC: {aic}")

Index(['United States', 'China', 'India', 'Japan', 'Brazil', 'Russia',
       'Indonesia', 'Germany', 'United Kingdom', 'France'],
      dtype='object')
do we have 10 country? True
   United States     China     India     Japan    Brazil    Russia  Indonesia  \
0       0.549942 -1.150906 -0.731604  0.891758  0.901695 -0.116030  -0.971418   
1      -0.392775 -1.312975 -0.578585  0.681981  0.414217  1.677038  -0.455322   
2       1.682610 -0.138875 -0.039541  0.819540 -0.844811  0.846715  -1.222849   
3       0.082101  0.667868  0.596878  0.561617 -0.132882 -1.103829   0.004532   
4       0.936878  0.531010 -0.700305  0.142064 -0.392403 -0.241295  -0.680287   
5       0.096171  1.564650  1.514991  0.468765  0.814019 -1.483200  -1.265857   
6       0.901702  0.484190 -0.700305 -1.690905  1.529456  0.979137   1.397329   
7      -1.761123  0.077217 -0.036064 -1.666832  0.533456  0.682082  -1.103750   
8      -0.463128 -1.680331  1.553246 -0.783018  1.301498 -0.538350  -1.159991   
9       0