<a href="https://colab.research.google.com/github/berkayguzel06/ML_Movie_Recommendation/blob/main/warehouse_model_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install torch scikit-learn pandas numpy



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the datasets
movies_df = pd.read_csv("movie.csv")
user_movies_df = pd.read_csv("user_movies.csv")

# --- Model 1: Content-Based Recommendation ---
# Preprocess genres
def preprocess_genres(movies_df):
    mlb = MultiLabelBinarizer()
    genre_data = movies_df['genres'].str.split('|')
    genre_matrix = mlb.fit_transform(genre_data)
    return genre_matrix, mlb

genre_matrix, genre_encoder = preprocess_genres(movies_df)

# Define Autoencoder for genre embeddings
class GenreAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GenreAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid(),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Instantiate and train the autoencoder
input_dim = genre_matrix.shape[1]
print(input_dim)
hidden_dim = 16
autoencoder = GenreAutoencoder(input_dim, hidden_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)

def train_autoencoder(autoencoder, data, epochs=50):
    for epoch in range(epochs):
        data_tensor = torch.FloatTensor(data)
        optimizer.zero_grad()
        _, reconstructed = autoencoder(data_tensor)
        loss = criterion(reconstructed, data_tensor)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

data = genre_matrix
train_autoencoder(autoencoder, data)

# Use the trained encoder to calculate similarities
data_tensor = torch.FloatTensor(data)
with torch.no_grad():
    embeddings, _ = autoencoder(data_tensor)
embeddings = embeddings.numpy()
similarity_matrix = cosine_similarity(embeddings)

20
Epoch 10, Loss: 0.1949
Epoch 20, Loss: 0.1060
Epoch 30, Loss: 0.0766
Epoch 40, Loss: 0.0691
Epoch 50, Loss: 0.0622


In [None]:
# Example: Recommend movies similar to movieId=1
movie_idx = 24  # Toy Story (1995)
similar_movies = similarity_matrix[movie_idx].argsort()[::-1][1:6]
print("Recommended movies for Toy Story (1995):")
# Print with movie genre
for movie_id in similar_movies:
    print(movies_df.iloc[movie_id]['title'])
    print(movies_df.iloc[movie_id]['genres'])
    print()

Recommended movies for Toy Story (1995):
Once Upon a Time... When We Were Colored (1995)
Drama|Romance

How to Make an American Quilt (1995)
Drama|Romance

Reconstruction (2003)
Drama|Romance

Plain Dirty (a.k.a. Briar Patch) (2003)
Drama|Romance

Panic in Needle Park, The (1971)
Drama|Romance



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the datasets
movies_df = pd.read_csv("movie.csv")
user_movies_df = pd.read_csv("user_movies.csv")

# --- Model 1: Content-Based Recommendation ---
# Preprocess genres
def preprocess_genres(movies_df):
    mlb = MultiLabelBinarizer()
    genre_data = movies_df['genres'].str.split('|')
    genre_matrix = mlb.fit_transform(genre_data)
    return genre_matrix, mlb

genre_matrix, genre_encoder = preprocess_genres(movies_df)

# Define Autoencoder for genre embeddings
class GenreAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GenreAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid(),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Instantiate and train the autoencoder
input_dim = genre_matrix.shape[1]
hidden_dim = 16
autoencoder = GenreAutoencoder(input_dim, hidden_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)

def train_autoencoder(autoencoder, data, epochs=50):
    for epoch in range(epochs):
        data_tensor = torch.FloatTensor(data)
        optimizer.zero_grad()
        _, reconstructed = autoencoder(data_tensor)
        loss = criterion(reconstructed, data_tensor)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

data = genre_matrix
train_autoencoder(autoencoder, data)

# Use the trained encoder to calculate similarities
data_tensor = torch.FloatTensor(data)
with torch.no_grad():
    embeddings, _ = autoencoder(data_tensor)
embeddings = embeddings.numpy()
similarity_matrix = cosine_similarity(embeddings)

def recommend_movies(movie_ids, top_n=5):
    recommended_movies = []
    for movie_id in movie_ids:
        try:
            movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
            similar_movies = similarity_matrix[movie_idx].argsort()[::-1][1:top_n+1]
            for sim_movie_idx in similar_movies:
                recommended_movies.append(movies_df.iloc[sim_movie_idx])
        except IndexError:
            print(f"Movie with ID {movie_id} not found in the dataset.")
    return pd.DataFrame(recommended_movies)

Epoch 10, Loss: 0.2016
Epoch 20, Loss: 0.1129
Epoch 30, Loss: 0.0806
Epoch 40, Loss: 0.0727
Epoch 50, Loss: 0.0649
       movieId                                              title  \
2209      2294                                        Antz (1998)   
21355   103755                                       Turbo (2013)   
24156   114552                              Boxtrolls, The (2014)   
24849   117454                           The Magic Crystal (2011)   
24092   114240                                     Aladdin (1992)   
16446    83115  Polar Bear King, The (Kvitebjørn Kong Valemon)...   
12339    56915        Water Horse: Legend of the Deep, The (2007)   
12260    56171                         Golden Compass, The (2007)   
124        126                  NeverEnding Story III, The (1994)   
2078      2162  NeverEnding Story II: The Next Chapter, The (1...   
19560    96872                                   Quadrille (1938)   
67          68                 French Twist (Gazon maudit

In [None]:
# Example usage: Recommend movies similar to multiple movies
movie_ids_to_recommend = [9, 14] # Example movie IDs
recommendations = recommend_movies(movie_ids_to_recommend)
print(recommendations[['movieId', 'title', 'genres']])

       movieId                                              title  genres
8            9                                Sudden Death (1995)  Action
70          71                                   Fair Game (1995)  Action
26161   125920                               Buffalo Girls (2012)  Action
26108   125539                                He Who Dares (2014)  Action
27139   130526                             The Detective 2 (2011)  Action
14147    70988                        Norma Jean & Marilyn (1996)   Drama
14180    71147  Death of a Cyclist (Muerte de un ciclista) (1955)   Drama
14131    70912                                Goodbye Solo (2008)   Drama
14126    70880                7 Women (a.k.a. Seven Women) (1966)   Drama
14118    70846       Lorna's Silence (Le silence de Lorna) (2008)   Drama


In [None]:
# Save the model
torch.save(autoencoder.state_dict(), 'autoencoder_model.pth')

# Load the model
loaded_autoencoder = GenreAutoencoder(input_dim, hidden_dim)
loaded_autoencoder.load_state_dict(torch.load('autoencoder_model.pth'))
loaded_autoencoder.eval()  # Set the model to evaluation mode

# Use the loaded model to generate embeddings
data_tensor = torch.FloatTensor(data)
with torch.no_grad():
    embeddings, _ = loaded_autoencoder(data_tensor)
embeddings = embeddings.numpy()
loaded_similarity_matrix = cosine_similarity(embeddings)

# Example usage with the loaded model:
def recommend_movies_loaded(movie_ids, top_n=5):
    recommended_movies = []
    for movie_id in movie_ids:
        try:
            movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
            similar_movies = loaded_similarity_matrix[movie_idx].argsort()[::-1][1:top_n+1]
            for sim_movie_idx in similar_movies:
                recommended_movies.append(movies_df.iloc[sim_movie_idx])
        except IndexError:
            print(f"Movie with ID {movie_id} not found in the dataset.")
    return pd.DataFrame(recommended_movies)

movie_ids_to_recommend = [9, 14] # Example movie IDs
recommendations = recommend_movies_loaded(movie_ids_to_recommend)
print(recommendations[['movieId', 'title', 'genres']])

  loaded_autoencoder.load_state_dict(torch.load('autoencoder_model.pth'))


       movieId                                              title  genres
8            9                                Sudden Death (1995)  Action
70          71                                   Fair Game (1995)  Action
26161   125920                               Buffalo Girls (2012)  Action
26108   125539                                He Who Dares (2014)  Action
27139   130526                             The Detective 2 (2011)  Action
14147    70988                        Norma Jean & Marilyn (1996)   Drama
14180    71147  Death of a Cyclist (Muerte de un ciclista) (1955)   Drama
14131    70912                                Goodbye Solo (2008)   Drama
14126    70880                7 Women (a.k.a. Seven Women) (1966)   Drama
14118    70846       Lorna's Silence (Le silence de Lorna) (2008)   Drama


# Model 2: Collaborative Filtering


In [None]:
# --- Model 2: Collaborative Filtering ---
# Preprocess user-movie ratings
num_users = user_movies_df['userId'].nunique()
num_movies = user_movies_df['movieId'].nunique()

user_movie_matrix = user_movies_df.pivot(index='userId', columns='movieId', values='rating').fillna(0).to_numpy()

# Define NCF model
class NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user_ids, movie_ids):
        user_embeds = self.user_embedding(user_ids)
        movie_embeds = self.movie_embedding(movie_ids)
        x = torch.cat([user_embeds, movie_embeds], dim=-1)
        return self.fc_layers(x).squeeze()

# Instantiate and train the NCF model
embedding_dim = 16
ncf = NCF(num_users, num_movies, embedding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(ncf.parameters(), lr=0.01)

def train_ncf(ncf, user_movie_matrix, epochs=10):
    for epoch in range(epochs):
        user_ids, movie_ids = np.where(user_movie_matrix > 0)
        ratings = user_movie_matrix[user_ids, movie_ids]

        user_ids_tensor = torch.LongTensor(user_ids)
        movie_ids_tensor = torch.LongTensor(movie_ids)
        ratings_tensor = torch.FloatTensor(ratings)

        optimizer.zero_grad()
        predictions = ncf(user_ids_tensor, movie_ids_tensor)
        loss = criterion(predictions, ratings_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 2 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

train_ncf(ncf, user_movie_matrix)



Epoch 2, Loss: 6.9442
Epoch 4, Loss: 1.3155
Epoch 6, Loss: 3.2753
Epoch 8, Loss: 1.1209
Epoch 10, Loss: 1.7733


# First Testing NFC Model

In [None]:
# Example: Recommend movies for userId=0
user_id = 0
user_movies = user_movie_matrix[user_id]
unwatched_movie_ids = np.where(user_movies == 0)[0]

with torch.no_grad():
    user_ids_tensor = torch.LongTensor([user_id] * len(unwatched_movie_ids))
    movie_ids_tensor = torch.LongTensor(unwatched_movie_ids)
    predictions = ncf(user_ids_tensor, movie_ids_tensor).numpy()

recommended_movie_ids = unwatched_movie_ids[np.argsort(predictions)[::-1][:5]]
print("Recommended movies for User 0:")
print(movies_df.iloc[recommended_movie_ids]['title'].values)

Recommended movies for User 0:
['Talk Radio (1988)' 'Siege, The (1998)' 'Night Flier (1997)'
 'Loser (1991)' 'Kentucky Fried Movie, The (1977)']


# Test the Generated Model with Similarity Scores

In [None]:
# Example: Recommend movies for userId=0 with genres and similarity scores
user_id = 0
user_movies = user_movie_matrix[user_id]
unwatched_movie_ids = np.where(user_movies == 0)[0]

with torch.no_grad():
    user_ids_tensor = torch.LongTensor([user_id] * len(unwatched_movie_ids))
    movie_ids_tensor = torch.LongTensor(unwatched_movie_ids)
    predictions = ncf(user_ids_tensor, movie_ids_tensor).numpy()

recommended_movie_ids = unwatched_movie_ids[np.argsort(predictions)[::-1][:5]]

print("Recommended movies for User 0:")
for movie_id in recommended_movie_ids:
    movie_title = movies_df.iloc[movie_id]['title']
    movie_genres = movies_df.iloc[movie_id]['genres']
    similarity_score = predictions[np.where(unwatched_movie_ids == movie_id)][0]  # Get the prediction score
    print(f"Title: {movie_title}")
    print(f"Genres: {movie_genres}")
    print(f"Similarity Score: {similarity_score:.4f}") # Display with 4 decimal places
    print()

Recommended movies for User 0:
Title: Talk Radio (1988)
Genres: Drama
Similarity Score: 3.5492

Title: Siege, The (1998)
Genres: Action|Thriller
Similarity Score: 3.4796

Title: Night Flier (1997)
Genres: Horror
Similarity Score: 3.2344

Title: Loser (1991)
Genres: Comedy
Similarity Score: 3.2152

Title: Kentucky Fried Movie, The (1977)
Genres: Comedy
Similarity Score: 3.1825



# Download Model and Use It
Model is downloaded in .pth format to use it in local and tested

In [None]:
# Save the model
torch.save(ncf.state_dict(), 'ncf_model.pth')

# Load the model
loaded_ncf = NCF(num_users, num_movies, embedding_dim)
loaded_ncf.load_state_dict(torch.load('ncf_model.pth'))
loaded_ncf.eval()

# Example usage with the loaded model
user_id = 0
user_movies = user_movie_matrix[user_id]
unwatched_movie_ids = np.where(user_movies == 0)[0]

with torch.no_grad():
    user_ids_tensor = torch.LongTensor([user_id] * len(unwatched_movie_ids))
    movie_ids_tensor = torch.LongTensor(unwatched_movie_ids)
    predictions = loaded_ncf(user_ids_tensor, movie_ids_tensor).numpy()

recommended_movie_ids = unwatched_movie_ids[np.argsort(predictions)[::-1][:5]]

print("Recommended movies for User 0 (using loaded model):")
for movie_id in recommended_movie_ids:
    movie_title = movies_df.iloc[movie_id]['title']
    movie_genres = movies_df.iloc[movie_id]['genres']
    similarity_score = predictions[np.where(unwatched_movie_ids == movie_id)][0]
    print(f"Title: {movie_title}")
    print(f"Genres: {movie_genres}")
    print(f"Similarity Score: {similarity_score:.4f}")
    print()

Recommended movies for User 0 (using loaded model):
Title: Talk Radio (1988)
Genres: Drama
Similarity Score: 3.5492

Title: Siege, The (1998)
Genres: Action|Thriller
Similarity Score: 3.4796

Title: Night Flier (1997)
Genres: Horror
Similarity Score: 3.2344

Title: Loser (1991)
Genres: Comedy
Similarity Score: 3.2152

Title: Kentucky Fried Movie, The (1977)
Genres: Comedy
Similarity Score: 3.1825



  loaded_ncf.load_state_dict(torch.load('ncf_model.pth'))
