Movie Recommendation System using cosine similarity, feature extraction (to covert data to vectors), one-hot encoder (binary vectors for cosine similarity)

Using a movies dataset from Kaggle

In [44]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import numpy as np
df = pd.read_csv('/content/movie_dataset.csv')
df.head()


# Fill missing values
df.fillna("", inplace=True)

df['combined_text'] = (
    df['director'] + " " +
    df['genres'] + " " +
    df['overview'] + " " +
    df['production_companies']
)

  df.fillna("", inplace=True)


In [45]:
def get_movie_index_by_title(title):
    try:
        # Use the 'title' argument instead of 'original_title'
        return df[df['title'] == title].index[0]
    except IndexError:
        return None

movie_title = "Tangled"
movie_index = get_movie_index_by_title(movie_title)

if movie_index is not None:
    print(f"Movie index for '{movie_title}':", movie_index)
else:
    print(f"Movie '{movie_title}' not found in the dataset.")


Movie index for 'Tangled': 6


In [49]:
tfidf = TfidfVectorizer(stop_words="english", max_features=500)
text_features = tfidf.fit_transform(df['combined_text']).toarray()

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
categorical_features = one_hot_encoder.fit_transform(df[['genres', 'production_companies']])

# Combine features into a single input tensor
text_weight = 3.0
categorical_weight = 1.0

features = np.hstack([
    text_features * text_weight,
    categorical_features * categorical_weight
])
features_tensor = torch.tensor(features, dtype=torch.float32)

In [51]:
class MovieDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

dataset = MovieDataset(features_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [52]:
class MovieEmbeddingModel(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(MovieEmbeddingModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)

    def forward(self, x):
        return F.normalize(self.embedding(x), dim=1)  # Normalize for cosine similarity

input_dim = features_tensor.shape[1]
embedding_dim = 128

model = MovieEmbeddingModel(input_dim, embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()


In [55]:
epochs = 10
for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        embeddings = model(batch)
        loss = criterion(embeddings, embeddings)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/10, Loss: 0.0000
Epoch 2/10, Loss: 0.0000
Epoch 3/10, Loss: 0.0000
Epoch 4/10, Loss: 0.0000
Epoch 5/10, Loss: 0.0000
Epoch 6/10, Loss: 0.0000
Epoch 7/10, Loss: 0.0000
Epoch 8/10, Loss: 0.0000
Epoch 9/10, Loss: 0.0000
Epoch 10/10, Loss: 0.0000


In [54]:
movie_embeddings = model(features_tensor).detach()

def recommend_movies(movie_title, num_recommendations=3):
    if movie_title not in df['title'].values:
        return f"'{movie_title}' not found in the dataset."

    movie_idx = df[df['title'] == movie_title].index[0]

    target_embedding = movie_embeddings[movie_idx]

    similarities = F.cosine_similarity(target_embedding.unsqueeze(0), movie_embeddings)

    # Exlude the target movie from cosine sims.
    similar_indices = similarities.argsort(descending=True)[1:num_recommendations + 1]

    similar_movies = df.iloc[similar_indices.cpu().numpy()]
    return [
        {"title": row['title'], "link": row['homepage']} for _, row in similar_movies.iterrows()
    ]



print(recommend_movies("Tangled"))

[{'title': 'Winnie the Pooh', 'link': 'http://disney.go.com/pooh/home/'}, {'title': 'Dinosaur', 'link': ''}, {'title': 'Monsters University', 'link': ''}]
