In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load the data
movies_metadata = pd.read_csv('/Users/anushaanandhan/Downloads/CineCraft/dataset/movies_metadata.csv', low_memory=False)

# Print the first few rows to verify the data
print(movies_metadata.head())

  from .autonotebook import tqdm as notebook_tqdm


   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497         

In [2]:
# Select relevant columns
movies = movies_metadata[['id', 'title', 'overview', 'genres', 'original_language']]

# Remove rows with missing overviews
movies = movies.dropna(subset=['overview'])

# Convert genres from string to list
movies['genres'] = movies['genres'].apply(eval)
movies['genres'] = movies['genres'].apply(lambda x: [genre['name'] for genre in x])

# Create a text field that combines title, overview, and genres
movies['text'] = movies['title'] + ' ' + movies['overview'] + ' ' + movies['genres'].apply(lambda x: ' '.join(x))

print(movies.head())

      id                        title  \
0    862                    Toy Story   
1   8844                      Jumanji   
2  15602             Grumpier Old Men   
3  31357            Waiting to Exhale   
4  11862  Father of the Bride Part II   

                                            overview  \
0  Led by Woody, Andy's toys live happily in his ...   
1  When siblings Judy and Peter discover an encha...   
2  A family wedding reignites the ancient feud be...   
3  Cheated on, mistreated and stepped on, the wom...   
4  Just when George Banks has recovered from his ...   

                         genres original_language  \
0   [Animation, Comedy, Family]                en   
1  [Adventure, Fantasy, Family]                en   
2             [Romance, Comedy]                en   
3      [Comedy, Drama, Romance]                en   
4                      [Comedy]                en   

                                                text  
0  Toy Story Led by Woody, Andy's toys liv

In [24]:
movies.to_csv('movies_llm.csv', index=False)

In [11]:
# Load the pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode the movie descriptions
movie_embeddings = model.encode(movies['text'].tolist(), show_progress_bar=True)

Batches:   1%|          | 14/1391 [00:08<14:38,  1.57it/s] 


KeyboardInterrupt: 

In [14]:
#Save embeddings as a CSV file
print("Saving embeddings to CSV...")
movie_embeddings_df = pd.DataFrame(movie_embeddings)
movie_embeddings_df.to_csv('movie_embeddings.csv', index=False)

Saving embeddings to CSV...


In [6]:
movie_embeddings = pd.read_csv('../dataset_pkl/movie_embeddings.csv')

In [7]:
# Convert to tensor for faster computation
movie_embeddings_tensor = torch.tensor(movie_embeddings.values)

In [None]:
def get_recommendations(query, top_n=5):
    # Encode the query
    query_embedding = model.encode([query])
    
    # Convert the query embedding to tensor
    query_embedding_tensor = torch.tensor(query_embedding)
    
    # Convert tensor to NumPy array for cosine similarity calculation
    query_embedding_numpy = query_embedding_tensor.numpy()

    # Calculate cosine similarity between the query embedding and all movie embeddings
    similarities = cosine_similarity(query_embedding_numpy, movie_embeddings_tensor.numpy())[0]
    
    # Get the indices of the top N similar movies
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Get the recommended movies from the movies DataFrame (assuming 'movies' DataFrame exists)
    recommendations = movies.iloc[top_indices]
    
    # Return a subset of relevant columns (title, overview, genres)
    return recommendations[['title', 'overview', 'genres']]


In [12]:
query = "I want to watch action movies"
recommendations = get_recommendations(query)
print("Recommendations for:", query)
print(recommendations)

Recommendations for: I want to watch action movies
                 title                                           overview  \
25412   Action Jackson  A man meets his lookalike, who's not just a ki...   
44027  Karlson Returns                       No movie overview available.   
30533   Video Violence  A husband and wife open a video store in a new...   
32019    The Challenge                                 No overview found.   
44086           Platon                       No movie overview available.   

                                           genres  
25412  [Action, Comedy, Drama, Romance, Thriller]  
44027              [Adventure, Animation, Family]  
30533                                    [Horror]  
32019                             [Action, Drama]  
44086                                     [Drama]  
