In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from prettytable import PrettyTable

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
def load_data(file_path):
    # Reading the CSV file from the provided path
    df = pd.read_csv(file_path)
    
    # Print column names to verify
    print("Columns in the dataset:", df.columns)
    
    # Clean column names by stripping leading/trailing spaces (if any)
    df.columns = df.columns.str.strip()
    
    # Selecting relevant columns (adjusted according to your dataset)
    df = df[['Movie Name', 'Description']].dropna()  # Use 'Movie Name' and 'Description'
    
    # Limiting the dataset to the top 500 rows
    df = df.head(500)
    
    # Returning the processed dataframe
    return df



In [9]:

file_path = 'Top_1000_IMDb_movies_New_version.csv'

# Loading the data using the function
df = load_data(file_path)

# Displaying the first few rows to confirm the data is loaded correctly
df.head()



Columns in the dataset: Index(['Unnamed: 0', 'Movie Name', 'Year of Release', 'Watch Time',
       'Movie Rating', 'Metascore of movie', 'Gross', 'Votes', 'Description'],
      dtype='object')


Unnamed: 0,Movie Name,Description
0,The Shawshank Redemption,"Over the course of several years, two convicts..."
1,The Godfather,"Don Vito Corleone, head of a mafia family, dec..."
2,The Dark Knight,When the menace known as the Joker wreaks havo...
3,Schindler's List,"In German-occupied Poland during World War II,..."
4,12 Angry Men,The jury in a New York City murder trial is fr...


In [11]:
# This function will take care of creating embeddings for all movie descriptions
def compute_embeddings(df):
    # We take the 'Description' column and generate embeddings for it
    embeddings = model.encode(df['Description'].tolist(), show_progress_bar=True)
    return embeddings

# Now, let's load the data and generate the embeddings for the descriptions
file_path = 'Top_1000_IMDb_movies_New_version.csv'  # This is where our data is stored
df = load_data(file_path)  # Load the cleaned data
movie_embeddings = compute_embeddings(df)  # Generate embeddings for the descriptions


Columns in the dataset: Index(['Unnamed: 0', 'Movie Name', 'Year of Release', 'Watch Time',
       'Movie Rating', 'Metascore of movie', 'Gross', 'Votes', 'Description'],
      dtype='object')


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [12]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_query(query):
    # Convert query to lowercase to maintain case insensitivity
    query = query.lower()
    
    # Remove stopwords (if applicable)
    query = " ".join([word for word in query.split() if word not in ENGLISH_STOP_WORDS])
    
    return query

# Function to get top N movie recommendations based on user query
def get_movie_recommendations(query, df, movie_embeddings, top_n=5, threshold=0.3):
    # Preprocess the user's query
    query = preprocess_query(query)
    
    # Generate an embedding for the user's query
    query_embedding = model.encode([query])
    
    # Calculate cosine similarity between the query and the movie embeddings
    cosine_similarities = cosine_similarity(query_embedding, movie_embeddings)
    
    # Get indices of the top N most similar movies
    similar_indices = cosine_similarities[0].argsort()[-top_n:][::-1]
    
    # Extract similarity scores
    similarity_scores = cosine_similarities[0][similar_indices]
    
    # Get the movie names and ratings from the dataframe
    movie_names = df.iloc[similar_indices]['Movie Name'].tolist()
    movie_ratings = df.iloc[similar_indices]['Movie Rating'].tolist()  # Ensure 'Movie Rating' column exists in df
    
    # Filter recommendations based on threshold for better relevance
    filtered_movie_names = []
    filtered_similarity_scores = []
    filtered_movie_ratings = []
    
    for movie, score, rating in zip(movie_names, similarity_scores, movie_ratings):
        if score >= threshold:  # If similarity score is above threshold
            filtered_movie_names.append(movie)
            filtered_similarity_scores.append(round(score, 4))  # Round the score for readability
            filtered_movie_ratings.append(rating)
    
    table = PrettyTable()
    table.field_names = ["Movie Name", "Similarity Score", "Movie Rating"]  # Table headers
    
    # Add each filtered recommendation to the table
    for movie, score, rating in zip(filtered_movie_names, filtered_similarity_scores, filtered_movie_ratings):
        table.add_row([movie, score, rating])

    return table

In [15]:
# Load the data
file_path = 'Top_1000_IMDb_movies_New_version.csv' 
df = load_data(file_path)

# Compute embeddings for movie descriptions
movie_embeddings = compute_embeddings(df)



Columns in the dataset: Index(['Unnamed: 0', 'Movie Name', 'Year of Release', 'Watch Time',
       'Movie Rating', 'Metascore of movie', 'Gross', 'Votes', 'Description'],
      dtype='object')


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [17]:
# Example input from user
user_query = input("Tell me what kind of movie you're in the mood for: ")

# Generate movie embeddings for the dataset
df = pd.read_csv("Top_1000_IMDb_movies_New_version.csv")  
movie_embeddings = model.encode(df['Description'].tolist())  

# Get movie recommendations based on user input
recommendations = get_movie_recommendations(user_query, df, movie_embeddings, top_n=5)

# Output the top 5 movie recommendations in a table format
print("\nHere are the top 5 movie recommendations based on your preferences:")
print(recommendations)


Tell me what kind of movie you're in the mood for:  I love thrilling action movies set in space, with a comedic twist.



Here are the top 5 movie recommendations based on your preferences:
+--------------------+------------------+--------------+
|     Movie Name     | Similarity Score | Movie Rating |
+--------------------+------------------+--------------+
|         8½         |      0.4357      |     8.0      |
|   Amores perros    |      0.4264      |     8.1      |
|    Sunset Blvd.    |      0.417       |     8.4      |
| Sullivan's Travels |      0.3901      |     7.9      |
|    Nightcrawler    |      0.3701      |     7.8      |
+--------------------+------------------+--------------+
