AI/Machine Learning Intern Challenge: Simple Content-Based Recommendation

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [22]:
# Loading the datset and setting the sample size and selecting the columns
def load_data(filepath, sample_size=0):
    df = pd.read_csv("wiki_movie_plots_deduped.csv")
    df = df[['Title', 'Plot']].dropna()
    if sample_size > 0 and sample_size < len(df):
        df = df.sample(n=sample_size, random_state=42)
    df.reset_index(drop=True, inplace=True)
    print("Dataset head:")
    print(df.head())
    return df

filepath = 'wiki_movie_plots_deduped.csv'
df = load_data(filepath, sample_size=0)


Dataset head:
                              Title  \
0            Kansas Saloon Smashers   
1     Love by the Light of the Moon   
2           The Martyred Presidents   
3  Terrible Teddy, the Grizzly King   
4            Jack and the Beanstalk   

                                                Plot  
0  A bartender is working at a saloon, serving dr...  
1  The moon, painted with a smiling face hangs ov...  
2  The film, just over a minute long, is composed...  
3  Lasting just 61 seconds and consisting of two ...  
4  The earliest known adaptation of the classic f...  


In [23]:
# Building the TF-IDF vectors
def build_tfidf_vectors(text_series):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(text_series)
    return vectorizer, tfidf_matrix

# Build vectors
vectorizer, tfidf_matrix = build_tfidf_vectors(df['Plot'])
print("TF-IDF vectors Matrix shape:", tfidf_matrix.shape)

TF-IDF vectors Matrix shape: (34886, 133554)


In [24]:
# Computing Cosine Similarity and recommending movies
def recommend_movies(query, df, top_n=5):
    # Building TF-IDF vectors to plot movies
    vectorizer, tfidf_matrix = build_tfidf_vectors(df['Plot'])
    
    # Transforming the user query into the TF-IDF space
    query_vec = vectorizer.transform([query])
    
    # Computing cosine similarity between the query vector and each movie
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Sorting Movies in descending order of similarity scores
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Creating a df with the recommended movie Titles and their similarity scores
    recommendations = df.iloc[top_indices][['Title']].copy()
    recommendations['Score'] = similarities[top_indices]
    
    return recommendations

# Testing the recommendation with a sample query
sample_query = "I like action movies set in space."
print("Sample Query:", sample_query)
top_movies = recommend_movies(sample_query, df, top_n=5)
print("\n Top Recommended Movies with Similarity Scores:")
print(top_movies.to_string(index=False))

Sample Query: I like action movies set in space.

 Top Recommended Movies with Similarity Scores:
                                                   Title    Score
            Bodacious Space Pirates: Abyss of Hyperspace 0.243816
                                         Always Together 0.241054
                                                  Native 0.213520
Crayon Shin-chan: The Storm Called: Operation Golden Spy 0.192285
                                        Space Master X-7 0.184191
