In [13]:
##**AI/Machine Learning Intern Challenge: Simple Content-Based Recommendation**

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode
from itertools import permutations
init_notebook_mode(connected=True)
import re
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Load the dataset
df = pd.read_csv("F:/Resume/job assement/IMBD/IMBD.csv")
df.head()

df.info()

df = df.drop_duplicates(subset=['title', 'description'])

# Fill missing values in genre with an empty string
df['genre'] = df['genre'].fillna('')

# Remove any rows with missing descriptions (as it's key for similarity)
df = df.dropna(subset=['description'])

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


In [6]:
# Function to clean text: remove special chars, lowercasing, remove stopwords
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()  # Tokenize
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return ' '.join(words)

df['cleaned_text'] = (df['genre'] + " " + df['description']).apply(clean_text)

df[['title', 'cleaned_text']].head()

Unnamed: 0,title,cleaned_text
0,Cobra Kai,action comedy drama decades 1984 valley karate...
1,The Crown,biography drama history follows political riva...
2,Better Call Saul,crime drama trials tribulations criminal lawye...
3,Devil in Ohio,drama horror mystery psychiatrist shelters mys...
4,Cyberpunk: Edgerunners,animation action adventure street kid trying s...


In [7]:
# Create TF-IDF vectorizer and transform data
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])

def generate_query_combinations(queries):
    combined_queries = set()
    for i in range(1, len(queries) + 1):
        for combo in permutations(queries, i):
            combined_queries.add(" ".join(combo))
    return list(combined_queries)

In [8]:
# Function to get recommendations
def get_recommendations(queries, top_n=5):
    query_combinations = generate_query_combinations(queries)
    best_matches = []

    for query in query_combinations:
        query_cleaned = clean_text(query)
        query_vector = vectorizer.transform([query_cleaned])
        similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        top_indices = np.argsort(similarities)[-top_n:][::-1]
        best_matches.extend([(df.iloc[i]['title'], similarities[i]) for i in top_indices])

    best_matches = sorted(set(best_matches), key=lambda x: x[1], reverse=True)[:top_n]
    return best_matches

In [15]:
# Example usage
if __name__ == "__main__":
    user_queries = ["I love science fiction movies and romance"]
    recommendations = get_recommendations(user_queries)
    for title, score in recommendations:
        print(f"{title} (Similarity: {score:.3f})")

Scare Tactics (Similarity: 0.362)
Love, Death & Robots (Similarity: 0.323)
The Day the Earth Stood Still (Similarity: 0.296)
Alien Worlds (Similarity: 0.280)
Six Windows in the Desert (Similarity: 0.260)


In [10]:
# Example usage
if __name__ == "__main__":
    user_queries = ["I love comdey romance"]
    recommendations = get_recommendations(user_queries)
    for title, score in recommendations:
        print(f"{title} (Similarity: {score:.3f})")

Being Mrs Elliot (Similarity: 0.517)
Black Knight (Similarity: 0.460)
Our Lovers (Similarity: 0.414)
The Deadly Affair (Similarity: 0.413)
To All the Boys I've Loved Before (Similarity: 0.404)


In [16]:
# Example usage
if __name__ == "__main__":
    user_queries = ["I love time travel and adventure in space"]
    recommendations = get_recommendations(user_queries)
    for title, score in recommendations:
        print(f"{title} (Similarity: {score:.3f})")



Final Space (Similarity: 0.303)
Timeless (Similarity: 0.301)
Cosmos: A Spacetime Odyssey (Similarity: 0.291)
Don Quijote de la Láctea (Similarity: 0.289)
Power Rangers (Similarity: 0.275)


In [12]:
# Salary expectation per month - 4200$