# Luma AI/Machine Learning Intern Challenge: Simple Content-Based Recommendation

# Libraries:

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Movie Recommender Class:

The `MovieRecommender` class is designed to provide movie recommendations based on user descriptions.
It takes an IMDb-style CSV file (e.g., `imdb_top_1000.csv`) and uses TF-IDF vectorization and
cosine similarity to match user preferences with relevant movie titles.

**Key Features:**
1. **Data Preparation**: The class reads movie data from a CSV and cleans or weights key text fields
   (e.g., genres and overview) for improved recommendation quality.
2. **Genre Synonyms**: It handles multiple user expressions for the same genre
   (e.g., "scifi" → "sci-fi").
3. **Settings Detection**: It includes a simple “setting” detection mechanism
   (e.g., "medieval", "space") and boosts similarity scores if the movie
   matches those settings.
4. **Composite Scoring**: Final similarity scores are computed by blending:
   - TF-IDF similarity (40%)
   - Genre match score (30%)
   - Setting match score (30%)

**Class Usage Example**:
- Instantiate the class by providing the path to your CSV file:
  ```python
  recommender = MovieRecommender('imdb_top_1000.csv')


In [44]:
class MovieRecommender:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        self.prepare_data()
        self.vectorizer = None
        self.tfidf_matrix = None
        self.fit_vectorizer()

        # Define key settings and their related terms
        self.settings = {
            'space': ['space', 'alien', 'galaxy', 'planet', 'astronaut', 'cosmic',
                     'spacecraft', 'spaceship', 'orbit', 'stellar', 'interstellar'],
            'future': ['future', 'dystopia', 'cyberpunk', 'futuristic'],
            'medieval': ['medieval', 'kingdom', 'castle', 'sword', 'knight'],
            'modern': ['modern', 'contemporary', 'today', 'present'],
            'historical': ['history', 'historical', 'period', 'century', 'ancient']
        }

        # Defining synonyms for genre extraction from user queries
        self.genre_synonyms = {
            'action': ['action', 'actions', 'action-packed'],
            'adventure': ['adventure', 'adventures', 'adventurous'],
            'comedy': ['comedy', 'comedies', 'comic'],
            'drama': ['drama', 'dramas', 'dramatic'],
            'thriller': ['thriller', 'thrillers'],
            'horror': ['horror', 'horrors', 'horrific', 'scary'],
            'romance': ['romance', 'romantic', 'love story', 'love stories'],
            'sci-fi': ['sci-fi', 'scifi', 'science fiction', 'sf'],
            'mystery': ['mystery', 'mysteries'],
            'crime': ['crime', 'criminal'],
            'fantasy': ['fantasy', 'fantastical'],
            'animation': ['animated', 'animation', 'cartoon']
        }

    def prepare_data(self):
        self.df['weighted_genre'] = self.df['Genre'].fillna('').apply(
            lambda x: ' '.join([x.lower()] * 3)
        )
        self.df['weighted_overview'] = self.df['Overview'].fillna('').apply(
            lambda x: ' '.join([x.lower()] * 2)
        )
        # Combine features with weighted components
        self.df['combined_features'] = (
            self.df['weighted_genre'] + ' ' +
            self.df['weighted_overview'] + ' ' +
            self.df['Director'].fillna('').str.lower() + ' ' +
            self.df['Star1'].fillna('').str.lower() + ' ' +
            self.df['Star2'].fillna('').str.lower()
        )
        self.df['combined_features'] = self.df['combined_features'].apply(
            lambda x: re.sub(r'[^\w\s]', ' ', str(x).lower())
        )

    # Create and fit the TF-IDF vectorizer on the movie data
    def fit_vectorizer(self):
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )
        self.tfidf_matrix = self.vectorizer.fit_transform(self.df['combined_features'])

    # Extract potential genre preferences from user input using a dictionary of synonyms and canonical genre labels.
    def extract_genres(self, text):
        text_lower = text.lower()
        matched_genres = []
        for genre, syns in self.genre_synonyms.items():
            # If any synonym for this genre is in the user's text, add the genre
            if any(syn in text_lower for syn in syns):
                matched_genres.append(genre)

        return list(set(matched_genres))  # Return unique genres

    # Calculate a genre matching score based on overlap between user genres and the movie's listed genres.
    def get_genre_match_score(self, user_genres, movie_genres):
        if not user_genres or not movie_genres:
            return 0.0
        movie_genres_list = [g.lower().strip() for g in movie_genres.split(',')]
        matches = sum(1 for g in user_genres if any(g in mg for mg in movie_genres_list))
        return matches / len(user_genres) if user_genres else 0.0

    def get_setting_match_score(self, user_description, movie_overview):
        user_description = user_description.lower()
        movie_overview = str(movie_overview).lower()
        setting_score = 0
        max_setting_score = 0

        for setting, keywords in self.settings.items():
            setting_mentioned = any(word in user_description for word in keywords)
            if setting_mentioned:
                max_setting_score += 1
                keyword_matches = sum(1 for word in keywords if word in movie_overview)
                if keyword_matches > 0:
                    setting_score += min(keyword_matches / len(keywords), 1.0)

        return setting_score / max_setting_score if max_setting_score > 0 else 0.0

    def get_recommendations(self, user_description, n_recommendations=5):
        # Clean and vectorize user input
        cleaned_input = re.sub(r'[^\w\s]', ' ', user_description.lower())
        user_vector = self.vectorizer.transform([cleaned_input])

        # Extract genre preferences from user input
        user_genres = self.extract_genres(cleaned_input)

        # Calculate base TF-IDF similarity scores
        similarity_scores = cosine_similarity(user_vector, self.tfidf_matrix).flatten()

        # Adjust scores by combining multiple factors
        for idx, (movie_genres, movie_overview) in enumerate(zip(self.df['Genre'], self.df['Overview'])):
            # Genre matching
            genre_score = self.get_genre_match_score(user_genres, str(movie_genres))

            # Setting matching
            setting_score = self.get_setting_match_score(user_description, movie_overview)

            # Combine scores with weights
            similarity_scores[idx] = (
                similarity_scores[idx] * 0.4 +  # Base TF-IDF similarity
                genre_score * 0.3 +            # Genre matching
                setting_score * 0.3            # Setting matching
            )

        # Get top N movie indices
        top_indices = similarity_scores.argsort()[-n_recommendations:][::-1]

        recommendations = []
        for idx in top_indices:
            movie = self.df.iloc[idx]
            recommendations.append({
                'title': movie['Series_Title'],
                'year': movie['Released_Year'],
                'genre': movie['Genre'],
                'rating': movie['IMDB_Rating'],
                'overview': movie['Overview'],
                'similarity_score': round(similarity_scores[idx], 3)
            })

        return recommendations

# Main Function:

In [45]:
def main():
    # 'data_path' will contain path for your .csv file
    data_path = 'imdb_top_1000.csv'

    recommender = MovieRecommender(data_path)

    # User need to Enter Their Preference
    user_input = input("Enter your movie preferences: ")
    recommendations = recommender.get_recommendations(user_input, n_recommendations=5)

    # Print 5 recommendations:
    print(f"\nRecommendations for: '{user_input}'\n")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec['title']} ({rec['year']}) - {rec['genre']}")
        print(f"   Rating: {rec['rating']}")
        print(f"   Similarity Score: {rec['similarity_score']}")
        print(f"   Overview: {rec['overview'][:150]}...")
        print()

if __name__ == "__main__":
    main()

Enter your movie preferences: I love action and science fiction movies.

Recommendations for: 'I love action and science fiction movies.'

1. Captain America: Civil War (2016) - Action, Adventure, Sci-Fi
   Rating: 7.8
   Similarity Score: 0.315
   Overview: Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man....

2. Serenity (2005) - Action, Adventure, Sci-Fi
   Rating: 7.8
   Similarity Score: 0.315
   Overview: The crew of the ship Serenity try to evade an assassin sent to recapture one of their members who is telepathic....

3. Edge of Tomorrow (2014) - Action, Adventure, Sci-Fi
   Rating: 7.9
   Similarity Score: 0.314
   Overview: A soldier fighting aliens gets to relive the same day over and over again, the day restarting every time he dies....

4. Predator (1987) - Action, Adventure, Sci-Fi
   Rating: 7.8
   Similarity Score: 0.314
   Overview: A team of commandos on a mission in a Central American jungle find themselves hunted by a