# You can get the dataset from here
https://www.kaggle.com/datasets/octopusteam/imdb-top-1000-tv-series

# The imports 

In [7]:
import pandas as pd
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from typing import List, Set
import pickle


# Consultation and cleaning

In [3]:
df = pd.read_csv('data.csv')

In [9]:
df.isnull().sum()

id               0
title            0
genres           0
averageRating    0
numVotes         0
releaseYear      0
dtype: int64

In [10]:
df.head()

Unnamed: 0,id,title,genres,averageRating,numVotes,releaseYear
0,tt0903747,Breaking Bad,"Crime, Drama, Thriller",9.5,2222701,2008
1,tt5491994,Planet Earth II,Documentary,9.5,161347,2016
2,tt0185906,Band of Brothers,"Drama, History, War",9.4,543177,2001
3,tt0795176,Planet Earth,"Documentary, Family",9.4,222871,2006
4,tt5152226,Tree of Life,Drama,9.4,11663,2014


In [11]:
df.drop(columns=['id'],inplace=True)

# Initialization of the LLM

In [None]:
def initialize_llm(api_key: str):
    """Initialize the Google Generative AI with the specified API key."""
    return GoogleGenerativeAI(model="models/gemini-1.5-flash", google_api_key=api_key)

# Creating prompt templates

In [None]:
def create_prompt_templates():
    """Create prompt templates for genre matching and recommendations."""
    genre_prompt = PromptTemplate(
        input_variables=["user_input", "available_genres"],
        template="""
        Given the following user input: {user_input}
        
        And these available genres: {available_genres}
        
        Please suggest the most relevant genres from the available list that match the user's input.
        Return only the genre names separated by commas, without any additional explanation.
        If no genres match, return "No matching genres found".
        """
    )
    recommendation_prompt = PromptTemplate(
        input_variables=["user_input", "filtered_content"],
        template="""
        Based on the user's request: {user_input}
        
        And the following available content:
        {filtered_content}
        
        Please provide a natural, conversational response that includes:
        1. Relevant titles that match their interests
        2. Release years for each recommendation
        3. A brief explanation of why each title matches their request
        
        Focus on being helpful and informative while maintaining a natural tone.
        If no content matches, explain that no exact matches were found and suggest broadening the search.
        """
    )
    return genre_prompt, recommendation_prompt

# Extracting , getting suggested genres <br>and filtering contet

In [None]:
def extract_genres(df: pd.DataFrame) -> Set[str]:
    """Extract unique genres from the DataFrame."""
    all_genres = df['genres'].str.split(',').explode()
    return {genre.strip() for genre in all_genres if isinstance(genre, str)}

def get_suggested_genres(user_input: str, genres_set: Set[str], genre_chain: LLMChain) -> List[str]:
    """Get suggested genres based on user input."""
    available_genres = sorted(list(genres_set))
    response = genre_chain.run(user_input=user_input, available_genres=", ".join(available_genres))

    if response.strip() == "No matching genres found":
        return []

    suggested_genres = [genre.strip() for genre in response.split(',')]
    return [genre for genre in suggested_genres if genre in genres_set]

def filter_content(df: pd.DataFrame, genres: List[str]) -> pd.DataFrame:
    """Filter content based on suggested genres."""
    if not genres:
        return pd.DataFrame()
    
    mask = df['genres'].apply(
        lambda x: any(genre.strip() in [g.strip() for g in x.split(',')] for genre in genres)
    )
    return df[mask]

# Format content from prompt <br>and get recommendations

In [None]:
def format_content_for_prompt(filtered_df: pd.DataFrame) -> str:
    """Format filtered content for the LLM prompt."""
    if filtered_df.empty:
        return "No matching content found."
    
    content_list = [
        f"Title: {row['title']}\nGenres: {row['genres']}\nRelease Year: {row['releaseYear']}\nRating: {row['averageRating']} (from {row['numVotes']} votes)\n"
        for _, row in filtered_df.iterrows()
    ]
    return "\n".join(content_list)

def get_recommendations(user_input: str, df: pd.DataFrame, genres_set: Set[str], genre_chain: LLMChain, recommendation_chain: LLMChain) -> str:
    """Get content recommendations based on user input."""
    suggested_genres = get_suggested_genres(user_input, genres_set, genre_chain)
    filtered_content = filter_content(df, suggested_genres)
    formatted_content = format_content_for_prompt(filtered_content)
    response = recommendation_chain.run(user_input=user_input, filtered_content=formatted_content)
    return response

# Running exemple

In [8]:
# Initialize LLM and prompts
api_key = "YOUR_API_KEY"  # Replace with actual API key
llm = initialize_llm(api_key)
genre_prompt, recommendation_prompt = create_prompt_templates()
genre_chain = LLMChain(llm=llm, prompt=genre_prompt)
recommendation_chain = LLMChain(llm=llm, prompt=recommendation_prompt)

# Load data and extract genres

genres_set = extract_genres(df)

# Example usage
example_queries = [
    "I want to watch something random"
]

for query in example_queries:
    print(f"\nQuery: {query}")
    recommendations = get_recommendations(query, df, genres_set, genre_chain, recommendation_chain)
    print("Recommendations:")
    print(recommendations)
    print("-" * 50)



Query: I want to watch something random
Recommendations:
You're looking for something random, huh?  That's a tough one!  Let's see...  Since you didn't specify any genres, I'm going to recommend a mix of things to keep it interesting. 

How about **"The Office" (2005)**? It's a classic comedy that's always sure to make you laugh. Or maybe you'd prefer **"Breaking Bad" (2008)**, a gripping crime drama that'll keep you on the edge of your seat. If you're in the mood for something a bit more lighthearted, **"Bluey" (2018)** is a hilarious animated show perfect for the whole family. 

And if you're looking for something truly out there, **"The Filthy Frank Show" (2011)** is a wild and chaotic comedy that's sure to be a unique experience. 

If none of these sound appealing, you could try broadening your search by specifying some genres you're interested in or telling me a little more about what you're looking for.  Happy watching! 

--------------------------------------------------


# Saving genres and treated df

In [11]:
with open('genres_set.pkl', 'wb') as f:
    pickle.dump(genres_set, f)

# Save DataFrame
df.to_pickle('movies_df.pkl')