In [42]:
# Import Library
import os
import requests
import pandas as pd
import json
from dotenv import load_dotenv
from datetime import datetime

# import tiktoken
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS


In [2]:
# Load environment variables from .env
load_dotenv()

# Access the OpenAI API key
OPENAI_API_KEY = os.getenv("MY_OPENAI_KEY")
TMDB_API_KEY = os.getenv("TMDB_API_KEY")

In [3]:
def fetch_data(user):
    if user == 'movie':
        url = "https://api.themoviedb.org/3/movie/now_playing?language=en-US&page=1"
        info_columns = ['title', 'release_date', 'genre', 'poster_path','popularity']
    elif user == 'tv':
        url = "https://api.themoviedb.org/3/tv/airing_today?language=en-US&page=1"
        info_columns = ['name', 'first_air_date', 'genre', 'poster_path','popularity']
    else:
        raise ValueError("User must be 'movie' or 'tv'")

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {TMDB_API_KEY}"
    }

    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        
        # Extract the list of movies or TV shows from the response
        results = data.get('results', [])
        
        # Create a DataFrame
        df = pd.DataFrame(results)

        # Add a new column 'genre' based on genre_ids
        genre_mapping = {
            28: 'Action',
            12: 'Adventure',
            16: 'Animation',
            35: 'Comedy',
            80: 'Crime',
            99: 'Documentary',
            18: 'Drama',
            10751: 'Family',
            14: 'Fantasy',
            36: 'History',
            27: 'Horror',
            10402: 'Music',
            9648: 'Mystery',
            10749: 'Romance',
            878: 'Science Fiction',
            10770: 'TV Movie',
            53: 'Thriller',
            10752: 'War',
            37: 'Western',
            10759: 'Action & Adventure',
            10762: 'Kids',
            10763: 'News',
            10764: 'Reality',
            10765: 'Sci-Fi & Fantasy',
            10766: 'Soap',
            10767: 'Talk',
            10768: 'War & Politics'  
        }
        
        df['genre'] = df['genre_ids'].apply(lambda x: [genre_mapping.get(genre_id, 'Unknown') for genre_id in x])
        
        # Create combined_info column based on user type
        df['combined_info'] = df.apply(lambda row: f"{info_columns[0]}: {row[info_columns[0]]}, {info_columns[1]}: {row[info_columns[1]]},{info_columns[2]}: {row[info_columns[2]]}, {info_columns[3]}: {row[info_columns[3]]}, {info_columns[4]}: {row[info_columns[4]]}", axis=1)
        
        return df
    
    else:
        print(f"Error fetching data. Status code: {response.status_code}")
        return None

In [4]:
# Example usage:
user = 'movie'  # Replace with 'tv' for TV shows
df = fetch_data(user)

df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genre,combined_info
0,False,/stKGOm8UyhuLPR9sZLjs5AkmncA.jpg,"[16, 10751, 18, 12, 35]",1022789,en,Inside Out 2,Teenager Riley's mind headquarters is undergoi...,4974.9,/vpnVM9B6NMmQpWeZvzLvDESb2QY.jpg,2024-06-11,Inside Out 2,False,7.636,143,"[Animation, Family, Drama, Adventure, Comedy]","title: Inside Out 2, release_date: 2024-06-11,..."
1,False,/fqv8v6AycXKsivp1T5yKtLbGXce.jpg,"[878, 12, 28]",653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,3704.838,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,False,6.8,1050,"[Science Fiction, Adventure, Action]","title: Kingdom of the Planet of the Apes, rele..."
2,False,/gRApXuxWmO2forYTuTmcz5RaNUV.jpg,"[28, 80, 53, 35]",573435,en,Bad Boys: Ride or Die,"After their late former Captain is framed, Low...",2749.931,/nP6RliHjxsz4irTKsxe8FRhKZYl.jpg,2024-06-05,Bad Boys: Ride or Die,False,7.049,276,"[Action, Crime, Thriller, Comedy]","title: Bad Boys: Ride or Die, release_date: 20..."
3,False,/hliXekHv7xc2cgXnMBLlp4Eihq8.jpg,"[53, 27, 28, 9648]",1001311,fr,Sous la Seine,"In the Summer of 2024, Paris is hosting the Wo...",2971.905,/qZPLK5ktRKa3CL4sKRZtj8UlPYc.jpg,2024-06-05,Under Paris,False,5.806,465,"[Thriller, Horror, Action, Mystery]","title: Under Paris, release_date: 2024-06-05,g..."
4,False,/z121dSTR7PY9KxKuvwiIFSYW8cf.jpg,"[10752, 28, 18]",929590,en,Civil War,"In the near future, a group of war journalists...",1557.962,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,False,7.019,1537,"[War, Action, Drama]","title: Civil War, release_date: 2024-04-10,gen..."


In [5]:
# Example usage:
user = 'tv'  # Replace with 'tv' for TV shows
df = fetch_data(user)

df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,origin_country,original_language,original_name,overview,popularity,poster_path,first_air_date,name,vote_average,vote_count,genre,combined_info
0,False,/sQTyBUMfaTGDmwTofvY3jbSv2BP.jpg,"[18, 9648, 35]",236033,[CN],zh,墨雨云间,"This drama tells the story of Xue Fangfei, the...",2991.444,/2tZoc93g8DXTWbQjj328xCb9Ei7.jpg,2024-06-02,The Double,7.0,1,"[Drama, Mystery, Comedy]","name: The Double, first_air_date: 2024-06-02,g..."
1,False,/s92ZvqE4gWOrZq60AKvhgZZxDY4.jpg,[10764],240909,[CO],es,La Casa de los Famosos Colombia,,2509.482,/lCn94cscNlFToXIbIWRSGBqb20Q.jpg,2024-02-11,La Casa de los Famosos Colombia,6.28,25,[Reality],"name: La Casa de los Famosos Colombia, first_a..."
2,False,/d1aFl8AQ3dFMsGbeF5wk6BOUwq8.jpg,[],209374,[US],es,Top Chef VIP,,2733.846,/cw6M4c2MpLSzqzmrrqpSJlEbwCF.jpg,2022-08-09,Top Chef VIP,4.5,6,[],"name: Top Chef VIP, first_air_date: 2022-08-09..."
3,False,/butPVWgcbtAjL9Z7jU7Xj1KA8KD.jpg,"[10767, 35]",22980,[US],en,Watch What Happens Live with Andy Cohen,Bravo network executive Andy Cohen discusses p...,1972.754,/onSD9UXfJwrMXWhq7UY7hGF2S1h.jpg,2009-07-16,Watch What Happens Live with Andy Cohen,4.896,53,"[Talk, Comedy]","name: Watch What Happens Live with Andy Cohen,..."
4,False,/ohJTnu93hJ0Uonl86Wn3mOSlWXN.jpg,"[10751, 35, 18]",91759,[HK],cn,愛·回家之開心速遞,"Hung Sue Gan starting from the bottom, establi...",1018.451,/lgD4j9gUGmMckZpWWRJjorWqGVT.jpg,2017-02-06,Come Home Love: Lo and Behold,5.4,36,"[Family, Comedy, Drama]","name: Come Home Love: Lo and Behold, first_air..."


In [6]:
# Load Processed Dataset and textsplitting
def loader(data):
    loader = DataFrameLoader(df, page_content_column="combined_info")
    docs  = loader.load()
    # Document splitting
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(docs)
    return texts


In [7]:
# embeddings model
def model_embedding():
    # Define the path to the pre-trained model you want to use
    modelPath = "sentence-transformers/all-MiniLM-l6-v2"

    # Create a dictionary with model configuration options, specifying to use the CPU for computations
    model_kwargs = {'device':'cpu'}

    # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
    encode_kwargs = {'normalize_embeddings': False}

    # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,     # Provide the pre-trained model's path
        model_kwargs=model_kwargs, # Pass the model configuration options
        encode_kwargs=encode_kwargs # Pass the encoding options
)
    return embeddings

In [8]:
# Vector DB
vectorstore  = FAISS.from_documents(loader(df), model_embedding())

  from tqdm.autonotebook import tqdm, trange







In [45]:
# Set up the memory
memory = ConversationBufferMemory()

In [71]:
def setup_chain(openai_api_key, vectorstore):
    MOVIE_TVSHOW_QUESTION_CREATOR_TEMPLATE = """
    Given a conversation history about movies or TV shows, reformulate the user's last message to make it easier to search from a database.
    For example, if the user requests a Movie/ TV show recommendation, answer with the format 
    - Title
    - Release Date / First Aired
    - Genre
    - Popularity
    - Poster
    You shouldn't change the language of the question, just reformulate it. If it is not needed to reformulate the question or it is not a question, just output the same text.
       
    ### Conversation History ###
    {chat_history}

    Last Message: {question}
    Reformulated Question:"""

    condense_question_prompt = PromptTemplate.from_template(MOVIE_TVSHOW_QUESTION_CREATOR_TEMPLATE)
    chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(openai_api_key=openai_api_key, 
                        model_name="gpt-3.5-turbo", temperature=0),
                        retriever=vectorstore.as_retriever(),
                        condense_question_llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
                        condense_question_prompt=condense_question_prompt,
                        verbose=True,
    )
    return chain

In [72]:
def run_chain(chain, query, chat_history):
    result = chain.run(question=query, chat_history=chat_history)
    return result

In [76]:
chat_history = ""
query = "looking for comedy tv show"

In [77]:
chain = setup_chain(OPENAI_API_KEY, vectorstore)

In [78]:
# Run the chain with the query and chat history
result = run_chain(chain, query=query, chat_history=chat_history)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
This is the content of document 2.

This is the content of document 1.

This is the content of document 3.
Human: looking for comedy tv show[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m
I don't know.


In [24]:
query = "What about drama"
chain = setup_chain(OPENAI_API_KEY, vectorstore)
result = run_chain(chain, query=query, chat_history=chat_history)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
name: The Tale of Rose, first_air_date: 2024-06-08,genre: ['Drama'], poster_path: /5Xjv2eIPSmTUHgMzzQVELZZXbPf.jpg, popularity: 779.953

name: The Double, first_air_date: 2024-06-02,genre: ['Drama', 'Mystery', 'Comedy'], poster_path: /2tZoc93g8DXTWbQjj328xCb9Ei7.jpg, popularity: 2991.444

name: Newscast, first_air_date: 2020-02-06,genre: ['News', 'Talk'], poster_path: /amqSeiSsVitgY8uVIOwu3eKSs3H.jpg, popularity: 466.008

name: Come Home Love: Lo and Behold, first_air_date: 2017-02-06,genre: ['Family', 'Comedy', 'Drama'], poster_path: /lgD4j9gUGmMckZpWWRJjorWqGVT.jpg, popularity: 1018.451
Human: What about drama[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m
He

In [69]:
result

"I don't have enough information to recommend a specific drama movie. Can you provide more details or preferences to help me suggest a suitable drama movie for you?"