In [67]:
# Import Library
import os
import requests
import pandas as pd
import json
from dotenv import load_dotenv
from datetime import datetime

# import tiktoken
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS


In [2]:
# Load environment variables from .env
load_dotenv()

# Access the OpenAI API key
openai_api_key = os.getenv("MY_OPENAI_KEY")
TMDB_API_KEY = os.getenv("TMDB_API_KEY")

In [53]:
def fetch_data(user):
    if user == 'movie':
        url = "https://api.themoviedb.org/3/movie/now_playing?language=en-US&page=1"
        info_columns = ['title', 'release_date', 'genre', 'poster_path','popularity']
    elif user == 'tv':
        url = "https://api.themoviedb.org/3/tv/airing_today?language=en-US&page=1"
        info_columns = ['name', 'first_air_date', 'genre', 'poster_path','popularity']
    else:
        raise ValueError("User must be 'movie' or 'tv'")

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {TMDB_API_KEY}"
    }

    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        
        # Extract the list of movies or TV shows from the response
        results = data.get('results', [])
        
        # Create a DataFrame
        df = pd.DataFrame(results)

        # Add a new column 'genre' based on genre_ids
        genre_mapping = {
            28: 'Action',
            12: 'Adventure',
            16: 'Animation',
            35: 'Comedy',
            80: 'Crime',
            99: 'Documentary',
            18: 'Drama',
            10751: 'Family',
            14: 'Fantasy',
            36: 'History',
            27: 'Horror',
            10402: 'Music',
            9648: 'Mystery',
            10749: 'Romance',
            878: 'Science Fiction',
            10770: 'TV Movie',
            53: 'Thriller',
            10752: 'War',
            37: 'Western',
            10759: 'Action & Adventure',
            10762: 'Kids',
            10763: 'News',
            10764: 'Reality',
            10765: 'Sci-Fi & Fantasy',
            10766: 'Soap',
            10767: 'Talk',
            10768: 'War & Politics'  
        }
        
        df['genre'] = df['genre_ids'].apply(lambda x: [genre_mapping.get(genre_id, 'Unknown') for genre_id in x])
        
        # Create combined_info column based on user type
        df['combined_info'] = df.apply(lambda row: f"{info_columns[0]}: {row[info_columns[0]]}, {info_columns[1]}: {row[info_columns[1]]},{info_columns[2]}: {row[info_columns[2]]}, {info_columns[3]}: {row[info_columns[3]]}, {info_columns[4]}: {row[info_columns[4]]}", axis=1)
        
        return df
    
    else:
        print(f"Error fetching data. Status code: {response.status_code}")
        return None

In [31]:
# Example usage:
user = 'movie'  # Replace with 'tv' for TV shows
df = fetch_data(user)

df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genre,combined_info
0,False,/stKGOm8UyhuLPR9sZLjs5AkmncA.jpg,"[16, 10751, 18, 12, 35]",1022789,en,Inside Out 2,Teenager Riley's mind headquarters is undergoi...,3553.5,/vpnVM9B6NMmQpWeZvzLvDESb2QY.jpg,2024-06-11,Inside Out 2,False,7.643,136,"[Animation, Family, Drama, Adventure, Comedy]","title: Inside Out 2, release_date: 2024-06-11,..."
1,False,/fqv8v6AycXKsivp1T5yKtLbGXce.jpg,"[878, 12, 28]",653346,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,3327.202,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,2024-05-08,Kingdom of the Planet of the Apes,False,6.82,1048,"[Science Fiction, Adventure, Action]","title: Kingdom of the Planet of the Apes, rele..."
2,False,/hliXekHv7xc2cgXnMBLlp4Eihq8.jpg,"[53, 27, 28, 9648]",1001311,fr,Sous la Seine,"In the Summer of 2024, Paris is hosting the Wo...",2692.396,/qZPLK5ktRKa3CL4sKRZtj8UlPYc.jpg,2024-06-05,Under Paris,False,5.789,455,"[Thriller, Horror, Action, Mystery]","title: Under Paris, release_date: 2024-06-05, ..."
3,False,/gRApXuxWmO2forYTuTmcz5RaNUV.jpg,"[28, 80, 53, 35]",573435,en,Bad Boys: Ride or Die,"After their late former Captain is framed, Low...",2520.868,/nP6RliHjxsz4irTKsxe8FRhKZYl.jpg,2024-06-05,Bad Boys: Ride or Die,False,7.04,273,"[Action, Crime, Thriller, Comedy]","title: Bad Boys: Ride or Die, release_date: 20..."
4,False,/z121dSTR7PY9KxKuvwiIFSYW8cf.jpg,"[10752, 28, 18]",929590,en,Civil War,"In the near future, a group of war journalists...",1346.112,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,False,7.019,1528,"[War, Action, Drama]","title: Civil War, release_date: 2024-04-10, ge..."


In [54]:
# Example usage:
user = 'tv'  # Replace with 'tv' for TV shows
df = fetch_data(user)

df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,origin_country,original_language,original_name,overview,popularity,poster_path,first_air_date,name,vote_average,vote_count,genre,combined_info
0,False,/sQTyBUMfaTGDmwTofvY3jbSv2BP.jpg,"[18, 9648, 35]",236033,[CN],zh,墨雨云间,"This drama tells the story of Xue Fangfei, the...",2991.444,/2tZoc93g8DXTWbQjj328xCb9Ei7.jpg,2024-06-02,The Double,7.0,1,"[Drama, Mystery, Comedy]","name: The Double, first_air_date: 2024-06-02,g..."
1,False,/s92ZvqE4gWOrZq60AKvhgZZxDY4.jpg,[10764],240909,[CO],es,La Casa de los Famosos Colombia,,2509.482,/lCn94cscNlFToXIbIWRSGBqb20Q.jpg,2024-02-11,La Casa de los Famosos Colombia,6.28,25,[Reality],"name: La Casa de los Famosos Colombia, first_a..."
2,False,/d1aFl8AQ3dFMsGbeF5wk6BOUwq8.jpg,[],209374,[US],es,Top Chef VIP,,2733.846,/cw6M4c2MpLSzqzmrrqpSJlEbwCF.jpg,2022-08-09,Top Chef VIP,4.5,6,[],"name: Top Chef VIP, first_air_date: 2022-08-09..."
3,False,/butPVWgcbtAjL9Z7jU7Xj1KA8KD.jpg,"[10767, 35]",22980,[US],en,Watch What Happens Live with Andy Cohen,Bravo network executive Andy Cohen discusses p...,1972.754,/onSD9UXfJwrMXWhq7UY7hGF2S1h.jpg,2009-07-16,Watch What Happens Live with Andy Cohen,4.896,53,"[Talk, Comedy]","name: Watch What Happens Live with Andy Cohen,..."
4,False,/ohJTnu93hJ0Uonl86Wn3mOSlWXN.jpg,"[10751, 35, 18]",91759,[HK],cn,愛·回家之開心速遞,"Hung Sue Gan starting from the bottom, establi...",1018.451,/lgD4j9gUGmMckZpWWRJjorWqGVT.jpg,2017-02-06,Come Home Love: Lo and Behold,5.4,36,"[Family, Comedy, Drama]","name: Come Home Love: Lo and Behold, first_air..."


In [58]:
# Load Processed Dataset and textsplitting
def loader(data):
    loader = DataFrameLoader(df, page_content_column="combined_info")
    docs  = loader.load()
    # Document splitting
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(docs)
    return texts


In [60]:
# embeddings model
def model_embedding():
    # Define the path to the pre-trained model you want to use
    modelPath = "sentence-transformers/all-MiniLM-l6-v2"

    # Create a dictionary with model configuration options, specifying to use the CPU for computations
    model_kwargs = {'device':'cpu'}

    # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
    encode_kwargs = {'normalize_embeddings': False}

    # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,     # Provide the pre-trained model's path
        model_kwargs=model_kwargs, # Pass the model configuration options
        encode_kwargs=encode_kwargs # Pass the encoding options
)
    return embeddings

In [68]:
# Vector DB
vectorstore  = FAISS.from_documents(loader(df), model_embedding())

  from tqdm.autonotebook import tqdm, trange





modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# todo write about alternatives
QUESTION_CREATOR_TEMPLATE = """Given a conversation history, reformulate the question to make it easier to search from a database. 
For example, if the AI says "Do you want to know the current weather in Istanbul?", and the user answer as "yes" then the AI should reformulate the question as "What is the current weather in Istanbul?".
You shouldn't change the language of the question, just reformulate it. If it is not needed to reformulate the question or it is not a question, just output the same text.
### Conversation History ###
{chat_history}

Last Message: {question}
Reformulated Question:"""

In [None]:
def chain():
    condense_question_prompt = PromptTemplate.from_template(QUESTION_CREATOR_TEMPLATE)
    chain = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
        retriever=retriever,
        condense_question_llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
        condense_question_prompt=condense_question_prompt,
        verbose=True,
    )

    return chain