In [None]:
import os
from dotenv import load_dotenv
from pytube import YouTube
import whisper
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.prompts import ChatPromptTemplate
import re
import tempfile
from langchain_core.runnables import RunnableParallel, RunnablePassthrough


In [None]:
# Cargar variables de entorno
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

data_to_read = "data/transcription.txt"


In [None]:
# # Youtube transcription
# YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=YM81g6XoZzI"
# if not os.path.exists(data_to_read):
#     youtube = YouTube(YOUTUBE_VIDEO)
#     audio = youtube.streams.filter(only_audio=True).first()
#     whisper_model = whisper.load_model("large-v2")

#     with tempfile.TemporaryDirectory() as tmpdir:
#         file = audio.download(output_path=tmpdir)
#         transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

#         with open(data_to_read, "w") as file:
#             file.write(transcription)

In [None]:
with open(data_to_read) as file:
    transcription = file.read()


In [None]:
# Paso 2: Trocear el texto manualmente
single_sentences_list = re.split(r"(?<=[.?!])\s+", transcription)

chunks = []
chunk_size = 10000
chunk = ""
for sentence in single_sentences_list:
    if len(chunk) + len(sentence) < chunk_size:
        chunk += " " + sentence
    else:
        chunks.append(chunk.strip())
        chunk = sentence
if chunk:
    chunks.append(chunk.strip())


In [None]:
# Paso 3: Generar embeddings para los chunks
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Crear los documentos usando SemanticChunker (para el ejemplo)
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(embeddings=embeddings)
docs = text_splitter.create_documents(chunks)


In [None]:
docs[:10]

In [None]:
# Paso 4: Cargar los embeddings en Pinecone
index_name = "youtube-rag-index-semantic"

PineconeVectorStore.get_pinecone_index(index_name).delete(delete_all=True)

pinecone = PineconeVectorStore.from_documents(
    docs,
    embedding=embeddings,
    index_name=index_name,  # , pinecone_api_key=PINECONE_API_KEY, pinecone_environment=PINECONE_ENV
)


In [None]:
pinecone.similarity_search("Como se llama el entrevistado?")[:3]


In [None]:
# Paso 5: Configurar el modelo y el prompt de LangChain para hacer la consulta
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4-mini")
template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Crear el chain para la consulta
chain = (
    {
        "context": pinecone.as_retriever(
            search_type="similarity", search_kwargs={"k": 10}
        ),
        "question": RunnablePassthrough(),
    }
    | prompt
    | model
    # | parser # Descomentar si quieres usar un parser
)

# Hacer la consulta
# response = chain.invoke({
#     "context": transcription,
#     "question": "De qué trabaja y quién es el entrevistado?"
# })
chain.invoke("De qué trabaja el entrevistado?")
# chain.invoke("Cómo se llama el entrevistado?")

# print(response)
