# RAG

## 0. Environment set up

In [None]:
# Switch to home project directory
%cd ../..

In [None]:
import langchain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores.faiss import FAISS
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

import faiss
import os
import json
import pandas as pd

from tqdm.notebook import tqdm
from typing import TypedDict

import seaborn as sns
import matplotlib.pyplot as plt

from src.unite_talking_points.utils.config.config_loader import ConfigLoader

In [None]:
config = ConfigLoader().load_config(current_directory_is_root=True)

In [None]:
config.keys()

In [None]:
os.environ["OPENAI_API_KEY"] = config['External-services']['openai_api_key']

## 1. Data loading

In [None]:
def load_json_data(data_dir):
    """
    Load JSON data from the specified directory into a pandas DataFrame.

    Args:
    - data_dir (str): Path to the directory containing JSON files.

    Returns:
    - df (pd.DataFrame): DataFrame containing the loaded JSON data.
    """
    # Initialize empty lists to store data
    file_names = []
    labels = []
    document_names = []
    meeting_names = []
    meeting_dates = []
    contents = []
    prompts = []

    # Iterate over each JSON file in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            with open(os.path.join(data_dir, filename), 'r') as file:
                data = json.load(file)
                # Extract data from each JSON file and append to lists
                file_names.append(filename)
                labels.append(data['label'])
                document_names.append(data['document_name'])
                meeting_names.append(data['meeting_name'])
                meeting_dates.append(data['meeting_date'])
                contents.append(data['content'])
                prompts.append(data['prompt'])

    # Create a DataFrame from the lists
    df = pd.DataFrame({
        'file_name': file_names,
        'label': labels,
        'document_name': document_names,
        'meeting_name': meeting_names,
        'meeting_date': meeting_dates,
        'content': contents,
        'prompt': prompts
    })

    return df

In [None]:
df = load_json_data(config['Directories']['raw_data_path'])

In [None]:
df

## 2. Data preprocessing 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents(df['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = FAISS.from_documents(texts, embedding=embeddings)
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

## 3. Model

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [None]:
from langchain.prompts import PromptTemplate

custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
        You are an assistant specialized in creating Talking Points for United Nations. 
        Use the given context to create a the Talking Point.
        The output should contain bullet points on each Talking Point.
        If the context does not contain enough information, indicate that clearly.
        
        Question: {question}
        Context: {context}
        Answer:
    """
)

In [None]:
df.iloc[0].prompt

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke(df.iloc[0].prompt)
print(answer)

## 4. Experiments

In [None]:
def setup_langchain(contents, llm_model="gpt-4o-mini"):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    texts = text_splitter.create_documents(contents)
    splits = text_splitter.split_documents(texts)
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(texts, embedding=embeddings)
    retriever = vectorstore.as_retriever()

    custom_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
            You are an assistant specialized in creating Talking Points for United Nations. 
            Use the given context to create a the Talking Point.
            The output should contain bullet points on each Talking Point.
            If the context does not contain enough information, indicate that clearly.
            
            Question: {question}
            Context: {context}
            Answer:
        """
    )

    llm = ChatOpenAI(model_name=llm_model)

    chain = rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_prompt
        | llm
        | StrOutputParser()
    )

    return chain
    

### 4.1 Models

In [None]:
# Function to conduct n experiments without averaging results
def conduct_experiment(df, model="gpt-4o-mini", n=3):
    results = []

    # Initialize the model and the chain
    for i in tqdm(range(len(df))):
        # Remove the document from the embedding
        mask = df.index.isin([1])
        contents = df[~mask].content
        
        chain = setup_langchain(contents, model)
        
        prompt = df.iloc[i].prompt
        content = df['content'][i]
        file_name = df['file_name'][i]

        # Conduct n experiments
        for experiment_num in tqdm(range(n), leave=False):
            # Generate talking point
            generated_talking_point = chain.invoke(prompt)

            # Store result
            # tppi_result = tppi.calculate_tppi(content, generated_talking_point)
            
            # Add metadata fields to each experiment result
            result = {}
            result['file_name'] = file_name
            result['model'] = model
            result['experiment_number'] = experiment_num + 1
            result['content'] = content
            result['prompt'] = prompt
            result['generated_doc'] = generated_talking_point

            results.append(result)

    return results

In [None]:
# Run experiments
models = ['gpt-3.5-turbo', 'gpt-4o-mini', 'gpt-4o']
all_results = []

for model in models:
    all_results.append(conduct_experiment(df, model, 3))

In [None]:
flattened_data = [item for sublist in all_results for item in sublist]
all_results = pd.DataFrame(flattened_data)

In [None]:
all_results.to_csv('notebooks/RAG/model_exp.csv')

### 4.2 Temperatures

In [None]:
def setup_langchain(contents, llm_model="gpt-4o-mini", temperature=0.5):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    texts = text_splitter.create_documents(contents)
    splits = text_splitter.split_documents(texts)
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(texts, embedding=embeddings)
    retriever = vectorstore.as_retriever()

    custom_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
            You are an assistant specialized in creating Talking Points for United Nations. 
            Use the given context to create a the Talking Point.
            The output should contain bullet points on each Talking Point.
            If the context does not contain enough information, indicate that clearly.
            
            Question: {question}
            Context: {context}
            Answer:
        """
    )

    llm = ChatOpenAI(model_name=llm_model, temperature=temperature)

    chain = rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_prompt
        | llm
        | StrOutputParser()
    )

    return chain

In [None]:
# Function to conduct n experiments without averaging results
def conduct_experiment(df, model="gpt-4o-mini", temperature=0.5, n=3):
    results = []

    # Initialize the model and the chain
    for i in tqdm(range(len(df))):
        # Remove the document from the embedding
        mask = df.index.isin([1])
        contents = df[~mask].content
        
        chain = setup_langchain(contents, model, temperature)
        
        prompt = df.iloc[i].prompt
        content = df['content'][i]
        file_name = df['file_name'][i]

        # Conduct n experiments
        for experiment_num in tqdm(range(n), leave=False):
            # Generate talking point
            generated_talking_point = chain.invoke(prompt)

            # Store result
            # tppi_result = tppi.calculate_tppi(content, generated_talking_point)
            
            # Add metadata fields to each experiment result
            result = {}
            result['file_name'] = file_name
            result['model'] = model
            result['temperature'] = temperature
            result['experiment_number'] = experiment_num + 1
            result['content'] = content
            result['prompt'] = prompt
            result['generated_doc'] = generated_talking_point

            results.append(result)

    return results

In [None]:
# Run experiments
temperatures = [0, 0.5, 1, 1.5, 2]
all_results = []

for temperature in temperatures:
    all_results.append(conduct_experiment(df, model='gpt-4o', temperature=temperature, n=3))

In [None]:
flattened_data = [item for sublist in all_results for item in sublist]
all_results = pd.DataFrame(flattened_data)

In [None]:
all_results.to_csv('notebooks/RAG/temperature_exp.csv')

## 5. Examples:

### 5.1 Doc 17

In [None]:
df_aux = df.drop(17)

In [None]:
texts = text_splitter.create_documents(df_aux['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(df.iloc[17].prompt)

### 5.2 Doc 28

In [None]:
df_aux = df.drop(28)

In [None]:
texts = text_splitter.create_documents(df_aux['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(df.iloc[28].prompt)

### 5.3 Doc 25

In [None]:
df_aux = df.drop(25)

In [None]:
texts = text_splitter.create_documents(df_aux['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(df.iloc[25].prompt)

In [None]:
df.iloc[25].prompt