# RAG

## 0. Environment set up

In [1]:
# Switch to home project directory
%cd ../..

/Users/alejandro.medrano/Projects/UNITE_TALKING_POINTS


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import langchain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores.faiss import FAISS
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

import faiss
import os
import json
import pandas as pd

from tqdm.notebook import tqdm
from typing import TypedDict

import seaborn as sns
import matplotlib.pyplot as plt

from src.unite_talking_points.utils.config.config_loader import ConfigLoader

In [3]:
config = ConfigLoader().load_config(current_directory_is_root=True)

In [4]:
config.keys()

KeysView(<configparser.ConfigParser object at 0x1084a44f0>)

In [5]:
os.environ["OPENAI_API_KEY"] = config['External-services']['openai_api_key']

## 1. Data loading

In [7]:
def load_json_data(data_dir):
    """
    Load JSON data from the specified directory into a pandas DataFrame.

    Args:
    - data_dir (str): Path to the directory containing JSON files.

    Returns:
    - df (pd.DataFrame): DataFrame containing the loaded JSON data.
    """
    # Initialize empty lists to store data
    file_names = []
    labels = []
    document_names = []
    meeting_names = []
    meeting_dates = []
    contents = []
    prompts = []

    # Iterate over each JSON file in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            with open(os.path.join(data_dir, filename), 'r') as file:
                data = json.load(file)
                # Extract data from each JSON file and append to lists
                file_names.append(filename)
                labels.append(data['label'])
                document_names.append(data['document_name'])
                meeting_names.append(data['meeting_name'])
                meeting_dates.append(data['meeting_date'])
                contents.append(data['content'])
                prompts.append(data['prompt'])

    # Create a DataFrame from the lists
    df = pd.DataFrame({
        'file_name': file_names,
        'label': labels,
        'document_name': document_names,
        'meeting_name': meeting_names,
        'meeting_date': meeting_dates,
        'content': contents,
        'prompt': prompts
    })

    return df

In [8]:
df = load_json_data(config['Directories']['raw_data_path'])

In [9]:
df

Unnamed: 0,file_name,label,document_name,meeting_name,meeting_date,content,prompt
0,PSGD 2023-03-06 Talking Points for USG DOS on ...,ai,Talking Points for USG DOS on Generative AI,Senior Manamement Group,09-03-2023,Generative Artificial Intelligence is a form o...,Generate talking points addressing the recent ...
1,BN DOS USG on Implementation of Digital Transf...,peacekeeping,Briefing Note for USG DOS,Implementation of the Peacekeeping Digital Tra...,01-06-2022,DOS is a partner in the digital transformation...,Create talking points highlighting the partner...
2,2022-12-08 DRAFT Remarks for 7th Pannel ICT Af...,digital_transformation,Remarks for ASG OICT,7th Panel: Information and Communications Tech...,08-12-2022,Acknowledgment of the crucial moment for ICT i...,Create a draft for a keynote address by a UN o...
3,2022-10-14 764 CXOTalk ASG OICT FINAL digital ...,digital_transformation,CITO Interview CXOTalk Show,"CXOTALK EPISODE #764, DIGITAL TECHNOLOGIES & ...",14-10-2022,Role of Technology at the UN: Technology plays...,Generate talking points on how technology inte...
4,2022-08-24 TECH ENVOY TPs - CITO cybersecurity...,cybersecurity,Talking Points for ASG OICT,Meeting with the incoming Special Envoy for Te...,24-09-2022,"Trust in the integrity, reliability and securi...",Generate talking points about the importance o...
5,2022-08-24 TECH ENVOY TPs - CITO ai.json,ai,Talking Points for ASG OICT,Meeting with the incoming Special Envoy for Te...,24-08-2022,Welcome!\nI would very much like to see contin...,Please draft a message welcoming collaboration...
6,2022-03-02 Talking Points for USG DMSPC Our Co...,data_strategy,Talking Points for USG DMSPC - Our Common Agenda,,02-03-2022,OICT continues to support business entities in...,Generate talking points on the ongoing efforts...
7,2023-09-15 Opening Remarks UNGS ScienceSummit ...,data_strategy,Opening Remarks for ASG OICT,UNGA Science Summit,12-09-2023,Emphasis on the importance of science as a cen...,Generate a set of strategic talking points for...
8,SG remarks to the Security Council on Artifici...,ai,SG remarks to the Security Council on Artifici...,SG remarks to the Security Council on Artifici...,01-09-2023,"AI has reached unprecedented speed and reach, ...",Generate a comprehensive overview of the curre...
9,2022-10-14 764 CXOTalk ASG OICT FINAL ict stra...,ict_strategy,CITO Interview CXOTalk Show,"CXOTALK EPISODE #764, DIGITAL TECHNOLOGIES & T...",14-10-2022,Role and Functions of the ICT Office: The Chie...,Generate a detailed overview of the Informatio...


## 2. Data preprocessing 

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents(df['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = FAISS.from_documents(texts, embedding=embeddings)
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

  prompt = loads(json.dumps(prompt_object.manifest))


## 3. Model

In [11]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [12]:
from langchain.prompts import PromptTemplate

custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
        You are an assistant specialized in creating Talking Points for United Nations. 
        Use the given context to create a the Talking Point.
        The output should contain bullet points on each Talking Point.
        If the context does not contain enough information, indicate that clearly.
        
        Question: {question}
        Context: {context}
        Answer:
    """
)

In [13]:
df.iloc[0].prompt

'Generate talking points addressing the recent breakthroughs in Generative Artificial Intelligence, highlighting its capabilities in producing human-like text, images, and videos. Discuss its applications in various sectors such as content creation, translation, and user interface enhancement. Emphasize the involvement of organizations like OpenAI and Microsoft, detailing their investments and integration of Generative AI into products like Microsoft Teams and search engines. Delve into the potential benefits of Generative AI in streamlining tasks for organizations like the UN, while acknowledging the risks associated with the technology, such as the generation of biased or misleading content, and its susceptibility to cyber-security threats and malicious manipulation. Consider the current need for human oversight in utilizing Generative AI and speculate on future trends as the technology continues to evolve.'

In [14]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke(df.iloc[0].prompt)
print(answer)

- Generative Artificial Intelligence (AI) has made significant breakthroughs in creating human-like text, images, and videos, showcasing its capabilities in various sectors.
- Organizations like OpenAI and Microsoft have heavily invested in Generative AI, leading to its integration into products like Microsoft Teams and search engines, offering features such as automatic meeting summarization.
- Generative AI has the potential to streamline tasks for organizations like the UN, with applications including automated translation and content creation.
- However, there are risks associated with Generative AI, such as the generation of biased or misleading content, and susceptibility to cyber-security threats and manipulation by malicious users.
- Human oversight is currently crucial in utilizing Generative AI, but as the technology evolves, there may be a shift towards more automated processes.
- The UN can benefit from the advancements in Generative AI to enhance its efficiency in tasks li

## 4. Experiments

In [15]:
def setup_langchain(contents, llm_model="gpt-4o-mini"):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    texts = text_splitter.create_documents(contents)
    splits = text_splitter.split_documents(texts)
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(texts, embedding=embeddings)
    retriever = vectorstore.as_retriever()

    custom_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
            You are an assistant specialized in creating Talking Points for United Nations. 
            Use the given context to create a the Talking Point.
            The output should contain bullet points on each Talking Point.
            If the context does not contain enough information, indicate that clearly.
            
            Question: {question}
            Context: {context}
            Answer:
        """
    )

    llm = ChatOpenAI(model_name=llm_model)

    chain = rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_prompt
        | llm
        | StrOutputParser()
    )

    return chain
    

### 4.1 Models

In [19]:
# Function to conduct n experiments without averaging results
def conduct_experiment(df, model="gpt-4o-mini", n=3):
    results = []

    # Initialize the model and the chain
    for i in tqdm(range(len(df))):
        # Remove the document from the embedding
        mask = df.index.isin([1])
        contents = df[~mask].content
        
        chain = setup_langchain(contents, model)
        
        prompt = df.iloc[i].prompt
        content = df['content'][i]
        file_name = df['file_name'][i]

        # Conduct n experiments
        for experiment_num in tqdm(range(n), leave=False):
            # Generate talking point
            generated_talking_point = chain.invoke(prompt)

            # Store result
            # tppi_result = tppi.calculate_tppi(content, generated_talking_point)
            
            # Add metadata fields to each experiment result
            result = {}
            result['file_name'] = file_name
            result['model'] = model
            result['experiment_number'] = experiment_num + 1
            result['content'] = content
            result['prompt'] = prompt
            result['generated_doc'] = generated_talking_point

            results.append(result)

    return results

In [20]:
# Run experiments
models = ['gpt-3.5-turbo', 'gpt-4o-mini', 'gpt-4o']
all_results = []

for model in models:
    all_results.append(conduct_experiment(df, model, 3))

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
flattened_data = [item for sublist in all_results for item in sublist]
all_results = pd.DataFrame(flattened_data)

In [23]:
all_results.to_csv('notebooks/RAG/model_exp.csv')

### 4.2 Temperatures

In [31]:
def setup_langchain(contents, llm_model="gpt-4o-mini", temperature=0.5):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    texts = text_splitter.create_documents(contents)
    splits = text_splitter.split_documents(texts)
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(texts, embedding=embeddings)
    retriever = vectorstore.as_retriever()

    custom_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
            You are an assistant specialized in creating Talking Points for United Nations. 
            Use the given context to create a the Talking Point.
            The output should contain bullet points on each Talking Point.
            If the context does not contain enough information, indicate that clearly.
            
            Question: {question}
            Context: {context}
            Answer:
        """
    )

    llm = ChatOpenAI(model_name=llm_model, temperature=temperature)

    chain = rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_prompt
        | llm
        | StrOutputParser()
    )

    return chain

In [32]:
# Function to conduct n experiments without averaging results
def conduct_experiment(df, model="gpt-4o-mini", temperature=0.5, n=3):
    results = []

    # Initialize the model and the chain
    for i in tqdm(range(len(df))):
        # Remove the document from the embedding
        mask = df.index.isin([1])
        contents = df[~mask].content
        
        chain = setup_langchain(contents, model, temperature)
        
        prompt = df.iloc[i].prompt
        content = df['content'][i]
        file_name = df['file_name'][i]

        # Conduct n experiments
        for experiment_num in tqdm(range(n), leave=False):
            # Generate talking point
            generated_talking_point = chain.invoke(prompt)

            # Store result
            # tppi_result = tppi.calculate_tppi(content, generated_talking_point)
            
            # Add metadata fields to each experiment result
            result = {}
            result['file_name'] = file_name
            result['model'] = model
            result['temperature'] = temperature
            result['experiment_number'] = experiment_num + 1
            result['content'] = content
            result['prompt'] = prompt
            result['generated_doc'] = generated_talking_point

            results.append(result)

    return results

In [34]:
# Run experiments
temperatures = [0, 0.5, 1, 1.5, 2]
all_results = []

for temperature in temperatures:
    all_results.append(conduct_experiment(df, model='gpt-4o', temperature=temperature, n=3))

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [35]:
flattened_data = [item for sublist in all_results for item in sublist]
all_results = pd.DataFrame(flattened_data)

In [36]:
all_results.to_csv('notebooks/RAG/temperature_exp.csv')

## 5. Examples:

### 5.1 Doc 17

In [20]:
df_aux = df.drop(17)

In [21]:
texts = text_splitter.create_documents(df_aux['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [26]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(df.iloc[17].prompt)

"The presentation will focus on the UN Technology Partnership for Digital Transformation, highlighting the collaboration between the private sector, governments, and the United Nations to enhance global digital transformation efforts, despite challenges in organizational languages and ethical concerns. The successful cooperation during the pandemic with the WHO will be referenced, aligning with the UN Secretary-General's 'Common Agenda' and the new 'Quintet of Change' initiative. Emphasis will be placed on integrating UN values in digital ecosystems, updating the UN's approach to leverage digital advancements, and ensuring ethical technology use upholding human rights, gender equality, and sustainable development, with anticipated remarks on next steps for the 'Quintet of Change' from a representative of the Secretary-General's office."

### 5.2 Doc 28

In [28]:
df_aux = df.drop(28)

In [29]:
texts = text_splitter.create_documents(df_aux['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [30]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(df.iloc[28].prompt)

"The keynote address should focus on the integral role of science in achieving the United Nations Sustainable Development Goals (SDGs) and the importance of innovation for future generations, emphasizing data inclusion and global access to scientific data through open access policies. It should explore science's pivotal role in addressing global challenges like climate change, pandemics, inequality, and technological advancements, and advocate for supportive policies and regulatory environments to enhance global science collaboration. Additionally, the address should discuss the ongoing data revolution, the impact of AI advancements on industries, potential benefits and risks associated with AI, and highlight the United Nations' initiatives in AI governance, particularly the Global Digital Compact planned for the 2024 Summit of the Future."

### 5.3 Doc 25

In [34]:
df_aux = df.drop(25)

In [35]:
texts = text_splitter.create_documents(df_aux['content'])
splits = text_splitter.split_documents(texts)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [36]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(df.iloc[25].prompt)

"The symposium focuses on accelerating the digital transformation of peacekeeping operations to align with the 'UN 2.0' vision. Key themes include the role of new technologies in enhancing peacekeeping effectiveness, potential risks of Generative AI, and the importance of partnerships and technological advancements. Emphasis is placed on eco-responsibility, telemedicine, and data-driven operations to improve peacekeeping outcomes."

In [38]:
df.iloc[25].prompt

"Generate a briefing note outlining the objectives and key themes for an upcoming symposium focused on the digital transformation of peacekeeping operations. The symposium seeks to align with the UN Secretary-General’s vision for a 'UN 2.0,' emphasizing the integration of new and emerging technologies to enhance peacekeeping mandates. Discuss the role of the Office of Information and Communications Technology in adapting to these changes, the potential impacts of Generative AI, and how these technologies can improve peacekeeping effectiveness. Highlight existing digital initiatives and propose new partnerships and technological advancements. Include talking points on the importance of eco-responsibility, telemedicine, and data-driven operations."