In [None]:
pip install langchain sentence-transformers tqdm qdrant-client pandas numpy

In [None]:
pip install langchain_community

In [None]:
pip install -U langchain-openai

In [None]:
!pip install --upgrade openai

In [None]:
# Importando bibliotecas necessárias
import os
from langchain_openai import OpenAI
from langchain.chains import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import qdrant_client
from qdrant_client.models import PointStruct, VectorParams, Distance
import pandas as pd
import numpy as np

  from tqdm.autonotebook import tqdm, trange


# ==========================================
# 1. Carregamento de Dados
# ==========================================

In [None]:
# Upload de arquivos no pc local
from google.colab import files
uploaded = files.upload()

In [None]:
# Carrega os dados da planilha
file_path = '/content/cpp_t1_cleaned_cpp_posts_t2.csv'
df = pd.read_csv(file_path)

# Imprime as primeiras 5 linhas
df.head()

Unnamed: 0,PostId,Body,Title
0,25,I'm having issues getting the C sockets API to...,How to use the C socket API in C++ on z/OS
1,264,\nWhat's the optimal level of concurrency that...,BerkeleyDB Concurrency
2,330,I am working on a collection of classes used f...,Should I use nested classes in this case?
3,419,Part of my everyday work is maintaining and ex...,Unload a COM control when working in VB6 IDE
4,601,"I'm looking for a performant, reasonably robus...",Robust Random Number Generation


In [None]:
# Verificar quantas linhas foram carregadas
num_linhas = len(df)
print(f"Número de linhas carregadas: {num_linhas}")

Número de linhas carregadas: 28071




---

Remover depois...

In [None]:
# Substituir valores NaN por strings vazias em todas as colunas
df = df.fillna('')

# Garantir que todos os valores no DataFrame sejam strings
df = df.astype(str)

# Preparar os textos que serão usados nos embeddings
#texts = df['Body'].tolist()
# Preparar os textos usando o título e o corpo da pergunta
texts = (df['Title'] + " " + df['Body']).tolist()



---



# ==========================================
# 2. Inicialização de Ferramentas e Modelos
# ==========================================

In [None]:
# Inicializa o LLM com a chave da API OpenAI
llm = OpenAI(temperature=0, openai_api_key="")

In [None]:
# O template do prompt
qa_prompt = PromptTemplate.from_template("Responda a pergunta usando as informações fornecidas: {context}")

# Construir o llm_chain
llm_chain = LLMChain(llm=llm, prompt=qa_prompt)

# Criar o StuffDocumentsChain para combinar documentos
combine_docs_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context"
)

  llm_chain = LLMChain(llm=llm, prompt=qa_prompt)
  combine_docs_chain = StuffDocumentsChain(


In [None]:
# Carregar o modelo de embedding do Sentence Transformers
#model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


# ==========================================
# 3. Funções de Embeddings e Pré-processamento
# ==========================================

In [None]:
# Função para gerar embeddings em lotes
def batch_encode(texts, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# ==========================================
# 4. Funções de Busca e Geração de Resposta
# ==========================================

In [None]:
# Função para realizar a busca e recuperar os documentos mais relevantes
def retrieve(query, k=15):
    # Gerar o embedding da consulta
    query_embedding = model.encode([query])[0]

    # Buscar os k documentos mais relevantes no Qdrant
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        search_params={"hnsw_ef": 100}  # Parâmetro HNSW
    )

    # Extrair os textos dos resultados da busca
    docs = [Document(page_content=hit.payload["text"]) for hit in search_result]

    return docs

In [None]:
# Função para dividir o contexto em partes menores (para o modelo LLM)
def split_context(context, max_len=500):
    sentences = context.split('. ')
    chunks = []
    chunk = ''

    for sentence in sentences:
        if len(chunk) + len(sentence) + 1 <= max_len:
            chunk += sentence + '. '
        else:
            chunks.append(chunk.strip())
            chunk = sentence + '. '

    if chunk:
        chunks.append(chunk.strip())

    # Imprimir os chunks para verificação
    for i, c in enumerate(chunks):
        print(f"Chunk {i}: {c[:100]}...")  # Exibir apenas uma parte do chunk

    return chunks

In [None]:
# Função principal para gerar a resposta
def generate_answer(query, k=15):
    try:
        # Recuperar os documentos mais relevantes usando o Qdrant
        docs = retrieve(query, k=k)

        # Verificar se documentos foram encontrados
        if not docs:
            return "Nenhum documento relevante encontrado."

        # Concatenar os textos dos documentos
        context = " ".join([doc.page_content for doc in docs])

        # Dividir o contexto se for muito grande
        context_chunks = split_context(context)

        # Iterar sobre os chunks e usar o modelo para gerar respostas
        answers = []
        for chunk in context_chunks:
            response = llm_chain.run(context=chunk)
            answers.append(response.strip())

        # Combinar as respostas e remover duplicatas
        final_answer = ' '.join(answers).strip()

        return final_answer if final_answer else "Nenhuma resposta clara foi encontrada."

    except Exception as e:
        return f"Ocorreu um erro: {str(e)}"

# ==========================================
# 5. Integração com Qdrant
# ==========================================

In [None]:
# Gerar embeddings em lotes
embeddings = batch_encode(texts)

In [None]:
# Inicializar o cliente do Qdrant (alterar a conexão conforme necessário)
client = qdrant_client.QdrantClient(":memory:")

In [None]:
# Nome da coleção no Qdrant
collection_name = "relevant_posts"


In [None]:
# Verificar se a coleção já existe, caso contrário, recriar
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=embeddings.shape[1], distance=Distance.COSINE),
)


  client.recreate_collection(


True

In [None]:
# Adicionar documentos ao Qdrant
points = [
    PointStruct(id=i, vector=embedding.tolist(), payload={"text": texts[i]})
    for i, embedding in enumerate(embeddings)
]
client.upsert(collection_name=collection_name, points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

# ==========================================
# 6. Testando o Sistema de Perguntas e Respostas
# ==========================================

In [None]:
# Teste de consulta
query = "What are the main benefits and challenges of migrating from C++11 to C++14?"
response = generate_answer(query)
print(f"------------------------------------------------------------------------")
print(f"Pergunta: {query}")
print(f"Resposta: {response}")
print(f"------------------------------------------------------------------------")

Chunk 0: Should you migrate a project to C++11? I have been trying to get our team to migrate a large C++ pro...
Chunk 1: He also says that this will require changing runtimes on all of our platforms which may change certa...
Chunk 2: Would his concern be just that the STL won't be exactly the same implementation?

I don't see what t...
Chunk 3: So my reasons are somewhat selfish.

My team lead is pushing back because he doesn't see the busines...
Chunk 4: Which would mean we would need to retest on all of the servers that we have deployed to.

The first ...
Chunk 5: Is there something I am missing? Are there any other good arguments for migrating that I should cite...
Chunk 6: What's the benefits of migrating to std? migrating from boost to std We have an "old" C++ project he...
Chunk 7: It's not my intention to start a flame war here, please, if you have strong opinions about not likin...
------------------------------------------------------------------------
Pergunta: What are the 

In [None]:
# Teste de consulta
query = "What should I consider when upgrading a project from C++14 to C++20?"
response = generate_answer(query)
print(f"------------------------------------------------------------------------")
print(f"Pergunta: {query}")
print(f"Resposta: {response}")
print(f"------------------------------------------------------------------------")

Chunk 0: ...
Chunk 1: How do I upgrade to C++17? I am using Atom as my IDE, my current __cplusplus = 201402 which is C++14...
Chunk 2: Is that accurate?
 Can a C++14/17 project use binary libraries compiled using C++11 standard or does...
Chunk 3: I want to do it mostly because I want to start using C++11 and the IDE is much nicer. So my reasons ...
Chunk 4: Which would mean we would need to retest on all of the servers that we have deployed to.

The first ...
Chunk 5: Is there something I am missing? Are there any other good arguments for migrating that I should cite...
Chunk 6: So my reasons are somewhat selfish.

My team lead is pushing back because he doesn't see the busines...
Chunk 7: Which would mean we would need to retest on all of the servers that we have deployed to.

The first ...
Chunk 8: Is there something I am missing? Are there any other good arguments for migrating that I should cite...
------------------------------------------------------------------------
Pergunta: 

In [None]:
# Teste de consulta
query = "What specific new features introduced in C++14 help improve code readability and maintainability compared to C++11?"
response = generate_answer(query)
print(f"------------------------------------------------------------------------")
print(f"Pergunta: {query}")
print(f"Resposta: {response}")
print(f"------------------------------------------------------------------------")

Chunk 0: <a C++11 (and C++14) introduces additional language constructs and improvements that target generic ...
Chunk 1: These include features such as;

R-value references
Reference collapsing
Perfect forwarding
Move sem...
Chunk 2: Some examples:
constexpr
decltype
thread_local
auto // New usage
noexcept
nullptr
static_assert
alig...
Chunk 3: Is that accurate?
 What changes introduced in C++14 can potentially break a program written in C++11...
Chunk 4: C++1y) Standard in a state close to being final, programmers must ask themselves about backwards com...
Chunk 5: I was having a look at <a href=\"http://gcc.gnu.org/projects/cxx1y.html\ Which C++ idioms are deprec...
Chunk 6: So the question that remains is:
What old ways of coding are definitely inferior to C++11 styles, an...
Chunk 7: Drafts and papers are currently available from the committee website. All sorts of new features are ...
Chunk 8: Until 0x, long template lists have involved Boost Preprocessor usually, and are very un