In [1]:
import pandas as pd
import numpy as np
import faiss, random

from langchain_openai import OpenAIEmbeddings
random_seed = 42

# openai api key
openai_api_key = input("Enter the OpenAI API key: ")
model_embeddings = "text-embedding-3-small"

In [2]:
# load the data

questions_fragments = pd.read_csv('001_questions_per_section/cleaned_test_dataset_with_clusters.csv')
questions_fragments['origin'] = 'documentation_fragment'

questions_documentation = pd.read_csv('002_questions_per_document/largefragments_cleaned_test_dataset_with_clusters.csv')
questions_documentation['origin'] = 'documentation'

questions_forum = pd.read_csv('003_questions_from_forum/forum_test_dataset_with_clusters.csv')
questions_forum['origin'] = 'forum'

In [3]:
n_fragments = 20
n_documents = 80
n_forum = 150

out_frags =questions_fragments[questions_fragments['cluster'] != 'tech documentation'].sample(n_fragments, random_state=random_seed)
out_docs = questions_documentation[questions_documentation['cluster'] != 'tech documentation'].sample(n_documents, random_state=random_seed)
out_forum = questions_forum.sample(n_forum, random_state=random_seed)

out_brut = pd.concat([out_frags, out_docs, out_forum])#[['question', 'answer', 'origin']]
out_brut = out_brut.reset_index(drop=True)
out_brut

Unnamed: 0,fragment,question,answer,fragment_text,cluster,origin
0,576,Who can issue an attestation in a decentralize...,An attestation issuer can be a government agen...,---\ntitle: Glossary\nlang: en-US\n--- \n- **...,dev,documentation_fragment
1,18,How does one progress through the ambassador r...,"In Optimism Governance, the progression throug...",`wannabe-ambassador` ‚Üí **`ambassador-in-traini...,marketing / promotion / ambassadors / events / PR,documentation_fragment
2,9,How can I contribute to Optimism if I speak mo...,You can help translate the documentation by fo...,"At Optimism, we know that we can never truly d...",marketing / promotion / ambassadors / events / PR,documentation_fragment
3,515,What is the ultimate goal of the Foundation an...,The ultimate goal is to achieve sustained dece...,While the Collective‚Äôs Working Constitution is...,governance,documentation_fragment
4,29,How long do I need to offer support before bec...,You need to offer support for a total of five ...,`wannabe-NERD` ‚Üí **`NERD-in-training`** ‚Üí `sup...,dev,documentation_fragment
...,...,...,...,...,...,...
245,134,What are some community concerns regarding the...,Community members have expressed concerns abou...,Board: Retro Funding üî¥\nThread: RetroPGF 3: C...,governance,forum
246,209,What is the process for proposing a Grants Cou...,Prospective Council Leads may propose a Grants...,Board: Elected Representatives üíº\nThread: Gra...,governance,forum
247,240,Why is the proposal suggesting the removal of ...,The proposal aims to eliminate delegate respon...,Board: Technical Proposals üìÉ\nThread: [FINAL]...,governance,forum
248,90,What was jackanorak's reasoning for voting for...,"Jackanorak voted for Tarot, Interest Protocol,...",Board: Delegates üèõ\nThread: Jack anorak - del...,other,forum


In [4]:
questions = out_brut['question'].tolist()

# project into the embedding space
embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
questions_emb = embeddings.embed_documents(questions)
questions_emb = np.array(questions_emb)

In [5]:
# given a treeshold (in terms of faiss distance in the emb space), we will remove the questions that are too similar
def rm_too_similar_questions(questions, questions_emb, tresh, criterion_mantain = lambda x: 0):
    # faiss index
    index = faiss.IndexFlatL2(questions_emb.shape[1])
    index.add(questions_emb)

    # get the 100 nearest neighbors for each question
    dist, ind = index.search(questions_emb, 100)
    dist, ind

    indexes_to_remove = []
    # for each question
    for n in range(len(ind)):
        # closest neighbors indexes
        i = ind[n]
        # closest neighbors distances
        d = dist[n]
        # if the question is not removed yet
        if not i[0] in indexes_to_remove:
            # the too close questions
            small = i[d < tresh]

            if len(small) > 1:
                maintained_idx = criterion_mantain(small)
                print(f"mantained: {questions[small[maintained_idx]]}")
                small = list(small)
                small.pop(maintained_idx) 
                for s in small:
                    print(f"removed: {questions[s]}")
                print("----")
                indexes_to_remove.extend((small))
        
    return indexes_to_remove

indexes_to_remove = rm_too_similar_questions(questions, questions_emb, tresh = 0.4, criterion_mantain = lambda x: -1)

out_clean = out_brut.drop(indexes_to_remove, axis='index')
out_clean

mantained: How many projects were awarded funding in the first round of Retro Funding?
removed: What types of projects have been rewarded in past rounds of Retro Funding?
----
mantained: How does the NumbaNERDs program help with grant spending in Optimism governance?
removed: How does the NumbaNERDs program contribute to the Optimism governance?
----
mantained: How can I get funding for a project I want to build on Optimism?
removed: How can I get support for my project on Optimism?
removed: How can I get developer support for my project on Optimism?
----
mantained: How does the NumbaNERDs program contribute to the Optimism governance?
removed: How does the NumbaNERDs program help with grant spending in Optimism governance?
----
mantained: What role does the Optimism Foundation play in the governance process?
removed: How does the Optimism Foundation ensure it remains accountable to the community?
----
mantained: What types of projects have been rewarded in past rounds of Retro Funding

Unnamed: 0,fragment,question,answer,fragment_text,cluster,origin
0,576,Who can issue an attestation in a decentralize...,An attestation issuer can be a government agen...,---\ntitle: Glossary\nlang: en-US\n--- \n- **...,dev,documentation_fragment
1,18,How does one progress through the ambassador r...,"In Optimism Governance, the progression throug...",`wannabe-ambassador` ‚Üí **`ambassador-in-traini...,marketing / promotion / ambassadors / events / PR,documentation_fragment
2,9,How can I contribute to Optimism if I speak mo...,You can help translate the documentation by fo...,"At Optimism, we know that we can never truly d...",marketing / promotion / ambassadors / events / PR,documentation_fragment
3,515,What is the ultimate goal of the Foundation an...,The ultimate goal is to achieve sustained dece...,While the Collective‚Äôs Working Constitution is...,governance,documentation_fragment
4,29,How long do I need to offer support before bec...,You need to offer support for a total of five ...,`wannabe-NERD` ‚Üí **`NERD-in-training`** ‚Üí `sup...,dev,documentation_fragment
...,...,...,...,...,...,...
245,134,What are some community concerns regarding the...,Community members have expressed concerns abou...,Board: Retro Funding üî¥\nThread: RetroPGF 3: C...,governance,forum
246,209,What is the process for proposing a Grants Cou...,Prospective Council Leads may propose a Grants...,Board: Elected Representatives üíº\nThread: Gra...,governance,forum
247,240,Why is the proposal suggesting the removal of ...,The proposal aims to eliminate delegate respon...,Board: Technical Proposals üìÉ\nThread: [FINAL]...,governance,forum
248,90,What was jackanorak's reasoning for voting for...,"Jackanorak voted for Tarot, Interest Protocol,...",Board: Delegates üèõ\nThread: Jack anorak - del...,other,forum


In [6]:
out_clean[['question', 'answer', 'origin']].sort_values('question').to_csv('questions_test_dataset.csv', index=False)

In [7]:
out_clean[['question', 'answer', 'origin', 'fragment_text']].sort_values('question').to_csv('questions_test_dataset2.csv', index=False)