# Cleaning the repeated and repetitive questions from the dataset

In [14]:
import pandas as pd
import numpy as np
import faiss, random

from langchain_openai import OpenAIEmbeddings

test_brut = pd.read_csv("first_test_dataset_with_fragments.csv")

# openai api key
openai_api_key = input("Enter the OpenAI API key: ")
model_embeddings = "text-embedding-3-small"

In [15]:
# for the lines that have repeated questions, we will let only the first one
test_clean_dupl = test_brut.drop_duplicates(subset=["question"])
test_clean_dupl

Unnamed: 0,fragment,question,answer,fragment_text
0,0,How can I get developer support for my project...,To get developer support for your project on O...,---\ntitle: How do I get project support (mark...
1,0,How can I promote my project once it's deploye...,Once your project is deployed on the Superchai...,---\ntitle: How do I get project support (mark...
2,0,What should I do if I want to apply for a Gove...,"If you want to apply for a Governance Grant, f...",---\ntitle: How do I get project support (mark...
3,1,How can I get involved in the Optimism Collect...,You can get involved in the Optimism Collectiv...,---\ntitle: Contribute to Optimism\nlang: en-U...
4,1,What is RetroPGF and why is it important for O...,RetroPGF is a core part of Optimism’s vision a...,---\ntitle: Contribute to Optimism\nlang: en-U...
...,...,...,...,...
291,102,What is the difference between push oracles an...,Push oracles are updated continuously and alwa...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...
292,102,What is the role of the Gas Price Oracle on OP...,The Gas Price Oracle on OP Mainnet provides in...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...
293,103,Which API providers offer free access to Optim...,Several API providers offer free access to Opt...,---\ntitle: Node & API Providers\nlang: en-US\...
294,103,Are there any API providers that support both ...,"Yes, several API providers support both OP Mai...",---\ntitle: Node & API Providers\nlang: en-US\...


In [16]:
# questions df
questions = test_clean_dupl["question"].tolist()

In [17]:
# project into the embedding space
embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
questions_emb = embeddings.embed_documents(questions)
questions_emb = np.array(questions_emb)

In [18]:
# given a treeshold (in terms of faiss distance in the emb space), we will remove the questions that are too similar
def rm_too_similar_questions(questions, questions_emb, tresh):
    # faiss index
    index = faiss.IndexFlatL2(questions_emb.shape[1])
    index.add(questions_emb)

    # get the 100 nearest neighbors for each question
    dist, ind = index.search(questions_emb, 100)
    dist, ind

    indexes_to_remove = []
    # for each question
    for n in range(len(ind)):
        # closest neighbors indexes
        i = ind[n]
        # closest neighbors distances
        d = dist[n]
        # if the question is not removed yet
        if not i[0] in indexes_to_remove:
            # the too close questions
            small = i[d < tresh]

            if len(small) > 1:
                for s in small:
                    print(questions[s])
                print("----")
                indexes_to_remove.extend(small[1:])
        
    # questions without indexes_to_remove
    questions_clean = [questions[i] for i in range(len(questions)) if i not in indexes_to_remove]
    # questions embeddings without indexes_to_remove
    questions_emb_clean = np.array([questions_emb[i] for i in range(len(questions_emb)) if i not in indexes_to_remove])

    return questions_clean, questions_emb_clean

# remove the questions that are too similar
cleaned_questions, cleaned_questions_emb = rm_too_similar_questions(questions, questions_emb, tresh = 0.4)

# select questions in the cleaned_questions list
test_clean = test_clean_dupl[test_clean_dupl["question"].isin(cleaned_questions)]
test_clean

What should I do if I want to apply for a Governance Grant?
How can I apply for grant funding from the Governance Fund?
----
How can I get involved in the Optimism Collective?
What is the Optimism Collective?
How can I stay updated on current events and opportunities within the Optimism Collective?
Who are the members of the Optimism Collective?
What is the main goal of the Optimism Collective?
----
How can I start the process of becoming an Optimism Ambassador?
What are the main responsibilities of an Optimism Ambassador?
----
How does the Optimism Collective plan to handle identity and reputation?
What approach should developers take when building within the identity and reputation space in the Optimism Collective?
----
How can I get involved in running local events for the Optimism community?
What should I do if I want to run an event for the Optimism community now?
----
Where can I find more information about the NumbaNERDs program?
How can I participate in the NumbaNERDs program?


Unnamed: 0,fragment,question,answer,fragment_text
0,0,How can I get developer support for my project...,To get developer support for your project on O...,---\ntitle: How do I get project support (mark...
1,0,How can I promote my project once it's deploye...,Once your project is deployed on the Superchai...,---\ntitle: How do I get project support (mark...
2,0,What should I do if I want to apply for a Gove...,"If you want to apply for a Governance Grant, f...",---\ntitle: How do I get project support (mark...
3,1,How can I get involved in the Optimism Collect...,You can get involved in the Optimism Collectiv...,---\ntitle: Contribute to Optimism\nlang: en-U...
4,1,What is RetroPGF and why is it important for O...,RetroPGF is a core part of Optimism’s vision a...,---\ntitle: Contribute to Optimism\nlang: en-U...
...,...,...,...,...
291,102,What is the difference between push oracles an...,Push oracles are updated continuously and alwa...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...
292,102,What is the role of the Gas Price Oracle on OP...,The Gas Price Oracle on OP Mainnet provides in...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...
293,103,Which API providers offer free access to Optim...,Several API providers offer free access to Opt...,---\ntitle: Node & API Providers\nlang: en-US\...
294,103,Are there any API providers that support both ...,"Yes, several API providers support both OP Mai...",---\ntitle: Node & API Providers\nlang: en-US\...


# Clustering questions using OpenAI Chat

In [19]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
model_chat = "gpt-3.5-turbo-0125"

# select the model
llm = ChatOpenAI(
    model = model_chat,
    temperature = 0,
    max_tokens = None,
    timeout = None,
    max_retries = 2,
    api_key = openai_api_key
)

# create the template for interactions
def answer_template():
    return f""" I'll give you a question about Optimism Documentation. You don't need to answer it, just categorize it (just return the number, one character) into one of the following categories:

1. project support;
2. governance;
3. dev;
4. tech documentation;
5. general documentation;
6. marketing / promotion / ambassadors / events / PR;
7. other.

Question: {{question}}
"""

prompt = ChatPromptTemplate.from_template(answer_template())

chain = prompt | llm

In [20]:
clusters = {}
for q in cleaned_questions:
    cat = chain.invoke({"question": q})
    clusters[q] = cat.content
    print(f"Question: {q}")
    print(f"Category: {cat.content}")

# add a column with the cluster to df
test_clean['cluster'] = [clusters[q] for q in test_clean['question']]
test_clean

Question: How can I get developer support for my project on Optimism?
Category: 1
Question: How can I promote my project once it's deployed on the Superchain?
Category: 6
Question: What should I do if I want to apply for a Governance Grant?
Category: 2
Question: How can I get involved in the Optimism Collective?
Category: 6
Question: What is RetroPGF and why is it important for Optimism?
Category: 4
Question: How can I start the process of becoming an Optimism Ambassador?
Category: 6
Question: What actions can lead to losing your Ambassador status?
Category: 6
Question: How can I become a support NERD in the Optimism community?
Category: 3
Question: What are the main responsibilities of a support NERD?
Category: 1
Question: What actions can lead to losing my NERD status?
Category: 3
Question: How does the Optimism Collective plan to handle identity and reputation?
Category: 2
Question: What are attestations and when are they useful?
Category: 4
Question: How can I get involved in runni

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean['cluster'] = [clusters[q] for q in test_clean['question']]


Unnamed: 0,fragment,question,answer,fragment_text,cluster
0,0,How can I get developer support for my project...,To get developer support for your project on O...,---\ntitle: How do I get project support (mark...,1
1,0,How can I promote my project once it's deploye...,Once your project is deployed on the Superchai...,---\ntitle: How do I get project support (mark...,6
2,0,What should I do if I want to apply for a Gove...,"If you want to apply for a Governance Grant, f...",---\ntitle: How do I get project support (mark...,2
3,1,How can I get involved in the Optimism Collect...,You can get involved in the Optimism Collectiv...,---\ntitle: Contribute to Optimism\nlang: en-U...,6
4,1,What is RetroPGF and why is it important for O...,RetroPGF is a core part of Optimism’s vision a...,---\ntitle: Contribute to Optimism\nlang: en-U...,4
...,...,...,...,...,...
291,102,What is the difference between push oracles an...,Push oracles are updated continuously and alwa...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...,4
292,102,What is the role of the Gas Price Oracle on OP...,The Gas Price Oracle on OP Mainnet provides in...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...,4
293,103,Which API providers offer free access to Optim...,Several API providers offer free access to Opt...,---\ntitle: Node & API Providers\nlang: en-US\...,4
294,103,Are there any API providers that support both ...,"Yes, several API providers support both OP Mai...",---\ntitle: Node & API Providers\nlang: en-US\...,4


In [21]:
test_clean['cluster'].value_counts()

cluster
4    156
2     42
3     21
6     14
1      5
5      5
7      1
Name: count, dtype: int64

In [25]:
"""
1. project support;
2. governance;
3. dev;
4. tech documentation;
5. general documentation;
6. marketing / promotion / ambassadors / events / PR;
7. other.
"""
labels_dict = {
    "1": "project support",
    "2": "governance",
    "3": "dev",
    "4": "tech documentation",
    "5": "general documentation",
    "6": "marketing / promotion / ambassadors / events / PR",
    "7": "other"
}

test_clean['cluster'] = test_clean['cluster'].map(labels_dict)
test_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean['cluster'] = test_clean['cluster'].map(labels_dict)


Unnamed: 0,fragment,question,answer,fragment_text,cluster
0,0,How can I get developer support for my project...,To get developer support for your project on O...,---\ntitle: How do I get project support (mark...,project support
1,0,How can I promote my project once it's deploye...,Once your project is deployed on the Superchai...,---\ntitle: How do I get project support (mark...,marketing / promotion / ambassadors / events / PR
2,0,What should I do if I want to apply for a Gove...,"If you want to apply for a Governance Grant, f...",---\ntitle: How do I get project support (mark...,governance
3,1,How can I get involved in the Optimism Collect...,You can get involved in the Optimism Collectiv...,---\ntitle: Contribute to Optimism\nlang: en-U...,marketing / promotion / ambassadors / events / PR
4,1,What is RetroPGF and why is it important for O...,RetroPGF is a core part of Optimism’s vision a...,---\ntitle: Contribute to Optimism\nlang: en-U...,tech documentation
...,...,...,...,...,...
291,102,What is the difference between push oracles an...,Push oracles are updated continuously and alwa...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...,tech documentation
292,102,What is the role of the Gas Price Oracle on OP...,The Gas Price Oracle on OP Mainnet provides in...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...,tech documentation
293,103,Which API providers offer free access to Optim...,Several API providers offer free access to Opt...,---\ntitle: Node & API Providers\nlang: en-US\...,tech documentation
294,103,Are there any API providers that support both ...,"Yes, several API providers support both OP Mai...",---\ntitle: Node & API Providers\nlang: en-US\...,tech documentation


# Remove questions that are too similar to the ones we had

In [40]:
# load csv
old_questions = pd.read_csv("../001_questions_per_section/cleaned_test_dataset_with_clusters.csv").question.to_list()

new_questions = test_clean.question.to_list()

# all
all_questions = old_questions + new_questions

In [41]:
# embeddings
all_questions_emb = embeddings.embed_documents(all_questions)
all_questions_emb = np.array(all_questions_emb)

# remove the new questions that are too similar to the old ones
cleaned_all_questions, cleaned_all_questions_emb = rm_too_similar_questions(all_questions, all_questions_emb, tresh = 0.2)

cleaned_all_questions

How can I get support for my project on Optimism?
How can I get developer support for my project on Optimism?
----
How can I get involved in the Optimism Collective?
How can I get involved in the Optimism Collective?
----
What is the Optimism Collective's approach to building public goods?
How does the Optimism Collective plan to fund public goods?
----
What is RetroPGF and why is it important to Optimism?
What is RetroPGF and why is it important for Optimism?
----
How can I contribute directly to the Optimism codebase?
How can I start contributing to the Optimism codebase?
----
How can I become a support NERD for Optimism?
How can I become a support NERD in the Optimism community?
----
Why are schemas important in Optimism Governance?
What's the purpose of schemas in Optimism Governance?
----
Where can I find more information about the NumbaNERDs program?
Where can I find more information about the NumbaNERDs program?
----
How can I participate in the TechNERDs program?
How can I get 

['How can I get support for my project on Optimism?',
 'What should I do before deploying my project on OP Mainnet?',
 'Where can I find tutorials for developing on Optimism?',
 'Is there a Discord channel for developer support for Optimism?',
 'How can I get my project listed on the Superchain apps page?',
 "Is there a way to promote my app launch through Superchain's marketing channels?",
 'How does Optimism support projects that align with its values?',
 'How does Optimism decide which tweets to retweet?',
 'Can I submit my tweet to Optimism for retweeting?',
 'What other interactions does Optimism have with tweets besides retweeting?',
 'How can I participate in OP Radio?',
 'What is the format of OP Radio?',
 'How can I apply for a grant from Optimism?',
 'What should I do to deploy my project on Optimism?',
 'How can I learn about the origins of Optimism?',
 'What are some upcoming changes to the Optimism protocol?',
 'Is there a way for developers to deploy their own app-specifi

In [42]:
len(all_questions)

1514

In [43]:
len(cleaned_all_questions)

1422

In [44]:
# questions that are in cleaned_all_questions and in new_questions
cleaned_new_questions = [q for q in new_questions if q in cleaned_all_questions]
len(cleaned_new_questions)

185

In [47]:
final_test_clean = test_clean[test_clean['question'].isin(cleaned_new_questions)]
final_test_clean

Unnamed: 0,fragment,question,answer,fragment_text,cluster
1,0,How can I promote my project once it's deploye...,Once your project is deployed on the Superchai...,---\ntitle: How do I get project support (mark...,marketing / promotion / ambassadors / events / PR
2,0,What should I do if I want to apply for a Gove...,"If you want to apply for a Governance Grant, f...",---\ntitle: How do I get project support (mark...,governance
3,1,How can I get involved in the Optimism Collect...,You can get involved in the Optimism Collectiv...,---\ntitle: Contribute to Optimism\nlang: en-U...,marketing / promotion / ambassadors / events / PR
6,2,How can I start the process of becoming an Opt...,To start the process of becoming an Optimism A...,---\ntitle: Ambassador Requirements\nlang: en-...,marketing / promotion / ambassadors / events / PR
8,2,What actions can lead to losing your Ambassado...,You can lose your Ambassador status due to ina...,---\ntitle: Ambassador Requirements\nlang: en-...,marketing / promotion / ambassadors / events / PR
...,...,...,...,...,...
290,102,How do oracles help blockchain applications ac...,"Oracles provide offchain data onchain, allowin...",---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...,tech documentation
292,102,What is the role of the Gas Price Oracle on OP...,The Gas Price Oracle on OP Mainnet provides in...,---\ntitle: Oracles\nlang: en-US\n--- \n[Orac...,tech documentation
293,103,Which API providers offer free access to Optim...,Several API providers offer free access to Opt...,---\ntitle: Node & API Providers\nlang: en-US\...,tech documentation
294,103,Are there any API providers that support both ...,"Yes, several API providers support both OP Mai...",---\ntitle: Node & API Providers\nlang: en-US\...,tech documentation


In [48]:
# save csv
final_test_clean.to_csv("largefragments_cleaned_test_dataset_with_clusters.csv", index=False)