# Cleaning the repeated and repetitive questions from the dataset

In [2]:
import pandas as pd
import numpy as np
import faiss, random

from langchain_openai import OpenAIEmbeddings

test_brut = pd.read_csv("first_test_dataset_with_fragments.csv")

# openai api key
openai_api_key = input("Enter the OpenAI API key: ")
model_embeddings = "text-embedding-3-small"

In [3]:
# for the lines that have repeated questions, we will let only the first one
test_clean_dupl = test_brut.drop_duplicates(subset=["question"])
test_clean_dupl

Unnamed: 0,fragment,question,answer,fragment_text
0,0,How can I get support for my project on Optimism?,"To get support for your project on Optimism, y...",---\ntitle: How do I get project support (mark...
1,0,Where can I find the Optimism community to dis...,You can find the Optimism community on their D...,---\ntitle: How do I get project support (mark...
2,0,What should I do before deploying my project o...,"Before deploying your project on OP Mainnet, y...",---\ntitle: How do I get project support (mark...
3,1,Where can I find tutorials for developing on O...,You can find tutorials for developing on Optim...,Steps to take if you would like developer supp...
4,1,How can I get immediate developer support for ...,"For immediate developer support, you can first...",Steps to take if you would like developer supp...
...,...,...,...,...
1867,758,Is there a free tier available for using OP Ma...,"Yes, there is a generous free tier available t...",[Pocket](https://www.portal.pokt.network/) off...
1869,760,Can I access OP Mainnet nodes for free?,"Yes, QuickNode offers access to hosted OP Main...",[QuickNode](https://www.quicknode.com/) offers...
1870,760,Are there any premium options available for OP...,"Yes, QuickNode provides an option to upgrade t...",[QuickNode](https://www.quicknode.com/) offers...
1871,760,Do I need to manage the infrastructure if I us...,"No, QuickNode manages the complex infrastructu...",[QuickNode](https://www.quicknode.com/) offers...


In [4]:
# questions df
questions = test_clean_dupl["question"].tolist()

In [5]:
# project into the embedding space
embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
questions_emb = embeddings.embed_documents(questions)
questions_emb = np.array(questions_emb)

In [12]:
# given a treeshold (in terms of faiss distance in the emb space), we will remove the questions that are too similar
def rm_too_similar_questions(questions, questions_emb, tresh):
    # faiss index
    index = faiss.IndexFlatL2(questions_emb.shape[1])
    index.add(questions_emb)

    # get the 100 nearest neighbors for each question
    dist, ind = index.search(questions_emb, 100)
    dist, ind

    indexes_to_remove = []
    # for each question
    for n in range(len(ind)):
        # closest neighbors indexes
        i = ind[n]
        # closest neighbors distances
        d = dist[n]
        # if the question is not removed yet
        if not i[0] in indexes_to_remove:
            # the too close questions
            small = i[d < tresh]

            if len(small) > 1:
                for s in small:
                    print(questions[s])
                print("----")
                indexes_to_remove.extend(small[1:])
        
    # questions without indexes_to_remove
    questions_clean = [questions[i] for i in range(len(questions)) if i not in indexes_to_remove]
    # questions embeddings without indexes_to_remove
    questions_emb_clean = np.array([questions_emb[i] for i in range(len(questions_emb)) if i not in indexes_to_remove])

    return questions_clean, questions_emb_clean

# remove the questions that are too similar
cleaned_questions, cleaned_questions_emb = rm_too_similar_questions(questions, questions_emb, tresh = 0.4)

# select questions in the cleaned_questions list
test_clean = test_clean_dupl[test_clean_dupl["question"].isin(cleaned_questions)]
test_clean

How can I get support for my project on Optimism?
Where can I find the Optimism community to discuss my project?
How can I get support for creating Optimism content?
How can I get in touch with the Optimism team for support or to contribute?
How can I gain traction for my project within the Optimism community?
How can I get immediate developer support for Optimism?
How can I contribute to the Optimism project?
----
What should I do before deploying my project on OP Mainnet?
How can I get started with deploying a contract on OP Mainnet?
How can I start developing on OP Mainnet?
----
Is there a Discord channel for developer support for Optimism?
How can I join the Optimism community on Discord?
----
How can I get my project listed on the Superchain apps page?
How can my app be featured on the Superchain apps page?
----
How can I apply for a grant from Optimism?
How can I get a grant from Optimism?
----
How can I get involved in the Optimism Collective?
How can I contribute technically to

Unnamed: 0,fragment,question,answer,fragment_text
0,0,How can I get support for my project on Optimism?,"To get support for your project on Optimism, y...",---\ntitle: How do I get project support (mark...
2,0,What should I do before deploying my project o...,"Before deploying your project on OP Mainnet, y...",---\ntitle: How do I get project support (mark...
3,1,Where can I find tutorials for developing on O...,You can find tutorials for developing on Optim...,Steps to take if you would like developer supp...
5,1,Is there a Discord channel for developer suppo...,"Yes, there is a Discord channel for developer ...",Steps to take if you would like developer supp...
6,2,How can I get my project listed on the Superch...,To get your project listed on the Superchain a...,When your project is deployed on the Superchai...
...,...,...,...,...
1866,758,How can developers access OP Mainnet nodes thr...,Developers can access OP Mainnet nodes through...,[Pocket](https://www.portal.pokt.network/) off...
1867,758,Is there a free tier available for using OP Ma...,"Yes, there is a generous free tier available t...",[Pocket](https://www.portal.pokt.network/) off...
1870,760,Are there any premium options available for OP...,"Yes, QuickNode provides an option to upgrade t...",[QuickNode](https://www.quicknode.com/) offers...
1871,760,Do I need to manage the infrastructure if I us...,"No, QuickNode manages the complex infrastructu...",[QuickNode](https://www.quicknode.com/) offers...


# Clustering questions using hdbscan

In [13]:
import hdbscan

# cluster the questions
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0.5)
clusters = clusterer.fit(cleaned_questions_emb)

cs_q = clusters.labels_

# count the number of elements in each cluster
for c in np.unique(cs_q):
    print(f"cluster {c} has {len(cs_q[cs_q == c])} elements")

cluster -1 has 457 elements
cluster 0 has 4 elements
cluster 1 has 2 elements
cluster 2 has 2 elements
cluster 3 has 2 elements
cluster 4 has 2 elements
cluster 5 has 2 elements
cluster 6 has 2 elements
cluster 7 has 4 elements
cluster 8 has 4 elements
cluster 9 has 3 elements
cluster 10 has 2 elements
cluster 11 has 3 elements
cluster 12 has 2 elements
cluster 13 has 3 elements
cluster 14 has 3 elements
cluster 15 has 2 elements
cluster 16 has 2 elements
cluster 17 has 2 elements
cluster 18 has 2 elements
cluster 19 has 2 elements
cluster 20 has 2 elements
cluster 21 has 2 elements
cluster 22 has 2 elements
cluster 23 has 2 elements
cluster 24 has 4 elements
cluster 25 has 3 elements
cluster 26 has 5 elements
cluster 27 has 5 elements
cluster 28 has 2 elements
cluster 29 has 2 elements
cluster 30 has 2 elements
cluster 31 has 2 elements
cluster 32 has 3 elements
cluster 33 has 2 elements
cluster 34 has 2 elements
cluster 35 has 2 elements
cluster 36 has 3 elements
cluster 37 has 2 ele

# Clustering fragments using hdbscan

In [14]:
# get fragments
frags_uniques = test_clean['fragment_text'].unique()

# embeddings of the fragments
fragments_emb = embeddings.embed_documents(frags_uniques)
fragments_emb = np.array(fragments_emb)

In [15]:
# cluster the fragments
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=1, cluster_selection_epsilon=10)
clusters_fragments = clusterer.fit(fragments_emb)

cs_fr = clusters_fragments.labels_

# count the number of elements in each cluster
for c in np.unique(cs_fr):
    print(f"cluster {c} has {len(cs_fr[cs_fr == c])} elements")

cluster -1 has 201 elements
cluster 0 has 381 elements
cluster 1 has 14 elements


# Clustering questions using OpenAI Chat

In [27]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
model_chat = "gpt-3.5-turbo-0125"

# select the model
llm = ChatOpenAI(
    model = model_chat,
    temperature = 0,
    max_tokens = None,
    timeout = None,
    max_retries = 2,
    api_key = openai_api_key
)

# create the template for interactions
def answer_template():
    return f""" I'll give you a question about Optimism Documentation. You don't need to answer it, just categorize it (just return the number, one character) into one of the following categories:

1. general information / community engagement;
2. governance;
3. dev / project tech support / tech documentation;
4. marketing / promotion / ambassadors / events / PR;
5. other.

Question: {{question}}
"""

prompt = ChatPromptTemplate.from_template(answer_template())

chain = prompt | llm

In [28]:
clusters = {}
for q in cleaned_questions:
    cat = chain.invoke({"question": q})
    clusters[q] = cat.content
    print(f"Question: {q}")
    print(f"Category: {cat.content}")

clusters

Question: How can I get support for my project on Optimism?
Category: 3
Question: What should I do before deploying my project on OP Mainnet?
Category: 3
Question: Where can I find tutorials for developing on Optimism?
Category: 3
Question: Is there a Discord channel for developer support for Optimism?
Category: 3
Question: How can I get my project listed on the Superchain apps page?
Category: 3
Question: Is there a way to promote my app launch through Superchain's marketing channels?
Category: 4
Question: How does Optimism support projects that align with its values?
Category: 2
Question: How does Optimism decide which tweets to retweet?
Category: 4
Question: Can I submit my tweet to Optimism for retweeting?
Category: 4
Question: What other interactions does Optimism have with tweets besides retweeting?
Category: 3
Question: How can I participate in OP Radio?
Category: 4
Question: What is the format of OP Radio?
Category: 3
Question: How can I apply for a grant from Optimism?
Category

{'How can I get support for my project on Optimism?': '3',
 'What should I do before deploying my project on OP Mainnet?': '3',
 'Where can I find tutorials for developing on Optimism?': '3',
 'Is there a Discord channel for developer support for Optimism?': '3',
 'How can I get my project listed on the Superchain apps page?': '3',
 "Is there a way to promote my app launch through Superchain's marketing channels?": '4',
 'How does Optimism support projects that align with its values?': '2',
 'How does Optimism decide which tweets to retweet?': '4',
 'Can I submit my tweet to Optimism for retweeting?': '4',
 'What other interactions does Optimism have with tweets besides retweeting?': '3',
 'How can I participate in OP Radio?': '4',
 'What is the format of OP Radio?': '3',
 'How can I apply for a grant from Optimism?': '2',
 'What should I do to deploy my project on Optimism?': '3',
 'How can I learn about the origins of Optimism?': '1',
 'What are some upcoming changes to the Optimism 

In [29]:
# add a column with the cluster to df
test_clean['cluster'] = [clusters[q] for q in test_clean['question']]
test_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean['cluster'] = [clusters[q] for q in test_clean['question']]


Unnamed: 0,fragment,question,answer,fragment_text,cluster
0,0,How can I get support for my project on Optimism?,"To get support for your project on Optimism, y...",---\ntitle: How do I get project support (mark...,3
2,0,What should I do before deploying my project o...,"Before deploying your project on OP Mainnet, y...",---\ntitle: How do I get project support (mark...,3
3,1,Where can I find tutorials for developing on O...,You can find tutorials for developing on Optim...,Steps to take if you would like developer supp...,3
5,1,Is there a Discord channel for developer suppo...,"Yes, there is a Discord channel for developer ...",Steps to take if you would like developer supp...,3
6,2,How can I get my project listed on the Superch...,To get your project listed on the Superchain a...,When your project is deployed on the Superchai...,3
...,...,...,...,...,...
1866,758,How can developers access OP Mainnet nodes thr...,Developers can access OP Mainnet nodes through...,[Pocket](https://www.portal.pokt.network/) off...,3
1867,758,Is there a free tier available for using OP Ma...,"Yes, there is a generous free tier available t...",[Pocket](https://www.portal.pokt.network/) off...,3
1870,760,Are there any premium options available for OP...,"Yes, QuickNode provides an option to upgrade t...",[QuickNode](https://www.quicknode.com/) offers...,3
1871,760,Do I need to manage the infrastructure if I us...,"No, QuickNode manages the complex infrastructu...",[QuickNode](https://www.quicknode.com/) offers...,3


In [30]:
test_clean['cluster'].value_counts()

cluster
3    1038
2     172
4      34
5      18
1       8
Name: count, dtype: int64

In [36]:
# select some random questions from cluster 3
cluster_3 = test_clean[test_clean['cluster'] == '3']
random_10 = cluster_3.sample(20).question.tolist()
random_10

['Where can I find the addresses of the messenger contracts on Optimism?',
 'Are demo spots limited?',
 'Can the SystemDictator revert an upgrade?',
 'Who should I contact if I have questions about the schema or need to share additional data?',
 'How do I install Go for the Optimism Governance tutorial?',
 'Why might some dapps need OP Mainnet-specific features?',
 'Why was Bedrock built to be modular and upgradeable?',
 'Where can I find a list of OP Mainnet Block Explorers?',
 'Do I need to have made my contribution between specific rounds to be considered for RetroPGF 3?',
 'How do I set the gas limit for a transaction on OP Mainnet?',
 'How can I bring web2 user behavior and identity data onchain?',
 'What kind of pricing model does BlockPI Network offer?',
 'Where can I find information on integrating OP Mainnet with my wallet?',
 'How can I ensure that the data I generate is verifiable by others?',
 'How many addresses qualify for a 10% bonus?',
 'How is the state stored and modi

## Second Implementation

In [42]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
model_chat = "gpt-3.5-turbo-0125"

# select the model
llm = ChatOpenAI(
    model = model_chat,
    temperature = 0,
    max_tokens = None,
    timeout = None,
    max_retries = 2,
    api_key = openai_api_key
)

# create the template for interactions
def answer_template():
    return f""" I'll give you a question about Optimism Documentation. You don't need to answer it, just categorize it (just return the number, one character) into one of the following categories:

1. project support;
2. governance;
3. dev;
4. tech documentation;
5. general documentation;
6. marketing / promotion / ambassadors / events / PR;
7. other.

Question: {{question}}
"""

prompt = ChatPromptTemplate.from_template(answer_template())

chain = prompt | llm

In [45]:
clusters = {}
for q in cleaned_questions:
    cat = chain.invoke({"question": q})
    clusters[q] = cat.content
    print(f"Question: {q}")
    print(f"Category: {cat.content}")

# add a column with the cluster to df
test_clean['cluster'] = [clusters[q] for q in test_clean['question']]
test_clean

Question: How can I get support for my project on Optimism?
Category: 1
Question: What should I do before deploying my project on OP Mainnet?
Category: 3
Question: Where can I find tutorials for developing on Optimism?
Category: 4
Question: Is there a Discord channel for developer support for Optimism?
Category: 3
Question: How can I get my project listed on the Superchain apps page?
Category: 6
Question: Is there a way to promote my app launch through Superchain's marketing channels?
Category: 6
Question: How does Optimism support projects that align with its values?
Category: 1


KeyboardInterrupt: 

In [44]:
test_clean['cluster'].value_counts()

cluster
4                        899
2                        153
3                         94
6                         67
5                         29
1                         15
7                         12
4. tech documentation      1
Name: count, dtype: int64