In [10]:
import pandas as pd
import json

# Load CSV file (questions and URLs)
qa_data = pd.read_csv('clearfeed_qa_pairs.csv')

# Load JSON file (documentation content)
with open('Clearfeed_kb.json', 'r') as file:
    docs_data = json.load(file)

# Convert JSON into a DataFrame for easier processing
docs_df = pd.DataFrame([{'url': url, 'title': content['title'], 'text': content['text']}
                        for url, content in docs_data.items()])


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove excessive spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()

# Apply preprocessing to documentation
docs_df['clean_text'] = docs_df['text'].apply(clean_text)


In [12]:
# Vectorize documentation text
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(docs_df['clean_text'])

# Function to get top 5 similar documents for a query

def search_top_k(query, k=5):
    query_vector = vectorizer.transform([clean_text(query)])  # Transform query into vector
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()  # Compute cosine similarities
    top_k_indices = similarities.argsort()[-k:][::-1]  # Get indices of top K matches

    # Retrieve top K URLs and details
    top_k_results = docs_df.iloc[top_k_indices][['url', 'title', 'text']]
    return top_k_results



In [17]:
# Example usage
query = "How can I integrate Slack with Clearfeed?"
top_results = search_top_k(query)

# Display the top 5 URLs with their titles
print("Top 5 Results:")
for i, row in top_results.iterrows():
    print(f"{i + 1}. {row['title']} - {row['url']}")


Top 5 Results:
87. For existing ClearFeed<>Slack users - https://docs.clearfeed.ai/clearfeed-help-center/getting-started/using-clearfeed-with-microsoft-teams/integrate-clearfeed-with-ms-teams/for-existing-clearfeed-less-than-greater-than-slack-users
10. Integration With Slack - https://docs.clearfeed.ai/clearfeed-help-center/getting-started/integration-with-slack
4. Slack <> Ticketing Integration - https://docs.clearfeed.ai/clearfeed-help-center/getting-started/for-customer-support/integrate-slack-and-external-ticketing-system
19. Integrate Clearfeed With Ms Teams - https://docs.clearfeed.ai/clearfeed-help-center/getting-started/using-clearfeed-with-microsoft-teams/integrate-clearfeed-with-ms-teams
61. Slack <> Ticketing Integration - https://docs.clearfeed.ai/clearfeed-help-center/getting-started/for-internal-support/integrate-slack-and-internal-ticketing-systems


In [13]:
# Evaluate search by checking if the target URL appears in the top 5 results
def evaluate_system():
    correct_predictions = 0
    for i, row in qa_data.iterrows():
        query = row['question']
        target_url = row['url']
        results = search_top_k(query)
        if target_url in results['url'].values:
            correct_predictions += 1

    accuracy = correct_predictions / len(qa_data)
    print(f"Search System Accuracy: {accuracy:.2%}")

# Run evaluation
evaluate_system()


Search System Accuracy: 90.70%


In [15]:
from transformers import pipeline

# Load pre-trained model for text generation
qa_model = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')

# Generate an answer for a question based on the retrieved content
def generate_answer(question, top_results):
    context = ' '.join(top_results['text'].values)
    answer = qa_model(question=question, context=context)
    return answer['answer']

# Example usage
query = " Clearfeed?"
top_results = search_top_k(query)
generated_answer = generate_answer(query, top_results)
print("Generated Answer:", generated_answer)


Generated Answer: test drive


In [16]:
def calculate_mrr():
    mrr = 0
    for i, row in qa_data.iterrows():
        query = row['question']
        target_url = row['url']
        results = search_top_k(query)
        for rank, url in enumerate(results['url']):
            if url == target_url:
                mrr += 1 / (rank + 1)
                break
    mrr /= len(qa_data)
    print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

calculate_mrr()


Mean Reciprocal Rank (MRR): 0.7442
