In [1]:
import pandas as pd
import anthropic
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import json

In [2]:
with open('./data/documents.json', 'r') as f_in:
    documents = json.load(f_in)

In [3]:
documents[0]

{'id': '3ba4d080-97a6-4954-829e-121e008c43e9',
 'page_content': 'In the `hfla-site` Slack channel, send an introductory message with your GitHub handle/username asking to be added to the Hack for LA website GitHub repository (this repository).  \n**NOTE:** Once you have accepted the GitHub invite (comes via email or in your GitHub notifications), **please do the following**:  \n1. Make your own Hack for LA GitHub organization membership public by following this [guide](https://help.github.com/en/articles/publicizing-or-hiding-organization-membership#changing-the-visibility-of-your-organization-membership).\n2. Set up two-factor authentication on your account by following this [guide](https://docs.github.com/en/github/authenticating-to-github/configuring-two-factor-authentication).  \n***',
 'header_1': '**How to Contribute to Hack for LA**',
 'header_2': '**Part 1: Setting up the development environment**',
 'header_3': '**1.1 Dev setup (1): Join the repository team**'}

## Indexing Stage

In [4]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')



In [5]:
query = 'Where do I find github issues to work on?'

In [6]:
v = model.encode(query)

In [7]:
len(v)

384

In [8]:
index_settings = {
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
  },
  "mappings": {
    "dynamic": True,
    "properties": {
      "id": { "type": "keyword" },
      "page_content": { "type": "text" },
      "header_1": { "type": "text" },
      "header_2": { "type": "text" },
      "header_3": { "type": "text" },
      "header_4": { "type": "text"},
      "header_5": { "type": "text"},
      "page_content_vector": {
        "type": "dense_vector",
        "dims": 384,
        "index": True,
        "similarity": "cosine"
      },
     "metadata_vector": {
        "type": "dense_vector",
        "dims": 384,
        "index": True,
        "similarity": "cosine"
      },
     "combined_vector": {
        "type": "dense_vector",
        "dims": 384,
        "index": True,
        "similarity": "cosine"
      }
    }
  }
}

index_name = "contributing_h4la"

In [9]:
es_client = Elasticsearch('http://localhost:9200')

In [10]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'contributing_h4la'})

In [11]:
for doc in tqdm(documents):
    # extract content from doc
    content = doc.get('page_content')
    headers = ' '.join([doc.get(f'header_{i}', '') for i in range(1, 6)])

    # combine headers and content for full text encoding
    combined_text = headers + ' ' + content

    # encode content and headers
    doc['page_content_vector'] = model.encode(content)
    doc['metadata_vector'] = model.encode(headers)
    doc['combined_vector'] = model.encode(combined_text)

  0%|          | 0/60 [00:00<?, ?it/s]

In [12]:
for doc in tqdm(documents):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

  0%|          | 0/60 [00:00<?, ?it/s]

## Retrieval Evaluation Stage

In [14]:
from typing import Dict
# from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

In [15]:
from langchain_elasticsearch import ElasticsearchRetriever

In [16]:
es_url = 'http://localhost:9200'

In [20]:
# embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")

In [21]:
# will return keyword search dictionary and vector search dictionary

def hybrid_query(search_query: str) -> Dict:
    vector = embeddings.embed_query(search_query)  # same embeddings as for indexing
    
    return {
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": search_query,
                        "fields": ["page_content", "header_1", "header_2", "header_3", "header_4", "header_5"],
                        "type": "best_fields",
                        "boost": 0.5,
                    }
                },
            }
        },
        "knn": {
            "field": "combined_vector",
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10000,
            "boost": 0.5,
        },
        "size": 5, # output the size
    }

In [22]:
hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_query,
    content_field='page_content',
    url=es_url,
)

In [23]:
# produces the results => list of the answers
hybrid_results = hybrid_retriever.invoke(query)

In [24]:
# print(hybrid_results[0].metadata['_source'])

for result in hybrid_results:
    print(result.metadata['_source']['header_1'],
          result.metadata['_source']['header_2'],
          result.metadata['_source']['header_3'],
          result.metadata['_source'].get('header_4', ''),
          result.metadata['_source'].get('header_5', ''),
          result.metadata['_score']
         )

**How to Contribute to Hack for LA** **Part 3: Pull Requests** **3.1 How to make a pull request** **3.1.b Complete pull request on Hack for LA `website` repo** **vi. After pull request is submitted/merged** 9.00024
**How to Contribute to Hack for LA** **Part 2: How the Website team works with GitHub issues** **2.3 Where can I find GitHub issues to work on?**   7.2144165
**How to Contribute to Hack for LA** **Part 2: How the Website team works with GitHub issues** **2.3 Where can I find GitHub issues to work on?** **2.3.a Available issues for new members**  7.202378
**How to Contribute to Hack for LA** **Part 2: How the Website team works with GitHub issues** **2.3 Where can I find GitHub issues to work on?** **2.3.d What if you see bugs/errors that are not connected to an issue?**  7.194034
**How to Contribute to Hack for LA** **Part 2: How the Website team works with GitHub issues** **2.3 Where can I find GitHub issues to work on?** **2.3.b Available issues for returning members**  7.

In [188]:
print(hybrid_results[0].page_content)

**NOTE**: After completing your assignment and committing all of the changes, you must leave your current branch and return to the `gh-pages` branch.  
Run the following command to return to the `gh-pages` branch:  
```bash
git checkout `gh-pages`
```
Once your pull request is merged you can delete your branch with the following command:  
```bash
git branch -d update-give-link-2093
```  
Now you are all set to work on a new PR. Start over at [**2.3 Where can I find GitHub issues to work on?**](#23-where-can-i-find-github-issues-to-work-on) and repeat completing parts 2 and 3.


## Hybrid Search

In [26]:
df_ground_truth = pd.read_csv('./data/ground-truth-retrieval.csv')

In [27]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [28]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [29]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [191]:
def elastic_search_hybrid(field, query):
    def hybrid_query(search_query: str) -> Dict:
        vector = embeddings.embed_query(search_query)
        
        return {
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": search_query,
                            "fields": ["page_content", "header_1", "header_2", "header_3", "header_4", "header_5"],
                            "type": "best_fields",
                            "boost": 0.3,
                        }
                    },
                }
            },
            "knn": {
                "field": field,
                "query_vector": vector,
                "k": 5,
                "num_candidates": 1000,
                "boost": 2,
            },
            "size": 5,
            "_source": ["page_content", "header_1", "header_2", "header_3", "header_4", "header_5", "id"],
        }
    
    
    hybrid_retriever = ElasticsearchRetriever.from_es_params(
        index_name=index_name,
        body_func=hybrid_query,
        content_field='page_content',
        url=es_url,
    )

    hybrid_results = hybrid_retriever.invoke(query)
    
    result_docs = []
    
    for hit in hybrid_results:
        result_docs.append(hit.metadata['_source'])

    return result_docs

In [31]:
question = ground_truth[0]['question']

In [32]:
print(question)

How do I request access to the Hack for LA website GitHub repository?


In [None]:
# hybrid_results[0].metadata['_source']

# LOOK INTO THIS
# Document(page_content=content, metadata=hit)

In [33]:
elastic_search_hybrid("combined_vector", question)

[{'id': '3ba4d080-97a6-4954-829e-121e008c43e9',
  'header_1': '**How to Contribute to Hack for LA**',
  'header_2': '**Part 1: Setting up the development environment**',
  'header_3': '**1.1 Dev setup (1): Join the repository team**'},
 {'id': '7da5e900-3183-4d40-b0af-021562c21707',
  'header_1': '**How to Contribute to Hack for LA**',
  'header_2': '**Part 1: Setting up the development environment**',
  'header_3': '**1.4 Dev setup (4): Clone (Create) a copy on your computer**',
  'header_4': '**1.4.c What if you accidentally cloned using the repository URL from the HackForLA Github (instead of the fork on your Github)?**',
  'header_5': '**i. Resolve remote (1): reset `origin` remote url**'},
 {'id': '729f9112-7dc4-4ad3-91c0-498578f7df13',
  'header_1': '**How to Contribute to Hack for LA**',
  'header_2': '**Part 1: Setting up the development environment**',
  'header_3': '**1.4 Dev setup (4): Clone (Create) a copy on your computer**',
  'header_4': '**1.4.c What if you accidentally

In [216]:
def question_text_hybrid(q):
    question = q['question']

    return elastic_search_hybrid('combined_vector', question)

In [35]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    
    # relevance is matching the id from the questions to the id in the documents
    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [217]:
evaluate(ground_truth, question_text_hybrid)

  0%|          | 0/242 [00:00<?, ?it/s]

{'hit_rate': 0.8925619834710744, 'mrr': 0.7216942148760329}

## Hybrid Search with RRF / ElasticSearch

In [37]:
def compute_rrf(rank, k=60):
    """ Own implementation of the relevance score """
    
    return 1 / (k + rank)

In [222]:
def elastic_search_hybrid_rrf(field, query, vector, k=60):
    # vector = embeddings.embed_query(query)
    
    # KNN query
    knn_query = {
        "knn": {
            "field": field,
            "query_vector": vector,
            "k": 10,
            "num_candidates": 1000,
            "boost": 0.5,
        },
        "size": 10,
        "_source": ["page_content^3", "header_1^2", "header_2", "header_3", "header_4", "header_5"],
    }
    
    # Keyword query
    keyword_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["page_content^3", "header_1^2", "header_2", "header_3", "header_4", "header_5"],
                "type": "best_fields",
                "boost": 0.5,
            }
        },
        "size": 10,
        # "_source": ["page_content", "header_1", "header_2", "header_3", "header_4", "header_5", "id"],
    }
    
    # Perform searches
    knn_results = es_client.search(index=index_name, body=knn_query)['hits']['hits']
    keyword_results = es_client.search(index=index_name, body=keyword_query)['hits']['hits']
    
    # Apply RRF
    rrf_scores = {}
    
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)
    
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)
    
    # Sort and get top results
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    final_results = []
    
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        
        final_results.append(doc['_source'])

    #print("RESULT: ", final_results[0])
    return final_results

In [226]:
def question_text_hybrid_rrf(q):
    question = q['question']

    v_q = model.encode(question)
    # v_q = embeddings.embed_query(question)

    return elastic_search_hybrid_rrf('page_content_vector', question, v_q)

In [227]:
evaluate(ground_truth, question_text_hybrid_rrf)

  0%|          | 0/242 [00:00<?, ?it/s]

{'hit_rate': 0.8966942148760331, 'mrr': 0.719421487603306}

## RAG Flow

In [228]:
client = anthropic.Anthropic()

In [243]:
def search_hyrid_rrf(q):
    v_q = model.encode(q)

    return elastic_search_hybrid_rrf('page_content_vector', q, v_q)

In [None]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["page_content", "header_1", "header_2", "header_3", "header_4", "header_5"],
                        "type": "best_fields",
                        "boost": 1
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        doc = hit['_source']
        doc['_score'] = hit['_score']
        
        result_docs.append(doc)

    return result_docs

In [229]:
prompt_template = """
You're an assistant to an open source software engineering project on github. Answer the QUESTION based on the CONTEXT from our
contributor FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
page_content: {page_content}
""".strip()

In [230]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [245]:
def llm(prompt):
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1024,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    print('token usage: ', response.usage) # get tokens info
    print('==============================')
    return response.content[0].text

In [241]:
def rag(query):
    search_results = search_hyrid_rrf(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [244]:
answer = rag(query)
print(answer)

token usage:  Usage(input_tokens=1023, output_tokens=68)
The best way to find GitHub issues to work on is by checking the GitHub Project Board, specifically the 'Prioritized Backlog' column. This column contains all available issues that have been approved and prioritized. The issues are filtered so that the top issue has the highest priority and should be worked on next.
