<a href="https://colab.research.google.com/github/Ghat0tkach/github-triage-bot/blob/main/Github_Triage_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import json
import faiss
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer


In [None]:
!git clone https://github.com/Ghat0tkach/jlug-lenscape-event-frontend.git
!pip install transformers torch tqdm
!pip install groq
!pip install faiss-cpu sentence-transformers


In [None]:
# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [3]:
def chunk_code(code, max_length=510):
    tokens = tokenizer.tokenize(code)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

def get_embeddings(code_chunk):
    inputs = tokenizer(code_chunk, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

def process_file(file_path):
    with open(file_path, 'r', errors='ignore') as file:
        content = file.read()

    chunks = chunk_code(content)
    embeddings = []
    metadata = []

    for i, chunk in enumerate(chunks):
        embedding = get_embeddings(chunk).astype(np.float32)
        embeddings.append(embedding)
        metadata.append({
            'file_path': file_path,
            'chunk_index': i,
            'chunk_content': chunk
        })

    return embeddings, metadata



In [None]:
ls

In [None]:

def process_repository(repo_path):
    all_embeddings = []
    all_metadata = []

    for root, _, files in os.walk(repo_path):
        for file in tqdm(files, desc="Processing files"):
            if file.endswith(('.py', '.js', '.java', '.cpp', '.c', '.html', '.css','.jsx','.tsx','.ts')):  # Add more extensions as needed
                print("File ", file)
                file_path = os.path.join(root, file)
                embeddings, metadata = process_file(file_path)
                all_embeddings.extend(embeddings)
                all_metadata.extend(metadata)

    return all_embeddings, all_metadata

# Process the repository
repo_path = 'jlug-lenscape-event-frontend'  # Adjust this to the cloned repo's path
embeddings, metadata = process_repository(repo_path)

print(f"Total embeddings generated: {len(embeddings)}")
print(f"Sample embedding shape: {embeddings[0].shape}")
print(f"Sample metadata: {metadata[0]}")

In [None]:

# Save embeddings
embeddings_array = np.array([e[0] for e in embeddings])
np.save('embeddings.npy', embeddings_array)

# Save metadata
with open('metadata.json', 'w') as f:
    json.dump(metadata, f)

print("Embeddings saved to 'embeddings.npy'")
print("Metadata saved to 'metadata.json'")

In [None]:
# Load embeddings
loaded_embeddings = np.load('embeddings.npy')

# Load metadata
with open('metadata.json', 'r') as f:
    loaded_metadata = json.load(f)

# Create a DataFrame
df = pd.DataFrame(loaded_embeddings)

# Add metadata columns
df['file_path'] = [item['file_path'] for item in loaded_metadata]
df['chunk_index'] = [item['chunk_index'] for item in loaded_metadata]

# Rename embedding columns
df.columns = [f'dim_{i}' if isinstance(i, int) else i for i in df.columns]

# Display the first few rows
print(df.head())

# Display info about the DataFrame
print(df.info())

# If you want to see all columns, you can use:
# pd.set_option('display.max_columns', None)
# print(df)

# Save as CSV if needed
df.to_csv('embeddings_with_metadata.csv', index=False)
print("Saved embeddings with metadata to 'embeddings_with_metadata.csv'")

In [None]:
# FAISS index setup and similarity search function
def setup_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

index = setup_faiss_index(embeddings)

# Function to search similar code snippets using FAISS
def find_similar_code_faiss(query_text, index, embeddings, metadata, top_k=3):
    query_embedding = get_embeddings(query_text).astype(np.float32)
    distances, indices = index.search(query_embedding.reshape(1, -1), top_k)
    relevant_snippets = [metadata[i] for i in indices[0]]
    return relevant_snippets, distances[0]

# Sample query to test similarity search
query_text = "How to implement binary search"
relevant_snippets, distances = find_similar_code_faiss(query_text, index, embeddings, metadata)
for snippet, distance in zip(relevant_snippets, distances):
    print(f"File: {snippet['file_path']}, Distance: {distance}, Content: {snippet['chunk_content']}")


In [None]:
import numpy as np
import json
from scipy.spatial.distance import cosine
from groq import Groq
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

def encode_query(query):
    inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()[0]

# Load embeddings and metadata
embeddings = np.load('embeddings.npy')
with open('metadata.json', 'r') as f:
    metadata = json.load(f)

def find_relevant_snippets(query, embeddings, metadata, top_k=3):
    query_embedding = encode_query(query)
    similarities = [1 - cosine(query_embedding, emb) for emb in embeddings]
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    return [metadata[i] for i in top_indices], [similarities[i] for i in top_indices]

# Function to query Groq API
client = Groq(api_key="")

def query_groq(question, context, similarities):
    prompt = f"""You are an AI assistant specialized in answering questions about code.
    Given the following code snippets, their relevance scores, and a question, provide a detailed answer.
    Use the relevance scores to weight the importance of each snippet in your answer.

    Code snippets and their relevance scores:
    {context}

    Question: {question}

    Answer:"""

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="mixtral-8x7b-32768",
        temperature=0.5,
        max_tokens=1024,
    )

    return chat_completion.choices[0].message.content

# Main question-answering loop
while True:
    question = input("Ask a question about the code (or type 'exit' to quit): ")
    if question.lower() == 'exit':
        break

    relevant_snippets, similarities = find_relevant_snippets(question, embeddings, metadata)
    context = "\n\n".join([f"File: {snippet['file_path']}\nRelevance: {sim:.4f}\nChunk: {snippet['chunk_content']}"
                           for snippet, sim in zip(relevant_snippets, similarities)])

    answer = query_groq(question, context, similarities)
    print("\nAnswer:", answer)
    print("\n" + "="*50 + "\n")