In [1]:
import os

allowed_extensions = [
    ".py",
    ".java",
    ".go",
    ".rs",
    ".c",
    ".cpp",
    ".c++",
    ".h",
    ".hpp",
    ".cs",
    ".js",
    ".jsx",
    ".ts",
    ".tsx",
    ".html",
    ".css",
    ".php",
    ".sql",
    # ".json",
    # ".xml",
    # ".yml",
    # ".yaml",
    # ".md",
    ".sh",
    ".bat",
    ".ps1",
]

# generate a dictionary of the form {filename: content}
def read_files(directory) -> dict:
    files_content = {}

    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            _, file_extension = os.path.splitext(file_path)
            if file_extension not in allowed_extensions:
                continue
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    files_content[file_path] = f.read().splitlines()
            except (UnicodeDecodeError, PermissionError) as e:
                # Skip files that can't be read (e.g., binary files, permission issues)
                print(f"Could not read file {file_path}: {e}")

    return files_content

file_content = read_files("repositories")

In [13]:
# Regex search in all files
# Function to perform regex check on file content
import re

def search_files(query, files_content):
    results = []
    regex = re.compile(query)

    for file_path, content_lines in files_content.items():
        matches_in_file = []

        for line_number, line in enumerate(content_lines, start=1):
            for match in regex.finditer(line):
                col_number = match.start() + 1
                matches_in_file.append({
                    'line_number': line_number,
                    'col_number': col_number,
                    'match': match.group()
                })

        if matches_in_file:
            results.append({
                'file_path': file_path,
                'matches': matches_in_file
            })

    return results

# Function to print results in required format
def print_results(results):
    for file_result in results:
        print(f"File: {file_result['file_path']}")
        print(f"File Link: ./"+ os.path.relpath(file_result['file_path']))  # File link
        print("Matches:")
        for match in file_result['matches']:
            print(f' - "{match["match"]}" at line {match["line_number"]}, column {match["col_number"]}')
        print("--------------------------------------------")

# Main execution function
def search_query_in_repo(directory, query):
    # Read files from directory
    files_content = read_files(directory)

    # Perform regex search
    results = search_files(query, files_content)

    # Print results
    if results:
        print_results(results)
    else:
        print("No matches found.")

# Example usage:
# Replace "repositories" with the base directory where your files are located
# Replace 'your-regex-query' with the regex you want to search
search_query_in_repo("repositories", '.*Provider')


File: repositories\Textbook-Assessment-Portal\src\App.js
File Link: ./repositories\Textbook-Assessment-Portal\src\App.js
Matches:
 - "import { ChakraProvider" at line 6, column 1
 - "import { AuthContextProvider" at line 20, column 1
 - "            <ChakraProvider" at line 36, column 1
 - "            </ChakraProvider" at line 105, column 1
--------------------------------------------
File: repositories\Textbook-Assessment-Portal\src\index.js
File Link: ./repositories\Textbook-Assessment-Portal\src\index.js
Matches:
 - "import { AuthContextProvider" at line 6, column 1
 - "     <AuthContextProvider" at line 11, column 1
 - "    </AuthContextProvider" at line 13, column 1
--------------------------------------------
File: repositories\Textbook-Assessment-Portal\src\Context\AuthContext.js
File Link: ./repositories\Textbook-Assessment-Portal\src\Context\AuthContext.js
Matches:
 - "export const AuthContextProvider" at line 16, column 1
 - "    <AuthContext.Provider" at line 32, column 1
 

In [12]:
file_content

{'repositories\\Scalable-Notification-System\\main.go': ['package main',
  '',
  'import (',
  '\t"fmt"',
  '',
  '\t"github.com/SatvikG7/Scalable-Notification-System/config"',
  '\t"github.com/SatvikG7/Scalable-Notification-System/internal/db"',
  '\t"github.com/SatvikG7/Scalable-Notification-System/internal/rabbitmq"',
  '\t"github.com/SatvikG7/Scalable-Notification-System/internal/server"',
  ')',
  '',
  'func main() {',
  '\tif err := config.ConfigENV(); err != nil {',
  '\t\tfmt.Println("Error loading environment variables")',
  '\t\treturn',
  '\t}',
  '',
  '\tif err := db.Init(); err != nil {',
  '\t\tfmt.Println("Error initializing database")',
  '\t\treturn',
  '\t}',
  '',
  '\tgo server.Init()',
  '',
  '\t// Define worker pools',
  '\tpools := []*rabbitmq.WorkerPool{',
  '\t\trabbitmq.NewWorkerPool("notifications_email_high", rabbitmq.NewRateLimiter(50), 3, 5),',
  '\t\trabbitmq.NewWorkerPool("notifications_sms_high", rabbitmq.NewRateLimiter(10), 3, 5),',
  '\t\trabbitmq.

In [2]:
read_files("repositories").items().__len__()

192

In [3]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model = "gemma2:2b",
    temperature = 0.8,
    num_predict = 256,
)

In [4]:
descriptions = {}

# get first 5 files
file_content = dict(list(file_content.items())[:5])

for filename, content in file_content.items():
    file = """"""
    for line in content:
      file += line + "\n"

    messages = [
      ("system", "You are a code description generator. Given a file name and its content, generate a concise, descriptive summary of the code's purpose and functionality. The description should be detailed enough to allow retrieval based on natural language queries. For example, if the code is a middleware function in Node.js, generate a description like 'This code implements middleware for a Node.js application."),
      ("human", f"Please generate a description for the following code snippet: {file} \n filename: {filename}"),
    ]
    response = llm.invoke(messages)
    descriptions[filename] = response.content

In [5]:
descriptions

{'repositories\\Scalable-Notification-System\\main.go': 'This code defines the main execution flow of a scalable notification system application. It initializes various components such as config settings, database connection, and worker pools for different types of notifications. \n\nThe `main` function first loads environment variables from the configuration file. Then it establishes the connection to the database, and initializes the server using `go server.Init()`. The code then defines multiple worker pools for different notification channels (email, SMS, push) and their corresponding rate limits. Finally, the `rabbitmq.Scheduler()` function manages the communication between these workers and the RabbitMQ queue using the defined worker pools and connections. \n\n**Note:** The code utilizes packages from the `github.com/SatvikG7/Scalable-Notification-System` repository to manage configuration, database operations, network communication with RabbitMQ, and server initialization.',
 'r

In [6]:
import uuid

def generate_uuid(repo_name: str) -> str:
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, repo_name))


In [7]:
from langchain_core.documents import Document

docs =[]
uuids = []
for index, (filename, description) in enumerate(descriptions.items()):
    docs.append(Document(
        id=index,
        page_content=description,
    ))
    uuids.append(generate_uuid(filename))

In [11]:
"a\\asdf"

'a\\asdf'

In [8]:
from langchain_ollama.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="nomic-embed-text"
)

In [9]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="descriptions",
    embedding_function=embeddings,
    persist_directory="./vector_store",
)

In [10]:
vector_store.add_documents(documents=docs, ids=uuids)

['f49c4048-ebd5-5619-86f3-749b48c91505',
 '2b11aedb-6949-5573-848e-5de04a9edc00',
 '19c9a972-977c-5656-b23d-8ec9b94c0e51',
 '74a3fb79-efc8-5830-b096-2f628434e1db',
 '2175f648-081a-525d-99ec-965dc67060c9']

In [11]:
retriever = vector_store.as_retriever(
  search_type="similarity", 
  search_kwargs={'k': 1}
)