In [1]:
import json
import os
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
import chromadb
from chromadb.config import Settings
import requests

from langchain_openai import OpenAIEmbeddings
import chromadb
from chromadb.config import Settings

import numpy as np 

In [33]:
# Set up API key and embedding model
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_api_key)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
embedding_model =OpenAIEmbeddings(openai_api_key=openai_api_key)
file_path = "Push_Commit_summary_outout.txt"

In [3]:
def create_summary_of_events():
    # with open("push_events.json", "r") as file:
    #     data = json.load(file)
    # owner = data[2]
    # repo_name = data[3]
    api_url = f"https://api.github.com/repos/arunkenwal02/code-validator/events"
    response = requests.get(api_url)
    events = response.json()
    push_ids = [id['id'] for id in events]
    # print(push_ids)
    data = ['52949273211','52821120274']
    push_events = [e for e in events if e['type'] == 'PushEvent']

    ids = [e['id'] for e in push_events]
    try:
        idx1 = ids.index(data[0])
        idx2 = ids.index(data[1])
    except ValueError:
        return {"error": "One or both push IDs not found in recent events."}

    start = min(idx1, idx2)
    end = max(idx1, idx2)

    history_between = push_events[start:end+1]  
    grouped_push_events = []
    commits_list = []
    for event in history_between:
        push_id = event['id']
        created_at = event['created_at']
        repo = event['repo']['name']
        commits_list = []

        for commit in event["payload"]["commits"]:
            sha = commit['sha']
            author = commit['author']['name']
            message = commit['message']

            commit_detail_url = f"https://api.github.com/repos/arunkenwal02/code-validator/commits/{sha}"
            commit_detail_response = requests.get(commit_detail_url)

            if commit_detail_response.status_code != 200:
                diff = "❌ Failed to fetch diff"
            else:
                commit_detail = commit_detail_response.json()
                diffs = []
                for file in commit_detail.get('files', []):
                    patch = file.get('patch')
                    if patch:
                        diffs.append(f"File: {file['filename']}\n{patch}")
                diff = "\n\n".join(diffs) if diffs else "No diff available"

            commits_list.append({
                "sha": sha,
                "author": author,
                "commit_message": message,
                "code_diff": diff
            })

        grouped_push_events.append({
            "push_id": push_id,
            "repo": repo,
            "created_at": created_at,
            "commits": commits_list
        })
    return grouped_push_events

In [4]:
grouped_push_events = create_summary_of_events()
grouped_push_events

[{'push_id': '52949273211',
  'repo': 'arunkenwal02/code-validator',
  'created_at': '2025-08-05T05:36:54Z',
  'commits': [{'sha': 'c536e703bf0e78761d6374044ef7d9c2bb482131',
    'author': 'Vipin',
    'commit_message': 'Updated requirements.txt',
    'code_diff': 'File: requirements.txt\n@@ -1,119 +1,234 @@\n+GitPython==3.1.45\n+Jinja2==3.1.6\n+Markdown==3.8.2\n+MarkupSafe==3.0.2\n+PyMuPDF==1.26.3\n+PyPika==0.48.9\n+PyYAML==6.0.2\n+RapidFuzz==3.13.0\n+SQLAlchemy==2.0.40\n+acres==0.5.0\n+aiofiles==24.1.0\n+aiohappyeyeballs==2.6.1\n+aiohttp==3.11.18\n+aiosignal==1.3.2\n altair==5.5.0\n+annotated-types==0.7.0\n anyio==4.9.0\n-appnope==0.1.4\n-argon2-cffi==25.1.0\n-argon2-cffi-bindings==21.2.0\n-arrow==1.3.0\n-asttokens==3.0.0\n-async-lru==2.0.5\n attrs==25.3.0\n-babel==2.17.0\n+backoff==2.2.1\n+bcrypt==4.3.0\n beautifulsoup4==4.13.4\n-bleach==6.2.0\n blinker==1.9.0\n-cachetools==6.1.0\n-certifi==2025.7.14\n+build==1.2.2.post1\n+cachetools==5.5.2\n+certifi==2025.4.26\n cffi==1.17.1\n+cfgv

In [5]:
def extract_commit_info(json_str, tag_id):
    push_info =json_str
    commit_structured = []
    for commit in push_info['commits']:
        # Simple file name extraction for single-file diffs
        file_line = commit['code_diff'].split('\n')[0]
        file_name = file_line.replace('File: ', '').strip()
        commit_structured.append({
            "push_id": push_info["push_id"],
            "repo": push_info["repo"],
            "created_at": push_info["created_at"],
            "commit_sha": commit["sha"],
            "author": commit["author"],
            "commit_message": commit["commit_message"],
            "file_name": file_name,
            "code_diff": commit["code_diff"],
            "tag_id": tag_id
        })
    return commit_structured

all_records = []
for js in grouped_push_events:
    records = extract_commit_info(js, tag_id="push_metadata")
    all_records.extend(records)
    

In [6]:
# all_records[0] 

In [20]:
# Set up ChromaDB client and collection
# chroma_client = chromadb.Client(Settings(persist_directory="./chroma_openai1"))
# collection = chroma_client.get_or_create_collection(name="git_diff_chunks")


chroma_client = chromadb.PersistentClient(path="./chroma_storage")
collection = chroma_client.get_or_create_collection(name="git_diff_chunks")


In [21]:
collections = chroma_client.list_collections()
for c in collections:
    print(c.name)

# chroma_client.delete_collection(name="git_diff_chunks")

# collection = chroma_client.get_or_create_collection(name="git_diff_chunks")

git_diff_chunks


In [26]:
def store_push_and_commit_into_chroma_db(all_records,collection, embedding_model):

    documents = [rec["code_diff"] for rec in all_records]
    metadatas = [
        {
            "push_id": rec["push_id"],
            "repo": rec["repo"],
            "created_at": rec["created_at"],
            "commit_sha": rec["commit_sha"],
            "author": rec["author"],
            "commit_message": rec["commit_message"],
            "file_name": rec["file_name"],
            "tag_id": rec["tag_id"]
        }
        for rec in all_records
    ]
    ids = [f'{rec["push_id"]}_{rec["commit_sha"]}' for rec in all_records]

    dummy_vec = np.random.rand(1536).tolist()  # This creates a random 1536-d vector

    # collection.add(
    #     documents=["dummy doc"],            # or your real text
    #     embeddings=[dummy_vec],             # your real 1536-d vector
    #     ids=["dummy_id"]
    # )

    for rec in all_records:
    
        code_diff_embedding = embedding_model.embed_query(rec["code_diff"])
        commit_msg_embedding = embedding_model.embed_query(rec["commit_message"])

        # Prepare metadata (excluding the actual text fields)
        metadata = {k: v for k, v in rec.items() if k not in ["code_diff", "commit_message"]}

        # Store code_diff embedding
        
        print(code_diff_embedding)
        collection.add(
            documents=[rec["code_diff"]],
            embeddings=[code_diff_embedding],
            ids=[f'{rec["push_id"]}_{rec["commit_sha"]}_code'],
            metadatas=[{**metadata, "embedding_type": "code_diff"}]
        )

        # Store commit_message embedding
        print("---------------------------------------------------------------")
        print(commit_msg_embedding)
        collection.add(
            documents=[rec["commit_message"]],
            embeddings=[code_diff_embedding],
            ids=[f'{rec["push_id"]}_{rec["commit_sha"]}_msg'],
            metadatas=[{**metadata, "embedding_type": "commit_message"}]
        )

    
    # chroma_client.persist() 

    return collection

In [27]:
collection = store_push_and_commit_into_chroma_db(all_records = all_records,collection = collection, embedding_model = embedding_model)

[0.009179099462926388, 0.00037041306495666504, 0.009941188618540764, -0.012295502237975597, -0.01581336371600628, 0.012955525889992714, -0.013622354716062546, -0.002585661830380559, -0.01593584194779396, -0.028496714308857918, 0.03753292188048363, -0.01743280328810215, -0.003286512102931738, 0.008682379499077797, 0.0019851757679134607, 0.0006255770567804575, 0.01130206324160099, -0.0029922230169177055, 0.018671199679374695, -0.007831833325326443, 0.032878730446100235, -0.01231591496616602, -0.03432125598192215, -0.030592460185289383, -0.014112269505858421, 0.02065807580947876, 0.005389063619077206, -0.028877759352326393, -0.001297934097237885, -0.005889184772968292, 0.047195132821798325, -0.016384929418563843, -0.025897443294525146, -0.013676789589226246, -0.008627944625914097, -0.001388942589983344, 0.003973753657191992, -0.012452002614736557, 0.020508380606770515, 0.01732393354177475, -0.0043582008220255375, 0.012023326940834522, 0.00834216084331274, 0.03766901046037674, -0.033069252

In [None]:
# # Generate asd store embeddings
# for rec in all_records:
    
#     code_diff_embedding = embedding_model.embed_query(rec["code_diff"])
#     commit_msg_embedding = embedding_model.embed_query(rec["commit_message"])

#     # Prepare metadata (excluding the actual text fields)
#     metadata = {k: v for k, v in rec.items() if k not in ["code_diff", "commit_message"]}

#     # Store code_diff embedding
    
#     print(code_diff_embedding)
#     collection.add(
#         documents=[rec["code_diff"]],
#         embeddings=[code_diff_embedding],
#         ids=[f'{rec["push_id"]}_{rec["commit_sha"]}_code'],
#         metadatas=[{**metadata, "embedding_type": "code_diff"}]
#     )

#     # Store commit_message embedding
#     print("---------------------------------------------------------------")
#     print(commit_msg_embedding)
#     collection.add(
#         documents=[rec["commit_message"]],
#         embeddings=[commit_msg_embedding],
#         ids=[f'{rec["push_id"]}_{rec["commit_sha"]}_msg'],
#         metadatas=[{**metadata, "embedding_type": "commit_message"}]
#     )

In [28]:
data = collection.get()

In [29]:
data['metadatas']

[{'push_id': '52949273211',
  'author': 'Vipin',
  'tag_id': 'push_metadata',
  'file_name': 'requirements.txt',
  'repo': 'arunkenwal02/code-validator',
  'commit_sha': 'c536e703bf0e78761d6374044ef7d9c2bb482131',
  'created_at': '2025-08-05T05:36:54Z',
  'embedding_type': 'code_diff'},
 {'tag_id': 'push_metadata',
  'file_name': 'requirements.txt',
  'commit_sha': 'c536e703bf0e78761d6374044ef7d9c2bb482131',
  'author': 'Vipin',
  'embedding_type': 'commit_message',
  'push_id': '52949273211',
  'created_at': '2025-08-05T05:36:54Z',
  'repo': 'arunkenwal02/code-validator'},
 {'embedding_type': 'code_diff',
  'tag_id': 'push_metadata',
  'repo': 'arunkenwal02/code-validator',
  'author': 'Vipin',
  'push_id': '52949184387',
  'created_at': '2025-08-05T05:33:50Z',
  'file_name': 'No diff available',
  'commit_sha': '5ef4b5b48eca8cad8d86e8d88904fcb2384e8dbb'},
 {'push_id': '52949184387',
  'created_at': '2025-08-05T05:33:50Z',
  'embedding_type': 'commit_message',
  'repo': 'arunkenwal02/

In [30]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

llm = ChatOpenAI(model="gpt-4o", openai_api_key=openai_api_key)


def code_dif_summary(llm,collection ):

    results = collection.get(include=["documents", "metadatas"])

    # Extract only code diffs, using metadata
    code_diffs = [
        doc
        for doc, meta in zip(results["documents"], results["metadatas"])
        if meta.get("embedding_type") == "code_diff"
    ]

    code_diff_text = "\n\n".join(code_diffs)
    prompt = (
        "Summarize the following list of code diffs. Focus on major changes, added/removed functions, "
        "refactored code, or structural modifications. Use concise and clear language."
    )

    response = llm([
        SystemMessage(content=prompt),
        HumanMessage(content=code_diff_text)
    ])

    # print("Summary of code differences:\n")
    # print(response.content)

    return response.content

def comment_summary(llm,collection ):
    data = collection.get()
    summaries = []
    for meta in data['metadatas']:
        summary = (
            f"Commit by {meta.get('author', 'Unknown')} to repo '{meta.get('repo', '')}'\n"
            f"File changed: {meta.get('file_name', '')}\n"
            f"Push ID: {meta.get('push_id', '')}\n"
            f"Commit SHA: {meta.get('commit_sha', '')}\n"
            # f"Created at: {meta.get('created_at', '')}\n"
            # f"Type: {meta.get('embedding_type', '')}\n"
            f"Commit_message: {meta.get('commit_message', '')}\n"
            "------"
        )
        summaries.append(summary)

    summaries = "\n".join(summaries)

    # Prepare metadata as pretty strings (or dicts)
    meta_strings = summaries

    prompt = (
        '''
        You are an assistant. Summarize the following list of meta data commit message. 
        Summarizre the commit changes with commit sha,  file changes and keep both commit message  and commit summary eg. there are 4 commit output should be 4 pointer 
        and output structure:
        1. **Commit SHA:** c536e703bf0e78761d6374044ef7d9c2bb482131  
        **File Changed:** requirements.txt  
        **Commit Message:** Updated the requirements.txt
        **Commit Message Summary:** Updated the requirements.txt file.
        
        '''
    )

    response = llm([
        SystemMessage(content=prompt),
        HumanMessage(content="\n\n".join(meta_strings))
    ])

    # print(response.content)

    return response.content


  llm = ChatOpenAI(model="gpt-4o", openai_api_key=openai_api_key)


In [None]:
def comit_push_summary(llm,collection, file_path):
    code_dif_summ =code_dif_summary(llm,collection )
    comment_dif_summ = comment_summary(llm,collection )

    # Combine them into one text string
    combined_text = f"This is Push code summary differenc \n {code_dif_summ}\n\n Thsis is Commit summary b/w these pushes \n {comment_dif_summ}"


    print("Diff of Push Code summary:\n",code_dif_summ)
    print("+++++++++++++++++++++++++++++++++++++++++++++++++")
    print("Commit summary \n", comment_dif_summ )


    # Save to .txt file

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(combined_text)

    print(f"Summary saved to {file_path}")


In [35]:


comit_push_summary(llm,collection, file_path)

  response = llm([


Diff of Push Code summary:
 ### Major Changes:

#### `requirements.txt`:
- **Additions**: Added numerous new dependencies, such as `GitPython`, `Jinja2`, `Markdown`, `PyMuPDF`, `SQLAlchemy`, `FastAPI`, and several `langchain`-related packages. 
- **Removals**: Removed various dependencies including `appnope`, `argon2-cffi`, `arrow`, `bleach`, and many `jupyter`-related packages.
- **Updates**: Some packages were updated to newer versions, such as `numpy` and `pandas`.

#### `white paper.txt`:
- **Removal**: Entire content related to feature implementation and validation metrics was removed.

#### `main.py`:
- **New Functions**:
  - `refine_extracted_elements_with_context`: Refines extracted elements based on query context.
  - `extract_from_pdf`: Extracts text from a PDF document.
  - `create_chunks`, `create_embeddings`, `store_in_chromaDB`: Functions for chunking text, creating embeddings, and storing in a vector database.
  - `summarize`: Summarizes findings from whitepaper and code