## Check Docs Validation 

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain.prompts import ChatPromptTemplate
from IPython.display import Markdown, display
from dotenv import load_dotenv
load_dotenv()
import os
import nbformat
import streamlit as st
import tempfile

In [None]:
'''
1. Load White Paper  (PDF)  
2. Vectors Embeddings - Text and tables; later include images
3. Chroma db
4. Retrival - (Accuracy)
5. Generation - 
    Validation-
        Data sources - List all sources and metadata
        Features - detect any change in features
        Changes in Transformation steps
        Model Details 
        Hyperparameter
        List of Validation Metrics and resepctive scores
        Compare Validation scores of white paper with model's validation scores
        Brief of comparision of scores

        List and Track of critical metrics - these should not be lower than mentioned (in white paper)

6. Respective scores for tracked metrics (confidence on generation)
7. If required update prompt and go back to step 4 and reiterate step 4 and 5. reason (geneation have 
   heiger confidence )
8. Outout should be in structured format (This will be input for summary block with affitional 2 inputs)
'''

In [None]:
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize LLM
llm = ChatOpenAI(model="gpt-4o",  
                 temperature=0,
                 openai_api_key= openai_api_key)




In [None]:


def read_notebook(file_path):
    """Read .ipynb notebook and extract content."""
    nb = nbformat.read(file_path, as_version=4)
    content = []
    for cell in nb.cells:
        if cell.cell_type == 'markdown':
            content.append("## Markdown Cell:\n" + cell.source)
        elif cell.cell_type == 'code':
            content.append("## Code Cell:\n```python\n" + cell.source + "\n```")
    return "\n\n".join(content)


def read_file(file_path):
    """Reads the content of a file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()


def extract_functionalities_from_code(notebook_content):

    """Uses LLM to extract functionalities from Python code."""
    prompt = f"""
    You are an expert Python code reviewer. Here is a Jupyter notebook:
    {notebook_content}

    The following is a Jupyter notebook content (code and markdown). 
    Please extract the following:
    Analyze the notebook and answer:

    1. List of features used in the model.
    2. Name/type of ML model used, only name of model
    3. Accuracy metrics (e.g., accuracy, F1, precision, recall, AUC, etc.), only metrics name. 
    4. What is the purpose of this notebook?
    5. What are the main operations and their results?
    6. Are there any errors or anomalies in outputs?
    7. What conclusions can be drawn from the outputs?

    """

    response = llm.invoke([SystemMessage(content="You are a helpful assistant."), HumanMessage(content=prompt)])

    return response.content.strip()


def extract_functionalities_from_whitepaper(whitepaper_text):
    """Uses LLM to extract functionalities from whitepaper."""
    prompt = [
        SystemMessage(content="You are a product analyst."),
        HumanMessage(content=f"""
        Here is the whitepaper or product requirement document:

        {whitepaper_text}

        List all functionalities or features the whitepaper mentions. Use bullet points.
        """)
            ]
    return llm(prompt).content.strip()


def compare_functionalities(whitepaper_funcs, code_funcs):
    """Compares two sets of functionalities using the LLM."""
    prompt = [
        SystemMessage(content="You are a software QA expert."),
        HumanMessage(content=f"""
        Whitepaper Functionalities:
        {whitepaper_funcs}

        Code Functionalities:
        {code_funcs}
        Extract validation metrics from code funcs eg, precision, recall and other validation are in output cell.
        Compare the two lists and identify which functionalities from the whitepape, if functionality is implemented in code but not available in white paper, print: white paper is not updated please update the document. and show details of each section 
        listmissing sections like feature and if model varies according to white paper and same for validation metrics.
        compare validation scores : Compare scores of code function with white paper.
        Also compare critical validation metrics: make sure critical metrics of code should be grater then white paper critical metrics
        if thereis no changhe in metrics of docuemt and code_funcs: keep output 'white paper is updated please proceed to next steps. no other information is required'  
        
        """)
            ]
    return llm(prompt).content.strip()


def read_notebook_with_outputs(file_path):
    """Read .ipynb notebook and include both code and output."""
    nb = nbformat.read(file_path, as_version=4)
    cells_content = []

    for cell in nb.cells:
        if cell.cell_type == 'markdown':
            cells_content.append(f"## Markdown Cell:\n{cell.source}")
        elif cell.cell_type == 'code':
            code = f"## Code Cell:\n```python\n{cell.source}\n```"
            outputs = []

            for output in cell.get("outputs", []):
                if output.output_type == "stream":
                    outputs.append(f"Output (stream):\n{output.text}")
                elif output.output_type == "execute_result":
                    # Display the result of the cell (e.g., print(2+2))
                    result = output.get("data", {}).get("text/plain", "")
                    outputs.append(f"Output (execute_result):\n{result}")
                elif output.output_type == "error":
                    outputs.append("Error:\n" + "\n".join(output.get("traceback", [])))

            full_output = "\n".join(outputs)
            if full_output:
                code += f"\n\n### Output:\n```\n{full_output}\n```"
            cells_content.append(code)

    return "\n\n".join(cells_content)

def read_notebook(path):
    with open(path, "r", encoding="utf-8") as f:
        return nbformat.read(f, as_version=4)



In [None]:


def main():
    st.set_page_config(page_title="Functionality Coverage Checker", layout="wide")
    
    st.title("🧠 AI Feature Mapping Validator")
    st.subheader("Compare functionalities between a Whitepaper and its Codebase")

    uploaded_whitepaper = st.file_uploader("📄 Upload Whitepaper File", type=["txt", "md", "pdf"])
    uploaded_code = st.file_uploader("💻 Upload Code File", type=["py", "txt", "ipynb"])

    if uploaded_whitepaper and uploaded_code:
        if st.button("Click to Process Files"):
            # Read whitepaper content
            whitepaper = uploaded_whitepaper.read().decode("utf-8")

            # Handle .ipynb or other code files
            if uploaded_code.name.endswith(".ipynb"):
                # Write the raw content to a temp file
                with tempfile.NamedTemporaryFile(delete=False, suffix=".ipynb", mode='wb') as tmp_file:
                    tmp_file.write(uploaded_code.read())
                    temp_file_path = tmp_file.name

                # notebook_contents = read_notebook(temp_file_path)
                notebook_contents = read_notebook_with_outputs(temp_file_path)
                code_funcs = extract_functionalities_from_code(notebook_contents)
            else:
                code = uploaded_code.read().decode("utf-8")
                code_funcs = extract_functionalities_from_code(code)

            whitepaper_funcs = extract_functionalities_from_whitepaper(whitepaper)

            st.markdown("### ⚖️ Comparing Functionalities")
            missing_funcs = compare_functionalities(whitepaper_funcs, code_funcs)
            st.markdown(missing_funcs)

if __name__ == "__main__":
    main()

    



In [None]:
# import os
# import nbformat
# from openai import OpenAI
# from langchain.chat_models import ChatOpenAI
# from langchain.schema import SystemMessage, HumanMessage
# from langchain.chat_models import ChatOpenAI
# from langchain.schema import SystemMessage, HumanMessage
# from langchain_openai import OpenAIEmbeddings
# from dotenv import load_dotenv
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# import chromadb
# from chromadb.config import Settings

# import tempfile

In [None]:
# queries = [
#             "Summary/Objective of white paper ",
#             "Features mentioned",
#             "Preprocessing steps and data transformation steps",
#             "Model selected for classification",
#             "Training and resting methodology",
#             "List of Hyper parameters and respective values",
#             "What are list of validation scores and the performance scores?",
#             "Ethical considerations" ]

In [None]:
def queryFun(query, embedding_model,collection):
    query_embedding = embedding_model.embed_query(query)
    l_docs = []
    results = collection.query(query_embeddings=[query_embedding], n_results=5)
    for doc in results["documents"][0]:
        l_docs.append(doc)
        # print("🔎 Match:", l_docs.append(doc))
    return l_docs

##  Check Embeddings 

In [None]:
import chromadb
from chromadb.config import Settings
from chromadb import PersistentClient

In [None]:
path = "./chroma_openai1"

# Step 1: Load the persistent client
chroma_client = PersistentClient(path=path)




In [None]:
collections = chroma_client.list_collections()
for col in collections:
    print(col.name)
    # print(col.metadata)


In [None]:
collection_name = "notebook_f887cf79f48bf8b101631e9ebdb3ca7220bd2c6e47a6b82041ca192aa98cf16b"
# Step 2: Access the existing collection
collection = chroma_client.get_collection(name=collection_name)
data = collection.get()

# Optional: View details
print("IDs:", data['ids'])
print("Documents:", data['documents'][:2])  # print only first 2 docs
print("Metadata:", data.get('metadatas'))  # only if metadata was stored


In [None]:
data.keys()

In [None]:
# Optional: View details
print("IDs:", data['ids'])
print("Documents:", data['documents'][:2])  # print only first 2 docs
print("Metadata:", data.get('metadatas'))  # only if metadata was stored
print("Embeddings:", data['embeddings'])

In [None]:
collection = client.get_collection("whitepaper_02958f41437b5bbcf9490a38b0edb5d41a365101ce7979d2822648a320dfdc73")

results = collections[6].get(
    ids=["id1", "id2"],    # optional
    where={"type": "pdf"}, # optional
    include=["documents", "metadatas"]
)

In [None]:
results

In [None]:
 # Embedding stored or not 
import os
import nbformat
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings

import streamlit as st
import fitz
import tempfile
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed


load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


In [None]:
def get_file_hash(uploaded_file):
    uploaded_file.seek(0)
    hash_val = hashlib.sha256(uploaded_file.read()).hexdigest()
    uploaded_file.seek(0)
    return hash_val

In [None]:
def extract_from_pdf(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    extracted_text = ""
    for page_num, page in enumerate(doc):
        text = page.get_text()
        extracted_text += f"\n\n--- Page {page_num + 1} ---\n{text}"
    return extracted_text

In [None]:
def collection_exists(collection_name):
    try:
        chroma_client.get_collection(collection_name)
        return True
    except Exception:
        return False

In [None]:
def create_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=750,     # faster, smaller chunk
        chunk_overlap=100   # reduced overlap
    )
    return text_splitter.split_text(text)

In [None]:


def get_or_create_embeddings(uploaded_file, text, _embedding_model, collection_name):
    chunks = create_chunks(text)

    if collection_exists(collection_name):
        collection = chroma_client.get_collection(collection_name)
    else:
        embeddings = _embedding_model.embed_documents(chunks)
        print(embeddings)
        collection = store_in_chromaDB(chunks, embeddings, collection_name)

    return collection, chunks



In [None]:
import hashlib
import fitz

def get_file_hash(file_obj):
    file_obj.seek(0)
    hash_val = hashlib.sha256(file_obj.read()).hexdigest()
    file_obj.seek(0)
    return hash_val

def extract_from_pdf(file_obj):
    file_obj.seek(0)
    doc = fitz.open(stream=file_obj.read(), filetype="pdf")
    extracted_text = ""
    for page_num, page in enumerate(doc):
        text = page.get_text()
        extracted_text += f"\n\n--- Page {page_num + 1} ---\n{text}"
    return extracted_text

file_path = "Load Prediction Whitepaper.pdf"

with open(file_path, "rb") as f:
    # 1. Compute hash
    file_hash = get_file_hash(f)
    # 2. Extract text
    text = extract_from_pdf(f)
    # 3. If needed, reset pointer for further use (not always needed)
    f.seek(0)
    # 4. Pass to embedding function (if needed)
    collection, chunks = get_or_create_embeddings(
        uploaded_file=f,              # If function needs file object
        text=text,                    # If function needs text
        _embedding_model=embedding_model,
        collection_name=f"whitepaper_{file_hash}"
    )


In [None]:
import requests

In [None]:
def create_summary_of_events():
    # with open("push_events.json", "r") as file:
    #     data = json.load(file)
    # owner = data[2]
    # repo_name = data[3]
    api_url = f"https://api.github.com/repos/arunkenwal02/code-validator/events"
    response = requests.get(api_url)
    events = response.json()
    push_ids = [id['id'] for id in events]
    print(push_ids)
    data = ['52949273211','52821120274']
    push_events = [e for e in events if e['type'] == 'PushEvent']

    ids = [e['id'] for e in push_events]
    try:
        idx1 = ids.index(data[0])
        idx2 = ids.index(data[1])
    except ValueError:
        return {"error": "One or both push IDs not found in recent events."}

    start = min(idx1, idx2)
    end = max(idx1, idx2)

    history_between = push_events[start:end+1]  
    grouped_push_events = []
    commits_list = []
    for event in history_between:
        push_id = event['id']
        created_at = event['created_at']
        repo = event['repo']['name']
        commits_list = []

        for commit in event["payload"]["commits"]:
            sha = commit['sha']
            author = commit['author']['name']
            message = commit['message']

            commit_detail_url = f"https://api.github.com/repos/arunkenwal02/code-validator/commits/{sha}"
            commit_detail_response = requests.get(commit_detail_url)

            if commit_detail_response.status_code != 200:
                diff = "❌ Failed to fetch diff"
            else:
                commit_detail = commit_detail_response.json()
                diffs = []
                for file in commit_detail.get('files', []):
                    patch = file.get('patch')
                    if patch:
                        diffs.append(f"File: {file['filename']}\n{patch}")
                diff = "\n\n".join(diffs) if diffs else "No diff available"

            commits_list.append({
                "sha": sha,
                "author": author,
                "commit_message": message,
                "code_diff": diff
            })

        grouped_push_events.append({
            "push_id": push_id,
            "repo": repo,
            "created_at": created_at,
            "commits": commits_list
        })
    return grouped_push_events

In [None]:
test = create_summary_of_events()
test
push id , commit summary, 

In [None]:
test[5]

In [None]:
len(test)

## One drive file access

In [None]:
import msal
import requests
import time
import fitz
import os 
from dotenv import load_dotenv
from datetime import datetime



load_dotenv() 
Permission_ID ="6a94cb3a-9869-4b54-ae0b-f4f523df2614"
client_id = Permission_ID
authority = "https://login.microsoftonline.com/consumers"
scopes = ["Files.Read"]
source_folder = "Documents/GitHub/code-validator/"
file_name = "Load Prediction Whitepaper.pdf"
version_id = int(7)
file_path = source_folder+file_name



In [None]:
# Extract updated version 

def access_token_key(client_id, authority):
    scopes = ["Files.Read"]
    app = msal.PublicClientApplication(client_id=client_id, authority=authority)
    result = None

    accounts = app.get_accounts()
    if accounts:
        result = app.acquire_token_silent(scopes, account=accounts[0])
    if not result:
        result = app.acquire_token_interactive(scopes=scopes)
    if not result or "access_token" not in result:
        print("MSAL Error:", result)
    access_token = result["access_token"]

    return access_token

def get_raw_data(client_id, authority ,file_path ):
    access_token= access_token_key(client_id=client_id, authority=authority)
    url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/content"
    headers = {"Authorization": f"Bearer {access_token}"}
    time.sleep(2)
    response = requests.get(url, headers=headers)
    print(f"Response code: {response.status_code}")
    if response.status_code == 200:
        file_bytes = response.content
        print("File read into memory!")
        return file_bytes
    else:
        print("Failed:", response.status_code, response.text)
        return None


def get_onedrive_whitepaper(file_bytes):
    
    # file_bytes is from above
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    text = ""
    for page_num, page in enumerate(doc):
        text += f"\n\n--- Page {page_num + 1} ---\n{page.get_text()}"

    print("First 1000 chars of PDF text:", text)
    
    return text
 
   
def prev_version( client_id, authority, file_path, version_id):
    access_token= access_token_key(client_id=client_id, authority=authority)

    versions_url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/versions"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(versions_url, headers=headers)

    if response.status_code == 200:
        versions = response.json()["value"]
        if len(versions) >= int(version_id):
            # 3. Get the 2nd version (index 1)
            version_id = versions[1]['id']
            print(f"2nd Version ID: {version_id}, Last Modified: {versions[1]['lastModifiedDateTime']}")
            
            # 4. Fetch 2nd version's PDF bytes
            download_url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/versions/{version_id}/content"
            version_response = requests.get(download_url, headers=headers)
            if version_response.status_code == 200:
                pdf_bytes = version_response.content  # This is your PDF in memory
                
                # 5. Extract text from the PDF (in memory, no save)
                doc = fitz.open(stream=pdf_bytes, filetype="pdf")
                all_text = ""
                for page_num, page in enumerate(doc):
                    all_text += f"\n--- Page {page_num+1} ---\n{page.get_text()}"
                
                print("Extracted PDF text (first 1000 chars):")
                print(all_text[:1000])
                return all_text
                # You can use `all_text` as needed (search, LLM input, etc)
            else:
                print("Failed to download 2nd version:", version_response.status_code, version_response.text)
        else:
            print("Less than 2 versions available!")
    else:
        print("Failed to fetch versions:", response.status_code, response.text)


In [None]:
def prev_version(client_id, authority, file_path, version_number):
    """
    Fetch and parse the N-th version of a OneDrive file via Microsoft Graph.
    version_number is 1-based: 1 = latest, 2 = previous, etc.
    Returns the extracted PDF text.
    """
    access_token = access_token_key(client_id=client_id, authority=authority)
    headers = {"Authorization": f"Bearer {access_token}"}

    # 1) List versions
    versions_url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/versions"
    response = requests.get(versions_url, headers=headers)
    if response.status_code != 200:
        raise RuntimeError(f"Failed to fetch versions: {response.status_code} {response.text}")

    versions = response.json().get("value", [])
    if not versions:
        raise RuntimeError("No versions found for this file.")

    # Sort DESC by lastModifiedDateTime so index 0 is latest, 1 is previous, etc.
    def _parse_dt(v):
        ts = v.get("lastModifiedDateTime")
        return datetime.fromisoformat(ts.replace("Z", "+00:00")) if ts else datetime.min
    versions.sort(key=_parse_dt, reverse=True)

    # 2) Show total + quick overview
    total = len(versions)
    print(f"Total versions available: {total}")
    for i, v in enumerate(versions, start=1):
        print(f"{i}. id={v.get('id')} | lastModified={v.get('lastModifiedDateTime')} | size={v.get('size', 'NA')}")

    # 3) Validate requested version and pick it
    if not (1 <= int(version_number) <= total):
        raise ValueError(f"Invalid version_number {version_number}. Only {total} versions exist.")

    selected = versions[int(version_number) - 1]   # 1-based → 0-based
    internal_id = selected["id"]
    print(f"\nSelected version #{version_number}: id={internal_id}, lastModified={selected.get('lastModifiedDateTime')}")

    # 4) Download that specific version’s content
    download_url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/versions/{internal_id}/content"
    version_response = requests.get(download_url, headers=headers)
    if version_response.status_code != 200:
        raise RuntimeError(f"Failed to download version #{version_number}: {version_response.status_code} {version_response.text}")

    pdf_bytes = version_response.content

    # 5) Extract text from the PDF (in memory, no save)
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    all_text = ""
    for page_num, page in enumerate(doc):
        all_text += f"\n--- Page {page_num+1} ---\n{page.get_text()}"

    print("\nExtracted PDF text (first 1000 chars):")
    # print(all_text[:1000])
    return all_text



In [None]:
version_number = 1
white_paper_text = prev_version(client_id, authority, file_path, version_number)


In [None]:
white_paper_text

In [None]:
file_bytes  = get_raw_data(client_id=client_id, authority=authority,file_path = file_path)
pdf_content = get_onedrive_whitepaper(file_bytes)  


In [None]:

version_id = 1
prev_version(client_id=client_id, authority=authority,  file_path= file_path, version_id = version_id)

## Get updated file from github

In [None]:
import json 
import pandas as pd 
import requests
import requests
import base64


In [None]:
push_event= pd.read_json('push_events.json', )
latest_push_id = push_event[0].tolist()[0]
latest_push_id

# --- Usage ---
owner = "arunkenwal02"
repo = "code-validator"
push_id = latest_push_id
file_path = "loan-approval-prediction_v2.ipynb"

In [None]:
def get_sha_pair_from_push_id(owner, repo, push_id):
    """
    Returns (before_sha, head_sha) for the given push_id.
    If not found, returns (None, None).
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/events"
    resp = requests.get(url)
    events = resp.json()
    for event in events:
        if event["type"] == "PushEvent" and event["id"] == str(push_id):
            before_sha = event["payload"]["before"]
            head_sha = event["payload"]["head"]
            print(f"Push ID: {push_id}\nbefore: {before_sha}\nhead: {head_sha}")
            return before_sha, head_sha
    print(f"Push ID {push_id} not found in recent events.")
    return None, None

def fetch_latest_file_for_sha(owner, repo, file_path, sha_pairs):
    """
    For each (sha_old, sha_new) in sha_pairs, check if file_path was updated.
    If yes, download file from sha_new. Else, download most recently updated version.
    """
    for i, (sha_old, sha_new) in enumerate(sha_pairs):
        print(f"\nProcessing pair {i+1}: {sha_old} → {sha_new}")

        # 1. Compare the two SHAs
        compare_url = f"https://api.github.com/repos/{owner}/{repo}/compare/{sha_old}...{sha_new}"
        compare_resp = requests.get(compare_url)
        compare_data = compare_resp.json()

        file_changed = False
        for f in compare_data.get("files", []):
            if f["filename"] == file_path:
                file_changed = True
                print(f"File {file_path} was changed in this push.")
                break

        if file_changed:
            # Download updated file from sha_new
            content_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
            params = {"ref": sha_new}
            file_resp = requests.get(content_url, params=params)
            file_data = file_resp.json()
            
        # Check for 'content' key (base64-encoded)
            if "content" in file_data:
                nb_json = base64.b64decode(file_data["content"]).decode("utf-8")
                notebook_dict = json.loads(nb_json)
                return notebook_dict
            else:
                raise Exception("Notebook not found or could not fetch content. Details: " + str(file_data))

        else:
            print(f"File {file_path} was NOT changed between {sha_old} and {sha_new}.")
            # Get most recent commit where this file was updated
            commits_url = f"https://api.github.com/repos/{owner}/{repo}/commits"
            params = {"path": file_path, "per_page": 1}
            commits_resp = requests.get(commits_url, params=params)
            last_update_sha = commits_resp.json()[0]["sha"]
            print("Most recent commit where file was changed:", last_update_sha)
            # Download file at that SHA
            content_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
            params = {"ref": last_update_sha}
            file_resp = requests.get(content_url, params=params)
            file_data = file_resp.json()
            
            # Check for 'content' key (base64-encoded)
            if "content" in file_data:
                nb_json = base64.b64decode(file_data["content"]).decode("utf-8")
                notebook_dict = json.loads(nb_json)
                return notebook_dict
            else:
                raise Exception("Notebook not found or could not fetch content. Details: " + str(file_data))



In [None]:
sha_pair = get_sha_pair_from_push_id(owner, repo, push_id)

sha_pair = [sha_pair]
fetch_latest_file_for_sha 
# --- Usage example ---
notebook_contents = fetch_latest_file_for_sha(owner, repo, file_path, sha_pair)


In [None]:

for i, cell in enumerate(notebook_contents['cells']):
    if cell['cell_type'] == 'code' and cell.get('outputs'):
        print(f"\nCell #{i+1}:")
        print("Code:")
        print("".join(cell['source']))
        print("\nOutputs:")
        for output in cell['outputs']:
            # Print text output (if any)
            if 'text' in output:
                print("".join(output['text']))
            # Print stream output
            if output.get('output_type') == 'stream':
                print("".join(output.get('text', '')))
            # Print execution result (display_data or execute_result)
            if output.get('output_type') in ['execute_result', 'display_data']:
                data = output.get('data', {})
                # Print text/plain or html if present
                if 'text/plain' in data:
                    print(data['text/plain'])
                if 'text/html' in data:
                    print(data['text/html'])
            # Print errors if any
            if output.get('output_type') == 'error':
                print(f"Error: {output.get('ename')} - {output.get('evalue')}")
                print("Traceback:")
                print("\n".join(output.get('traceback', [])))
        print("-" * 40)


In [None]:
all_cells_text = ""

for i, cell in enumerate(notebook_contents['cells']):
    if cell['cell_type'] == 'code' and cell.get('outputs'):
        # Add cell number and code
        all_cells_text += f"\nCell #{i+1}\n"
        all_cells_text += "Code:\n"
        all_cells_text += "".join(cell['source']).strip() + "\n"
        all_cells_text += "Output(s):\n"
        # Add outputs
        for output in cell['outputs']:
            output_text = ""
            if output.get('output_type') == 'stream':
                text = output.get('text', '')
                if isinstance(text, list):
                    text = "".join(text)
                output_text += text.strip()
            elif output.get('output_type') in ['execute_result', 'display_data']:
                data = output.get('data', {})
                text = data.get('text/plain', '')
                if isinstance(text, list):
                    text = "".join(text)
                output_text += text.strip()
            # Skipping errors
            if output_text:
                all_cells_text += output_text + "\n"
        all_cells_text += "-" * 30 + "\n"

# Optional: remove leading/trailing whitespace
all_cells_text = all_cells_text.strip()

# Print or use as needed
print(all_cells_text)


# Chroma DB - Prompt Outpout for white paper

In [None]:
import os, chromadb
from IPython.display import Markdown, display


DATA_DIR = os.path.abspath("./chroma_openai1")  # <-- match the folder that actually has your DB
client = chromadb.PersistentClient(path=DATA_DIR)

In [None]:
for db in client.list_collections():
    print("Name:",db.name)

In [None]:
# Delete the collection
client.delete_collection(name="whitepaper_02958f41437b5bbcf9490a38b0edb5d41a365101ce7979d2822648a320dfdc73")

In [None]:
# Delete the collection
client.delete_collection(name="whitepaper_80155473120ea4dcf824fec347b00809b601f5a71c2ed64892a6c178903ec71b")

In [None]:
# Delete the collection
client.delete_collection(name="whitepaper_803eb93c087768f8959427cf4ede1d1af37a2717b8a7d2b7952e58ea79b8a4ed")

In [None]:
client.delete_collection(name="notebook_0a2bc68bfeed8d271ecb43f5600b98dbb99b06c04ea769c3a6bc655a0363154a")

In [None]:
client.delete_collection(name="notebook_74b8a7ff23f322cce20437b9a56a9c32681a0bab37a51ce5999bad4bb0cf0431")

In [None]:
client.delete_collection(name="notebook_f887cf79f48bf8b101631e9ebdb3ca7220bd2c6e47a6b82041ca192aa98cf16b")

In [None]:
client.delete_collection(name="notebook_927f65957e11b760abafa86522bf21a1bec1308a8bc4e7df619022846279eefc")

In [None]:
for db in client.list_collections():
    print("Name:",db.name)

In [None]:
import shutil, os

DATA_DIR = os.path.abspath("./chroma_openai1")  # same path you're using
shutil.rmtree(DATA_DIR)  # Permanently deletes everything
os.makedirs(DATA_DIR, exist_ok=True)  # Recreate the directory if needed

print("ChromaDB data directory reset.")


In [None]:
import sqlite3
import os

# Path to Chroma's persistent storage
persist_dir = "./chroma_openai1"  # Change this if yours is different
db_path = os.path.join(persist_dir, "chroma.sqlite3")

if not os.path.exists(db_path):
    raise FileNotFoundError(f"Chroma SQLite DB not found at {db_path}")

# Connect to SQLite DB
conn = sqlite3.connect(db_path)
cursor = conn.cursor()


In [None]:
import os
import chromadb

# Path to your actual persistent Chroma storage
DATA_DIR = os.path.abspath("./chroma_openai1")  # adjust if needed

# Use PersistentClient to read from persistent DB
client = chromadb.PersistentClient(path=DATA_DIR)

# Get the specific collection
collection_name = "whitepaper_02958f41437b5bbcf9490a38b0edb5d41a365101ce7979d2822648a320dfdc73"
collection = client.get_collection(name=collection_name)

# Retrieve all records (you can also filter with where / where_document)
results = collection.get(
    include=["documents", "metadatas", "embeddings"],  # choose what you need
    limit=5  # remove or increase for more
)


In [None]:
# Print sample
for doc, meta,embeddings in zip(results["documents"], results["metadatas"], results['embeddings']):
    print("Document:", doc)
    print("Metadata:", meta)
    print("Embeddings",embeddings)
    print("-" * 40)

In [None]:
import os
import nbformat
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings
import streamlit as st
import fitz
import tempfile
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [None]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)


In [None]:
def queryFun(query, embedding_model, collection):
    query_embedding = embedding_model.embed_query(query)
    results = collection.query(query_embeddings=[query_embedding], n_results=5)
    l_docs = [doc for doc in results["documents"][0]]
    return l_docs

In [None]:
queries = [
    "Summary/Objective of white paper ",
    "Select All features or Features name. Do not include any of the following preprocessing steps, model details, data types, or any explanations.",
    "Feature names from preprocessing steps — list only the features on which preprocessing was applied. excluding any training methodology, model details, bias mitigation, evaluation",
    "Name the Model selected for classification",
    "Extract the list of hyperparameters/ performance scores/ Best hyperparameters along with their  values.",
    "List of validation metrics and respective scores only and hyperparameter scores",
    "Ethical considerations"
]

In [None]:
out = queryFun(queries[6], embedding_model, collection)

In [None]:
for o in out:
    print(o)
    print("++++++++++++++++++++++++++++++++")

In [None]:
context_list = [
    "Overview",
    "Do not include descriptions, preprocessing details, training methodology, target variable explanations, or any other text.",
    "Include only the train/test percentages and their purposes if mentioned. Do not include hyperparameter tuning, validation strategy, retraining details, evaluation metrics, or deployment strategy.",
    "Exclude summary and Laon details. Do not consider any training methodology, model details, bias mitigation, evaluation, or future work, and output in the format <Feature_Name>: <Operation_Name> preserving the exact feature names.",
    "Keep Model detail only, exclude Traning methodology, resluts, analysis, feature engineering and data handling",
    "Keep only validation/perofrmance and best hyperparameter scores. Do not include other details"
    "Keep only ethical considerations. do not include other details"
]

In [None]:
def refine_extracted_elements_with_context(similar_elements, query_context):
    combined_elements = "\n\n".join(similar_elements)
    prompt = [
        SystemMessage(content="You are a product analyst."),
               
        HumanMessage(content=f"""
        The following are the top 5 similar elements retrieved from a vector database and  create a structured report in HTML format with the following three sections, using dangerouslySetInnerHTML={{ __html: reportMarkdown }}; html should not affect other elements : 
        {combined_elements}

        The original query context is:
        "{query_context}"
        - Keep only ethical considerations. do not include other details
        - Identify and extract only the most relevant elements or functionalities.
        - Do not recommend, only extract 
        - Avoid verbose explanations; focus on clarity and precision.
        - Provide concise, bullet-pointed outputs or insights based on retrieved data.
        """)

        # HumanMessage(content=f"""
        # The following are the top 5 similar elements retrieved from a vector database : 

        # {combined_elements}

        # The original query context is:
        # "{query_context}"

        # - Identify and extract only the most relevant elements or functionalities.
        # - Avoid verbose explanations; focus on clarity and precision.
        # - Extract details from given context keep length short in summary format 
        # - Do not recommend, only extract
        # - Extract metrics score/values, model name and and hyperpapramter values if available in context 
        # - Provide concise, bullet-pointed outputs or insights based on retrieved data.
        # - Format the response using IPython Markdown style for readability

        # """)
    ]
    return llm(prompt).content.strip()


In [None]:
ref_output= refine_extracted_elements_with_context(out,queries[6])

In [None]:
Markdown(ref_output)

# Chroma DB for Notebook

In [None]:
# Path to your actual persistent Chroma storage
DATA_DIR = os.path.abspath("./chroma_openai1")  # adjust if needed

# Use PersistentClient to read from persistent DB
client = chromadb.PersistentClient(path=DATA_DIR)

In [None]:
for coll in client.list_collections():
    print("Coll name:", coll.name)

In [None]:
# Get the specific collection
collection_name = "notebook_74b8a7ff23f322cce20437b9a56a9c32681a0bab37a51ce5999bad4bb0cf0431"
collection = client.get_collection(name=collection_name)

# Retrieve all records (you can also filter with where / where_document)
results = collection.get(
    include=["documents", "metadatas", "embeddings"],  # choose what you need
    limit=5  # remove or increase for more
)

In [None]:
# Print sample
for doc, meta,embeddings in zip(results["documents"], results["metadatas"], results['embeddings']):
    print("Document:", doc)
    print("Metadata:", meta)
    print("Embeddings",embeddings)
    print("-" * 40)

In [None]:
import os
import nbformat
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings
import streamlit as st
import fitz
import tempfile
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [None]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)


In [None]:
def queryFun(query, embedding_model, collection):
    query_embedding = embedding_model.embed_query(query)
    results = collection.query(query_embeddings=[query_embedding], n_results=5)
    l_docs = [doc for doc in results["documents"][0]]
    return l_docs

In [None]:
queries = [
    "Summary/Objective of white paper.",
    "Select All features or Features name. Do not include any of the following preprocessing steps, model details, data types, or any explanations.",
    "Feature names from preprocessing steps — list only the features on which preprocessing was applied. excluding any training methodology, model details, bias mitigation, evaluation.",
    "Get only Model name used for classification, exclude other information.",
    # "Extract the list of hyperparameters/ performance scores/ Best hyperparameters along with their  values.",
    "Extract only the list of validation metrics with their respective scores and the list of hyperparameters with their respective values, excluding all other information.",
    "Ethical considerations"
]

In [None]:
out = queryFun(queries[5], embedding_model, collection)

In [None]:
for o in out:
    print(o)
    print("++++++++++++++++++++++++++++++++")

In [None]:
context_list = [
    "From the provided HTML or text, extract summary overview in 2 lines only, excluding all other sections or details.",
    "Keep only features name. Do not include descriptions, preprocessing details, training methodology, target variable explanations, or any other text.",
    "Include only the train/test percentages and their purposes if mentioned. Do not include hyperparameter tuning, validation strategy, retraining details, evaluation metrics, or deployment strategy.",
    # "Exclude summary and Laon details. Do not consider any training methodology, model details, bias mitigation, evaluation, or future work, and output in the format <Feature_Name>: <Operation_Name> preserving the exact feature names.",
    "Strictly Keep Model name only. exclude other details/information.",
    "Extract only the validation/performance metrics with their scores and the best hyperparameter scores, excluding all other details."
    "Keep only ethical considerations. do not include other details."
]

In [None]:
def refine_extracted_elements_with_context(similar_elements, query_context,context_list):
    combined_elements = "\n\n".join(similar_elements)
    prompt = [
        SystemMessage(content="You are a product analyst."),
               
        HumanMessage(content=f"""
        The following are the top 5 similar elements retrieved from a vector database and  create a structured report in HTML format with the following three sections, using dangerouslySetInnerHTML={{ __html: reportMarkdown }}; html should not affect other elements : 
        {combined_elements}

        The original query context is:
        "{query_context}"
        - {context_list}
        - Identify and extract only the most relevant elements or functionalities.
        - Do not recommend, only extract.
        - Avoid verbose explanations; focus on clarity and precision.
        - Provide concise, bullet-pointed outputs or insights based on retrieved data.
        """)

        # HumanMessage(content=f"""
        # The following are the top 5 similar elements retrieved from a vector database : 

        # {combined_elements}

        # The original query context is:
        # "{query_context}"

        # - Identify and extract only the most relevant elements or functionalities.
        # - Avoid verbose explanations; focus on clarity and precision.
        # - Extract details from given context keep length short in summary format 
        # - Do not recommend, only extract
        # - Extract metrics score/values, model name and and hyperpapramter values if available in context 
        # - Provide concise, bullet-pointed outputs or insights based on retrieved data.
        # - Format the response using IPython Markdown style for readability

        # """)
    ]
    return llm(prompt).content.strip()


In [None]:
ref_output= refine_extracted_elements_with_context(out,queries[4],context_list[4] )

In [None]:
def queryFun_parallel(queries, embedding_model, collection):
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(queryFun, query, embedding_model, collection) for query in queries]
        return [future.result() for future in as_completed(futures)]


In [None]:
# --- Parallel vector queries ---
list_pdf_docs = queryFun_parallel(queries, embedding_model, collection)

In [None]:
list_pdf_docs[1]

In [None]:
with ThreadPoolExecutor() as executor:
    list_refine_context_from_extracted_element_from_pdf = list(
        executor.map(
            refine_extracted_elements_with_context,
            list_pdf_docs, queries , context_list
        )
    )

In [None]:
Markdown(ref_output)

In [None]:
# Create summary:

# Read file from GCP

In [None]:
# pip install pymupdf requests

import fitz  # pymupdf
import requests
from io import BytesIO
from urllib.parse import quote



def read_white_paper_from_gcp(filename, base_url, version_number):
    # Safely encode filename for a URL

    filename = "Load Prediction Whitepaper.pdf"
    filename.split(".")
    encoded_filename = quote(filename.split(".")[0])
    file_type = filename.split(".")[1]
    url = f"{base_url}{encoded_filename}_v{version_number}.{file_type}"

    # Download the PDF into memory
    response = requests.get(url)
    response.raise_for_status()

    # Open PDF from bytes
    pdf_stream = BytesIO(response.content)
    doc = fitz.open(stream=pdf_stream, filetype="pdf")
    text_block = ""
    # Iterate through pages
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")  # Extract as plain text
        text_block += f"--- Page {page_num + 1} ---\n{text}\n\n"
        print(f"--- Page {page_num + 1} ---")
        print(text)
        # print()
    return text_block 



In [None]:
filename = "Load Prediction Whitepaper.pdf"
base_url = "https://storage.googleapis.com/whitepaper_test/"
version_number = 1

text = read_white_paper_from_gcp(filename, base_url,version_number)
text

# Validate White paper with version code 

In [None]:
# ===== Step 3: Send to GPT-4o via LangChain =====
llm = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key= os.getenv("OPENAI_API_KEY"))

system_message = (
    "You are an assistant that extracts document headings in 3 levels: "
    "Level 1 (Main Heading), Level 2 (Subheading), and Level 3 (Sub-subheading). "
    "Level 3 items can be bullet points, numbered pointers, or descriptive labels under a subheading."
)

user_message = f"""
From the following PDF text, extract a complete hierarchy of headings with up to 3 levels.

Output in valid JSON format like:
{{
  "headings": [
    {{
      "heading": "Main Heading 1",
      "subheadings": [
        {{
          "subheading": "Subheading 1.1",
          "subsubheadings": ["Sub-subheading 1.1.1", "Sub-subheading 1.1.2"]
        }},
        {{
          "subheading": "Subheading 1.2",
          "subsubheadings": []
        }}
      ]
    }}
  ]
}}

Rules:
- Ignore body text that is not a heading or bullet point under a heading.
- Preserve numbering and bullet symbols if they appear in the extracted text.
- Bullet points can be treated as sub-subheadings if they belong to a subheading.


PDF text:
{text}
"""

response = llm([
    SystemMessage(content=system_message),
    HumanMessage(content=user_message)
])

# ===== Step 4: Output result =====
print(response.content)

In [None]:
""" 
1. Read white paper name and version 
    - Extract Heading/ Sub Heading/ Sub sub Heading only (to compare with diff (v2-v1))

2. Read V1 code 
3. Read V2 Code 
    - Get code diff b/w v1 and v2 
    - Compare code diff aganist white paper (Only for given details in white paper, exclude other details for main summary) 
    - what if critical info addedd in v2 but not mentioned oin white paper 

"""

In [1]:
from typing import Optional, Tuple, List
import pandas as pd


def get_sha_pair_from_push_id(owner: str, repo: str, push_id: str) -> Tuple[Optional[str], Optional[str]]:
    url = f"https://api.github.com/repos/{owner}/{repo}/events"
    resp = requests.get(url)
    if resp.status_code != 200:
        raise RuntimeError(f"GitHub events fetch failed: {resp.status_code} {resp.text}")
    events = resp.json()
    for event in events:
        if event.get("type") == "PushEvent" and event.get("id") == str(push_id):
            before_sha = event["payload"]["before"]
            head_sha = event["payload"]["head"]
            return before_sha, head_sha 
    return None, None


def fetch_latest_file_for_sha(owner: str, repo: str, notebook_file_path: str, sha_pairs: List[Tuple[str, str]]) -> dict:
    for (sha_old, sha_new) in sha_pairs:
        compare_url = f"https://api.github.com/repos/{owner}/{repo}/compare/{sha_old}...{sha_new}"
        compare_resp = requests.get(compare_url)
        if compare_resp.status_code != 200:
            raise RuntimeError(f"GitHub compare failed: {compare_resp.status_code} {compare_resp.text}")
        compare_data = compare_resp.json()

        file_changed = any(f.get("filename") == notebook_file_path for f in compare_data.get("files", []))

        if file_changed:
            content_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{notebook_file_path}"
            print("File changed URL: ",content_url)
            params = {"ref": sha_new}
        else:
            commits_url = f"https://api.github.com/repos/{owner}/{repo}/commits"
            params = {"path": notebook_file_path, "per_page": 1}
            commits_resp = requests.get(commits_url, params=params)
            if commits_resp.status_code != 200:
                raise RuntimeError(f"GitHub commits fetch failed: {commits_resp.status_code} {commits_resp.text}")
            last_update_sha = commits_resp.json()[0]["sha"]
            content_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{notebook_file_path}"
            print("File not changed URL: ",content_url)
            params = {"ref": last_update_sha}

        file_resp = requests.get(content_url, params=params)
        if file_resp.status_code != 200:
            raise RuntimeError(f"GitHub content fetch failed: {file_resp.status_code} {file_resp.text}")

        file_data = file_resp.json()
        if "content" not in file_data:
            raise RuntimeError("Notebook not found or could not fetch content. Details: " + str(file_data))
        nb_json = base64.b64decode(file_data["content"]).decode("utf-8")
        return json.loads(nb_json)

    raise RuntimeError("No SHA pairs yielded a notebook.")


def read_notebook_with_outputs(owner: str, repo: str, push_id: str, notebook_file_path: str) -> str:
    sha_pair = get_sha_pair_from_push_id(owner=owner, repo=repo, push_id=push_id)
    if not sha_pair or not sha_pair[0] or not sha_pair[1]:
        raise RuntimeError(f"Push ID {push_id} not found in recent events.")
    notebook_contents = fetch_latest_file_for_sha(owner=owner, repo=repo, notebook_file_path=notebook_file_path, sha_pairs=[sha_pair])

    all_cells_text = ""
    for i, cell in enumerate(notebook_contents.get('cells', [])):
        if cell.get('cell_type') == 'code' and cell.get('outputs'):
            all_cells_text += f"\nCell #{i+1}\n"
            all_cells_text += "Code:\n"
            all_cells_text += "".join(cell.get('source', [])).strip() + "\n"
            all_cells_text += "Output(s):\n"
            for output in cell['outputs']:
                output_text = ""
                if output.get('output_type') == 'stream':
                    text = output.get('text', '')
                    if isinstance(text, list):
                        text = "".join(text)
                    output_text += (text or "").strip()
                elif output.get('output_type') in ['execute_result', 'display_data']:
                    data = output.get('data', {})
                    text = data.get('text/plain', '')
                    if isinstance(text, list):
                        text = "".join(text)
                    output_text += (text or "").strip()
                if output_text:
                    all_cells_text += output_text + "\n"
            all_cells_text += "-" * 30 + "\n"

    return all_cells_text.strip()


def get_first_two_push_ids(path: str = "push_events.json") -> list:
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} not found.")
    
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    results = []
    
    # If the JSON is a list of dicts
    if isinstance(data, list):
        for item in data[:2]:  # first two items
            if isinstance(item, dict):
                first_key = list(item.keys())[0]
                results.append(str(item[first_key]))
            else:
                results.append(str(item))
    
    # If the JSON is a dict of lists
    elif isinstance(data, dict):
        first_key = next(iter(data))
        results = [str(val) for val in data[first_key][:2]]
    
    else:
        raise ValueError("Unsupported JSON structure.")
    
    return results




In [2]:
import json
import os
import base64
import requests

first_two = get_first_two_push_ids("push_events.json")
print(first_two)
first_push_id = first_two[1]
second_push_id = first_two[0]

GITHUB_OWNER = "arunkenwal02"
GITHUB_REPO = "code-validator"
# NOTEBOOK_FILE_PATH = "loan-approval-prediction_v1.ipynb"
NOTEBOOK_FILE_PATH = "main.ipynb"

['52949273211', '53215273440']


In [3]:
first_push_id_sha_pair = get_sha_pair_from_push_id(owner = GITHUB_OWNER, repo =  GITHUB_REPO, push_id = first_push_id)
first_push_id_sha_pair

('ee534eb29525f1c7b7febe4941cb73f87bf17264',
 '249882444dc790db55b48469bb34c891418934bd')

In [4]:
l_sha = fetch_latest_file_for_sha(owner = GITHUB_OWNER, repo = GITHUB_REPO, notebook_file_path = NOTEBOOK_FILE_PATH, sha_pairs = [first_push_id_sha_pair])
# l_sha

File not changed URL:  https://api.github.com/repos/arunkenwal02/code-validator/contents/main.ipynb


In [5]:
second_push_id_sha_pair = get_sha_pair_from_push_id(owner = GITHUB_OWNER, repo =  GITHUB_REPO, push_id = second_push_id)
second_push_id_sha_pair

('5ef4b5b48eca8cad8d86e8d88904fcb2384e8dbb',
 'c536e703bf0e78761d6374044ef7d9c2bb482131')

In [6]:
l_sha_second_push_id = fetch_latest_file_for_sha(owner = GITHUB_OWNER, repo = GITHUB_REPO, notebook_file_path = NOTEBOOK_FILE_PATH, sha_pairs = [second_push_id_sha_pair])
# l_sha

File not changed URL:  https://api.github.com/repos/arunkenwal02/code-validator/contents/main.ipynb


In [None]:
import requests
import json
import difflib

def fetch_text(url: str) -> str:
    """Download file text from a raw GitHub URL."""
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    # Raw endpoints serve text/bytes directly
    return r.text

def normalize_for_diff(text: str, url_hint: str) -> list[str]:
    """
    Return a list of lines ready for diff.
    - If it's a .ipynb (by URL or detectable JSON with 'cells'), compare cell sources.
    - Otherwise, return plain lines.
    """
    # Quick path for non-notebooks
    if not url_hint.endswith(".ipynb"):
        return text.splitlines()

    # Try to parse as a notebook; if parsing fails, fallback to plain text.
    try:
        nb = json.loads(text)
        if not isinstance(nb, dict) or "cells" not in nb:
            return text.splitlines()
        lines = []
        for i, cell in enumerate(nb.get("cells", []), start=1):
            ctype = cell.get("cell_type", "unknown")
            src = "".join(cell.get("source", []))
            lines.append(f"### CELL {i} [{ctype}]")
            lines.extend(src.splitlines() if src else ["(empty source)"])
            lines.append("")  # separator
        return lines
    except Exception:
        return text.splitlines()

def diff_urls(url_old: str, url_new: str) -> str:
    old_text = fetch_text(url_old)
    new_text = fetch_text(url_new)

    old_lines = normalize_for_diff(old_text, url_old)
    new_lines = normalize_for_diff(new_text, url_new)

    diff = difflib.unified_diff(
        old_lines, new_lines,
        fromfile="OLD_VERSION",
        tofile="NEW_VERSION",
        lineterm=""
    )
    out = "\n".join(diff)
    return out if out else "No differences found."

if __name__ == "__main__":
    # Replace with your two RAW URLs (must be raw.githubusercontent.com or ?raw=1)
    url_v1 = "https://raw.githubusercontent.com/arunkenwal02/code-validator/249882444dc790db55b48469bb34c891418934bd/main.ipynb"
    url_v2 = "https://raw.githubusercontent.com/arunkenwal02/code-validator/c536e703bf0e78761d6374044ef7d9c2bb482131/main.ipynb"

    print(diff_urls(url_v1, url_v2))


--- OLD_VERSION
+++ NEW_VERSION
@@ -1,7 +1,4 @@
-### CELL 1 [markdown]
-## Check Docs Validation 
-
-### CELL 2 [code]
+### CELL 1 [code]
 from langchain.chat_models import ChatOpenAI
 from langchain.schema import SystemMessage, HumanMessage
 from langchain.prompts import ChatPromptTemplate
@@ -13,7 +10,7 @@
 import streamlit as st
 import tempfile
 
-### CELL 3 [code]
+### CELL 2 [code]
 '''
 1. Load White Paper  (PDF)  
 2. Vectors Embeddings - Text and tables; later include images
@@ -38,7 +35,7 @@
 8. Outout should be in structured format (This will be input for summary block with affitional 2 inputs)
 '''
 
-### CELL 4 [code]
+### CELL 3 [code]
 openai_api_key = os.getenv("OPENAI_API_KEY")
 
 # Initialize LLM
@@ -48,7 +45,7 @@
 
 
 
-### CELL 5 [code]
+### CELL 4 [code]
 
 
 def read_notebook(file_path):
@@ -166,7 +163,7 @@
         return nbformat.read(f, as_version=4)
 
 
-### CELL 6 [code]
+### CELL 5 [code]
 
 
 def main():
@@ -209,952 +206,3 @@
     
 
 
-### CELL 7 [code]
-(

In [None]:
for db in client.list_collections():
    print("Name:",db.name)