## Check Docs Validation 

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain.prompts import ChatPromptTemplate
from IPython.display import Markdown, display
from dotenv import load_dotenv
load_dotenv()
import os
import nbformat
import streamlit as st
import tempfile

In [None]:
'''
1. Load White Paper  (PDF)  
2. Vectors Embeddings - Text and tables; later include images
3. Chroma db
4. Retrival - (Accuracy)
5. Generation - 
    Validation-
        Data sources - List all sources and metadata
        Features - detect any change in features
        Changes in Transformation steps
        Model Details 
        Hyperparameter
        List of Validation Metrics and resepctive scores
        Compare Validation scores of white paper with model's validation scores
        Brief of comparision of scores

        List and Track of critical metrics - these should not be lower than mentioned (in white paper)

6. Respective scores for tracked metrics (confidence on generation)
7. If required update prompt and go back to step 4 and reiterate step 4 and 5. reason (geneation have 
   heiger confidence )
8. Outout should be in structured format (This will be input for summary block with affitional 2 inputs)
'''

In [None]:
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize LLM
llm = ChatOpenAI(model="gpt-4o",  
                 temperature=0,
                 openai_api_key= openai_api_key)




In [None]:


def read_notebook(file_path):
    """Read .ipynb notebook and extract content."""
    nb = nbformat.read(file_path, as_version=4)
    content = []
    for cell in nb.cells:
        if cell.cell_type == 'markdown':
            content.append("## Markdown Cell:\n" + cell.source)
        elif cell.cell_type == 'code':
            content.append("## Code Cell:\n```python\n" + cell.source + "\n```")
    return "\n\n".join(content)


def read_file(file_path):
    """Reads the content of a file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()


def extract_functionalities_from_code(notebook_content):

    """Uses LLM to extract functionalities from Python code."""
    prompt = f"""
    You are an expert Python code reviewer. Here is a Jupyter notebook:
    {notebook_content}

    The following is a Jupyter notebook content (code and markdown). 
    Please extract the following:
    Analyze the notebook and answer:

    1. List of features used in the model.
    2. Name/type of ML model used, only name of model
    3. Accuracy metrics (e.g., accuracy, F1, precision, recall, AUC, etc.), only metrics name. 
    4. What is the purpose of this notebook?
    5. What are the main operations and their results?
    6. Are there any errors or anomalies in outputs?
    7. What conclusions can be drawn from the outputs?

    """

    response = llm.invoke([SystemMessage(content="You are a helpful assistant."), HumanMessage(content=prompt)])

    return response.content.strip()


def extract_functionalities_from_whitepaper(whitepaper_text):
    """Uses LLM to extract functionalities from whitepaper."""
    prompt = [
        SystemMessage(content="You are a product analyst."),
        HumanMessage(content=f"""
        Here is the whitepaper or product requirement document:

        {whitepaper_text}

        List all functionalities or features the whitepaper mentions. Use bullet points.
        """)
            ]
    return llm(prompt).content.strip()


def compare_functionalities(whitepaper_funcs, code_funcs):
    """Compares two sets of functionalities using the LLM."""
    prompt = [
        SystemMessage(content="You are a software QA expert."),
        HumanMessage(content=f"""
        Whitepaper Functionalities:
        {whitepaper_funcs}

        Code Functionalities:
        {code_funcs}
        Extract validation metrics from code funcs eg, precision, recall and other validation are in output cell.
        Compare the two lists and identify which functionalities from the whitepape, if functionality is implemented in code but not available in white paper, print: white paper is not updated please update the document. and show details of each section 
        listmissing sections like feature and if model varies according to white paper and same for validation metrics.
        compare validation scores : Compare scores of code function with white paper.
        Also compare critical validation metrics: make sure critical metrics of code should be grater then white paper critical metrics
        if thereis no changhe in metrics of docuemt and code_funcs: keep output 'white paper is updated please proceed to next steps. no other information is required'  
        
        """)
            ]
    return llm(prompt).content.strip()


def read_notebook_with_outputs(file_path):
    """Read .ipynb notebook and include both code and output."""
    nb = nbformat.read(file_path, as_version=4)
    cells_content = []

    for cell in nb.cells:
        if cell.cell_type == 'markdown':
            cells_content.append(f"## Markdown Cell:\n{cell.source}")
        elif cell.cell_type == 'code':
            code = f"## Code Cell:\n```python\n{cell.source}\n```"
            outputs = []

            for output in cell.get("outputs", []):
                if output.output_type == "stream":
                    outputs.append(f"Output (stream):\n{output.text}")
                elif output.output_type == "execute_result":
                    # Display the result of the cell (e.g., print(2+2))
                    result = output.get("data", {}).get("text/plain", "")
                    outputs.append(f"Output (execute_result):\n{result}")
                elif output.output_type == "error":
                    outputs.append("Error:\n" + "\n".join(output.get("traceback", [])))

            full_output = "\n".join(outputs)
            if full_output:
                code += f"\n\n### Output:\n```\n{full_output}\n```"
            cells_content.append(code)

    return "\n\n".join(cells_content)

def read_notebook(path):
    with open(path, "r", encoding="utf-8") as f:
        return nbformat.read(f, as_version=4)



In [None]:


def main():
    st.set_page_config(page_title="Functionality Coverage Checker", layout="wide")
    
    st.title("🧠 AI Feature Mapping Validator")
    st.subheader("Compare functionalities between a Whitepaper and its Codebase")

    uploaded_whitepaper = st.file_uploader("📄 Upload Whitepaper File", type=["txt", "md", "pdf"])
    uploaded_code = st.file_uploader("💻 Upload Code File", type=["py", "txt", "ipynb"])

    if uploaded_whitepaper and uploaded_code:
        if st.button("Click to Process Files"):
            # Read whitepaper content
            whitepaper = uploaded_whitepaper.read().decode("utf-8")

            # Handle .ipynb or other code files
            if uploaded_code.name.endswith(".ipynb"):
                # Write the raw content to a temp file
                with tempfile.NamedTemporaryFile(delete=False, suffix=".ipynb", mode='wb') as tmp_file:
                    tmp_file.write(uploaded_code.read())
                    temp_file_path = tmp_file.name

                # notebook_contents = read_notebook(temp_file_path)
                notebook_contents = read_notebook_with_outputs(temp_file_path)
                code_funcs = extract_functionalities_from_code(notebook_contents)
            else:
                code = uploaded_code.read().decode("utf-8")
                code_funcs = extract_functionalities_from_code(code)

            whitepaper_funcs = extract_functionalities_from_whitepaper(whitepaper)

            st.markdown("### ⚖️ Comparing Functionalities")
            missing_funcs = compare_functionalities(whitepaper_funcs, code_funcs)
            st.markdown(missing_funcs)

if __name__ == "__main__":
    main()

    



In [None]:
# import os
# import nbformat
# from openai import OpenAI
# from langchain.chat_models import ChatOpenAI
# from langchain.schema import SystemMessage, HumanMessage
# from langchain.chat_models import ChatOpenAI
# from langchain.schema import SystemMessage, HumanMessage
# from langchain_openai import OpenAIEmbeddings
# from dotenv import load_dotenv
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# import chromadb
# from chromadb.config import Settings

# import tempfile

In [None]:
# queries = [
#             "Summary/Objective of white paper ",
#             "Features mentioned",
#             "Preprocessing steps and data transformation steps",
#             "Model selected for classification",
#             "Training and resting methodology",
#             "List of Hyper parameters and respective values",
#             "What are list of validation scores and the performance scores?",
#             "Ethical considerations" ]

In [None]:
def queryFun(query, embedding_model,collection):
    query_embedding = embedding_model.embed_query(query)
    l_docs = []
    results = collection.query(query_embeddings=[query_embedding], n_results=5)
    for doc in results["documents"][0]:
        l_docs.append(doc)
        # print("🔎 Match:", l_docs.append(doc))
    return l_docs

##  Check Embeddings 

In [None]:
import chromadb
from chromadb.config import Settings
from chromadb import PersistentClient

In [None]:
path = "./chroma_openai1"

# Step 1: Load the persistent client
chroma_client = PersistentClient(path=path)




In [None]:
collections = chroma_client.list_collections()
for col in collections:
    print(col.name)
    # print(col.metadata)


In [None]:
collection_name = "notebook_f887cf79f48bf8b101631e9ebdb3ca7220bd2c6e47a6b82041ca192aa98cf16b"
# Step 2: Access the existing collection
collection = chroma_client.get_collection(name=collection_name)
data = collection.get()

# Optional: View details
print("IDs:", data['ids'])
print("Documents:", data['documents'][:2])  # print only first 2 docs
print("Metadata:", data.get('metadatas'))  # only if metadata was stored


In [None]:
data.keys()

In [None]:
# Optional: View details
print("IDs:", data['ids'])
print("Documents:", data['documents'][:2])  # print only first 2 docs
print("Metadata:", data.get('metadatas'))  # only if metadata was stored
print("Embeddings:", data['embeddings'])

In [None]:
collection = client.get_collection("whitepaper_02958f41437b5bbcf9490a38b0edb5d41a365101ce7979d2822648a320dfdc73")

results = collections[6].get(
    ids=["id1", "id2"],    # optional
    where={"type": "pdf"}, # optional
    include=["documents", "metadatas"]
)

In [None]:
results

In [None]:
 # Embedding stored or not 
import os
import nbformat
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings

import streamlit as st
import fitz
import tempfile
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed


load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


In [None]:
def get_file_hash(uploaded_file):
    uploaded_file.seek(0)
    hash_val = hashlib.sha256(uploaded_file.read()).hexdigest()
    uploaded_file.seek(0)
    return hash_val

In [None]:
def extract_from_pdf(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    extracted_text = ""
    for page_num, page in enumerate(doc):
        text = page.get_text()
        extracted_text += f"\n\n--- Page {page_num + 1} ---\n{text}"
    return extracted_text

In [None]:
def collection_exists(collection_name):
    try:
        chroma_client.get_collection(collection_name)
        return True
    except Exception:
        return False

In [None]:
def create_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=750,     # faster, smaller chunk
        chunk_overlap=100   # reduced overlap
    )
    return text_splitter.split_text(text)

In [None]:


def get_or_create_embeddings(uploaded_file, text, _embedding_model, collection_name):
    chunks = create_chunks(text)

    if collection_exists(collection_name):
        collection = chroma_client.get_collection(collection_name)
    else:
        embeddings = _embedding_model.embed_documents(chunks)
        print(embeddings)
        collection = store_in_chromaDB(chunks, embeddings, collection_name)

    return collection, chunks



In [None]:
import hashlib
import fitz

def get_file_hash(file_obj):
    file_obj.seek(0)
    hash_val = hashlib.sha256(file_obj.read()).hexdigest()
    file_obj.seek(0)
    return hash_val

def extract_from_pdf(file_obj):
    file_obj.seek(0)
    doc = fitz.open(stream=file_obj.read(), filetype="pdf")
    extracted_text = ""
    for page_num, page in enumerate(doc):
        text = page.get_text()
        extracted_text += f"\n\n--- Page {page_num + 1} ---\n{text}"
    return extracted_text

file_path = "Load Prediction Whitepaper.pdf"

with open(file_path, "rb") as f:
    # 1. Compute hash
    file_hash = get_file_hash(f)
    # 2. Extract text
    text = extract_from_pdf(f)
    # 3. If needed, reset pointer for further use (not always needed)
    f.seek(0)
    # 4. Pass to embedding function (if needed)
    collection, chunks = get_or_create_embeddings(
        uploaded_file=f,              # If function needs file object
        text=text,                    # If function needs text
        _embedding_model=embedding_model,
        collection_name=f"whitepaper_{file_hash}"
    )


In [None]:
import requests

In [None]:
def create_summary_of_events():
    # with open("push_events.json", "r") as file:
    #     data = json.load(file)
    # owner = data[2]
    # repo_name = data[3]
    api_url = f"https://api.github.com/repos/arunkenwal02/code-validator/events"
    response = requests.get(api_url)
    events = response.json()
    push_ids = [id['id'] for id in events]
    print(push_ids)
    data = ['52949273211','52821120274']
    push_events = [e for e in events if e['type'] == 'PushEvent']

    ids = [e['id'] for e in push_events]
    try:
        idx1 = ids.index(data[0])
        idx2 = ids.index(data[1])
    except ValueError:
        return {"error": "One or both push IDs not found in recent events."}

    start = min(idx1, idx2)
    end = max(idx1, idx2)

    history_between = push_events[start:end+1]  
    grouped_push_events = []
    commits_list = []
    for event in history_between:
        push_id = event['id']
        created_at = event['created_at']
        repo = event['repo']['name']
        commits_list = []

        for commit in event["payload"]["commits"]:
            sha = commit['sha']
            author = commit['author']['name']
            message = commit['message']

            commit_detail_url = f"https://api.github.com/repos/arunkenwal02/code-validator/commits/{sha}"
            commit_detail_response = requests.get(commit_detail_url)

            if commit_detail_response.status_code != 200:
                diff = "❌ Failed to fetch diff"
            else:
                commit_detail = commit_detail_response.json()
                diffs = []
                for file in commit_detail.get('files', []):
                    patch = file.get('patch')
                    if patch:
                        diffs.append(f"File: {file['filename']}\n{patch}")
                diff = "\n\n".join(diffs) if diffs else "No diff available"

            commits_list.append({
                "sha": sha,
                "author": author,
                "commit_message": message,
                "code_diff": diff
            })

        grouped_push_events.append({
            "push_id": push_id,
            "repo": repo,
            "created_at": created_at,
            "commits": commits_list
        })
    return grouped_push_events

In [None]:
test = create_summary_of_events()
test
push id , commit summary, 

In [None]:
test[5]

In [None]:
len(test)

## One drive file access

In [8]:
import msal
import requests
import time
import fitz
import os 
from dotenv import load_dotenv
from datetime import datetime



load_dotenv() 
Permission_ID ="6a94cb3a-9869-4b54-ae0b-f4f523df2614"
client_id = Permission_ID
authority = "https://login.microsoftonline.com/consumers"
scopes = ["Files.Read"]
source_folder = "Documents/GitHub/code-validator/"
file_name = "Load Prediction Whitepaper.pdf"
version_id = int(7)
file_path = source_folder+file_name



In [None]:
# Extract updated version 

def access_token_key(client_id, authority):
    scopes = ["Files.Read"]
    app = msal.PublicClientApplication(client_id=client_id, authority=authority)
    result = None

    accounts = app.get_accounts()
    if accounts:
        result = app.acquire_token_silent(scopes, account=accounts[0])
    if not result:
        result = app.acquire_token_interactive(scopes=scopes)
    if not result or "access_token" not in result:
        print("MSAL Error:", result)
    access_token = result["access_token"]

    return access_token

def get_raw_data(client_id, authority ,file_path ):
    access_token= access_token_key(client_id=client_id, authority=authority)
    url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/content"
    headers = {"Authorization": f"Bearer {access_token}"}
    time.sleep(2)
    response = requests.get(url, headers=headers)
    print(f"Response code: {response.status_code}")
    if response.status_code == 200:
        file_bytes = response.content
        print("File read into memory!")
        return file_bytes
    else:
        print("Failed:", response.status_code, response.text)
        return None


def get_onedrive_whitepaper(file_bytes):
    
    # file_bytes is from above
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    text = ""
    for page_num, page in enumerate(doc):
        text += f"\n\n--- Page {page_num + 1} ---\n{page.get_text()}"

    print("First 1000 chars of PDF text:", text)
    
    return text
 
   
def prev_version( client_id, authority, file_path, version_id):
    access_token= access_token_key(client_id=client_id, authority=authority)

    versions_url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/versions"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(versions_url, headers=headers)

    if response.status_code == 200:
        versions = response.json()["value"]
        if len(versions) >= int(version_id):
            # 3. Get the 2nd version (index 1)
            version_id = versions[1]['id']
            print(f"2nd Version ID: {version_id}, Last Modified: {versions[1]['lastModifiedDateTime']}")
            
            # 4. Fetch 2nd version's PDF bytes
            download_url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{file_path}:/versions/{version_id}/content"
            version_response = requests.get(download_url, headers=headers)
            if version_response.status_code == 200:
                pdf_bytes = version_response.content  # This is your PDF in memory
                
                # 5. Extract text from the PDF (in memory, no save)
                doc = fitz.open(stream=pdf_bytes, filetype="pdf")
                all_text = ""
                for page_num, page in enumerate(doc):
                    all_text += f"\n--- Page {page_num+1} ---\n{page.get_text()}"
                
                print("Extracted PDF text (first 1000 chars):")
                print(all_text[:1000])
                return all_text
                # You can use `all_text` as needed (search, LLM input, etc)
            else:
                print("Failed to download 2nd version:", version_response.status_code, version_response.text)
        else:
            print("Less than 2 versions available!")
    else:
        print("Failed to fetch versions:", response.status_code, response.text)


In [6]:
file_bytes  = get_raw_data(client_id=client_id, authority=authority,file_path = file_path)
pdf_content = get_onedrive_whitepaper(file_bytes)  


Response code: 200
File read into memory!
First 1000 chars of PDF text: 

--- Page 1 ---
Machine Learning-Based Loan Approval Prediction System
for Financial Institutions
1. Executive Summary
The financial services industry faces a critical challenge in automating and de-risking
the loan approval process. Traditional methods, often relying on manual review and
static rule-based systems, are prone to human error, inconsistency, and significant
processing delays. These limitations result in missed opportunities, elevated credit risk,
and suboptimal customer experiences. To address these issues, we have developed a
robust machine learning-based Loan Approval Classification System. This model
leverages a comprehensive set of applicant data to predict the likelihood of loan
repayment, classifying applications as either approved or rejected.
Our system is designed to provide a high-level overview of an applicant's
creditworthiness, offering a data-driven, objective, and transparent decision-

In [7]:

version_id = 1
prev_version(client_id=client_id, authority=authority,  file_path= file_path, version_id = version_id)

2nd Version ID: 6.0, Last Modified: 2025-08-05T08:44:27Z
Extracted PDF text (first 1000 chars):

--- Page 1 ---
Machine Learning-Based Loan Approval Prediction System
for Financial Institutions
1. Executive Summary
The financial services industry faces a critical challenge in automating and de-risking
the loan approval process. Traditional methods, often relying on manual review and
static rule-based systems, are prone to human error, inconsistency, and significant
processing delays. These limitations result in missed opportunities, elevated credit risk,
and suboptimal customer experiences. To address these issues, we have developed a
robust machine learning-based Loan Approval Classification System. This model
leverages a comprehensive set of applicant data to predict the likelihood of loan
repayment, classifying applications as either approved or rejected.
Our system is designed to provide a high-level overview of an applicant's
creditworthiness, offering a data-driven, objective, an

"\n--- Page 1 ---\nMachine Learning-Based Loan Approval Prediction System\nfor Financial Institutions\n1. Executive Summary\nThe financial services industry faces a critical challenge in automating and de-risking\nthe loan approval process. Traditional methods, often relying on manual review and\nstatic rule-based systems, are prone to human error, inconsistency, and significant\nprocessing delays. These limitations result in missed opportunities, elevated credit risk,\nand suboptimal customer experiences. To address these issues, we have developed a\nrobust machine learning-based Loan Approval Classification System. This model\nleverages a comprehensive set of applicant data to predict the likelihood of loan\nrepayment, classifying applications as either approved or rejected.\nOur system is designed to provide a high-level overview of an applicant's\ncreditworthiness, offering a data-driven, objective, and transparent decision-making tool.\nBy integrating advanced machine learning tec

## Get updated file from github

In [8]:
import json 
import pandas as pd 
import requests
import requests
import base64


In [9]:
push_event= pd.read_json('push_events.json', )
latest_push_id = push_event[0].tolist()[0]
latest_push_id

# --- Usage ---
owner = "arunkenwal02"
repo = "code-validator"
push_id = latest_push_id
file_path = "loan-approval-prediction_v2.ipynb"

In [None]:
def get_sha_pair_from_push_id(owner, repo, push_id):
    """
    Returns (before_sha, head_sha) for the given push_id.
    If not found, returns (None, None).
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/events"
    resp = requests.get(url)
    events = resp.json()
    for event in events:
        if event["type"] == "PushEvent" and event["id"] == str(push_id):
            before_sha = event["payload"]["before"]
            head_sha = event["payload"]["head"]
            print(f"Push ID: {push_id}\nbefore: {before_sha}\nhead: {head_sha}")
            return before_sha, head_sha
    print(f"Push ID {push_id} not found in recent events.")
    return None, None

def fetch_latest_file_for_sha(owner, repo, file_path, sha_pairs):
    """
    For each (sha_old, sha_new) in sha_pairs, check if file_path was updated.
    If yes, download file from sha_new. Else, download most recently updated version.
    """
    for i, (sha_old, sha_new) in enumerate(sha_pairs):
        print(f"\nProcessing pair {i+1}: {sha_old} → {sha_new}")

        # 1. Compare the two SHAs
        compare_url = f"https://api.github.com/repos/{owner}/{repo}/compare/{sha_old}...{sha_new}"
        compare_resp = requests.get(compare_url)
        compare_data = compare_resp.json()

        file_changed = False
        for f in compare_data.get("files", []):
            if f["filename"] == file_path:
                file_changed = True
                print(f"File {file_path} was changed in this push.")
                break

        if file_changed:
            # Download updated file from sha_new
            content_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
            params = {"ref": sha_new}
            file_resp = requests.get(content_url, params=params)
            file_data = file_resp.json()
            
        # Check for 'content' key (base64-encoded)
            if "content" in file_data:
                nb_json = base64.b64decode(file_data["content"]).decode("utf-8")
                notebook_dict = json.loads(nb_json)
                return notebook_dict
            else:
                raise Exception("Notebook not found or could not fetch content. Details: " + str(file_data))

        else:
            print(f"File {file_path} was NOT changed between {sha_old} and {sha_new}.")
            # Get most recent commit where this file was updated
            commits_url = f"https://api.github.com/repos/{owner}/{repo}/commits"
            params = {"path": file_path, "per_page": 1}
            commits_resp = requests.get(commits_url, params=params)
            last_update_sha = commits_resp.json()[0]["sha"]
            print("Most recent commit where file was changed:", last_update_sha)
            # Download file at that SHA
            content_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
            params = {"ref": last_update_sha}
            file_resp = requests.get(content_url, params=params)
            file_data = file_resp.json()
            
            # Check for 'content' key (base64-encoded)
            if "content" in file_data:
                nb_json = base64.b64decode(file_data["content"]).decode("utf-8")
                notebook_dict = json.loads(nb_json)
                return notebook_dict
            else:
                raise Exception("Notebook not found or could not fetch content. Details: " + str(file_data))



In [11]:
sha_pair = get_sha_pair_from_push_id(owner, repo, push_id)

sha_pair = [sha_pair]
fetch_latest_file_for_sha 
# --- Usage example ---
notebook_contents = fetch_latest_file_for_sha(owner, repo, file_path, sha_pair)


Push ID: 52764657352
before: a92475d2557a2d1dd1e17c0a2f57ff950d60b6ae
head: 057c3593f00d2b21d20d4b572095038807df5de1

Processing pair 1: a92475d2557a2d1dd1e17c0a2f57ff950d60b6ae → 057c3593f00d2b21d20d4b572095038807df5de1
File loan-approval-prediction_v2.ipynb was NOT changed between a92475d2557a2d1dd1e17c0a2f57ff950d60b6ae and 057c3593f00d2b21d20d4b572095038807df5de1.
Most recent commit where file was changed: 899191ec55d9a7d44bcdaed22e41395f946059df


In [12]:

for i, cell in enumerate(notebook_contents['cells']):
    if cell['cell_type'] == 'code' and cell.get('outputs'):
        print(f"\nCell #{i+1}:")
        print("Code:")
        print("".join(cell['source']))
        print("\nOutputs:")
        for output in cell['outputs']:
            # Print text output (if any)
            if 'text' in output:
                print("".join(output['text']))
            # Print stream output
            if output.get('output_type') == 'stream':
                print("".join(output.get('text', '')))
            # Print execution result (display_data or execute_result)
            if output.get('output_type') in ['execute_result', 'display_data']:
                data = output.get('data', {})
                # Print text/plain or html if present
                if 'text/plain' in data:
                    print(data['text/plain'])
                if 'text/html' in data:
                    print(data['text/html'])
            # Print errors if any
            if output.get('output_type') == 'error':
                print(f"Error: {output.get('ename')} - {output.get('evalue')}")
                print("Traceback:")
                print("\n".join(output.get('traceback', [])))
        print("-" * 40)



Cell #11:
Code:
# Data dimension
print('Data dimension: {} rows and {} columns'.format(len(df_train), len(df_train.columns)))
df_train.head()

Outputs:
Data dimension: 491 rows and 13 columns

Data dimension: 491 rows and 13 columns

['    Loan_ID  Gender Married Dependents     Education Self_Employed  \\\n', '0  LP002305  Female      No          0      Graduate            No   \n', '1  LP001715    Male     Yes         3+  Not Graduate           Yes   \n', '2  LP002086  Female     Yes          0      Graduate            No   \n', '3  LP001136    Male     Yes          0  Not Graduate           Yes   \n', '4  LP002529    Male     Yes          2      Graduate            No   \n', '\n', '   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \\\n', '0             4547                0.0       115.0             360.0   \n', '1             5703                0.0       130.0             360.0   \n', '2             4333             2451.0       110.0             360.0   \n', '3

In [15]:
all_cells_text = ""

for i, cell in enumerate(notebook_contents['cells']):
    if cell['cell_type'] == 'code' and cell.get('outputs'):
        # Add cell number and code
        all_cells_text += f"\nCell #{i+1}\n"
        all_cells_text += "Code:\n"
        all_cells_text += "".join(cell['source']).strip() + "\n"
        all_cells_text += "Output(s):\n"
        # Add outputs
        for output in cell['outputs']:
            output_text = ""
            if output.get('output_type') == 'stream':
                text = output.get('text', '')
                if isinstance(text, list):
                    text = "".join(text)
                output_text += text.strip()
            elif output.get('output_type') in ['execute_result', 'display_data']:
                data = output.get('data', {})
                text = data.get('text/plain', '')
                if isinstance(text, list):
                    text = "".join(text)
                output_text += text.strip()
            # Skipping errors
            if output_text:
                all_cells_text += output_text + "\n"
        all_cells_text += "-" * 30 + "\n"

# Optional: remove leading/trailing whitespace
all_cells_text = all_cells_text.strip()

# Print or use as needed
print(all_cells_text)


Cell #11
Code:
# Data dimension
print('Data dimension: {} rows and {} columns'.format(len(df_train), len(df_train.columns)))
df_train.head()
Output(s):
Data dimension: 491 rows and 13 columns
Loan_ID  Gender Married Dependents     Education Self_Employed  \
0  LP002305  Female      No          0      Graduate            No   
1  LP001715    Male     Yes         3+  Not Graduate           Yes   
2  LP002086  Female     Yes          0      Graduate            No   
3  LP001136    Male     Yes          0  Not Graduate           Yes   
4  LP002529    Male     Yes          2      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             4547                0.0       115.0             360.0   
1             5703                0.0       130.0             360.0   
2             4333             2451.0       110.0             360.0   
3             4695                0.0        96.0               NaN   
4             6700             1750.