# file name formate

In [1]:
"""
reading the csv file which contains
crs_id,crs_title,file_id,file_name,file_url 
filling the gaps and formatting the fields to make it readable by code
"""

import pandas as pd
data_df=pd.read_csv('../urls.csv')
data_df['crs_title'] = data_df['crs_title'].apply(lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
data_df['final_name'] = data_df['crs_id'].astype(str) + '_' + data_df['crs_title'].astype(str) 

In [2]:
"""
creating list of first folder names under the material
"""

first_folder_name=data_df['final_name'].to_list()

In [3]:
"""
Getting the list of names of the pdf files that downloaded and saved for the extraction of images from them and save it to the common folder
"""

import os

directory_path_1 = '../downloading_data/downloads'
files = [f for f in os.listdir(directory_path_1) if os.path.isfile(os.path.join(directory_path_1, f))]
print(files)


['233_I_Polynomials_2546_Basics_for_Level_1_Mathematics.pdf', '233_I_Polynomials_2547_Multiplication_of_Polynomials.pdf']


In [4]:
"""
code for creating folders inside the material folder with the names which stored in first_folder_name variable
"""
material_dir = 'material' 
os.makedirs(material_dir, exist_ok=True)

for file, first_folder in zip(files, first_folder_name):
    save_folder = os.path.join(material_dir, first_folder)

    counter = 1
    original_save_folder = save_folder
    while os.path.exists(save_folder):
        save_folder = f"{original_save_folder}_{counter}"
        counter += 1
    
    os.makedirs(save_folder, exist_ok=True)
    print(f"Created folder: {save_folder}")


Created folder: material/233_I_Polynomials
Created folder: material/233_I_Polynomials_1


In [5]:
"""
Storing the new generated folder in a list
"""

directory_path_2 = '../extraction_data/material'
folders = [f for f in os.listdir(directory_path_2) if os.path.isdir(os.path.join(directory_path_2, f))]
print(folders)

['233_I_Polynomials_1', '233_I_Polynomials']


In [6]:
"""
Creating list of sub folders name inside the first_folder_name
"""

import pandas as pd
data_df=pd.read_csv('../urls.csv')
data_df['final_name'] = data_df['file_id'].astype(str)
new_folders_list=data_df['final_name'].to_list()[::-1]
print(new_folders_list)

['2547', '2546']


In [7]:
"""
Creating the subfolder and storing their paths as a list
"""

current_folder_paths=[]

for folder,new_folder in zip(folders,new_folders_list):
    folder1_path = os.path.join(directory_path_2, folder,new_folder)
    os.makedirs(folder1_path, exist_ok=True)

    common_folder_path = os.path.join(folder1_path, 'common')
    os.makedirs(common_folder_path, exist_ok=True)

    generated_folder_path = os.path.join(folder1_path, 'generated')
    os.makedirs(generated_folder_path, exist_ok=True)
    
    print(f"Created folder: {folder1_path}")
    current_folder_paths.append(folder1_path)

Created folder: ../extraction_data/material/233_I_Polynomials_1/2547
Created folder: ../extraction_data/material/233_I_Polynomials/2546


In [8]:
"""
Base64 code which take pages one by one and create it in a base64 code for make it readable by gpt model
"""

import base64
import io

import fitz
from PIL import Image
import os

# Function to convert a PDF page to base64
def pdf_page_to_base64(pdf_path: str, page_number: int):
    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    buffer = io.BytesIO()
    img.save(buffer, format="PNG")

    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [9]:
"""
Saving each pages inside the folder by editing their names i.e p1.png p2.png and so on
"""

# Function to save PDF pages as images
def save_pdf_pages_as_images(pdf_path: str, output_folder: str):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the PDF document
    pdf_document = fitz.open(pdf_path)
    
    # Loop through all pages and save as images
    for page_number in range(1, pdf_document.page_count + 1):
        # Convert page to base64 and then to an image file
        base64_image = pdf_page_to_base64(pdf_path, page_number)
        
        # Decode the base64 image and save it
        image_data = base64.b64decode(base64_image)
        image_filename = os.path.join(output_folder, f"p{page_number}.png")
        
        with open(image_filename, 'wb') as img_file:
            img_file.write(image_data)
        
        print(f"Saved page {page_number} as {image_filename}")


In [10]:
"""
Saving the images inside the common folder 
"""
for common_folder,file in zip(current_folder_paths,files):
    file_path = directory_path_1+'/'+file
    output_folder = common_folder+'/common/'

    save_pdf_pages_as_images(file_path, output_folder)

Saved page 1 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p1.png
Saved page 2 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p2.png
Saved page 3 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p3.png
Saved page 4 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p4.png
Saved page 5 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p5.png
Saved page 6 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p6.png
Saved page 7 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p7.png
Saved page 8 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p8.png
Saved page 9 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p9.png
Saved page 10 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p10.png
Saved page 11 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p11.png
Saved page 12 as ../extraction_data/material/233_I_Polynomials_1/2547/common/p12.pn

In [11]:
"""
Function for genrating the base64 of the image
"""

import base64
import io

import fitz
from PIL import Image
import os


def pdf_page_to_base64(pdf_path: str, page_number: int):
    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    buffer = io.BytesIO()
    img.save(buffer, format="PNG")

    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [12]:
"""
Function which returns the base64 code of the provided image path
"""

from IPython.display import Image as IPImage

def get_image_path(img_path):
    question_image_path = img_path
    question_base64_image = pdf_page_to_base64(question_image_path, 0)
    return question_base64_image

In [None]:
"""
LLM model for generating the html response for the provided image
"""

import os  
import base64
from openai import AzureOpenAI  

endpoint = "https://ai-anuragsingh65692019195ai682501652060.openai.azure.com/" 
deployment = "gpt-4o"
subscription_key ="ApKKNDsSgzuAwzeg0YmI97hia2uygeg8bQ7CheJt7jiP80BMxkRqJQQJ99ALACHYHv6XJ3w3AAAAACOGHz2s"

# Initialize Azure OpenAI Service client with key-based authentication    
def llm_bot(question_base64_image):
    client = AzureOpenAI(  
        azure_endpoint=endpoint,  
        api_key=subscription_key,  
        api_version="2024-05-01-preview",
    )
        
    #Prepare the chat prompt 
    chat_prompt = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": """ I have an image of a scanned A4 paper containing questions. Your task is to generate HTML code to replicate after understanding it and make sure the questions should be unique as well as similar to them dont go beyond the topic make sure the level should be same and the response should be in only dictionary and make sure no (backslash n) \n use only div. 
                        Your response must be a valid Python dictionary, with no extra text, explanations, or formatting. 
                        The response should exactly match the structure:
                        Respond ONLY with a clean dictionary. No markdown (```), no "python", no "json", no explanations.  
                        Your response should strictly follow this structure: 
                        {"title": "<p class=\"title\">Simple arithmetic questions</p>","questions": "<div class=\"question\"><div><span class=\"question-number\">(1)</span> (2a + b)(3x + 2y) =</div><div><span class=\"question-number\">(2)</span> (a - b)(c + d) =</div><div><span class=\"question-number\">(3)</span> (2a - b)(3x + 2y) =</div></div>"}
                        *your response should be clean it wont have any these things ```,python,\n,```python
                        *make sure same number of things to create
                        **ðŸš« DO NOT include:**  
                        - Markdown (` ``` `), "python", "json", or extra text.  
                        - Any `\n`, `\t`, or escape sequences.  
                        - Additional explanations or formatting.  
                        *Include mathematical expressions in LaTeX format (using MathJax for rendering) it must be :
                    """
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{question_base64_image}"}
                }
            ]
        }
    ] 
        
    # Include speech result if speech is enabled  
    messages = chat_prompt  
        
    # Generate the completion  
    completion = client.chat.completions.create(  
        model=deployment,
        messages=messages,
        max_tokens=2000,  
        temperature=0.7,  
        top_p=0.95,  
        frequency_penalty=0,  
        presence_penalty=0,
        stop=None,  
        stream=False
    )
    return completion
    

In [None]:
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image


# Format the HTML content with data from the dictionary
def generate_image_by_llm_and_save(response_dict,destination,image_name):
    html_content = f"""
    <!DOCTYPE html>  
    <html lang="en">  
    <head>  
    <meta charset="UTF-8">  
    <meta name="viewport" content="width=device-width, initial-scale=1.0">  
    <title>Replica of A4 Paper</title>  
    <script type="text/javascript" async
      src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
    </script>

    <style>  
        * {{  
        margin: 0;  
        padding: 0;  
        box-sizing: border-box;  
        }}  
        body {{  
        display: flex;  
        justify-content: center;  
        align-items: center;  
        background-color: #f4f4f4;  
        font-family: 'Times New Roman', Times, serif;  
        }}  
        .a4-page {{  
        width: 210mm;
            height: 297mm;
            background: white;
            padding: 10mm;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
            border: 1px solid #ddd;
            display: flex;
            flex-direction: column; 
        }}  
        .title {{  
        font-size: 24px;  
        font-weight: bold;  
        text-align: left;  
        }}  
        .question {{  
            font-size: 18px;
            padding-top: 10px;
            line-height: 1.8;
            display: flex;
            flex-direction: column;
            justify-content: space-between;
            height: 100%;
        }}  
        .question span {{  
        font-weight: bold;  
        }}  
        .question-number {{  
        margin-right: 10px;  
        }}  
    </style>  
    </head>  
    <body>  
    <div class="a4-page">  
        {response_dict['title']}
        {response_dict['questions']}
    </div>  
    </body>  
    </html>
    """

    # Save HTML to a file
    html_file = "/Users/anuragsingh/Documents/GitHub/LLM/book_arrangement/extraction_data/a4_page.html"
    with open(html_file, "w") as file:
        file.write(html_content)

    # Set up Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--window-size=800,1200")  # Ensure the window fits the A4 page
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Open the HTML file
    driver.get(f"file:///{html_file}")

    # Wait for the page to load
    time.sleep(5)

    # Adjust zoom level to fit A4 dimensions
    driver.execute_script("document.body.style.zoom='0.94'")  # Adjust zoom if needed
    time.sleep(1)

    # Take a full-page screenshot
    screenshot_path = destination+image_name+'.png'
    print(screenshot_path)
    driver.save_screenshot(screenshot_path)

    # Close the browser
    driver.quit()



In [None]:
import glob
import json

usage_records=[]
current_folder_paths_images=[]
for folder_images in current_folder_paths:
    image_paths = folder_images + '/common/*' 
    generated_image_path = folder_images + '/generated/' 

    for item in glob.glob(image_paths):
        filename_with_extension = os.path.basename(item)  # Get the filename with extension
        filename, _ = os.path.splitext(filename_with_extension)  # Remove the extension 

        base64_of_images=get_image_path(item)
        completion = llm_bot(base64_of_images)
        response=completion.to_dict()['choices'][0]['message']['content']
        response_usage=completion.to_dict()['usage']
        usage_records.append({
            "filename": filename,
            "completion_tokens": response_usage.get('completion_tokens', 0),
            "prompt_tokens": response_usage.get('prompt_tokens', 0),
            "total_tokens": response_usage.get('total_tokens', 0),
        })
        new_df_usage=pd.DataFrame(usage_records)
        new_df_usage.describe().to_csv('usage.csv',index=True)

        # Convert the string to a dictionary
        print(response)
        response_dict = json.loads(response)
        generate_image_by_llm_and_save(response_dict,generated_image_path,filename)



{"title": "<p class=\"title\">Exponential Expressions</p>","questions": "<div class=\"question\"><div><span class=\"question-number\">(1)</span> \\((-3)^2 = \\)</div><div><span class=\"question-number\">(2)</span> \\((-4)^3 = \\)</div><div><span class=\"question-number\">(3)</span> \\((-5)^4 = \\)</div><div><span class=\"question-number\">(4)</span> \\(\\left(-\\frac{1}{3}\\right)^2 = \\)</div><div><span class=\"question-number\">(5)</span> \\(\\left(-\\frac{2}{5}\\right)^3 = \\)</div><div><span class=\"question-number\">(6)</span> \\(-(-3)^3 = \\)</div><div><span class=\"question-number\">(7)</span> \\(-(-4)^4 = \\)</div><div><span class=\"question-number\">(8)</span> \\(-\\left(-\\frac{1}{2}\\right)^3 = \\)</div><div><span class=\"question-number\">(9)</span> \\(-6^2 = \\)</div><div><span class=\"question-number\">(10)</span> \\(-7^3 = \\)</div><div><span class=\"question-number\">(11)</span> \\(-\\left(-\\frac{2}{3}\\right)^4 = \\)</div><div><span class=\"question-number\">(12)</spa

In [None]:
      jm=pd.DataFrame(usage_records)

In [19]:
new_df_usage

Unnamed: 0,filename,completion_tokens,prompt_tokens,total_tokens
0,p2,355,665,1020
1,p1,423,665,1088
2,p2,341,665,1006
3,p1,214,665,879


In [30]:
new_df_usage.describe().to_csv('usage.csv',index=True)

In [29]:
read_usage=pd.read_csv('usage.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'usage.csv'

In [28]:
prompt_token_cost = 0.000216  # Cost per prompt token (example)
completion_token_cost = 0.00086551  # Cost per completion token (example)

# Calculate the total spend
total_prompt_tokens = read_usage['prompt_tokens'].sum()
total_completion_tokens = read_usage['completion_tokens'].sum()

total_cost = (total_prompt_tokens * prompt_token_cost) + (total_completion_tokens * completion_token_cost)

print(f"Total Tokens Used: {total_prompt_tokens + total_completion_tokens}")
print(f"Total Spend: Rs {total_cost:.4f}")

Total Tokens Used: 6084.6947054967595
Total Spend: Rs 2.6722
