<a href="https://colab.research.google.com/github/aswinaus/ML/blob/main/Document_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install azure-ai-formrecognizer openai

In [None]:
# Step 1: Parse document using Azure Document Intelligence
from azure.ai.formrecognizer.aio import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from openai import AsyncOpenAI
import asyncio

# Replace with your actual access token
from google.colab import userdata
DOCUMENTINTEL_KEY = userdata.get('DOCUMENTINTEL_KEY')

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

import nest_asyncio
nest_asyncio.apply()


import os
import json

directory_path = "/content/drive/MyDrive/ML/Training"
all_entries = os.listdir(directory_path)
document_files = [entry for entry in all_entries if os.path.isfile(os.path.join(directory_path, entry))]

print(document_files)

# Azure Document Intelligence setup
endpoint = "https://documentsclassifier.cognitiveservices.azure.com/"
key = DOCUMENTINTEL_KEY

async def process_document(file_name, document_analysis_client, openai_client):
    """Processes a single document using Azure Document Intelligence and OpenAI asynchronously."""
    file_path = os.path.join(directory_path, file_name)
    if file_name.lower().endswith('.odt'):
        print(f"Skipping .odt file: {file_name}")
        return file_name, f"Skipped: .odt file"

    try:
        async with asyncio.Lock(): # Use a lock if file reading needs to be synchronized, though often not needed for reads
             with open(file_path, "rb") as f:
                # Azure Document Intelligence client for async operations
                poller = await document_analysis_client.begin_analyze_document("prebuilt-document", f)
                result = await poller.result()

        # Extract content
        content = result.content

        # Step 2: Send content to GPT for classification
        # Original prompt template - NOT MODIFIED
        prompt = f"""
        Classify the following document text into an appropriate category.
        Also return the number of pages.
        include the precide page number which you think contains the gist of the document.
        If the above exists in more than one page have it displayed as comma separated like 1,2
        Number of Pages should include the over all count of the document
        Return JSON with fields: File Name, Category, Confidence, Description, Number of Pages, Gist Page Number.

        Document:
        {content}
        """

        response = await openai_client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a document classifier. Classify the following document content."},
                {"role": "user", "content": prompt} # Using the original prompt template
            ]
        )
        classification = response.choices[0].message.content
        return file_name, classification

    except Exception as e:
        return file_name, f"Error: {e}"

async def main():
    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(DOCUMENTINTEL_KEY)
    )
    openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)

    tasks = []
    for file_name in document_files:
        tasks.append(process_document(file_name, document_analysis_client, openai_client))

    results = await asyncio.gather(*tasks)

    classification_results = {}
    for file_name, result in results:
        classification_results[file_name] = result

    # Close the clients (important for async clients)
    await document_analysis_client.close()
    await openai_client.close()

    # Display results in JSON format
    print("Classification Results (JSON):")
    print(json.dumps(classification_results, indent=4))

# Run the async main function
if __name__ == "__main__":
    asyncio.run(main())

# Task
Loop through all documents in "/content/drive/MyDrive/ML/Training", extract content using appropriate libraries for different file types (.csv, .xlsx, .docx, .pptx) and Azure Document Intelligence for others, send the extracted content to GPT for classification, and display the results in a beautified JSON format. Skip .odt files.

## Install necessary libraries

### Subtask:
Install libraries required for reading different file formats (e.g., pandas for CSV/Excel, python-docx for Word, python-pptx for PowerPoint, and potentially a library for handling PDFs if not already covered by Azure).


**Reasoning**:
The subtask requires installing several libraries to handle different file formats. I will use pip to install `pandas`, `python-docx`, and `python-pptx`.



In [None]:
%pip install pandas python-docx python-pptx

## Create modular functions

### Subtask:
Create modular functions for processing different file types (.csv, .xlsx, .docx, .pptx) using appropriate libraries and a function for other file types using Azure Document Intelligence.


**Reasoning**:
Implement the modular functions for processing different file types and adapt the existing Azure Document Intelligence function as instructed.



In [None]:
import pandas as pd
from docx import Document
from pptx import Presentation
from azure.ai.formrecognizer.aio import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from openai import AsyncOpenAI
import asyncio
import os

# Replace with your actual access token
from google.colab import userdata
DOCUMENTINTEL_KEY = userdata.get('DOCUMENTINTEL_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

import nest_asyncio
nest_asyncio.apply()

directory_path = "/content/drive/MyDrive/ML/Training"

def process_excel(file_path):
    """Reads an Excel file and extracts content as text."""
    try:
        df = pd.read_excel(file_path)
        return df.to_string()
    except Exception as e:
        return f"Error processing Excel file: {e}"

def process_csv(file_path):
    """Reads a CSV file and extracts content as text."""
    try:
        df = pd.read_csv(file_path)
        return df.to_string()
    except Exception as e:
        return f"Error processing CSV file: {e}"

def process_word(file_path):
    """Reads a Word file and extracts content as text."""
    try:
        document = Document(file_path)
        content = ""
        for para in document.paragraphs:
            content += para.text + "\n"
        return content
    except Exception as e:
        return f"Error processing Word file: {e}"

def process_powerpoint(file_path):
    """Reads a PowerPoint file and extracts content as text."""
    try:
        prs = Presentation(file_path)
        content = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    content += shape.text + "\n"
        return content
    except Exception as e:
        return f"Error processing PowerPoint file: {e}"

async def process_document_intelligence(file_path, document_analysis_client, openai_client):
    """Processes a document using Azure Document Intelligence and OpenAI asynchronously."""
    try:
        async with asyncio.Lock():
             with open(file_path, "rb") as f:
                poller = await document_analysis_client.begin_analyze_document("prebuilt-document", f)
                result = await poller.result()
        content = result.content

        # Step 2: Send content to GPT for classification
        prompt = f"""
        Classify the following document text into an appropriate category.
        Also return the number of pages.
        include the precide page number which you think contains the gist of the document.
        If the above exists in more than one page have it displayed as comma separated like 1,2
        Number of Pages should include the over all count of the document
        Return JSON with fields: File Name, Category, Confidence, Description, Number of Pages, Gist Page Number.

        Document:
        {content}
        """

        response = await openai_client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a document classifier. Classify the following document content."},
                {"role": "user", "content": prompt}
            ]
        )
        classification = response.choices[0].message.content
        return os.path.basename(file_path), classification

    except Exception as e:
        return os.path.basename(file_path), f"Error: {e}"

## Update main processing logic

### Subtask:
Modify the main loop to identify the file type based on its extension and call the appropriate processing function for each file.


**Reasoning**:
Modify the main loop to iterate through files, determine their type, and call the appropriate processing function, including handling skipped .odt files.



In [None]:
async def main():
    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(DOCUMENTINTEL_KEY)
    )
    openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)

    classification_results = {}
    tasks = []

    for file_name in document_files:
        file_path = os.path.join(directory_path, file_name)
        file_extension = os.path.splitext(file_name)[1].lower()

        if file_extension == '.odt':
            print(f"Skipping .odt file: {file_name}")
            classification_results[file_name] = "Skipped: .odt file"
        elif file_extension == '.csv':
            content = process_csv(file_path)
            # For simplicity, directly classify content from local files
            prompt = f"""
            Classify the following document text into an appropriate category.
            Also return the number of pages.
            include the precide page number which you think contains the gist of the document.
            If the above exists in more than one page have it displayed as comma separated like 1,2
            Number of Pages should include the over all count of the document
            Return JSON with fields: File Name, Category, Confidence, Description, Number of Pages, Gist Page Number.

            Document:
            {content}
            """
            tasks.append(openai_client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a document classifier. Classify the following document content."},
                    {"role": "user", "content": prompt}
                ]
            ))
            classification_results[file_name] = "Processing with OpenAI" # Placeholder
        elif file_extension == '.xls' or file_extension == '.xlsx':
            content = process_excel(file_path)
            prompt = f"""
            Classify the following document text into an appropriate category.
            Also return the number of pages.
            include the precide page number which you think contains the gist of the document.
            If the above exists in more than one page have it displayed as comma separated like 1,2
            Number of Pages should include the over all count of the document
            Return JSON with fields: File Name, Category, Confidence, Description, Number of Pages, Gist Page Number.

            Document:
            {content}
            """
            tasks.append(openai_client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a document classifier. Classify the following document content."},
                    {"role": "user", "content": prompt}
                ]
            ))
            classification_results[file_name] = "Processing with OpenAI" # Placeholder
        elif file_extension == '.docx':
            content = process_word(file_path)
            prompt = f"""
            Classify the following document text into an appropriate category.
            Also return the number of pages.
            include the precide page number which you think contains the gist of the document.
            If the above exists in more than one page have it displayed as comma separated like 1,2
            Number of Pages should include the over all count of the document
            Return JSON with fields: File Name, Category, Confidence, Description, Number of Pages, Gist Page Number.

            Document:
            {content}
            """
            tasks.append(openai_client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a document classifier. Classify the following document content."},
                    {"role": "user", "content": prompt}
                ]
            ))
            classification_results[file_name] = "Processing with OpenAI" # Placeholder
        elif file_extension == '.pptx':
            content = process_powerpoint(file_path)
            prompt = f"""
            Classify the following document text into an appropriate category.
            Also return the number of pages.
            include the precide page number which you think contains the gist of the document.
            If the above exists in more than one page have it displayed as comma separated like 1,2
            Number of Pages should include the over all count of the document
            Return JSON with fields: File Name, Category, Confidence, Description, Number of Pages, Gist Page Number.

            Document:
            {content}
            """
            tasks.append(openai_client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a document classifier. Classify the following document content."},
                    {"role": "user", "content": prompt}
                ]
            ))
            classification_results[file_name] = "Processing with OpenAI" # Placeholder
        else:
            # Use Azure Document Intelligence for other types (including PDFs and images)
            tasks.append(process_document_intelligence(file_path, document_analysis_client, openai_client))

    # Wait for all OpenAI tasks to complete
    openai_responses = await asyncio.gather(*[task for task in tasks if "Processing with OpenAI" in classification_results.values()])

    # Update results with OpenAI responses
    openai_response_index = 0
    for file_name in document_files:
        if classification_results.get(file_name) == "Processing with OpenAI":
            classification_results[file_name] = openai_responses[openai_response_index].choices[0].message.content
            openai_response_index += 1
        elif classification_results.get(file_name) != "Skipped: .odt file":
             # Handle results from process_document_intelligence
             pass # This part will be handled by the original async gather if needed

    # Gather results from process_document_intelligence (if any)
    di_tasks_results = await asyncio.gather(*[task for task in tasks if not isinstance(task, asyncio.Task)])
    for file_name, result in di_tasks_results:
         classification_results[file_name] = result


    # Close the clients
    await document_analysis_client.close()
    await openai_client.close()

    # Display results in JSON format
    print("Classification Results (JSON):")
    print(json.dumps(classification_results, indent=4))

# Run the async main function
if __name__ == "__main__":
    asyncio.run(main())