In [None]:
!pip install PyPDF2 requests
!pip install PyPDF2 pdf2image pytesseract requests
!apt-get install -y poppler-utils tesseract-ocr

In [None]:
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
import requests
import re
import os

# Function to extract text from PDF (supports both text-based and image-based)
def extract_text_from_pdf(pdf_file):
    text = ""

    # First try to extract text using PyPDF2
    try:
        with open(pdf_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
    except:
        pass  # Silent fail

    # If text still empty, try OCR using Tesseract
    if not text.strip():
        images = convert_from_path(pdf_file)
        for image in images:
            text += pytesseract.image_to_string(image) + "\n"

    return text

# Function to analyze sentiment using ICL, CoT, and simulated RAG with ChatGPT API
def analyze_sentiment_with_cot_and_rag(text, api_key):
    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json',
    }

    # Few-shot examples (ICL)
    icl_examples = """
    Example 1: "The company performed exceptionally well this quarter." -> Sentiment Score: 0.9 (very positive)
    Example 2: "The audit revealed significant risks and operational inefficiencies." -> Sentiment Score: -0.7 (negative)
    Example 3: "The company met its compliance requirements, but there are still areas for improvement." -> Sentiment Score: 0.2 (neutral)
    Now, analyze the sentiment of the following sentence.
    """

    # CoT prompt
    cot_prompt = f"""
    Based on the given examples and retrieved information, break down the sentiment analysis into reasoning steps.
    Then provide a sentiment score between -1 (very negative) to 1 (very positive).
    \nSentence: "{text}"
    """

    # Simulated RAG info
    rag_information = """
    Retrieved external knowledge:
    Positive phrases in audit reports include: "effective controls," "positive cash flow," "in compliance."
    Negative phrases include: "significant risks," "material weakness," "operational inefficiencies."
    Neutral phrases include: "met compliance requirements," "satisfactory performance," "adequate financial control."
    """

    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "user", "content": f"{icl_examples}\n{rag_information}\n{cot_prompt}"}
        ],
        "max_tokens": 150
    }

    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)

    if response.status_code == 200:
        reply = response.json()
        try:
            return reply['choices'][0]['message']['content']
        except (KeyError, IndexError):
            return None
    else:
        print(f"API Error: {response.status_code}")
        return None

# Main function
def main():
    pdf_file_path = "/content/dec17.pdf"  # PDF file path
    api_key = " " # PUT OpenAI API key

    pdf_text = extract_text_from_pdf(pdf_file_path)

    # Split into manageable chunks
    chunk_size = 2000
    text_chunks = [pdf_text[i:i + chunk_size] for i in range(0, len(pdf_text), chunk_size)]

    sentiment_scores = []
    for chunk in text_chunks:
        result = analyze_sentiment_with_cot_and_rag(chunk, api_key)
        if result:
            print(result)
            match = re.search(r"Sentiment Score: ([\-0-9.]+)", result)
            if match:
                sentiment_scores.append(float(match.group(1)))

    if sentiment_scores:
        overall_score = sum(sentiment_scores) / len(sentiment_scores)
        print(f"\nOverall Sentiment Score of PDF: {round(overall_score, 3)}")

# Run it
if __name__ == "__main__":
    main()


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.7).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reasoning Steps:
1. The audit report is from Deloitte Haskins & Sells, Chartered Accountants, indicating a professional audit firm.
2. The report is addressed to the Board of Directors of Central Depository Services (India) Limited.
3. The auditors mention that they have audited the financial results for the quarter and nine months ended December 31, 2017.
4. The audit was conducted in accordance with the Standards on Auditing issued by the Institute of Chartered Accountants of India.
5. The auditors state their responsibility to express an opinion on the financial statement based on their audit.
6. The audit report does not contain specific positive or negative phrases from the retrieved external kno