<a href="https://colab.research.google.com/github/bhimabasheer/Function-and-triggers/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Import necessary libraries
import re
import spacy
from transformers import pipeline, AutoTokenizer
import pandas as pd
from PyPDF2 import PdfReader
from google.colab import files

# Upload the file
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]  # Automatically gets the uploaded file path

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Initialize transformer-based pipeline for summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Define function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# Define function to split transcript into sections
def split_sections(text):
    sections = re.split(r"(Moderator:|Question:|Answer:|Response:)", text)
    sections = [section.strip() for section in sections if section.strip()]
    return sections

# Define function to classify sections
def classify_sections(sections):
    key_sections = {"Growth Prospects": [], "Key Changes": [], "Triggers": [], "Earnings Impact": []}
    growth_keywords = ["growth", "expansion", "market", "opportunities", "strategy"]
    change_keywords = ["acquisition", "restructure", "merger", "addition"]
    trigger_keywords = ["new customer", "order", "contract", "cross-sell"]
    earnings_keywords = ["earnings", "profit", "loss", "financial impact", "revenue"]

    for section in sections:
        for keyword in growth_keywords:
            if keyword in section.lower():
                key_sections["Growth Prospects"].append(section)
                break
        for keyword in change_keywords:
            if keyword in section.lower():
                key_sections["Key Changes"].append(section)
                break
        for keyword in trigger_keywords:
            if keyword in section.lower():
                key_sections["Triggers"].append(section)
                break
        for keyword in earnings_keywords:
            if keyword in section.lower():
                key_sections["Earnings Impact"].append(section)
                break

    return key_sections

# Define function to summarize text in manageable chunks
def summarize_text_in_chunks(text, max_tokens=512):
    sentences = text.split(". ")
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.encode(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(". ".join(current_chunk) + ".")
            current_chunk = [sentence]
            current_length = sentence_length
    if current_chunk:
        chunks.append(". ".join(current_chunk) + ".")

    summarized_text = ""
    for chunk in chunks:
        summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)
        summarized_text += summary[0]['summary_text'] + " "

    return summarized_text.strip()

# Summarize sections by processing them in chunks
def summarize_sections(key_sections):
    summarized_data = {}
    for category, sections in key_sections.items():
        summarized_text = " ".join(sections)
        summarized_data[category] = summarize_text_in_chunks(summarized_text)
    return summarized_data

# Run the pipeline on the uploaded document
def extract_investor_insights(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    sections = split_sections(text)
    key_sections = classify_sections(sections)
    summarized_data = summarize_sections(key_sections)
    return summarized_data

insights = extract_investor_insights(pdf_path)

# Convert results to DataFrame for easy viewing or exporting
insights_df = pd.DataFrame(list(insights.items()), columns=["Category", "Summary"])
print(insights_df)


Saving SJS Transcript Call.pdf to SJS Transcript Call (7).pdf
           Category                                            Summary
0  Growth Prospects  SJS Enterprises 1Q FY24 Earnings Conference Ca...
1       Key Changes  SJS Enterprises 1Q FY24 Earnings Conference Ca...
2          Triggers  SJS Enterprises 1Q FY24 Earnings Conference Ca...
3   Earnings Impact  Transcripts of Analysts/Investor Meet/ Earning...


In [3]:
pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
