<a href="https://colab.research.google.com/github/arunkumargopidinne/Enhancement_Restaurant_app/blob/main/PDF_Reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2 transformers sentence_transformers nltk

import PyPDF2
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def summarize_text(text, max_length=500):
  """Summarizes the input text using a transformer model."""
  summarizer = pipeline("summarization")
  summary = summarizer(text, max_length=max_length, min_length=100, do_sample=False) # min_length added to avoid short summaries
  return summary[0]['summary_text']


def extract_key_information(text):
    """Extracts key information related to growth, changes, triggers, and potential material effects."""

    # Define keywords and patterns (more can be added)
    growth_keywords = ["growth", "expansion", "increase", "rise", "double", "triple", "upswing", "progress", "development", "potential", "future"]
    change_keywords = ["change", "new", "updated", "revised", "restructuring", "merger", "acquisition", "strategy shift", "innovation"]
    trigger_keywords = ["trigger", "catalyst", "driver", "opportunity", "risk", "challenge", "milestone", "event"]
    material_effect_keywords = ["impact", "affect", "influence", "result", "consequence", "material", "significant", "substantial", "earnings", "profit", "revenue", "performance", "outlook", "forecast"]

    # Sentence Segmentation
    sentences = nltk.sent_tokenize(text)

    key_info = {
        "growth_prospects": [],
        "key_changes": [],
        "key_triggers": [],
        "material_effects": []
    }

    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in growth_keywords):
            key_info["growth_prospects"].append(sentence)
        if any(keyword in sentence.lower() for keyword in change_keywords):
            key_info["key_changes"].append(sentence)
        if any(keyword in sentence.lower() for keyword in trigger_keywords):
            key_info["key_triggers"].append(sentence)
        if any(keyword in sentence.lower() for keyword in material_effect_keywords):
            key_info["material_effects"].append(sentence)

    return key_info

def refine_information(key_info):
  """Further refines the extracted information by removing redundant or irrelevant sentences."""
  refined_info = {}
  for category, sentences in key_info.items():
    refined_sentences = []
    for sentence in sentences:
      # Basic cleaning (can be improved)
      cleaned_sentence = re.sub(r'[^\w\s]', '', sentence).lower() # Remove punctuation
      stop_words = set(stopwords.words('english'))
      words = cleaned_sentence.split()
      filtered_sentence = [w for w in words if not w in stop_words]
      refined_sentences.append(" ".join(filtered_sentence))
    refined_info[category] = refined_sentences
  return refined_info


def analyze_sentiment(sentences):
  """Analyzes the sentiment of the extracted sentences."""
  analyzer = pipeline("sentiment-analysis")
  sentiment_results = {}
  for sentence in sentences:
    try:
      result = analyzer(sentence)
      sentiment_results[sentence] = result[0]  # Store sentiment label and score
    except Exception as e:
        print(f"Sentiment analysis error: {e}") # Handle cases where sentiment analysis might fail
        sentiment_results[sentence] = {"label": "Neutral", "score": 0.5} # Default to neutral

  return sentiment_results



def main(pdf_path):
    """Main function to orchestrate the information extraction process."""
    text = extract_text_from_pdf(pdf_path)
    #summary = summarize_text(text)  # Optional: Summarize before extracting key info
    key_info = extract_key_information(text)
    refined_info = refine_information(key_info)

    # Sentiment Analysis
    sentiment_info = {}
    for category, sentences in refined_info.items():
      sentiment_info[category] = analyze_sentiment(sentences)

    return refined_info, sentiment_info # Return both refined info and sentiment


# Example usage (replace with your PDF path)
pdf_file_path = "/content/SJS Transcript Call.pdf" # Example path
refined_info, sentiment_info = main(pdf_file_path)

# Print or process the results
print("Refined Key Information:")
for category, sentences in refined_info.items():
  print(f"\n{category.upper()}:")
  for sentence in sentences:
    print(f"- {sentence}")

print("\nSentiment Analysis:")
for category, sentiments in sentiment_info.items():
  print(f"\n{category.upper()} Sentiment:")
  for sentence, sentiment in sentiments.items():
    print(f"- {sentence}: {sentiment['label']} ({sentiment['score']})")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
No model was 

Refined Key Information:

GROWTH_PROSPECTS:
- enterprises limited _______________________ thabraz hushain w company secretary compliance officer membership
- a51119 encl sjs enterprises limited q1 fy202 4 earnings conference call july 27 202 3 analyst mr ronak mehta jm financial management mr ka
- joseph managing director cofounder mr sanjay thapar ceo executive director mr mahendra naredi chief financial officer ms dev anshi dhruv head investor relations sjs enterprises july 27 2023 moderator ladies gentlemen good day welcome sjs enterprises 1q fy24 earnings conference call hosted jm financial
- ronak mehta jm financial institutional securities welcome 1q fy2 4 earnings call sjs enterprises
- sanjay take industry view business performance also give strategic outloo k future growth company end
- sjs enterprises july 27 2023 sanjay mah endra take presentation later im happy inform completed transformative acquisit ion walter pack india within set timeline
- successful acquisition exot e