In [None]:
import os
import PyPDF2
import re
import reimport
import PyPDF2
import pandas as pd
import nltk

# Helper function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# Helper function to clean extracted text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Helper function to count word frequency
def count_word_frequency(text, word):
    words = text.split()
    return words.count(word.lower())

# List of PDF file names
pdf_files = [
    r'2024.Sep.pdf',
    r'2022.Sep.pdf',
    r'2023.Sep.pdf'
]

# Dictionary to store frequency counts
frequency_results = {}

# Check each file and count the occurrence of 'IRA'
for pdf_file in pdf_files:
    if not os.path.exists(pdf_file):
        print(f"File does not exist: {pdf_file}")
        continue

    try:
        raw_text = extract_text_from_pdf(pdf_file)
        cleaned_text = clean_text(raw_text)
        frequency = count_word_frequency(cleaned_text, 'ira')
        frequency_results[pdf_file] = frequency
        print(f"In file {pdf_file}, 'IRA' appears: {frequency}")
    except Exception as e:
        print(f"Error processing file {pdf_file}: {e}")

In [None]:
# Now, extract sentences containing both "IRA" and "risk"
nltk.download('punkt')  # Download tokenizer model for sentence splitting

def extract_text_from_pdf(file_path):
    """
    Extracts text from all pages of a PDF file.
    """
    text = ""
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def preprocess_text(text):
    """
    Preprocess text by replacing newline characters with spaces.
    """
    return text.replace('\n', ' ')

def split_into_sentences(text):
    """
    Split text into sentences using NLTK's sent_tokenize.
    """
    return nltk.sent_tokenize(text)

def filter_ira_risk_sentences(sentences):
    """
    Filter sentences containing both "IRA" and "risk" (case-insensitive).
    """
    return [s for s in sentences if 'ira' in s.lower() and 'risk' in s.lower()]

def extract_ira_phrases(sentence, window=2):
    """
    Extract IRA-related phrases from a sentence.
    It constructs phrases including 'IRA' and a specified number of words before and after it.

    Example:
    Sentence: "The new IRA policy may affect the market"
    Window=2 extracts: "new IRA policy may"
    """
    words = sentence.split()
    phrases = []
    for i, word in enumerate(words):
        if word.lower() == 'ira':
            start = max(0, i - window)
            end = min(len(words), i + window + 1)
            phrase = ' '.join(words[start:end])
            phrases.append(phrase)
    return phrases

# List of PDF files (ensure file paths are correct or modify accordingly)
pdf_files = [
    r'2024.Sep.pdf',
    r'2022.Sep.pdf',
    r'2023.Sep.pdf'
]

# Stores extraction results; each record contains PDF filename, full sentence, and extracted IRA-related phrases
data = []

for pdf_file in pdf_files:
    if not os.path.exists(pdf_file):
        print(f"File does not exist: {pdf_file}")
        continue

    try:
        # Extract and preprocess text
        raw_text = extract_text_from_pdf(pdf_file)
        preprocessed_text = preprocess_text(raw_text)
        # Split into sentences using NLTK
        sentences = split_into_sentences(preprocessed_text)
        # Filter sentences containing both IRA and risk
        filtered_sentences = filter_ira_risk_sentences(sentences)

        for sentence in filtered_sentences:
            ira_phrases = extract_ira_phrases(sentence, window=2)
            data.append({
                "PDF File": pdf_file,
                "IRA & Risk Sentence": sentence.strip(),
                "IRA Phrases": ', '.join(ira_phrases)
            })
    except Exception as e:
        print(f"Error processing file {pdf_file}: {e}")

# Compile results into a DataFrame and display
if data:
    df = pd.DataFrame(data)
    # Adjust pandas display options to show complete sentences and phrases
    pd.set_option('display.max_colwidth', None)
    print("Extracted IRA-related sentences containing 'risk' and their IRA phrases:")
    print(df)
else:
    print("No IRA-related sentences containing 'risk' were extracted.")