In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install webdriver_manager

Note: you may need to restart the kernel to use updated packages.


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException, NoSuchElementException
import time
from selenium.webdriver.common.by import By

In [4]:
class ExtractAndDownloadPdf:
    def __init__(self, default_directory_path, url):
        self.default_directory_path = default_directory_path
        self.url = url

    def initialize_options(self):   
        # Set Chrome options to download PDFs automatically instead of opening them
        self.options = Options()
        self.options.add_experimental_option('prefs', {
            'download.default_directory': self. default_directory_path,  # Path to save the files
            'download.prompt_for_download': False,  # Automatically download without asking
            'download.directory_upgrade': True,  # Automatically replace if file exists
            'plugins.always_open_pdf_externally': True,  # Disable PDF viewer
        })

    def initialize_webdriver(self):
        self.initialize_options()
        # Set up the Selenium WebDriver with ChromeDriverManager
        self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
        
        self.driver.get(self.url) # Open the quarterly reports page
        
        self.driver.maximize_window()  # Maximize the window
        self.driver.implicitly_wait(10)  # Wait for page load

    def close_cookie_diailog(self):
        try:
            cookie_div = self.driver.find_element(By.XPATH, "//div[@id='onetrust-banner-sdk']")
            if cookie_div:
                self.driver.find_element(By.XPATH, "//div[@id='onetrust-close-btn-container']/button").click()
        except NoSuchElementException:
            print("No cookie banner found or already closed.")
    
    def download_shareholder_report(self):
        try:
            pdf_links = self.driver.find_elements(By.XPATH, "//a[contains(text(),'Report to shareholders')]")
            for link in pdf_links:
                try:
                    self.driver.execute_script("arguments[0].scrollIntoView();", link)
    
                    # Explicitly wait for the element to be present and visible
                    WebDriverWait(self.driver, 25).until(EC.visibility_of(link))
    
                    # Attempt to click using JavaScript to bypass potential blockers
                    self.driver.execute_script("arguments[0].click();", link)
                    time.sleep(3)  # Wait for a few seconds to ensure the download starts
    
                except ElementClickInterceptedException:
                    print("ElementClickInterceptedException: Trying to click using JavaScript.")
                    self.driver.execute_script("arguments[0].click();", link)
                    time.sleep(3)  # Wait for a few seconds to ensure the download starts
        except TimeoutException: 
            print("TimeoutException: Element not found or clickable within the time limit.")
            self.driver.quit()
        except NoSuchElementException:
            print("NoSuchElementException: No 'Report to shareholders' link found.")

default_directory_path = 'C:\\Users\\MIS\\Lambton\\NLP\\Assignment-1'

# URL of the page to scrape
url = 'https://www.cibc.com/en/about-cibc/investor-relations/quarterly-results.html'

extractAndDownloadPdf = ExtractAndDownloadPdf(default_directory_path, url)
extractAndDownloadPdf.initialize_webdriver()
extractAndDownloadPdf.close_cookie_diailog()
extractAndDownloadPdf.download_shareholder_report()

# Close the driver after downloading
extractAndDownloadPdf.driver.quit()


TimeoutException: Element not found or clickable within the time limit.


In [5]:
pip install tabula-py




In [6]:
import tabula

class ExtractPdfContent:
    def __init__(self, pdf_name):
        self.pdf_name = pdf_name
        self.dfs_dict = {}
        self.quater_name = self.pdf_name[:2]

    def extract_table(self):
        self.dfs_dict[self.quater_name] = tabula.read_pdf(self.pdf_name, pages='all', force_subprocess=True, stream=True)

extract_pdf_content = ExtractPdfContent('q324report-en.pdf')
extract_pdf_content.extract_table()

In [7]:
extract_pdf_content.dfs_dict.get(extract_pdf_content.quater_name)[3]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,months ended,Unnamed: 5,Unnamed: 6,months ended.1
0,,,2024,,2024,2023,,2024,2023
1,Unaudited,,Jul. 31,,Apr. 30,Jul. 31 (1),,Jul. 31,Jul. 31 (1)
2,Financial results ($ millions),,,,,,,,
3,Net interest income,$,3532,$,3281,"$ 3,236",$,10062,"$ 9,628"
4,Non-interest income,,3072,,2883,2616,,8927,7857
...,...,...,...,...,...,...,...,...,...
63,Leverage ratio,,4.3 %,,4.3 %,4.2 %,,4.3 %,4.2 %
64,Liquidity coverage ratio (LCR),,126 %,,129 %,131 %,,,
65,Net stable funding ratio (NSFR),,116 %,,115 %,117 %,,116 %,117 %
66,Other information,,,,,,,,


In [8]:
pip install pymupdf

Collecting pymupdfNote: you may need to restart the kernel to use updated packages.

  Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl (16.0 MB)
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
   ------- -------------------------------- 2.9/16.0 MB 15.2 MB/s eta 0:00:01
   ----------------- ---------------------- 7.1/16.0 MB 19.0 MB/s eta 0:00:01
   --------------------------------- ------ 13.4/16.0 MB 22.7 MB/s eta 0:00:01
   ---------------------------------------- 16.0/16.0 MB 21.4 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.11


In [12]:
import fitz  # PyMuPDF

class PDFTextExtractor:
    def __init__(self, pdf_path):
        """
        Initialize the PDFTextExtractor class with the path to the PDF file.
        
        :param pdf_path: Path to the PDF file
        """
        self.pdf_path = pdf_path
        self.pdf_document = None
        self.pdf_text = ''
    
    def open_pdf(self):
        """
        Open the PDF file using PyMuPDF (fitz).
        """
        try:
            self.pdf_document = fitz.open(self.pdf_path)
        except Exception as e:
            print(f"Error opening PDF: {e}")
    
    def extract_text(self):
        """
        Extract only text from all pages of the PDF and store it in self.pdf_text.
        """
        if self.pdf_document is None:
            print("PDF file is not opened. Please open the PDF file first.")
            return
        
        for page_num in range(len(self.pdf_document)):
            page = self.pdf_document.load_page(page_num)  # Load page
            # Extract text using the 'text' mode which focuses on text only, ignoring tables
            self.pdf_text += page.get_text("text")

    def close_pdf(self):
        """
        Close the PDF file after extracting the text.
        """
        if self.pdf_document is not None:
            self.pdf_document.close()

    def get_text_preview(self, num_characters=1000):
        """
        Return a preview of the extracted text (first 'num_characters' characters).
        
        :param num_characters: Number of characters to return as preview (default 1000)
        :return: Preview of the extracted text
        """
        return self.pdf_text[:num_characters]

    def write_text_to_file(self, output_path):
        """
        Write the extracted text to a text file.
        
        :param output_path: Path to the output text file.
        """
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(self.pdf_text)
        print(f"Text successfully written to {output_path}")

# Example Usage:
if __name__ == "__main__":
    # Create an instance of PDFTextExtractor with the PDF path
    extractor = PDFTextExtractor('C:\\Users\\MIS\\chat-bot\\q324report-en.pdf')

    # Open the PDF
    extractor.open_pdf()

    # Extract text from the PDF
    extractor.extract_text()

    # Get and print a preview of the extracted text (first 1000 characters)
    print(extractor.get_text_preview(1000))

     # Write the extracted text to a file
    extractor.write_text_to_file('extracted_text.txt')

    # Close the PDF file
    extractor.close_pdf()


 
 
 
Report to Shareholders for the Third Quarter, 2024 
www.cibc.com
August 29, 2024 
Report of the President and Chief Executive Officer 
Overview of results 
CIBC today announced its financial results for the third quarter ended July 31, 2024. 
Third quarter highlights 
 
 
Q3/24 
  
Q3/23 (1) 
  
Q2/24 
  
YoY 
Variance 
 
  
QoQ 
Variance 
 
Revenue 
 
$6,604 million   
$5,852 million   
$6,164 million   
+13% 
  
+7% 
 
Reported Net Income 
 
$1,795 million   
$1,432 million   
$1,749 million   
+25% 
  
+3% 
 
Adjusted Net Income (2) 
 
$1,895 million   
$1,475 million   
$1,718 million   
+28% 
  
+10% 
 
Adjusted pre-provision, pre-tax earnings (2) 
 
$2,939 million   
$2,602 million   
$2,690 million   
+13% 
  
+9% 
 
Reported Diluted Earnings Per Share (EPS) 
 
$1.82 
  
$1.47 
  
$1.79 
  
+24% 
  
+2% 
 
Adjusted Diluted EPS (2) 
 
$1.93 
  
$1.52 
  
$1.75 
  
+27% 
  
+10% 
 
Reported Return on Common Shareholders’ Equity (ROE) (3)  
13.2% 
  
11.6% 
  
13.7% 
  
 
  


In [14]:
import re

class PDFTextCleaner:
    def __init__(self, text):
        """
        Initialize the PDFTextCleaner with the extracted text.
        """
        self.text = text

    def remove_non_content(self):
        """
        Remove non-content elements such as page numbers, headers, and footers.
        """
        # Remove multiple spaces, tabs, newlines
        self.text = re.sub(r'\s+', ' ', self.text)

        # Remove page numbers (e.g., "CIBC THIRD QUARTER 2024 5")
        self.text = re.sub(r'CIBC THIRD QUARTER \d{4} \d+', '', self.text)

        # Remove common disclaimers or footer elements
        self.text = re.sub(r'For additional information.*?statement.', '', self.text, flags=re.IGNORECASE)
        
        # Remove references in brackets, e.g., (1), (2)
        self.text = re.sub(r'\(\d+\)', '', self.text)

        # Remove table-like structures (numbers and percentages)
        self.text = re.sub(r'\$\d+[\.,]?\d*\s*[a-zA-Z%]*|\d+\.\d+%', '', self.text)

        return self.text

    def get_cleaned_text(self):
        """
        Return the cleaned text.
        """
        return self.remove_non_content()

# Load the extracted text from file
with open('C:\\Users\\MIS\\chat-bot\\extracted_text.txt', 'r', encoding='utf-8') as file:
    extracted_text = file.read()

# Initialize the cleaner
cleaner = PDFTextCleaner(extracted_text)

# Get cleaned text
cleaned_text = cleaner.get_cleaned_text()

# Write the cleaned text to a new file
with open('C:\\Users\\MIS\\chat-bot\\cleaned_text.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print("Cleaned text successfully written to 'cleaned_text.txt'.")

Cleaned text successfully written to 'cleaned_text.txt'.
