# Extract text

**Import PDFMiner**

In [1]:
from pdfminer.layout import LAParams
import pandas as pd

In [2]:
from pdfminer.high_level import extract_text
def extract_text_from_pdf(pdf_path):
    laparams = LAParams()
    text = extract_text(pdf_path,laparams)
    return text.split("\f")

In [3]:
extract_text_from_pdf('Q3-2024-Press-Release-English.pdf')

['Canadian Tire Corporation Reports Third Quarter 2024 Results; Announces Annual \n\nDividend Increase for 15th Consecutive Year and Share Repurchase Intention \n\nToronto, November 7, 2024 – Canadian Tire Corporation, Limited (TSX:CTC, TSX: CTC.A) (CTC \nor the Company) today released its third quarter results for the period ended September 28, 2024.  \n\n•  Consolidated  comparable  sales1  trend  improved  compared  to  Q2  2024;  consolidated \n\ncomparable sales were down 1.5% compared to Q3 2023.  \n\n•  Diluted and Normalized Earnings Per Share1 (EPS) were $3.59, compared to $(1.19) in Q3 \n\n2023, and up 21.3% from $2.96 on a normalized basis. \n\n•  Annualized  dividend  increased  from  $7.00  to  $7.10  per  share,  alongside  an  intention  to \n\nrepurchase up to $200.0 million of Class A Non-Voting Shares in 2025.  \n\n“We delivered strong retail profitability for the third consecutive quarter and sales trends improved,” \n\nsaid  Greg  Hicks,  President  and  CEO,  Canad

## Extract text across pages

In [4]:
import re
def normalize_text(text):
    return re.sub(r'\s+', ' ', text.strip()).lower()

def extract_text_between_words_across_pages(text_pages, start_word, end_word):
    # Normalize words
    start_word = normalize_text(start_word)
    end_word = normalize_text(end_word)
    extracted_texts = []
    combined_text = "\n".join([normalize_text(page) for page in text_pages])
    pattern = re.compile(re.escape(start_word) + '(.*?)' + re.escape(end_word), re.S)

    for match in pattern.finditer(combined_text):
        start_page, end_page = None, None
        start_index, end_index = match.start(), match.end()

        cumulative_length = 0
        for i, page in enumerate(text_pages):
            page_length = len(page)
            if start_page is None and start_index < cumulative_length + page_length:
                start_page = i + 1
            if end_page is None and end_index <= cumulative_length + page_length:
                end_page = i + 1
            cumulative_length += page_length + len("\n")
            if start_page is not None and end_page is not None:
                break

        extracted_texts.append({
            'start_page': start_page,
            'end_page': end_page,
            'start_word': start_word,
            'end_word': end_word,
            'extracted_text': match.group(1).strip()
        })
    return extracted_texts


Extract text by sections

In [5]:
start_end_pairs = [
    ('THIRD-QUARTER HIGHLIGHTS', 'products are set to roll out during 2025.'),
    ('CONSOLIDATED OVERVIEW','Company in the quarter.'),
    ('RETAIL SEGMENT OVERVIEW','impacted the Retail segment in the quarter.'),
    ('FINANCIAL SERVICES OVERVIEW','Services segment in the quarter.'),
    ('CT REIT OVERVIEW','November 5, 2024.'),
    ('CAPITAL ALLOCATION','million to $575 million.'),
    ('QUARTERLY DIVIDEND','for tax purposes.'),
    ('SHARE REPURCHASES','to regulatory approvals.'),
    ('1) NON-GAAP FINANCIAL MEASURES','Owned brand penetration'),
    ('FORWARD-LOOKING STATEMENTS', 'by applicable securities laws.'),
    ('CONFERENCE CALL','at this website for 12 months.'),
    ('ABOUT CANADIAN TIRE CORPORATION','visit Corp.CanadianTire.ca.'),
    ('FOR MORE INFORMATION','karen.keyes@cantire.com')
    # Add more pairs as needed
]

import os
def process_files_in_folder(file_name, start_end_pairs):
    all_extracted_texts = []
    if file_name.endswith('.pdf'):
      text_pages = extract_text_from_pdf(file_name)
      for start_word, end_word in start_end_pairs:
        extracted_texts = extract_text_between_words_across_pages(text_pages, start_word, end_word)
        all_extracted_texts.extend(extracted_texts)

    return all_extracted_texts

all_extracted_texts = process_files_in_folder('Q3-2024-Press-Release-English.pdf', start_end_pairs)

In [6]:
all_extracted_texts

[{'start_page': 1,
  'end_page': 3,
  'start_word': 'third-quarter highlights',
  'end_word': 'products are set to roll out during 2025.',
  'extracted_text': '• consolidated comparable sales were down 1.5%; sportchek grew for the first quarter since q2 2023, which partially offset declines at canadian tire retail (ctr) and mark’s. o ctr comparable sales1 were down 2.2%, compared to q3 2023. customers continued to prioritize essential categories including automotive, which continued to perform well against a strong quarter in q3 2023, led by growth in automotive service. o sportchek comparable sales1 were up 2.9%, marking two consecutive quarters in which sportchek outperformed industry trends. targeted promotional events and\nimproved customer experience continued to be a focus and contributed to growth in athletic footwear and hockey categories. o mark’s comparable sales1 were down 2.3%, led by industrial wear declines, which were partially offset by growth in men’s shorts and t-shir

# Text summarizer
Summarize text by section
Lets Create a text summarizer using Cosine Distance

In [19]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import re

def read_article(article):
    sentences = []
    for sentence in article.split('. '):
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split())
    return sentences

def sentence_similarity(sent1, sent2, stop_words=None):
    if stop_words is None:
        stop_words = []

    sent1 = [w.lower() for w in sent1 if w not in stop_words]
    sent2 = [w.lower() for w in sent2 if w not in stop_words]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # Build the vector for the first sentence
    for w in sent1:
        if w in stop_words:
            continue
        vector1[all_words.index(w)] += 1

    # Build the vector for the second sentence
    for w in sent2:
        if w in stop_words:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:  # ignore if both are the same sentence
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

def generate_summary(article, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text and split it
    sentences = read_article(article)

    # Step 2 - Generate Similarity Matrix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity matrix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentence)

    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Output the summarize text
    print("Summarized Text: \n", ". ".join(summarize_text))
    return summarize_text


In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Andrea
[nltk_data]     FS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Testing summarizer in sections

In [9]:
for section in all_extracted_texts:
    print(f"Summary for section starting with '{section['start_word']}' and ending with '{section['end_word']}':")
    generate_summary(section['extracted_text'], top_n=2)
    print("\n")

Summary for section starting with 'third-quarter highlights' and ending with 'products are set to roll out during 2025.':
Indexes of top ranked_sentence order are  [(0.07653128487177972, ['•', 'consolidated', 'comparable', 'sales', 'were', 'down', '1.5%;', 'sportchek', 'grew', 'for', 'the', 'first', 'quarter', 'since', 'q2', '2023,', 'which', 'partially', 'offset', 'declines', 'at', 'canadian', 'tire', 'retail', '(ctr)', 'and', 'mark’s']), (0.07537800341474762, ['•', 'improved', 'retail', 'profitability', 'led', 'to', 'higher', 'consolidated', 'income', 'before', 'income', 'taxes', '(ibt)', 'at', '$299.3', 'million,', 'an', 'increase', 'of', '$230.0', 'million', 'and', '$33.0', 'million', 'on', 'a', 'normalized', 'basis1', 'compared', 'to', 'the', 'prior', 'year']), (0.06414638177243122, ['•', 'in-store', 'net', 'promoter', 'score', '(nps)', 'was', 'up', 'across', 'the', 'company’s', 'banners,', 'including', 'ctr;', 'store', 'investments', 'and', 'a', 'focus', 'on', 'strong', 'in-stock

## Saving and structuring summarizations

In [23]:
import json

processed_data = {
    'document_sections': []
}

for section in all_extracted_texts:
    processed_data['document_sections'].append({
        'start_word': section['start_word'],
        'end_word': section['end_word'],
        'summary': generate_summary(section['extracted_text'], top_n=1),
        'full_text': section['extracted_text']
    })



Indexes of top ranked_sentence order are  [(0.07653128487177972, ['•', 'consolidated', 'comparable', 'sales', 'were', 'down', '1.5%;', 'sportchek', 'grew', 'for', 'the', 'first', 'quarter', 'since', 'q2', '2023,', 'which', 'partially', 'offset', 'declines', 'at', 'canadian', 'tire', 'retail', '(ctr)', 'and', 'mark’s']), (0.07537800341474762, ['•', 'improved', 'retail', 'profitability', 'led', 'to', 'higher', 'consolidated', 'income', 'before', 'income', 'taxes', '(ibt)', 'at', '$299.3', 'million,', 'an', 'increase', 'of', '$230.0', 'million', 'and', '$33.0', 'million', 'on', 'a', 'normalized', 'basis1', 'compared', 'to', 'the', 'prior', 'year']), (0.06414638177243122, ['•', 'in-store', 'net', 'promoter', 'score', '(nps)', 'was', 'up', 'across', 'the', 'company’s', 'banners,', 'including', 'ctr;', 'store', 'investments', 'and', 'a', 'focus', 'on', 'strong', 'in-stock', 'availability', 'of', 'key', 'brands', 'continued', 'to', 'drive', 'improvements', 'in', 'positive', 'customer', 'senti

In [24]:
processed_data

{'document_sections': [{'start_word': 'third-quarter highlights',
   'end_word': 'products are set to roll out during 2025.',
   'summary': ['• consolidated comparable sales were down 1.5%; sportchek grew for the first quarter since q2 2023, which partially offset declines at canadian tire retail (ctr) and mark’s'],
   'full_text': '• consolidated comparable sales were down 1.5%; sportchek grew for the first quarter since q2 2023, which partially offset declines at canadian tire retail (ctr) and mark’s. o ctr comparable sales1 were down 2.2%, compared to q3 2023. customers continued to prioritize essential categories including automotive, which continued to perform well against a strong quarter in q3 2023, led by growth in automotive service. o sportchek comparable sales1 were up 2.9%, marking two consecutive quarters in which sportchek outperformed industry trends. targeted promotional events and\nimproved customer experience continued to be a focus and contributed to growth in athlet

In [25]:
# Save to a JSON file for chatbot usage
with open('processed_document.json', 'w') as file:
    json.dump(processed_data, file, indent=4)

# Scrapping tables

Tables on pages: 6,7,8,9 and 10 (3 tables on page 7)

In [11]:
import locale

In [12]:
# Function to set locale safely
def set_locale(locale_name):
    try:
        locale.setlocale(locale.LC_ALL, locale_name)
    except locale.Error:
        print(f"Locale {locale_name} not supported on this system. Using default locale.")
        locale.setlocale(locale.LC_ALL, '')

# Set the locale (adjust as necessary for your system)
set_locale('en_US.UTF-8')  # Common locale setting for Unix-like systems
locale._override_localeconv = {'n_sign_posn':1}

In [13]:
# Format as currency
def format_currency(value):
    if pd.isna(value):
        return ''
    return locale.currency(value, grouping=True).split('.')[0]

import tabula
# Extract tables from the PDF

# Path to the PDF file
pdf_path = 'Q3-2024-Press-Release-English.pdf'
table1 = tabula.read_pdf(pdf_path, pages=6, multiple_tables=True)


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


In [14]:
table1[0]

Unnamed: 0,Net income,$,220.7 $,(27.8) $,540.2 $,141.9
0,Net income attributable to shareholders,,200.6,(66.4),476.2,40.8
1,Add normalizing items:,,,,,
2,DC fire expense (recovery),$,— $,(96.4) $,— $,8.4
3,GST/HST-related charge1,,—,—,—,24.7
4,Change in fair value of redeemable financial i...,,—,328.0,—,328.0
5,Normalized Net income,$,220.7 $,203.8 $,540.2 $,503.0
6,Normalized Net income attributable to sharehol...,$,200.6 $,165.2 $,476.2 $,396.9
7,Normalized Diluted EPS,$,3.59 $,2.96 $,8.54 $,7.0


In [15]:
# Check if a table was found
if table1:
    for i, table_df in enumerate(table1):

        existing_headers = table_df.columns
        table_df.columns = existing_headers

        table_df = table_df.drop(0)  # Drop the first row which is now the header
        table_df = table_df.reset_index(drop=True)  # Reset the index

         # Clean and process the DataFrame as needed
        numeric_columns = ['Q3 2024','Q3 2023','Q3 2024','Q3 2023']

        for col in numeric_columns:
          table_df[col] = table_df[col].str.replace(',', '')
          table_df[col] = table_df[col].str.replace('(', '-')
          table_df[col] = table_df[col].str.replace(')', '')
          table_df[col] = pd.to_numeric(table_df[col], errors='coerce')
          table_df[col] = table_df[col].apply(format_currency)
else:
    print("No tables found.")

KeyError: 'Q3 2024'

In [None]:
table_df.head(30)

Unnamed: 0,"(Canadian $ in millions, except as noted)",Q2-2024,Q1-2024,Q2-2023,YTD-2024,YTD-2023
0,Net interest income,"$4,515","$4,721","$4,814","$9,236","$8,835"
1,Non-interest revenue,"$3,459","$2,951","$2,975","$6,410","$4,053"
2,Revenue,"$7,974","$7,672","$7,789","$15,646","$12,888"
3,Provision for credit losses,-$705,-$627,"-$1,023","-$1,332","-$1,240"
4,Non-interest expense,"-$4,844","-$5,389","-$5,501","-$10,233","-$9,883"
5,Income before income taxes,"$2,425","$1,656","$1,265","$4,081","$1,765"
6,Provision for income taxes,-$559,-$364,-$236,-$923,-$603
7,Net income,"$1,866","$1,292","$1,029","$3,158","$1,162"
8,Diluted EPS ($),$2,$1,$1,$4,$1
9,Adjusting Items Impacting Revenue (Pre-tax),,,,,


In [None]:
#another way : Tabula - First Step

import tabula

# Path to the PDF file
pdf_path = 'Q3-2024-Press-Release-English.pdf'
table = tabula.read_pdf(pdf_path, pages=6, multiple_tables=False)
# Check if a table was found
if table:
    for i, table_df in enumerate(table):

        existing_headers = table_df.columns
        new_headers = table_df.iloc[0]

        combined_headers = []

        for existing, new in zip(existing_headers, new_headers):
          combined_headers.append(f'{existing}_{new}')  # Combine existing and new headers

        table_df.columns = new_headers

        #table_df = table_df.drop(0)  # Drop the first row which is now the header
        table_df = table_df.reset_index(drop=True)  # Reset the index

else:
    print("No tables found.")

In [None]:
table_df.head(30)

Unnamed: 0,Net income attributable to shareholders,NaN,200.6,(66.4),476.2,40.8
0,Add normalizing items:,,,,,
1,DC fire expense (recovery),$,— $,(96.4) $,— $,8.4
2,GST/HST-related charge1,,—,—,—,24.7
3,Change in fair value of redeemable financial i...,,—,328.0,—,328.0
4,Normalized Net income,$,220.7 $,203.8 $,540.2 $,503.0
5,Normalized Net income attributable to sharehol...,$,200.6 $,165.2 $,476.2 $,396.9
6,Normalized Diluted EPS,$,3.59 $,2.96 $,8.54 $,7.0


Improving Tabula Results

In [None]:
import tabula
import pandas as pd

# Path to the PDF file
pdf_path = 'Q3-2024-Press-Release-English.pdf'

# Format as currency
def format_currency(value):
    if pd.isna(value):
        return ''
    return locale.currency(value, grouping=True).split('.')[0]

tables = tabula.read_pdf(pdf_path, pages=6, multiple_tables=False)

if tables:
    for i, table_df in enumerate(tables):

        # Assume the first row contains headers
        new_headers = table_df.iloc[0]
        table_df.columns = new_headers

        # Drop the first row which is now the header
        table_df = table_df.drop(0).reset_index(drop=True)

        if 'Management Markets' in table_df.columns:
            # Split the merged column into two separate columns
            split_cols = table_df['Management Markets'].str.split(expand=True)

            # Rename the new columns appropriately
            split_cols.columns = ['Management', 'Markets']

            # Drop the old merged column and concatenate the new columns
            table_df = table_df.drop(columns=['Management Markets'])
            table_df = pd.concat([table_df, split_cols], axis=1)

        # Clean and process the DataFrame as needed
        numeric_columns = ['Canadian P&C', 'U.S. P&C', 'Total P&C', 'Management', 'Markets', 'Services', 'Total Bank', '(US$ in millions)']

        for col in numeric_columns:
          table_df[col] = table_df[col].str.replace(',', '')
          table_df[col] = table_df[col].str.replace('(', '-')
          table_df[col] = table_df[col].str.replace(')', '')
          table_df[col] = pd.to_numeric(table_df[col], errors='coerce')
          table_df[col] = table_df[col].apply(format_currency)
else:
    print("No tables found.")

In [None]:
actual_columns = ['(Canadian $ in millions, except as noted)','Canadian P&C', 'U.S. P&C', 'Total P&C', 'BMO Wealth Management', 'BMO Capital Markets', 'Corporate Services', 'Total Bank', 'U.S. Segment (US$ in millions)']

table_df.rename(columns={'Management': 'BMO Wealth Management', 'Markets': 'BMO Capital Markets', 'Services' : 'Corporate Services', '(US$ in millions)': 'U.S. Segment (US$ in millions)'}, inplace=True)
table_df = table_df[actual_columns]

In [None]:
table_df.head(15)

Unnamed: 0,"(Canadian $ in millions, except as noted)",Canadian P&C,U.S. P&C,Total P&C,BMO Wealth Management,BMO Capital Markets,Corporate Services,Total Bank,U.S. Segment (US$ in millions)
0,Q2-2024,,,,,,,,
1,Reported net income (loss),$872,$543,"$1,415",$320,$459,-$328,"$1,866",$559
2,Acquisition and integration costs,$2,,$2,,$2,$22,$26,$17
3,Amortization of acquisition-related intangible...,$3,$69,$72,$2,$5,,$79,$54
4,Legal provision (including related interest ex...,,,,,,,,
5,and legal fees),,,,,,$12,$12,$9
6,Impact of FDIC special assessment,,,,,,$50,$50,$37
7,Adjusted net income (loss) (2),$877,$612,"$1,489",$322,$466,-$244,"$2,033",$676
8,Q1-2024,,,,,,,,
9,Reported net income (loss),$921,$560,"$1,481",$240,$393,-$822,"$1,292",$184


In [None]:
for col in table_df.columns:
    # Combine rows by indices for specific rows
    combined_row = table_df.iloc[4, table_df.columns.get_loc(col)] + ' ' + table_df.iloc[5, table_df.columns.get_loc(col)]
    combined_row = combined_row.replace('- -', '-')

    # Assign the combined row to the DataFrame
    table_df.iloc[4, table_df.columns.get_loc(col)] = combined_row

# Drop the redundant row
table_df = table_df.drop(index=5).reset_index(drop=True)

for col in table_df.columns:
    # Combine rows by indices for specific rows
    combined_row = table_df.iloc[11, table_df.columns.get_loc(col)] + ' ' + table_df.iloc[12, table_df.columns.get_loc(col)]
    combined_row = combined_row.replace('- -', '-')

    # Assign the combined row to the DataFrame
    table_df.iloc[11, table_df.columns.get_loc(col)] = combined_row

# Drop the redundant row
table_df = table_df.drop(index=12).reset_index(drop=True)

In [None]:
table_df.head(15)

Unnamed: 0,"(Canadian $ in millions, except as noted)",Canadian P&C,U.S. P&C,Total P&C,BMO Wealth Management,BMO Capital Markets,Corporate Services,Total Bank,U.S. Segment (US$ in millions)
0,Q2-2024,,,,,,,,
1,Reported net income (loss),$872,$543,"$1,415",$320,$459,-$328,"$1,866",$559
2,Acquisition and integration costs,$2,,$2,,$2,$22,$26,$17
3,Amortization of acquisition-related intangible...,$3,$69,$72,$2,$5,,$79,$54
4,Legal provision (including related interest ex...,,,,,,$12,$12,$9
5,Impact of FDIC special assessment,,,,,,$50,$50,$37
6,Adjusted net income (loss) (2),$877,$612,"$1,489",$322,$466,-$244,"$2,033",$676
7,Q1-2024,,,,,,,,
8,Reported net income (loss),$921,$560,"$1,481",$240,$393,-$822,"$1,292",$184
9,Acquisition and integration costs,$1,,$1,,$10,$46,$57,$39
