In [70]:
# pip install tabula-py

In [71]:
import pandas as pd
import tabula
import numpy as np
from tabula import read_pdf

# Common Extracting and preprocessing Logic 

## Extract tables

In [72]:
def extract_table_from_pdf(pdf_path, page_number, area=None, multiple_tables=False, stream=True):
    # Read tables from the specified PDF page and area
    tables = read_pdf(
        pdf_path,
        pages=page_number,
        area=area,  # Specify the area if provided
        multiple_tables=multiple_tables,
        stream=stream
    )

    # Initialize an empty DataFrame
    table_df = pd.DataFrame()

    if tables:
        # Loop through the tables if multiple_tables is True
        for i, table in enumerate(tables if isinstance(tables, list) else [tables]):
            new_headers = table.iloc[0]
            table.columns = new_headers
            table = table.drop(0).reset_index(drop=True)
            if not multiple_tables:
                return table
            else:
                table_df = pd.concat([table_df, table], ignore_index=True)
    else:
        print(f"No tables found on page {page_number}.")
    return table_df

## Page 1 

In [73]:
def raname_page1_columns(table_df, column_renames=None, unnamed_column_start=1):
    if table_df.empty:
        raise ValueError("The input DataFrame is empty.")

    # Step 1: Replace NaN column names with sequential integers
    table_df.columns = [
        str(i) if pd.isna(col) else col
        for i, col in enumerate(table_df.columns, start=unnamed_column_start)
    ]

    # Step 2: Rename columns based on the provided mapping
    if column_renames:
        table_df = table_df.rename(columns=column_renames)

    return table_df

In [74]:
import numpy as np

def process_and_unpivot_table_p1(table):
    table.columns = table.columns.astype(str)

    # Drop the first row if it's used as a header
    table = table.iloc[1:].reset_index(drop=True)
    
    # Flatten any multi-dimensional data
    table = table.applymap(lambda x: x if not isinstance(x, list) else ', '.join(map(str, x)))
    
    # Ensure the first column is the identifier
    id_col = table.columns[0]

    # Unpivot the table
    try:
        unpivoted_table = pd.melt(
            table,
            id_vars=[id_col],  # Use the first column as identifier
            var_name="Column Name",      # New column for original column names
            value_name="Value"           # New column for the cell values
        )
    except ValueError as e:
        print(f"Error while melting: {e}")
        print(f"Columns: {table.columns}")
        return pd.DataFrame()  # Return an empty DataFrame on failure

    # Add a combined text column
    unpivoted_table['text'] = unpivoted_table[id_col] + " - " + unpivoted_table["Column Name"]
    
    # Clean the 'Value' column
    unpivoted_table['Value'] = (
        unpivoted_table['Value']
        .astype(str)                          # Ensure values are strings for cleaning
        .str.replace(r'[\$%,]', '', regex=True)  # Remove $, %, and commas
        .str.replace('million', '', regex=False) # Remove "million"
        .str.strip()                          # Strip leading/trailing spaces
    )
    
    # Replace 'nan' with np.NaN
    unpivoted_table['Value'] = unpivoted_table['Value'].replace('nan', np.NaN)
    
    # Convert the 'Value' column to numeric
    unpivoted_table['Value'] = pd.to_numeric(unpivoted_table['Value'], errors='coerce')
    
    # Drop unnecessary columns and keep 'text' and 'Value'
    unpivoted_table = unpivoted_table[['text', 'Value']]
    
    return unpivoted_table

## Page 3 to 7

In [75]:
def split_merged_columns(table, column_index, new_column_name=None):
    # Extract data from the specified column
    data_from_column = table.iloc[:, column_index]  # Get data from the specified column
    header_of_column = table.columns[column_index]  # Get the header of the specified column

    # Determine the new column name
    if not new_column_name:
        new_column_name = f"New {header_of_column}"

    # Add the new column to the table
    table[new_column_name] = data_from_column

    return table

In [76]:
def format_columns(df, fourth_column_index=3, last_column_index=-1):
    # Format the fourth column
    fourth_column_name = df.columns[fourth_column_index]
    df[fourth_column_name] = df[fourth_column_name].astype(str).apply(
        lambda x: x.replace('$', '').replace(',', '').replace('\xa0', ' ').strip() if isinstance(x, str) else x
    )
    df[fourth_column_name] = df[fourth_column_name].apply(
        lambda x: x.split(' ')[0] if isinstance(x, str) and ' ' in x else x
    )

    # Format the last column
    last_column_name = df.columns[last_column_index]
    df[last_column_name] = df[last_column_name].astype(str).apply(
        lambda x: x.replace('$', '').replace(',', '').replace('\xa0', ' ').strip() if isinstance(x, str) else x
    )
    df[last_column_name] = df[last_column_name].apply(
        lambda x: x.split(' ')[-1] if isinstance(x, str) and ' ' in x else x
    )

    return df

In [77]:
def process_table_headers_and_convert(table_df, num_header_rows=4, handle_merged_columns=False):
    # Save the existing column names (current header), converting NaNs to empty strings
    original_headers = table_df.columns.astype(str).fillna('')
    
    # Extract the header rows
    headers = table_df.iloc[:num_header_rows]
    
    # Fill NaN values in the header rows with an empty string
    headers = headers.fillna('')
    
    # Merge the headers across the specified rows
    combined_headers = headers.apply(lambda x: ' '.join(x.astype(str)).strip(), axis=0)
    
    # Include original header only if it's not 'nan'
    combined_headers = [
        f"{original} {combined}".strip() if original != 'nan' else combined
        for original, combined in zip(original_headers, combined_headers)
    ]
    
    # Remove columns where the merged header contains only "Unnamed" or is an empty string
    filtered_columns = [
        col for col in combined_headers if col.strip() and not col.strip().startswith('Unnamed')
    ]
    
    # Filter the DataFrame to include only the non-"Unnamed" columns
    table_df = table_df.loc[:, [col in filtered_columns for col in combined_headers]]
    
    # Assign the merged headers as the new column names
    table_df.columns = [
        col for col in combined_headers if col in filtered_columns
    ]
    
    # Drop the original header rows now that they have been combined
    table_df = table_df.drop(range(num_header_rows)).reset_index(drop=True)

    # Handle specific column renaming (if required)
    if handle_merged_columns:
        table_df.rename(
            columns={
                'U.S. Capital Commercial Markets Banking and Direct and Wealth Financial Management Services': 
                'U.S. Commercial Banking and Wealth Management',
                'New U.S. Capital Commercial Markets Banking and Direct and Wealth Financial Management Services': 
                'Capital Markets and Direct Financial Services'
            },
            inplace=True
        )
        # Format the fourth column
        table_df = format_columns(table_df)
    
    # Clean the data: remove unwanted characters from numeric columns
    for col in table_df.columns[1:]:  # Skip the first column (usually row labels)
        table_df[col] = table_df[col].astype(str).str.replace(',', '', regex=False)
        table_df[col] = table_df[col].str.replace('(', '-', regex=False)
        table_df[col] = table_df[col].str.replace(')', '', regex=False)
        table_df[col] = table_df[col].str.replace('$', '', regex=False)
    
    # Convert cleaned numeric columns to float
    for col in table_df.columns[1:]:  # Skip the first column (usually text labels)
        table_df[col] = pd.to_numeric(table_df[col], errors='coerce')
    
    return table_df

In [78]:
def melt_table_with_descriptive_text(table_df):
    if table_df.empty:
        raise ValueError("The input DataFrame is empty.")
    
    # Melt the DataFrame
    melted_df = pd.melt(
        table_df, 
        id_vars=[table_df.columns[0]],  # Use the first column as the row label
        var_name='header', 
        value_name='value'
    )
    
    # Create descriptive 'text' by combining header and row label
    melted_df['text'] = melted_df['header'] + " - " + melted_df[table_df.columns[0]]
    
    # Keep only the 'text' and 'value' columns
    melted_df = melted_df[['text', 'value']]
    
    return melted_df

In [79]:
def merge_matching_rows(df, phrases, primary_phrase, third_row_value_index=2):
    # Filter rows where the 'text' column matches any of the specified phrases
    matching_rows = df[df['text'].isin(phrases)].copy()  # Use .copy() to avoid the warning

    # Remove 'primary_phrase' from all rows except the first
    matching_rows.loc[:, 'text'] = matching_rows['text'].apply(
        lambda x: x.replace(f"{primary_phrase} - ", "") if x != phrases[0] else x
    )

    # Combine the 'text' values into a single string
    combined_text = ' '.join(matching_rows['text'].unique())  # Concatenate unique text entries

    # Check if there are any non-NaN values in the 'value' column, otherwise set to NaN
    if matching_rows['value'].notna().any():
        # Take the value from the specified row index
        combined_value = matching_rows['value'].iloc[third_row_value_index]
    else:
        combined_value = np.nan  # Set to NaN if all are NaN

    # Create a new DataFrame with the combined row
    combined_row = pd.DataFrame({'text': [combined_text], 'value': [combined_value]})

    # Drop the original matching rows and append the combined row
    updated_df = df[~df['text'].isin(phrases)]
    updated_df = pd.concat([updated_df, combined_row], ignore_index=True)

    return updated_df

In [80]:
# Defining conastant list for different phrases
canadian_personal_phrases = [
    "Canadian Personal and Business Banking - Recovery to income tax that will be eliminated with the substantive",
    "Canadian Personal and Business Banking - enactment of a Federal proposal to deny the dividends received",
    "Canadian Personal and Business Banking - deduction for banks (2)"
]

capital_markets_phrases = [
    "Capital Markets and Direct Financial Services - Recovery to income tax that will be eliminated with the substantive",
    "Capital Markets and Direct Financial Services - enactment of a Federal proposal to deny the dividends received",
    "Capital Markets and Direct Financial Services - deduction for banks (2)",
]

canadian_commercial_phrases = [
    "Canadian Commercial Banking and Wealth Management - Recovery to income tax that will be eliminated with the substantive",
    "Canadian Commercial Banking and Wealth Management - enactment of a Federal proposal to deny the dividends received",
    "Canadian Commercial Banking and Wealth Management - deduction for banks (2)",
]

us_commercial_phrases = [
    "U.S. Commercial Banking and Wealth Management - Recovery to income tax that will be eliminated with the substantive",
    "U.S. Commercial Banking and Wealth Management - enactment of a Federal proposal to deny the dividends received",
    "U.S. Commercial Banking and Wealth Management - deduction for banks (2)",
]

corporate_other_phrases = [
    "Corporate and Other - Recovery to income tax that will be eliminated with the substantive",
    "Corporate and Other - enactment of a Federal proposal to deny the dividends received",
    "Corporate and Other - deduction for banks (2)",
]

cibc_total_phrases = [
    "CIBC Total - Recovery to income tax that will be eliminated with the substantive",
    "CIBC Total - enactment of a Federal proposal to deny the dividends received",
    "CIBC Total - deduction for banks (2)",
]

commercial_banking_phrases = [
    "Commercial Banking and Wealth Management (US$ millions) - Recovery to income tax that will be eliminated with the substantive",
    "Commercial Banking and Wealth Management (US$ millions) - enactment of a Federal proposal to deny the dividends received",
    "Commercial Banking and Wealth Management (US$ millions) - deduction for banks (2)",
]

phrases_list = [canadian_personal_phrases, capital_markets_phrases, canadian_commercial_phrases, us_commercial_phrases, corporate_other_phrases, cibc_total_phrases, commercial_banking_phrases]
primary_phrase_list = ["Canadian Personal and Business Banking", "Capital Markets and Direct Financial Services", "Canadian Commercial Banking and Wealth Management",
                      "U.S. Commercial Banking and Wealth Management", "Corporate and Other", "CIBC Total", "Commercial Banking and Wealth Management (US$ millions)"]

In [81]:
# Defining conastant list for different phrases
canadian_personal_phrases = [ "Canadian Personal and Business Banking - Commodity tax charge related to the retroactive impact of the 2023",
    "Canadian Personal and Business Banking - Canadian Federal budget"
]

capital_markets_phrases = [ "Capital Markets and Direct Financial Services - Commodity tax charge related to the retroactive impact of the 2023",
    "Capital Markets and Direct Financial Services - Canadian Federal budget"
]

canadian_commercial_phrases = [ "Canadian Commercial Banking and Wealth Management - Commodity tax charge related to the retroactive impact of the 2023",
    "Canadian Commercial Banking and Wealth Management - Canadian Federal budget"
]

us_commercial_phrases = [ "U.S. Commercial Banking and Wealth Management - Commodity tax charge related to the retroactive impact of the 2023",
    "U.S. Commercial Banking and Wealth Management - Canadian Federal budget"
]

corporate_other_phrases = [ "Corporate and Other - Commodity tax charge related to the retroactive impact of the 2023",
    "Corporate and Other - Canadian Federal budget"
]

cibc_total_phrases = [ "CIBC Total - Commodity tax charge related to the retroactive impact of the 2023",
    "CIBC Total - Canadian Federal budget"
]

commercial_banking_phrases = [ "Commercial Banking and Wealth Management (US$ millions) - Commodity tax charge related to the retroactive impact of the 2023",
    "Commercial Banking and Wealth Management (US$ millions) - Canadian Federal budget"
]

q3_phrases_list = [canadian_personal_phrases, capital_markets_phrases, canadian_commercial_phrases, us_commercial_phrases, corporate_other_phrases, cibc_total_phrases, commercial_banking_phrases]
q3_primary_phrase_list = ["Canadian Personal and Business Banking", "Capital Markets and Direct Financial Services", "Canadian Commercial Banking and Wealth Management",
                      "U.S. Commercial Banking and Wealth Management", "Corporate and Other", "CIBC Total", "Commercial Banking and Wealth Management (US$ millions)"]

## Last Page

In [82]:
def process_table_headers_and_dates_for_last_page(table_df, num_header_rows=4):
    # Step 1: Merge multi-row headers
    original_headers = table_df.columns.astype(str).fillna('')
    headers = table_df.iloc[:num_header_rows].fillna('')
    combined_headers = headers.apply(lambda x: ' '.join(x.astype(str)).strip(), axis=0)
    combined_headers = [
        f"{original} {combined}".strip() if original != 'nan' else combined
        for original, combined in zip(original_headers, combined_headers)
    ]
    table_df.columns = combined_headers
    table_df = table_df.drop(range(num_header_rows)).reset_index(drop=True)

    # Remove columns with empty or "Unnamed" headers
    filtered_columns = [
        col for col in table_df.columns if col.strip() and not col.strip().startswith('Unnamed')
    ]
    table_df = table_df.loc[:, filtered_columns]
    
    # Step 2: Extract year, month, and day from the first column
    table_df['Year'] = table_df.iloc[:, 0].str.extract(r'(\d{4})')  # Extract the year
    table_df['Year'] = table_df['Year'].fillna(method='ffill')  # Propagate year downward
    table_df['Month_Day'] = table_df.iloc[:, 0].str.extract(r'([A-Za-z]+\.? \d{1,2})')  # Extract month/day
    table_df['Date'] = (table_df['Year'] + ' ' + table_df['Month_Day']).str.strip()  # Combine into "Date"
    
    # Step 3: Clean the first column (remove extracted year and month/day)
    table_df.iloc[:, 0] = table_df.iloc[:, 0].str.replace(r'(\d{4})', '', regex=True)  # Remove year
    table_df.iloc[:, 0] = table_df.iloc[:, 0].str.replace(r'([A-Za-z]+\.? \d{1,2})', '', regex=True)  # Remove month/day
    table_df.iloc[:, 0] = table_df.iloc[:, 0].str.strip()  # Remove extra spaces
    
    # Drop temporary columns used for date extraction
    table_df.drop(columns=['Year', 'Month_Day'], inplace=True)

    # Step 4: Clean numeric columns
    for col in table_df.columns[1:]:  # Skip the first column (usually descriptive text)
        if col != 'Date':  # Skip the "Date" column
            table_df[col] = table_df[col].astype(str).str.replace(',', '', regex=False)
            table_df[col] = table_df[col].str.replace('(', '-', regex=False)
            table_df[col] = table_df[col].str.replace(')', '', regex=False)
            table_df[col] = table_df[col].str.replace('$', '', regex=False)
            table_df[col] = pd.to_numeric(table_df[col], errors='coerce')  # Convert to numeric

    return table_df


In [83]:
def process_and_melt_table_with_date(table_df, date_column='Date', date_replacements=None):
    if table_df.empty:
        raise ValueError("The input DataFrame is empty.")

    # Ensure the Date column exists in the DataFrame
    if date_column not in table_df.columns:
        raise ValueError(f"The specified date column '{date_column}' does not exist in the DataFrame.")

    # Step 1: Melt the DataFrame, excluding the Date column
    melted_df = pd.melt(
        table_df,
        id_vars=[table_df.columns[0], date_column],  # Keep the first column and Date column as id_vars
        var_name='header',
        value_name='value'
    )
    
    # Step 2: Create descriptive 'text' by combining header and row label
    melted_df['text'] = melted_df['header'] + " - " + melted_df[table_df.columns[0]]

    # Keep only the 'Date', 'text', and 'value' columns
    melted_df = melted_df[['Date', 'text', 'value']]

    # Step 3: Replace values in the 'Date' column if replacements are provided
    if date_replacements:
        melted_df['Date'] = melted_df['Date'].replace(date_replacements)

    # Step 4: Fill missing 'Date' values using neighboring values
    rows = len(melted_df)
    for i in range(rows):
        # If the first row is NaN, fill it with the next row's value
        if i == 0 and pd.isna(melted_df.loc[i, date_column]):
            melted_df.loc[i, date_column] = melted_df.loc[i + 1, date_column]
        
        # If a NaN is encountered
        elif pd.isna(melted_df.loc[i, date_column]):
            # If it's the last row, fill it with the previous value
            if i == rows - 1:
                melted_df.loc[i, date_column] = melted_df.loc[i - 1, date_column]
            # If the next row is not NaN, fill the current NaN with the next row's value
            elif not pd.isna(melted_df.loc[i + 1, date_column]):
                melted_df.loc[i, date_column] = melted_df.loc[i + 1, date_column]
            # Otherwise, fill the current NaN with the previous value
            else:
                melted_df.loc[i, date_column] = melted_df.loc[i - 1, date_column]

    # Step 5: Merge 'Date' and 'text' columns with '-'
    melted_df['text'] =  melted_df['text'] + " - " + melted_df['Date'] 

    melted_df.drop(columns = 'Date', inplace = True)

    return melted_df

# 2024

## Q3 2024

In [84]:
pdf_path = 'q324newsrelease-en.pdf'

q324_processed_table_page_1 = extract_table_from_pdf(pdf_path, 1, [155, 20, 290, 576])
q324_processed_table_page_3 = extract_table_from_pdf(pdf_path, 3, [210, 20, 610, 800]) # Merged column #phrases_list_p3_p6
q324_processed_table_page_4 = extract_table_from_pdf(pdf_path, 4) # primary_phrase_list
q324_processed_table_page_5 = extract_table_from_pdf(pdf_path, 5) # q3_primary_phrase_list
q324_processed_table_page_6 = extract_table_from_pdf(pdf_path, 6) # Merged column, phrases_list_p3_p6
q324_processed_table_page_7 = extract_table_from_pdf(pdf_path, 7) # q3_primary_phrase_list
q324_processed_table_page_8 = extract_table_from_pdf(pdf_path, 8, [70, 20, 420, 800])

### Q3 2024 - Page 1

In [85]:
# Define the column renames
column_renames = {
    'Q3/24': 'Q3 2024',
    'Q3/23 (1)': 'Q3 2023',
    'Q2/24': 'Q2 2024',
    '1': 'text',
    '5': 'YoY Variance',
    '6': 'QoQ Variance',
}

# Raname columns
q324_processed_table_page_1 = raname_page1_columns(
    table_df=q324_processed_table_page_1,
    column_renames=column_renames,
    unnamed_column_start=1
)

# Process the table
q324_processed_df_page_1 = process_and_unpivot_table_p1(q324_processed_table_page_1)

### Q3 2024 - Page 3 to 7

In [86]:
# Defining conastant list for different phrases
canadian_personal_phrases = [ "Canadian Personal and Business Banking - Adjustments related to enactment of a Federal tax measure in June",
    "Canadian Personal and Business Banking - 2024 that denies the dividends received deduction for banks (2)"
]

capital_markets_phrases = [ "Capital Markets and Direct Financial Services - Adjustments related to enactment of a Federal tax measure in June",
    "Capital Markets and Direct Financial Services - 2024 that denies the dividends received deduction for banks (2)"
]

canadian_commercial_phrases = [ "Canadian Commercial Banking and Wealth Management - Adjustments related to enactment of a Federal tax measure in June",
    "Canadian Commercial Banking and Wealth Management - 2024 that denies the dividends received deduction for banks (2)"
]

us_commercial_phrases = [ "U.S. Commercial Banking and Wealth Management - Adjustments related to enactment of a Federal tax measure in June",
    "U.S. Commercial Banking and Wealth Management - 2024 that denies the dividends received deduction for banks (2)"
]

corporate_other_phrases = [ "Corporate and Other - Adjustments related to enactment of a Federal tax measure in June",
    "Corporate and Other - 2024 that denies the dividends received deduction for banks (2)"
]

cibc_total_phrases = [ "CIBC Total - Adjustments related to enactment of a Federal tax measure in June",
    "CIBC Total - 2024 that denies the dividends received deduction for banks (2)"
]

commercial_banking_phrases = [ "Commercial Banking and Wealth Management (US$ millions) - Adjustments related to enactment of a Federal tax measure in June",
    "Commercial Banking and Wealth Management (US$ millions) - 2024 that denies the dividends received deduction for banks (2)"
]

phrases_list_p3_p6 = [canadian_personal_phrases, capital_markets_phrases, canadian_commercial_phrases, us_commercial_phrases, 
                   corporate_other_phrases, cibc_total_phrases, commercial_banking_phrases]

primary_phrase_list_p3_p6 = ["Canadian Personal and Business Banking", "Capital Markets and Direct Financial Services", 
                          "Canadian Commercial Banking and Wealth Management", "U.S. Commercial Banking and Wealth Management",
                          "Corporate and Other", "CIBC Total", "Commercial Banking and Wealth Management (US$ millions)"]

In [87]:
# Create new column to merged columns
q324_processed_table_page_3 = split_merged_columns(q324_processed_table_page_3, column_index=3)
q324_processed_table_page_6 = split_merged_columns(q324_processed_table_page_6, column_index=3)

q324_processed_df_page_3 = process_table_headers_and_convert(q324_processed_table_page_3, num_header_rows=4, handle_merged_columns=True)
q324_processed_df_page_4 = process_table_headers_and_convert(q324_processed_table_page_4, num_header_rows=4)
q324_processed_df_page_5 = process_table_headers_and_convert(q324_processed_table_page_5, num_header_rows=4)
q324_processed_df_page_6 = process_table_headers_and_convert(q324_processed_table_page_6, num_header_rows=4, handle_merged_columns=True)
q324_processed_df_page_7 = process_table_headers_and_convert(q324_processed_table_page_7, num_header_rows=4)

q324_processed_df_page_3 = melt_table_with_descriptive_text(q324_processed_df_page_3)
q324_processed_df_page_4 = melt_table_with_descriptive_text(q324_processed_df_page_4)
q324_processed_df_page_5 = melt_table_with_descriptive_text(q324_processed_df_page_5)
q324_processed_df_page_6 = melt_table_with_descriptive_text(q324_processed_df_page_6)
q324_processed_df_page_7 = melt_table_with_descriptive_text(q324_processed_df_page_7)

for phrase, primary_phrase in zip(phrases_list, primary_phrase_list):
    q324_processed_df_page_4 = merge_matching_rows(q324_processed_df_page_4, phrase, primary_phrase=primary_phrase)

for phrase, primary_phrase in zip(phrases_list_p3_p6, primary_phrase_list_p3_p6):
    q324_processed_df_page_3 = merge_matching_rows(q324_processed_df_page_3, phrase, primary_phrase=primary_phrase)
    q324_processed_df_page_6= merge_matching_rows(q324_processed_df_page_6, phrase, primary_phrase=primary_phrase)

for phrase, primary_phrase in zip(q3_phrases_list, q3_primary_phrase_list):
    q324_processed_df_page_5 = merge_matching_rows(q324_processed_df_page_5, phrase, primary_phrase=primary_phrase)
    q324_processed_df_page_7= merge_matching_rows(q324_processed_df_page_7, phrase, primary_phrase=primary_phrase)

q324_processed_df_page_3['text'] = q324_processed_df_page_3['text'].astype(str) + 'Q3 2024 three months end'
q324_processed_df_page_4['text'] = q324_processed_df_page_4['text'].astype(str) + 'Q2 2024 three months end'
q324_processed_df_page_5['text'] = q324_processed_df_page_5['text'].astype(str) + 'Q3 2023 three months end'
q324_processed_df_page_6['text'] = q324_processed_df_page_6['text'].astype(str) + 'Q3 2024 nine months end'
q324_processed_df_page_7['text'] = q324_processed_df_page_7['text'].astype(str) + 'Q3 2023 nine months end'

### Q3 2024 - Page 8

In [88]:
q324_processed_df_page_8 = process_table_headers_and_dates_for_last_page(q324_processed_table_page_8, num_header_rows=4)

date_replacements = {
    'nan': np.nan,        # Replace 'nan' with np.nan
    '2024 Jul. 31': 'Q3 2024',
    '2024 Apr. 30': 'Q2 2024',
    '2023 Jul. 31': 'Q3 2023',
}

q324_processed_df_page_8 = process_and_melt_table_with_date(
    table_df=q324_processed_df_page_8,
    date_column='Date',
    date_replacements=date_replacements
)

## Q2 2024

### Table Extraction

In [89]:
pdf_path = 'q224newsrelease-en.pdf'
page_3_area = [300, 20, 610, 800]

q224_processed_table_page_1 = extract_table_from_pdf(pdf_path, 1)
q224_processed_table_page_3 = extract_table_from_pdf(pdf_path, 3, page_3_area)
q224_processed_table_page_4 = extract_table_from_pdf(pdf_path, 4)
q224_processed_table_page_5 = extract_table_from_pdf(pdf_path, 5)
q224_processed_table_page_6 = extract_table_from_pdf(pdf_path, 6)
q224_processed_table_page_7 = extract_table_from_pdf(pdf_path, 7)
q224_processed_table_page_8 = extract_table_from_pdf(pdf_path, 8)

### Q2 2024 - Page 1

In [90]:
# Define the column renames
column_renames = {
    'Q2/24': 'Q2 2024',
    'Q2/23 (1)': 'Q2 2023',
    'Q1/24': 'Q1 2024',
    '1': 'text',
    '5': 'YoY Variance',
    '6': 'QoQ Variance',
}

# Raname columns
q224_processed_table_page_1 = raname_page1_columns(
    table_df=q224_processed_table_page_1,
    column_renames=column_renames,
    unnamed_column_start=1
)

# Process the table
q224_processed_df_page_1 = process_and_unpivot_table_p1(q224_processed_table_page_1)

### Q2 2024 - Page 3 to 7

In [91]:
# Create new column to merged columns
q224_processed_table_page_3 = split_merged_columns(q224_processed_table_page_3, column_index=3)
q224_processed_table_page_6 = split_merged_columns(q224_processed_table_page_6, column_index=3)

q224_processed_df_page_3 = process_table_headers_and_convert(q224_processed_table_page_3, num_header_rows=4, handle_merged_columns=True)
q224_processed_df_page_4 = process_table_headers_and_convert(q224_processed_table_page_4, num_header_rows=4)
q224_processed_df_page_5 = process_table_headers_and_convert(q224_processed_table_page_5, num_header_rows=4)
q224_processed_df_page_6 = process_table_headers_and_convert(q224_processed_table_page_6, num_header_rows=4, handle_merged_columns=True)
q224_processed_df_page_7 = process_table_headers_and_convert(q224_processed_table_page_7, num_header_rows=4)

q224_processed_df_page_3 = melt_table_with_descriptive_text(q224_processed_df_page_3)
q224_processed_df_page_4 = melt_table_with_descriptive_text(q224_processed_df_page_4)
q224_processed_df_page_5 = melt_table_with_descriptive_text(q224_processed_df_page_5)
q224_processed_df_page_6 = melt_table_with_descriptive_text(q224_processed_df_page_6)
q224_processed_df_page_7 = melt_table_with_descriptive_text(q224_processed_df_page_7)

for phrase, primary_phrase in zip(phrases_list, primary_phrase_list):
    q224_processed_df_page_3 = merge_matching_rows(q224_processed_df_page_3, phrase, primary_phrase=primary_phrase)
    q224_processed_df_page_4 = merge_matching_rows(q224_processed_df_page_4, phrase, primary_phrase=primary_phrase)
    q224_processed_df_page_6 = merge_matching_rows(q224_processed_df_page_6, phrase, primary_phrase=primary_phrase)


q224_processed_df_page_3['text'] = q224_processed_df_page_3['text'].astype(str) + 'Q2 2024 three months end'
q224_processed_df_page_4['text'] = q224_processed_df_page_4['text'].astype(str) + 'Q1 2024 three months end'
q224_processed_df_page_5['text'] = q224_processed_df_page_5['text'].astype(str) + 'Q2 2023 three months end'
q224_processed_df_page_6['text'] = q224_processed_df_page_6['text'].astype(str) + 'Q2 2024 six months end'
q224_processed_df_page_7['text'] = q224_processed_df_page_7['text'].astype(str) + 'Q2 2023 six months end'

### Q2 2024 - Page 8

In [92]:
q224_processed_df_page_8 = process_table_headers_and_dates_for_last_page(q224_processed_table_page_8, num_header_rows=4)

date_replacements = {
    'nan': np.nan,        # Replace 'nan' with np.nan
    '2024 Apr. 30': 'Q2 2024',
    '2024 Jan. 31': 'Q1 2024',
    '2023 Apr. 30': 'Q2 2023'
}

q224_processed_df_page_8 = process_and_melt_table_with_date(
    table_df=q224_processed_df_page_8,
    date_column='Date',
    date_replacements=date_replacements
)

## Q1 2024

In [93]:
pdf_path = 'q124newsrelease-en.pdf' 

q124_processed_table_page_1 = extract_table_from_pdf(pdf_path, 1, [145, 30, 310, 576])
q124_processed_table_page_3 = extract_table_from_pdf(pdf_path, 3, [210, 20, 610, 800])
q124_processed_table_page_4 = extract_table_from_pdf(pdf_path, 4, [60, 30, 370, 600])
q124_processed_table_page_4_2 = extract_table_from_pdf(pdf_path, 4, [400, 30, 740, 600])
q124_processed_table_page_5 = extract_table_from_pdf(pdf_path, 5, [70, 30, 290, 600])

### Q1 2024 - Page 1

In [94]:
# Define the column renames
column_renames = {
    'Q1/24': 'Q1 2024',
    'Q1/23 (1)	': 'Q1 2023',
    'Q4/23 (1)': 'Q4 2023',
    '1': 'text',
    '5': 'YoY Variance',
    '6': 'QoQ Variance',
}

# Raname columns
q124_processed_table_page_1 = raname_page1_columns(
    table_df=q124_processed_table_page_1,
    column_renames=column_renames,
    unnamed_column_start=1
)

# Process the table
q124_processed_df_page_1 = process_and_unpivot_table_p1(q124_processed_table_page_1)

### Q1 2024 - Page 3 - 4

In [95]:
q124_processed_df_page_3

Unnamed: 0,text,value
0,Canadian Personal and Business Banking - Opera...,
1,Canadian Personal and Business Banking - Total...,2497.0
2,Canadian Personal and Business Banking - Provi...,329.0
3,Canadian Personal and Business Banking - Non-i...,1280.0
4,Canadian Personal and Business Banking - Incom...,888.0
...,...,...
261,Capital Markets and Direct Financial Services ...,
262,Capital Markets and Direct Financial Services ...,575.0
263,Capital Markets and Direct Financial Services ...,
264,Capital Markets and Direct Financial Services ...,


In [96]:
# Create new column to merged columns
q124_processed_table_page_3 = split_merged_columns(q124_processed_table_page_3, column_index=3)

q124_processed_df_page_3 = process_table_headers_and_convert(q124_processed_table_page_3, num_header_rows=4, handle_merged_columns=True)
q124_processed_df_page_4 = process_table_headers_and_convert(q124_processed_table_page_4, num_header_rows=4)
q124_processed_df_page_4_2 = process_table_headers_and_convert(q124_processed_table_page_5, num_header_rows=4)

q124_processed_df_page_3 = melt_table_with_descriptive_text(q124_processed_df_page_3)
q124_processed_df_page_4 = melt_table_with_descriptive_text(q124_processed_df_page_4)
q124_processed_df_page_4_2 = melt_table_with_descriptive_text(q124_processed_df_page_4_2)

q124_processed_df_page_3['text'] = q124_processed_df_page_3['text'].astype(str) + 'Q1 2024 three months end'
q124_processed_df_page_4['text'] = q124_processed_df_page_4['text'].astype(str) + 'Q4 2023 three months end'
q124_processed_df_page_4_2['text'] = q124_processed_df_page_4_2['text'].astype(str) + 'Q1 2023 three months end'

### Q1 2024 - Page 5

In [97]:
q124_processed_df_page_5 = process_table_headers_and_dates_for_last_page(q124_processed_table_page_5, num_header_rows=4)

date_replacements = {
    'nan': np.nan,        # Replace 'nan' with np.nan
    '2024 Jan. 31': 'Q1 2024',
    '2023 Oct. 31': 'Q4 2023',
    '2023 Jan. 31': 'Q1 2023'
}

q124_processed_df_page_5 = process_and_melt_table_with_date(
    table_df=q124_processed_df_page_5,
    date_column='Date',
    date_replacements=date_replacements
)

# 2023

## Q3 2023

### Table Extraction

In [98]:
pdf_path = 'q323newsrelease-en.pdf'

q323_processed_table_page_1 = extract_table_from_pdf(pdf_path, 1, [155, 20, 290, 576])
q323_processed_table_page_3 = extract_table_from_pdf(pdf_path, 3, [210, 20, 610, 800])
q323_processed_table_page_4 = extract_table_from_pdf(pdf_path, 4)
q323_processed_table_page_5 = extract_table_from_pdf(pdf_path, 5)
q323_processed_table_page_6 = extract_table_from_pdf(pdf_path, 6)
q323_processed_table_page_7 = extract_table_from_pdf(pdf_path, 7)
q323_processed_table_page_8 = extract_table_from_pdf(pdf_path, 8)

### Q3 2023 - Page 1

In [99]:
# Define the column renames
column_renames = {
    'Q3/23': 'Q3 2023',
    'Q3/22': 'Q3 2022',
    'Q2/23': 'Q2 2023',
    '1': 'text',
    '5': 'YoY Variance',
    '6': 'QoQ Variance',
}

# Raname columns
q323_processed_table_page_1 = raname_page1_columns(
    table_df=q323_processed_table_page_1,
    column_renames=column_renames,
    unnamed_column_start=1
)

# Process the table
q323_processed_df_page_1 = process_and_unpivot_table_p1(q323_processed_table_page_1)

### Q3 2023 - Page 3 to 7

In [100]:
# Defining conastant list for different phrases
canadian_personal_phrases = [ "Canadian Personal and Business Banking - Acquisition and integration-related costs as well as purchase",
    "Canadian Personal and Business Banking - accounting adjustments (5)"
]

capital_markets_phrases = [ "Capital Markets and Direct Financial Services - Acquisition and integration-related costs as well as purchase",
    "Capital Markets and Direct Financial Services - accounting adjustments (5)"
]

canadian_commercial_phrases = [ "Canadian Commercial Banking and Wealth Management - Acquisition and integration-related costs as well as purchase",
    "Canadian Commercial Banking and Wealth Management - accounting adjustments (5)"
]

us_commercial_phrases = [ "U.S. Commercial Banking and Wealth Management - Acquisition and integration-related costs as well as purchase",
    "U.S. Commercial Banking and Wealth Management - accounting adjustments (5)"
]

corporate_other_phrases = [ "Corporate and Other - Acquisition and integration-related costs as well as purchase",
    "Corporate and Other - accounting adjustments (5)"
]

cibc_total_phrases = [ "CIBC Total - Acquisition and integration-related costs as well as purchase",
    "CIBC Total - accounting adjustments (5)"
]

commercial_banking_phrases = [ "Commercial Banking and Wealth Management (US$ millions) - Acquisition and integration-related costs as well as purchase",
    "Commercial Banking and Wealth Management (US$ millions) - accounting adjustments (5)"
]

phrases_list_p5 = [canadian_personal_phrases, capital_markets_phrases, canadian_commercial_phrases, us_commercial_phrases, corporate_other_phrases, cibc_total_phrases, commercial_banking_phrases]
primary_phrase_list_p5 = ["Canadian Personal and Business Banking", "Capital Markets and Direct Financial Services", "Canadian Commercial Banking and Wealth Management",
                      "U.S. Commercial Banking and Wealth Management", "Corporate and Other", "CIBC Total", "Commercial Banking and Wealth Management (US$ millions)"]

In [101]:
q323_processed_df_page_3 = process_table_headers_and_convert(q323_processed_table_page_3, num_header_rows=4)
q323_processed_df_page_4 = process_table_headers_and_convert(q323_processed_table_page_4, num_header_rows=4)
q323_processed_df_page_5 = process_table_headers_and_convert(q323_processed_table_page_5, num_header_rows=4)
q323_processed_df_page_6 = process_table_headers_and_convert(q323_processed_table_page_6, num_header_rows=4)
q323_processed_df_page_7 = process_table_headers_and_convert(q323_processed_table_page_7, num_header_rows=4)

q323_processed_df_page_3 = melt_table_with_descriptive_text(q323_processed_df_page_3)
q323_processed_df_page_4 = melt_table_with_descriptive_text(q323_processed_df_page_4)
q323_processed_df_page_5 = melt_table_with_descriptive_text(q323_processed_df_page_5)
q323_processed_df_page_6 = melt_table_with_descriptive_text(q323_processed_df_page_6)
q323_processed_df_page_7 = melt_table_with_descriptive_text(q323_processed_df_page_7)

for phrase, primary_phrase in zip(q3_phrases_list, q3_primary_phrase_list):
    q323_processed_df_page_3 = merge_matching_rows(q323_processed_df_page_3, phrase, primary_phrase=primary_phrase)

for phrase, primary_phrase in zip(phrases_list_p5, primary_phrase_list_p5):
    q323_processed_df_page_5 = merge_matching_rows(q323_processed_df_page_5, phrase, primary_phrase=primary_phrase)

q323_processed_df_page_3['text'] = q323_processed_df_page_3['text'].astype(str) + 'Q3 2023 three months end'
q323_processed_df_page_4['text'] = q323_processed_df_page_4['text'].astype(str) + 'Q2 2023 three months end'
q323_processed_df_page_5['text'] = q323_processed_df_page_5['text'].astype(str) + 'Q3 2022 three months end'
q323_processed_df_page_6['text'] = q323_processed_df_page_6['text'].astype(str) + 'Q3 2023 nine months end'
q323_processed_df_page_7['text'] = q323_processed_df_page_7['text'].astype(str) + 'Q3 2022 nine months end'

### Q3 2023 - Page 8

In [102]:
q323_processed_df_page_8 = process_table_headers_and_dates_for_last_page(q323_processed_table_page_8, num_header_rows=4)

date_replacements = {
    'nan': np.nan,        # Replace 'nan' with np.nan
    '2023 Jul. 31': 'Q3 2023',
    '2023 Apr. 30': 'Q2 2023',
    '2022 Jul. 31': 'Q3 2022',
}

q323_processed_df_page_8 = process_and_melt_table_with_date(
    table_df=q323_processed_df_page_8,
    date_column='Date',
    date_replacements=date_replacements
)

## Q2 2023

In [103]:
pdf_path = 'q223newsrelease-en.pdf'

q223_processed_table_page_1 = extract_table_from_pdf(pdf_path, 1, [170, 20, 290, 576])
q223_processed_table_page_3 = extract_table_from_pdf(pdf_path, 3, [210, 20, 610, 800])
q223_processed_table_page_4 = extract_table_from_pdf(pdf_path, 4)
q223_processed_table_page_5 = extract_table_from_pdf(pdf_path, 5)
q223_processed_table_page_6 = extract_table_from_pdf(pdf_path, 6)
q223_processed_table_page_7 = extract_table_from_pdf(pdf_path, 7)
q223_processed_table_page_8 = extract_table_from_pdf(pdf_path, 8,[70, 20, 420, 800])

### Q2 2023 - Page 1

In [104]:
# Define the column renames
column_renames = {
    'Q2/23': 'Q2 2023',
    'Q2/22': 'Q2 2022',
    'Q1/23': 'Q1 2023',
    '1': 'text',
    '5': 'YoY Variance',
    '6': 'QoQ Variance',
}

# Raname columns
q223_processed_table_page_1 = raname_page1_columns(
    table_df=q223_processed_table_page_1,
    column_renames=column_renames,
    unnamed_column_start=1
)

# Process the table
q223_processed_df_page_1 = process_and_unpivot_table_p1(q223_processed_table_page_1)

### Q2 2023 - Page 3 to 7

In [105]:
# Defining conastant list for different phrases
canadian_personal_phrases = [
    "Canadian Personal and Business Banking - Acquisition and integration-related costs as well as purchase",
    "Canadian Personal and Business Banking - accounting adjustments and provision for credit losses for",
    "Canadian Personal and Business Banking - deduction for banks (2)"
]

capital_markets_phrases = [
    "Capital Markets and Direct Financial Services - Acquisition and integration-related costs as well as purchase",
    "Capital Markets and Direct Financial Services - accounting adjustments and provision for credit losses for",
    "Capital Markets and Direct Financial Services - deduction for banks (2)",
]

canadian_commercial_phrases = [
    "Canadian Commercial Banking and Wealth Management - Acquisition and integration-related costs as well as purchase",
    "Canadian Commercial Banking and Wealth Management - accounting adjustments and provision for credit losses for",
    "Canadian Commercial Banking and Wealth Management - performing loans (6)",
]

us_commercial_phrases = [
    "U.S. Commercial Banking and Wealth Management - Acquisition and integration-related costs as well as purchase",
    "U.S. Commercial Banking and Wealth Management - accounting adjustments and provision for credit losses for",
    "U.S. Commercial Banking and Wealth Management - performing loans (6)",
]

corporate_other_phrases = [
    "Corporate and Other - Acquisition and integration-related costs as well as purchase",
    "Corporate and Other - accounting adjustments and provision for credit losses for",
    "Corporate and Other - performing loans (6)",
]

cibc_total_phrases = [
    "CIBC Total - Acquisition and integration-related costs as well as purchase",
    "CIBC Total - accounting adjustments and provision for credit losses for",
    "CIBC Total - performing loans (6)",
]

commercial_banking_phrases = [
    "Commercial Banking and Wealth Management (US$ millions) - Acquisition and integration-related costs as well as purchase",
    "Commercial Banking and Wealth Management (US$ millions) - accounting adjustments and provision for credit losses for",
    "Commercial Banking and Wealth Management (US$ millions) - performing loans (6)",
]

phrases_list_q223 = [canadian_personal_phrases, capital_markets_phrases, canadian_commercial_phrases, us_commercial_phrases, corporate_other_phrases, cibc_total_phrases, commercial_banking_phrases]
primary_phrase_list_q223 = ["Canadian Personal and Business Banking", "Capital Markets and Direct Financial Services", "Canadian Commercial Banking and Wealth Management",
                      "U.S. Commercial Banking and Wealth Management", "Corporate and Other", "CIBC Total", "Commercial Banking and Wealth Management (US$ millions)"]

In [106]:
q223_processed_df_page_3 = process_table_headers_and_convert(q223_processed_table_page_3, num_header_rows=4)
q223_processed_df_page_4 = process_table_headers_and_convert(q223_processed_table_page_4, num_header_rows=4)
q223_processed_df_page_5 = process_table_headers_and_convert(q223_processed_table_page_5, num_header_rows=4)
q223_processed_df_page_6 = process_table_headers_and_convert(q223_processed_table_page_6, num_header_rows=4)
q223_processed_df_page_7 = process_table_headers_and_convert(q223_processed_table_page_7, num_header_rows=4)

q223_processed_df_page_3 = melt_table_with_descriptive_text(q223_processed_df_page_3)
q223_processed_df_page_4 = melt_table_with_descriptive_text(q223_processed_df_page_4)
q223_processed_df_page_5 = melt_table_with_descriptive_text(q223_processed_df_page_5)
q223_processed_df_page_6 = melt_table_with_descriptive_text(q223_processed_df_page_6)
q223_processed_df_page_7 = melt_table_with_descriptive_text(q223_processed_df_page_7)

for phrase, primary_phrase in zip(phrases_list_q223, primary_phrase_list_q223):
    q223_processed_df_page_5 = merge_matching_rows(q223_processed_df_page_5, phrase, primary_phrase=primary_phrase)

for phrase, primary_phrase in zip(phrases_list_q223, primary_phrase_list_q223):
    q223_processed_df_page_7 = merge_matching_rows(q223_processed_df_page_7, phrase, primary_phrase=primary_phrase)

q223_processed_df_page_3['text'] = q223_processed_df_page_3['text'].astype(str) + 'Q2 2023 three months end'
q223_processed_df_page_4['text'] = q223_processed_df_page_4['text'].astype(str) + 'Q1 2023 three months end'
q223_processed_df_page_5['text'] = q223_processed_df_page_5['text'].astype(str) + 'Q2 2022 three months end'
q223_processed_df_page_6['text'] = q223_processed_df_page_6['text'].astype(str) + 'Q2 2023 six months end'
q223_processed_df_page_7['text'] = q223_processed_df_page_7['text'].astype(str) + 'Q2 2022 six months end'

### Q2 2023 - Page 8

In [107]:
q223_processed_df_page_8 = process_table_headers_and_dates_for_last_page(q223_processed_table_page_8, num_header_rows=4)

date_replacements = {
    'nan': np.nan,        # Replace 'nan' with np.nan
    '2023 Apr. 30': 'Q2 2023',
    '2023 Jan. 31': 'Q1 2023',
    '2022 Apr. 31': 'Q2 2022',
}

q223_processed_df_page_8 = process_and_melt_table_with_date(
    table_df=q223_processed_df_page_8,
    date_column='Date',
    date_replacements=date_replacements
)

## Q1 2023

In [108]:
pdf_path = 'q123newsrelease-en.pdf'

q123_processed_table_page_1 = extract_table_from_pdf(pdf_path, 1, [160, 20, 290, 576])
q123_processed_table_page_3 = extract_table_from_pdf(pdf_path, 3, [210, 20, 610, 800])
q123_processed_table_page_4 = extract_table_from_pdf(pdf_path, 4)
q123_processed_table_page_5 = extract_table_from_pdf(pdf_path, 5, [60, 30, 390, 600])
q123_processed_table_page_5_2 = extract_table_from_pdf(pdf_path, 5, [420, 30, 650, 600])

### Q1 2023 - Page 1

In [109]:
# Define the column renames
column_renames = {
    'Q1/23': 'Q1 2023',
    'Q1/22': 'Q1 2022',
    'Q4/22': 'Q4 2022',
    '1': 'text',
    '5': 'YoY Variance',
    '7': 'QoQ Variance',
}

# Raname columns
q123_processed_table_page_1 = raname_page1_columns(
    table_df=q123_processed_table_page_1,
    column_renames=column_renames,
    unnamed_column_start=1
)

q123_processed_table_page_1.drop(columns='6', inplace = True)

# Process the table
q123_processed_df_page_1 = process_and_unpivot_table_p1(q123_processed_table_page_1)

### Q1 2023 - Page 3 to 5(table 1)

In [110]:
# Defining conastant list for different phrases
canadian_personal_phrases = [ "Canadian Personal and Business Banking - Acquisition and integration-related costs as well as purchase",
    "Canadian Personal and Business Banking - accounting adjustments (6)"
]

capital_markets_phrases = [ "Capital Markets and Direct Financial Services - Acquisition and integration-related costs as well as purchase",
    "Capital Markets and Direct Financial Services - accounting adjustments (6)"
]

canadian_commercial_phrases = [ "Canadian Commercial Banking and Wealth Management - Acquisition and integration-related costs as well as purchase",
    "Canadian Commercial Banking and Wealth Management - accounting adjustments (6)"
]

us_commercial_phrases = [ "U.S. Commercial Banking and Wealth Management - Acquisition and integration-related costs as well as purchase",
    "U.S. Commercial Banking and Wealth Management - accounting adjustments (6)"
]

corporate_other_phrases = [ "Corporate and Other - Acquisition and integration-related costs as well as purchase",
    "Corporate and Other - accounting adjustments (6)"
]

cibc_total_phrases = [ "CIBC Total - Acquisition and integration-related costs as well as purchase",
    "CIBC Total - accounting adjustments (6)"
]

commercial_banking_phrases = [ "Commercial Banking and Wealth Management (US$ millions) - Acquisition and integration-related costs as well as purchase",
    "Commercial Banking and Wealth Management (US$ millions) - accounting adjustments (6)"
]

phrases_list_p4 = [canadian_personal_phrases, capital_markets_phrases, canadian_commercial_phrases, us_commercial_phrases, corporate_other_phrases, cibc_total_phrases, commercial_banking_phrases]
primary_phrase_list_p4 = ["Canadian Personal and Business Banking", "Capital Markets and Direct Financial Services", "Canadian Commercial Banking and Wealth Management",
                      "U.S. Commercial Banking and Wealth Management", "Corporate and Other", "CIBC Total", "Commercial Banking and Wealth Management (US$ millions)"]

In [111]:
q123_processed_df_page_3 = process_table_headers_and_convert(q123_processed_table_page_3, num_header_rows=4)
q123_processed_df_page_4 = process_table_headers_and_convert(q123_processed_table_page_4, num_header_rows=4)
q123_processed_df_page_5 = process_table_headers_and_convert(q123_processed_table_page_5, num_header_rows=4)

q123_processed_df_page_3 = melt_table_with_descriptive_text(q123_processed_df_page_3)
q123_processed_df_page_4 = melt_table_with_descriptive_text(q123_processed_df_page_4)
q123_processed_df_page_5 = melt_table_with_descriptive_text(q123_processed_df_page_5)

for phrase, primary_phrase in zip(phrases_list_p4, primary_phrase_list_p4):
    q123_processed_df_page_4 = merge_matching_rows(q123_processed_df_page_4, phrase, primary_phrase=primary_phrase)

q123_processed_df_page_3['text'] = q123_processed_df_page_3['text'].astype(str) + 'Q1 2023 three months end'
q124_processed_df_page_4['text'] = q124_processed_df_page_4['text'].astype(str) + 'Q4 2022 three months end'
q124_processed_df_page_5['text'] = q124_processed_df_page_5['text'].astype(str) + 'Q1 2022 three months end'

### Q1 2023 - Page 5(table 2)

In [112]:
q123_processed_df_page_5_2 = process_table_headers_and_dates_for_last_page(q123_processed_table_page_5_2, num_header_rows=4)

date_replacements = {
    'nan': np.nan,        # Replace 'nan' with np.nan
    '2023 Jan. 30': 'Q1 2023',
    '2023 Oct. 31': 'Q4 2023',
    '2022 Jan. 31': 'Q1 2022',
}

q123_processed_df_page_5_2 = process_and_melt_table_with_date(
    table_df=q123_processed_df_page_5_2,
    date_column='Date',
    date_replacements=date_replacements
)

# Final Table names

In [None]:
# Train data tables
q224_processed_df_page_1
q224_processed_df_page_2
q224_processed_df_page_3
q224_processed_df_page_4
q224_processed_df_page_5
q224_processed_df_page_6
q224_processed_df_page_7
q224_processed_df_page_8
q124_processed_df_page_1
q124_processed_df_page_3
q124_processed_df_page_4
q124_processed_df_page_4_2
q124_processed_df_page_5
q323_processed_df_page_1
q323_processed_df_page_3
q323_processed_df_page_4
q323_processed_df_page_5
q323_processed_df_page_6
q323_processed_df_page_7
q323_processed_df_page_8
q223_processed_df_page_1
q223_processed_df_page_3
q223_processed_df_page_4
q223_processed_df_page_5
q223_processed_df_page_6
q223_processed_df_page_7
q223_processed_df_page_8
q123_processed_df_page_1
q123_processed_df_page_2
q123_processed_df_page_3
q123_processed_df_page_4
q123_processed_df_page_5
q123_processed_df_page_5_2


# Test Data
q324_processed_df_page_1
q324_processed_df_page_3
q324_processed_df_page_4
q324_processed_df_page_5
q324_processed_df_page_6
q324_processed_df_page_7
q324_processed_df_page_8