## Best Yet


Need to handle horizontal separations between tables though

In [None]:
import pandas as pd

def extract_tables_from_excel(file_path):
    """
    Extracts tables from each sheet in an Excel file by identifying top-left and bottom-right coordinates
    and constructing DataFrames for each table.

    Parameters:
    - file_path (str): Path to the Excel file.

    Returns:
    - dict: A dictionary where keys are sheet names, and values are lists of tables (as DataFrames).
    """
    def extract_individual_table_bounds(sheet_data):
        """Identify individual tables and their top-left and bottom-right coordinates."""
        # Identify rows and columns with non-empty cells
        non_empty_rows = sheet_data.notnull().any(axis=1)
        
        # Locate start and end rows for each table
        table_ranges = []
        current_table_start = None

        for idx, has_data in enumerate(non_empty_rows):
            if has_data and current_table_start is None:
                current_table_start = idx  # Start of a new table
            elif not has_data and current_table_start is not None:
                # End of the current table
                table_ranges.append((current_table_start, idx - 1))
                current_table_start = None
        # Handle the last table if the sheet ends with data
        if current_table_start is not None:
            table_ranges.append((current_table_start, len(non_empty_rows) - 1))
        
        # Get the column range for all tables
        for start_row, end_row in table_ranges:
            non_empty_subset = sheet_data.iloc[start_row:end_row + 1]
            cols = non_empty_subset.notnull().any(axis=0)
            if cols.any():  # Only proceed if columns have data
                start_col, end_col = cols[cols].index[0], cols[cols].index[-1]
                yield (start_row, start_col), (end_row, end_col)
    
    # Open the Excel file
    spreadsheet = pd.ExcelFile(file_path)
    sheet_tables = {}

    # Process each sheet
    for sheet_name in spreadsheet.sheet_names:
        sheet_data = spreadsheet.parse(sheet_name, header=None)  # Read sheet without headers
        table_coords = list(extract_individual_table_bounds(sheet_data))  # Get table bounds

        # Extract tables based on coordinates
        tables = []
        for (start_row, start_col), (end_row, end_col) in table_coords:
            table = sheet_data.iloc[start_row:end_row + 1, start_col:end_col + 1]
            tables.append(table.reset_index(drop=True))  # Reset index for cleaner output
        
        sheet_tables[sheet_name] = tables
    
    return sheet_tables

# Example Usage
file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 1 - Inventory Planning.xlsx'
# file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 2 - Supply Management.xlsx'
# file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 3 - Inventory Dashboard _V2.xlsx'
tables_by_sheet = extract_tables_from_excel(file_path)

In [None]:
tables_by_sheet.keys()

In [None]:
tot = 0
for k,v in tables_by_sheet.items():
    tot += len(v)
tot

In [None]:
for sheet_name, tables in tables_by_sheet.items():
    
    for i, table in enumerate(tables):
        print(f"Sheet: {sheet_name}")
        print(f"Table {i+1}:")
        display(table.head(5))  # Only display the first 5 rows


In [None]:
import openpyxl

def extract_tables_from_excel_openpyxl(file_path):
    """
    Extracts tables from each sheet in an Excel file using OpenPyXL by identifying top-left 
    and bottom-right coordinates and constructing tables.

    Parameters:
    - file_path (str): Path to the Excel file.

    Returns:
    - dict: A dictionary where keys are sheet names, and values are lists of tables (as lists of lists).
    """
    def extract_table_bounds(sheet):
        """Identify individual tables based on non-empty rows and columns."""
        # Track table ranges
        table_ranges = []
        current_table_start = None
        non_empty_rows = []

        # Identify non-empty rows
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True), start=1):
            if any(cell is not None for cell in row):  # Check if row is not empty
                non_empty_rows.append(row_idx)
                if current_table_start is None:
                    current_table_start = row_idx
            elif current_table_start is not None:
                # End of a table
                table_ranges.append((current_table_start, row_idx - 1))
                current_table_start = None

        # Handle the last table if the sheet ends with data
        if current_table_start is not None:
            table_ranges.append((current_table_start, non_empty_rows[-1]))

        # Identify column ranges for each table
        for start_row, end_row in table_ranges:
            start_col = None
            end_col = None
            for row in sheet.iter_rows(min_row=start_row, max_row=end_row, values_only=True):
                # Get column indices of non-empty cells
                non_empty_cols = [idx for idx, cell in enumerate(row, start=1) if cell is not None]
                if non_empty_cols:
                    if start_col is None or min(non_empty_cols) < start_col:
                        start_col = min(non_empty_cols)
                    if end_col is None or max(non_empty_cols) > end_col:
                        end_col = max(non_empty_cols)
            yield (start_row, start_col, end_row, end_col)

    def extract_table_data(sheet, start_row, start_col, end_row, end_col):
        """Extract table data given the start and end coordinates."""
        table_data = []
        for row in sheet.iter_rows(min_row=start_row, max_row=end_row, 
                                   min_col=start_col, max_col=end_col, values_only=True):
            table_data.append(list(row))
        return table_data

    # Load the workbook
    workbook = openpyxl.load_workbook(file_path, data_only=True, read_only=True)
    tables_by_sheet = {}

    # Process each sheet
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        tables = []

        # Extract all table ranges
        for start_row, start_col, end_row, end_col in extract_table_bounds(sheet):
            # Extract table data
            table_data = extract_table_data(sheet, start_row, start_col, end_row, end_col)
            tables.append(table_data)

        tables_by_sheet[sheet_name] = tables

    return tables_by_sheet

# Example Usage
file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 1 - Inventory Planning.xlsx'
tables_by_sheet = extract_tables_from_excel_openpyxl(file_path)

In [None]:
import pandas as pd 
for sheet_name, tables in tables_by_sheet.items():
    
    for i, table in enumerate(tables):
        print(f"Sheet: {sheet_name}")
        print(f"  Table {i+1}:")
        display(pd.DataFrame(table[:5]))
        

In [None]:
import openpyxl
import pandas as pd
import numpy as np  # For NaN values

def extract_tables_with_column_names_and_dependencies(file_path):
    """
    Extracts tables with both computed values and metadata, including formulas, column names, and dependencies.
    Columns with all empty rows are named as `NaN`.
    """
    def extract_table_bounds(sheet):
        """Identify individual tables based on non-empty rows and columns."""
        table_ranges = []
        current_table_start = None
        non_empty_rows = []

        # Identify non-empty rows
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True), start=1):
            if any(cell is not None for cell in row):  # Check if row is not empty
                non_empty_rows.append(row_idx)
                if current_table_start is None:
                    current_table_start = row_idx
            elif current_table_start is not None:
                # End of a table
                table_ranges.append((current_table_start, row_idx - 1))
                current_table_start = None

        # Handle the last table if the sheet ends with data
        if current_table_start is not None:
            table_ranges.append((current_table_start, non_empty_rows[-1]))

        # Identify column ranges for each table
        for start_row, end_row in table_ranges:
            start_col = None
            end_col = None
            for row in sheet.iter_rows(min_row=start_row, max_row=end_row, values_only=True):
                non_empty_cols = [idx for idx, cell in enumerate(row, start=1) if cell is not None]
                if non_empty_cols:
                    if start_col is None or min(non_empty_cols) < start_col:
                        start_col = min(non_empty_cols)
                    if end_col is None or max(non_empty_cols) > end_col:
                        end_col = max(non_empty_cols)
            yield (start_row, start_col, end_row, end_col)

    def get_column_name(sheet, start_row, start_col, end_row, col_idx):
        """
        Determine the column name based on the first non-None value in the column.
        If all rows are empty, return NaN.
        """
        for row_idx in range(start_row, end_row + 1):
            cell_value = sheet.cell(row=row_idx, column=col_idx).value
            if isinstance(cell_value, str):
                return cell_value  # Prioritize string values as column names
            elif cell_value is not None:
                return str(cell_value)  # Fallback to numeric or other non-None values
        return np.nan  # Fallback to NaN if no valid values are found

    def extract_table_data(sheet, start_row, start_col, end_row, end_col, values_only, workbook):
        """Extract table data and column metadata."""
        table_data = []
        column_metadata = []

        for col_idx in range(start_col, end_col + 1):
            column_formula = None
            dependency_sheets = set()

            if not values_only:  # Fetch formulas only when values_only=False
                for row_idx in range(start_row, end_row + 1):
                    cell = sheet.cell(row=row_idx, column=col_idx)
                    if cell.data_type == 'f' and not column_formula:
                        # First formula found in the column
                        column_formula = cell.value

                        # Extract valid dependencies
                        for token in column_formula.split("!"):
                            for sheetname in workbook.sheetnames:
                                if sheetname in token.strip("="):
                                    dependency_sheets.add(sheetname) #TODO: Fix later


            # Get the column name
            column_name = get_column_name(sheet, start_row, start_col, end_row, col_idx)

            column_metadata.append({
                "ColumnName": column_name,
                "Formula": column_formula,
                "Dependencies": list(dependency_sheets),
            })

        for row in sheet.iter_rows(min_row=start_row, max_row=end_row, 
                                   min_col=start_col, max_col=end_col, values_only=values_only):
            table_data.append(list(row))

        return table_data, column_metadata

    # Load the workbook twice: once for values and once for formulas
    workbook_values = openpyxl.load_workbook(file_path, data_only=True, read_only=False)
    workbook_formulas = openpyxl.load_workbook(file_path, data_only=False, read_only=False)

    # Restrict to visible sheets only
    visible_sheet_names = [sheet_name for sheet_name in workbook_values.sheetnames if not workbook_values[sheet_name].sheet_state == 'hidden']

    tables_by_sheet = {}

    # Process each sheet
    for sheet_name in visible_sheet_names:
        sheet_values = workbook_values[sheet_name]
        sheet_formulas = workbook_formulas[sheet_name]
        tables = []

        # Extract all table ranges
        for start_row, start_col, end_row, end_col in extract_table_bounds(sheet_values):
            # Extract table data (values)
            table_data_values, _ = extract_table_data(sheet_values, start_row, start_col, end_row, end_col, values_only=True, workbook=workbook_values)

            # Extract formulas and metadata
            _, column_metadata = extract_table_data(sheet_formulas, start_row, start_col, end_row, end_col, values_only=False, workbook=workbook_formulas)

            tables.append({
                "TableData": table_data_values,
                "Metadata": column_metadata,
            })

        tables_by_sheet[sheet_name] = tables

    return tables_by_sheet

file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 1 - Inventory Planning.xlsx'

tables_with_metadata = extract_tables_with_column_names_and_dependencies(file_path)




## BEst 

In [None]:
import openpyxl
import pandas as pd
import numpy as np  # For NaN values

def extract_tables_with_column_names_and_dependencies(file_path):
    """
    Extracts tables with both computed values and metadata, including formulas, column names, and dependencies.
    Columns with all empty rows are named as `NaN`.
    """
    def extract_table_bounds(sheet):
        """Identify individual tables based on non-empty rows and columns."""
        table_ranges = []
        current_table_start = None
        non_empty_rows = []

        # Identify non-empty rows
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True), start=1):
            if any(cell is not None for cell in row):  # Check if row is not empty
                non_empty_rows.append(row_idx)
                if current_table_start is None:
                    current_table_start = row_idx
            elif current_table_start is not None:
                # End of a table
                table_ranges.append((current_table_start, row_idx - 1))
                current_table_start = None

        # Handle the last table if the sheet ends with data
        if current_table_start is not None:
            table_ranges.append((current_table_start, non_empty_rows[-1]))

        # Identify column ranges for each table
        for start_row, end_row in table_ranges:
            start_col = None
            end_col = None
            for row in sheet.iter_rows(min_row=start_row, max_row=end_row, values_only=True):
                non_empty_cols = [idx for idx, cell in enumerate(row, start=1) if cell is not None]
                if non_empty_cols:
                    if start_col is None or min(non_empty_cols) < start_col:
                        start_col = min(non_empty_cols)
                    if end_col is None or max(non_empty_cols) > end_col:
                        end_col = max(non_empty_cols)
            yield (start_row, start_col, end_row, end_col)

    def get_column_name(sheet, start_row, end_row, col_idx):
        """
        Determine the column name based on the first non-formula, non-None value in the column.
        If no valid value exists, return NaN.
        """
        for row_idx in range(start_row, end_row + 1):
            cell = sheet.cell(row=row_idx, column=col_idx)
            if cell.data_type != 'f' and cell.value is not None:  # Ignore formulas and None values
                if isinstance(cell.value, str):
                    return cell.value  # Prioritize string values as column names
                return str(cell.value)  # Fallback to numeric or other non-None values
        return np.nan  # Fallback to NaN if no valid values are found

    def extract_table_data(sheet, start_row, start_col, end_row, end_col, values_only, workbook):
        """Extract table data and column metadata."""
        table_data = []
        column_metadata = []

        for col_idx in range(start_col, end_col + 1):
            column_formula = None
            dependency_sheets = set()

            if not values_only:  # Fetch formulas only when values_only=False
                for row_idx in range(start_row, end_row + 1):
                    cell = sheet.cell(row=row_idx, column=col_idx)
                    if cell.data_type == 'f' and not column_formula:
                        # First formula found in the column
                        column_formula = str(cell.value)

                        # Extract valid dependencies
                        for token in column_formula.split("!"):
                            for sheetname in workbook.sheetnames:
                                if sheetname in token.strip("="):
                                    dependency_sheets.add(sheetname)

            # Get the column name
            column_name = get_column_name(sheet, start_row, end_row, col_idx)

            column_metadata.append({
                "ColumnName": column_name,
                "Formula": column_formula,
                "Dependencies": list(dependency_sheets),
            })

        for row in sheet.iter_rows(min_row=start_row, max_row=end_row, 
                                   min_col=start_col, max_col=end_col, values_only=values_only):
            table_data.append(list(row))

        return table_data, column_metadata

    # Load the workbook twice: once for values and once for formulas
    workbook_values = openpyxl.load_workbook(file_path, data_only=True, read_only=False)
    workbook_formulas = openpyxl.load_workbook(file_path, data_only=False, read_only=False)

    # Restrict to visible sheets only
    visible_sheet_names = [sheet_name for sheet_name in workbook_values.sheetnames if not workbook_values[sheet_name].sheet_state == 'hidden']

    tables_by_sheet = {}

    # Process each sheet
    for sheet_name in visible_sheet_names:
        sheet_values = workbook_values[sheet_name]
        sheet_formulas = workbook_formulas[sheet_name]
        tables = []

        # Extract all table ranges
        for start_row, start_col, end_row, end_col in extract_table_bounds(sheet_values):
            # Extract table data (values)
            table_data_values, _ = extract_table_data(sheet_values, start_row, start_col, end_row, end_col, values_only=True, workbook=workbook_values)

            # Extract formulas and metadata
            _, column_metadata = extract_table_data(sheet_formulas, start_row, start_col, end_row, end_col, values_only=False, workbook=workbook_formulas)

            tables.append({
                "TableData": table_data_values,
                "Metadata": column_metadata,
            })

        tables_by_sheet[sheet_name] = tables

    return tables_by_sheet

file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 1 - Inventory Planning.xlsx'
# file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 2 - Supply Management.xlsx'
# file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 3 - Inventory Dashboard _V2.xlsx'
tables_with_metadata = extract_tables_with_column_names_and_dependencies(file_path)

In [None]:
# Display the extracted tables and metadata
for sheet_name, tables in tables_with_metadata.items():
    print(f"Sheet: {sheet_name}")
    for i, table_info in enumerate(tables):
        print(f"  Table {i+1}:")
        df = pd.DataFrame(table_info["TableData"])
        display(df.head(5))  # Display first 5 rows of the table
        print("    Metadata:")
        for col_meta in table_info["Metadata"]:
            print(col_meta)

Run tiktoken on this data, get max, average, total tokens present

In [None]:
import tiktoken 
def count_tokens(text, model="gpt-4o-mini"):
    # Load the tokenizer for the specified model
    encoding = tiktoken.encoding_for_model(model)
    # Tokenize the text and count the tokens
    token_count = len(encoding.encode(text))
    return token_count



In [None]:
total = 0
for sheet_name, tables in tables_with_metadata.items():
    print(f"Sheet: {sheet_name}")
    text = ""
    print(f"Number of tables {len(tables)}")
    text += f"**Sheet: {sheet_name}**\n"
    for i, table_info in enumerate(tables[:6]):
        # print(f"  Table {i+1}:")
        text += f"  **Table {i+1}:**\n"
        df = pd.DataFrame(table_info["TableData"][:6])

        text += f" {df.to_markdown(index=False)}\n"
        # print("    Metadata:\n")
        for col_meta in table_info["Metadata"]:
            text += f"""
        - **Column Name**: {col_meta["ColumnName"]}
        - **Formula**: {col_meta["Formula"]}
        - **Dependencies**: {", ".join(col_meta["Dependencies"])}
        """
        print(text)
    num_tokens = count_tokens(text)
    print(f"Number of Tokens {num_tokens}")
    total += num_tokens
print(total)

    

In [None]:
count_tokens(pd.DataFrame(tables_with_metadata['OS Receivings Report'][0]['TableData'][:6]).to_markdown(index=False))

Adding Column names to formula

In [None]:
import openpyxl
import re
from openpyxl.utils import column_index_from_string
import numpy as np  # For NaN values

def extract_tables_with_column_names_and_dependencies(file_path):
    """
    Extracts tables with both computed values and metadata, including formulas, column names, and dependencies.
    """
    def extract_table_bounds(sheet):
        """Identify individual tables based on non-empty rows and columns."""
        table_ranges = []
        current_table_start = None
        non_empty_rows = []

        # Identify non-empty rows
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True), start=1):
            if any(cell is not None for cell in row):  # Check if row is not empty
                non_empty_rows.append(row_idx)
                if current_table_start is None:
                    current_table_start = row_idx
            elif current_table_start is not None:
                # End of a table
                table_ranges.append((current_table_start, row_idx - 1))
                current_table_start = None

        # Handle the last table if the sheet ends with data
        if current_table_start is not None:
            table_ranges.append((current_table_start, non_empty_rows[-1]))

        # Identify column ranges for each table
        for start_row, end_row in table_ranges:
            start_col = None
            end_col = None
            for row in sheet.iter_rows(min_row=start_row, max_row=end_row, values_only=True):
                non_empty_cols = [idx for idx, cell in enumerate(row, start=1) if cell is not None]
                if non_empty_cols:
                    if start_col is None or min(non_empty_cols) < start_col:
                        start_col = min(non_empty_cols)
                    if end_col is None or max(non_empty_cols) > end_col:
                        end_col = max(non_empty_cols)
            yield (start_row, start_col, end_row, end_col)

    def get_column_name(sheet, start_row, end_row, col_idx):
        """Determine the column name from the first non-None, non-formula cell in the column."""
        for row_idx in range(start_row, end_row + 1):
            cell = sheet.cell(row=row_idx, column=col_idx)
            if cell.data_type != 'f' and cell.value is not None:
                return str(cell.value)
        return np.nan

    def extract_table_data(sheet, start_row, start_col, end_row, end_col):
        """Extract the actual table data."""
        table_data = []
        for row in sheet.iter_rows(min_row=start_row, max_row=end_row, 
                                   min_col=start_col, max_col=end_col, values_only=True):
            table_data.append(list(row))
        return table_data

    def extract_table_metadata(sheet, start_row, start_col, end_row, end_col, workbook):
        """Extract column metadata for the table."""
        column_metadata = []

        for col_idx in range(start_col, end_col + 1):
            column_formula = None
            dependency_sheets = set()

            for row_idx in range(start_row, end_row + 1):
                cell = sheet.cell(row=row_idx, column=col_idx)
                if cell.data_type == 'f' and not column_formula:
                    column_formula = str(cell.value)
                    if "#REF!" in column_formula:
                        continue
                    for token in column_formula.split("!"):
                        for sheetname in workbook.sheetnames:
                            if sheetname in token.strip("="):
                                dependency_sheets.add(sheetname)

            column_name = get_column_name(sheet, start_row, end_row, col_idx)

            column_metadata.append({
                "ColumnName": column_name,
                "Formula": column_formula,
                "Dependencies": list(dependency_sheets),
            })

        return column_metadata

    workbook_values = openpyxl.load_workbook(file_path, data_only=True, read_only=False)
    workbook_formulas = openpyxl.load_workbook(file_path, data_only=False, read_only=False)

    visible_sheet_names = [sheet_name for sheet_name in workbook_values.sheetnames if not workbook_values[sheet_name].sheet_state == 'hidden']

    tables_by_sheet = {}
    for sheet_name in visible_sheet_names:
        sheet_values = workbook_values[sheet_name]
        sheet_formulas = workbook_formulas[sheet_name]
        tables = []

        for start_row, start_col, end_row, end_col in extract_table_bounds(sheet_values):
            # Extract table data
            table_data = extract_table_data(sheet_values, start_row, start_col, end_row, end_col)

            # Extract metadata
            column_metadata = extract_table_metadata(sheet_formulas, start_row, start_col, end_row, end_col, workbook_formulas)

            tables.append({
                "Coordinates": {"StartRow": start_row, "StartCol": start_col, "EndRow": end_row, "EndCol": end_col},
                "TableData": table_data,  # Store table data here
                "Metadata": column_metadata,
            })

        tables_by_sheet[sheet_name] = tables

    return tables_by_sheet

def create_cell_to_column_map(tables_with_metadata):
    """Create a global mapping of cells to column names."""
    cell_to_column_map = {}
    for sheet_name, tables in tables_with_metadata.items():
        for table in tables:
            coords = table["Coordinates"]
            metadata = table["Metadata"]
            for col_idx, meta in enumerate(metadata, start=coords["StartCol"]):
                for row_idx in range(coords["StartRow"], coords["EndRow"] + 1):
                    cell_to_column_map[(sheet_name, row_idx, col_idx)] = meta["ColumnName"]
    return cell_to_column_map

def enhance_formula_with_column_names(formula, cell_to_column_map, current_sheet_name):
    """Enhance formulas with column names, handling cross-sheet references."""
    cell_reference_pattern = re.compile(r"(?:(\w+)\!)?([A-Z]+)(\d+)")

    def replace_reference(match):
        sheet_name = match.group(1) or current_sheet_name
        col_letter = match.group(2)
        row_number = int(match.group(3))

        try:
            # Convert column letter to index
            col_index = column_index_from_string(col_letter)
        except ValueError:
            # If the column letter is invalid, leave it as-is
            return match.group(0)

        # Lookup in the precomputed map
        col_name = cell_to_column_map.get((sheet_name, row_number, col_index), "Unknown")

        return f"{match.group(0)}({col_name})"

    return cell_reference_pattern.sub(replace_reference, formula)

# Load and process the spreadsheet
file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 1 - Inventory Planning.xlsx'
file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 2 - Supply Management.xlsx'
file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 3 - Inventory Dashboard _V2.xlsx'
tables_with_metadata = extract_tables_with_column_names_and_dependencies(file_path)
cell_to_column_map = create_cell_to_column_map(tables_with_metadata)

# Enhance formulas in all metadata
for sheet_name, tables in tables_with_metadata.items():
    for table in tables:
        for col_meta in table["Metadata"]:
            if col_meta["Formula"]:
                col_meta["EnhancedFormula"] = enhance_formula_with_column_names(
                    col_meta["Formula"], cell_to_column_map, sheet_name
                )



In [None]:
import pandas as pd

# Display the extracted tables and metadata
for sheet_name, tables in tables_with_metadata.items():
    print(f"Sheet: {sheet_name}\n{'-' * 40}")
    
    for i, table_info in enumerate(tables):
        print(f"Sheet: {sheet_name}\n{'-' * 40}")
        print(f"  Table {i + 1}:\n{'-' * 20}")
        
        # Safely extract and display table data if it exists
        table_data = table_info.get("TableData", None)
        if table_data:
            print("    Table Data (First 6 Rows):")
            df = pd.DataFrame(table_data)
            display(df.head(6))  # Show the first 6 rows as a DataFrame
        else:
            print("    Table Data: Not available")
        
        # Safely extract and display metadata line by line if it exists
        metadata = table_info.get("Metadata", None)
        if metadata:
            print("    Metadata:")
            for col_meta in metadata:
                print(f"      {col_meta}")
        else:
            print("    Metadata: Not available")
        
        print("\n")


In [None]:
import pandas as pd

total = 0
for sheet_name, tables in tables_with_metadata.items():
    print(f"Sheet: {sheet_name}")
    text = ""
    print(f"Number of tables: {len(tables)}")
    text += f"**Sheet: {sheet_name}**\n"
    
    for i, table_info in enumerate(tables[:6]):  # Limit to the first 6 tables
        text += f"  **Table {i+1}:**\n"
        
        # Safely handle TableData
        table_data = table_info.get("TableData", None)
        if table_data:
            df = pd.DataFrame(table_data[:6])  # Convert the first 6 rows to a DataFrame
            text += f"{df.to_markdown(index=False)}\n"  # Convert DataFrame to markdown
        else:
            text += "    Table Data: Not available\n"

        # Safely handle Metadata
        metadata = table_info.get("Metadata", [])
        if metadata:
            for col_meta in metadata:
                enhanced_formula = col_meta.get("EnhancedFormula", "None")
                dependencies = col_meta.get("Dependencies", [])
                text += f"""
        - **Column Name**: {col_meta.get("ColumnName", "Unknown")}
        - **Formula**: {enhanced_formula}
        - **Dependencies**: {", ".join(dependencies) if dependencies else "None"}
                """
        else:
            text += "    Metadata: Not available\n"

        # Print the text for debugging
        print(text)

    # Calculate tokens in the text
    num_tokens = count_tokens(text)
    print(f"Number of Tokens: {num_tokens}")
    total += num_tokens

# Print the total token count
print(f"Total Tokens: {total}")


Added column combine

In [187]:
import openpyxl
import re
from openpyxl.utils import column_index_from_string
import numpy as np  # For NaN values

def extract_tables_with_column_names_and_dependencies(file_path):
    """
    Extracts tables with both computed values and metadata, including formulas, column names, and dependencies.
    """
    def extract_table_bounds(sheet):
        """Identify individual tables and combine those with the same column range."""
        table_ranges = []
        current_table_start = None
        non_empty_rows = []

        # Identify non-empty rows
        for row_idx, row in enumerate(sheet.iter_rows(values_only=True), start=1):
            if any(cell is not None for cell in row):  # Check if row is not empty
                non_empty_rows.append(row_idx)
                if current_table_start is None:
                    current_table_start = row_idx
            elif current_table_start is not None:
                # End of a table
                table_ranges.append((current_table_start, row_idx - 1))
                current_table_start = None

        # Handle the last table if the sheet ends with data
        if current_table_start is not None:
            table_ranges.append((current_table_start, non_empty_rows[-1]))

        # Identify column ranges for each table and combine overlapping ones
        combined_tables = []
        for start_row, end_row in table_ranges:
            start_col = None
            end_col = None
            for row in sheet.iter_rows(min_row=start_row, max_row=end_row, values_only=True):
                non_empty_cols = [idx for idx, cell in enumerate(row, start=1) if cell is not None]
                if non_empty_cols:
                    if start_col is None or min(non_empty_cols) < start_col:
                        start_col = min(non_empty_cols)
                    if end_col is None or max(non_empty_cols) > end_col:
                        end_col = max(non_empty_cols)

            # Combine tables with the same column range
            if combined_tables and combined_tables[-1][1] + 1 >= start_row and \
                    combined_tables[-1][2] == start_col and combined_tables[-1][3] == end_col:
                # Extend the row range of the previous table
                combined_tables[-1] = (combined_tables[-1][0], end_row, start_col, end_col)
            else:
                # Add a new table range
                combined_tables.append((start_row, end_row, start_col, end_col))

        for start_row, end_row, start_col, end_col in combined_tables:
            yield (start_row, start_col, end_row, end_col)

    def get_column_name(sheet, start_row, end_row, col_idx):
        """Determine the column name from the first non-None, non-formula cell in the column."""
        for row_idx in range(start_row, end_row + 1):
            cell = sheet.cell(row=row_idx, column=col_idx)
            if cell.data_type != 'f' and cell.value is not None:
                return str(cell.value)
        return np.nan

    def extract_table_data(sheet, start_row, start_col, end_row, end_col):
        """Extract the actual table data."""
        table_data = []
        for row in sheet.iter_rows(min_row=start_row, max_row=end_row, 
                                   min_col=start_col, max_col=end_col, values_only=True):
            table_data.append(list(row))
        return table_data

    def extract_table_metadata(sheet, start_row, start_col, end_row, end_col, workbook):
        """Extract column metadata for the table."""
        column_metadata = []

        for col_idx in range(start_col, end_col + 1):
            column_formula = None
            dependency_sheets = set()

            for row_idx in range(start_row, end_row + 1):
                cell = sheet.cell(row=row_idx, column=col_idx)
                if cell.data_type == 'f' and not column_formula:
                    column_formula = str(cell.value)
                    if "#REF!" in column_formula:
                        continue
                    for token in column_formula.split("!"):
                        for sheetname in workbook.sheetnames:
                            if sheetname in token.strip("="):
                                dependency_sheets.add(sheetname)

            column_name = get_column_name(sheet, start_row, end_row, col_idx)

            column_metadata.append({
                "ColumnName": column_name,
                "Formula": column_formula,
                "Dependencies": list(dependency_sheets),
            })

        return column_metadata

    workbook_values = openpyxl.load_workbook(file_path, data_only=True, read_only=False)
    workbook_formulas = openpyxl.load_workbook(file_path, data_only=False, read_only=False)

    visible_sheet_names = [sheet_name for sheet_name in workbook_values.sheetnames if not workbook_values[sheet_name].sheet_state == 'hidden']

    tables_by_sheet = {}
    for sheet_name in visible_sheet_names:
        sheet_values = workbook_values[sheet_name]
        sheet_formulas = workbook_formulas[sheet_name]
        tables = []

        for start_row, start_col, end_row, end_col in extract_table_bounds(sheet_values):
            # Extract table data
            table_data = extract_table_data(sheet_values, start_row, start_col, end_row, end_col)

            # Extract metadata
            column_metadata = extract_table_metadata(sheet_formulas, start_row, start_col, end_row, end_col, workbook_formulas)

            tables.append({
                "Coordinates": {"StartRow": start_row, "StartCol": start_col, "EndRow": end_row, "EndCol": end_col},
                "TableData": table_data,  # Store table data here
                "Metadata": column_metadata,
            })

        tables_by_sheet[sheet_name] = tables

    return tables_by_sheet

def create_cell_to_column_map(tables_with_metadata):
    """Create a global mapping of cells to column names."""
    cell_to_column_map = {}
    for sheet_name, tables in tables_with_metadata.items():
        for table in tables:
            coords = table["Coordinates"]
            metadata = table["Metadata"]
            for col_idx, meta in enumerate(metadata, start=coords["StartCol"]):
                for row_idx in range(coords["StartRow"], coords["EndRow"] + 1):
                    cell_to_column_map[(sheet_name, row_idx, col_idx)] = meta["ColumnName"]
    return cell_to_column_map

def enhance_formula_with_column_names(formula, cell_to_column_map, current_sheet_name):
    """Enhance formulas with column names, handling cross-sheet references."""
    cell_reference_pattern = re.compile(r"(?:(\w+)\!)?([A-Z]+)(\d+)")

    def replace_reference(match):
        sheet_name = match.group(1) or current_sheet_name
        col_letter = match.group(2)
        row_number = int(match.group(3))

        try:
            # Convert column letter to index
            col_index = column_index_from_string(col_letter)
        except ValueError:
            # If the column letter is invalid, leave it as-is
            return match.group(0)

        # Lookup in the precomputed map
        col_name = cell_to_column_map.get((sheet_name, row_number, col_index), "Unknown")

        return f"{match.group(0)}({col_name})"

    return cell_reference_pattern.sub(replace_reference, formula)

# Load and process the spreadsheet
file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 1 - Inventory Planning.xlsx'
# file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 2 - Supply Management.xlsx'
# file_path = '/Users/ajay/Documents/Atomic/inventory_analysis/Data/Company 3 - Inventory Dashboard _V2.xlsx'
tables_with_metadata = extract_tables_with_column_names_and_dependencies(file_path)
cell_to_column_map = create_cell_to_column_map(tables_with_metadata)

# Enhance formulas in all metadata
for sheet_name, tables in tables_with_metadata.items():
    for table in tables:
        for col_meta in table["Metadata"]:
            if col_meta["Formula"]:
                col_meta["EnhancedFormula"] = enhance_formula_with_column_names(
                    col_meta["Formula"], cell_to_column_map, sheet_name
                )


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [None]:
import pandas as pd

# Display the extracted tables and metadata
for sheet_name, tables in tables_with_metadata.items():
    print(f"Sheet: {sheet_name}\n{'-' * 40}")
    
    for i, table_info in enumerate(tables):
        print(f"Sheet: {sheet_name}\n{'-' * 40}")
        print(f"  Table {i + 1}:\n{'-' * 20}")
        
        # Safely extract and display table data if it exists
        table_data = table_info.get("TableData", None)
        if table_data:
            print("    Table Data (First 6 Rows):")
            df = pd.DataFrame(table_data)
            display(df.head(6))  # Show the first 6 rows as a DataFrame
        else:
            print("    Table Data: Not available")
        
        # Safely extract and display metadata line by line if it exists
        metadata = table_info.get("Metadata", None)
        if metadata:
            print("    Metadata:")
            for col_meta in metadata:
                print(f"      {col_meta}")
        else:
            print("    Metadata: Not available")
        
        print("\n")


In [188]:
import pandas as pd

total = 0
texts = []
for sheet_name, tables in tables_with_metadata.items():
    print(f"Sheet: {sheet_name}")
    text = ""
    print(f"Number of tables: {len(tables)}")
    text += f"**Sheet: {sheet_name}**\n"
    
    for i, table_info in enumerate(tables[:6]):  # Limit to the first 6 tables
        text += f"  **Table {i+1}:**\n"
        
        # Safely handle TableData
        table_data = table_info.get("TableData", None)
        if table_data:
            df = pd.DataFrame(table_data[:6])  # Convert the first 6 rows to a DataFrame
            text += f"{df.to_markdown(index=False)}\n"  # Convert DataFrame to markdown
        else:
            text += "    Table Data: Not available\n"

        # Safely handle Metadata
        metadata = table_info.get("Metadata", [])
        if metadata:
            for col_meta in metadata:
                enhanced_formula = col_meta.get("EnhancedFormula", "None")
                dependencies = col_meta.get("Dependencies", [])
                text += f"""
        - **Column Name**: {col_meta.get("ColumnName", "Unknown")}
        - **Formula**: {enhanced_formula}
        - **Dependencies**: {", ".join(dependencies) if dependencies else "None"}
                """
        else:
            text += "    Metadata: Not available\n"

        # Print the text for debugging
        # print(text)
        texts.append(text)
    # Calculate tokens in the text
    num_tokens = count_tokens(text)
    print(f"Number of Tokens: {num_tokens}")
    total += num_tokens

# Print the total token count
print(f"Total Tokens: {total}")


Sheet: INVENTORY SNAPSHOT
Number of tables: 7
Number of Tokens: 3650
Sheet: Sheet24
Number of tables: 1
Number of Tokens: 424
Sheet: Current Inventory
Number of tables: 5
Number of Tokens: 2003
Sheet: INPUT -> DOT Inventory Report
Number of tables: 2
Number of Tokens: 4625
Sheet: Inventory Detail
Number of tables: 1
Number of Tokens: 1460
Sheet: True Up Template
Number of tables: 2
Number of Tokens: 3447
Sheet: Copy of Inventory Detail
Number of tables: 1
Number of Tokens: 1334
Sheet: Production by Month
Number of tables: 0
Number of Tokens: 7
Sheet: Delivery by Month
Number of tables: 10
Number of Tokens: 1592
Sheet: Inventory - Monthly NEW
Number of tables: 1
Number of Tokens: 730
Sheet: Item List
Number of tables: 1
Number of Tokens: 121
Sheet: Invoice Tracker(TK)
Number of tables: 1
Number of Tokens: 638
Total Tokens: 20031


In [194]:
print(texts[-3])

**Sheet: Inventory - Monthly NEW**
  **Table 1:**
| 0                                                       | 1    | 2          | 3                   | 4                   | 5                   | 6                   | 7                   |
|:--------------------------------------------------------|:-----|:-----------|:--------------------|:--------------------|:--------------------|:--------------------|:--------------------|
| DOES NOT INCLUDE PRODUCTION OR DELIVERY FROM RITE STUFF |      |            |                     |                     |                     | NaT                 |                     |
| Description                                             |      |            | 2024-01-01 00:00:00 | 2024-02-01 00:00:00 | 2024-03-01 00:00:00 | 2024-04-01 00:00:00 | 2024-04-30 00:00:00 |
| CP Big Mozz Sticks                                      | #N/A | Starting   | 130                 | 1559.0              | 1.0                 | NaT                 | 1224.0              |
