In [27]:
import os
import glob
import pandas as pd
import numpy as np
import re

In [28]:
def clean_text(text):
    """
    Clean text by keeping only allowed characters: letters, digits, dot (.), percent (%), slash (/), comma (,), and dash (-).
    IMPORTANT: Place '-' at the beginning or end to avoid bad range error.
    """
    return re.sub(r'[^a-zA-Z0-9.%/,-]', '', text)

def is_numeric(s):
    return s.apply(lambda x: re.fullmatch(r'\d+', x) is not None).all()

def is_decimal(s):
    return s.apply(lambda x: re.fullmatch(r'\d+(\.\d+)?', x) is not None).all()

def has_special_characters(s):
    """Check if string contains only special characters like / , - . %"""
    return s.apply(lambda x: any(c in x for c in ['/', '-', ',', '.', '%'])).all()

def is_alphanumeric(s):
    return s.apply(lambda x: re.fullmatch(r'[A-Za-z0-9]+', x) is not None).all()

def is_mostly_letters(s, threshold=0.7):
    """Check if most entries are mostly letters."""
    letter_counts = s.apply(lambda x: sum(c.isalpha() for c in x))
    total_counts = s.apply(lambda x: len(x))
    ratios = letter_counts / total_counts
    return (ratios > threshold).mean() > 0.7

def fixed_length(s):
    cleaned = s.apply(clean_text)
    lengths = cleaned.apply(len).unique()
    return len(lengths) == 1

In [29]:
def load_input_file(input_directory):
    files = glob.glob(os.path.join(input_directory, "*.xlsx"))
    if not files:
        raise FileNotFoundError("No XLSX file found.")
    file_path = files[0]
    df = pd.read_excel(file_path, skiprows=2)
    df_for_mapping = df.head(10)
    # print(f"Loaded input file '{file_path}' with {len(df)} rows. Using top 10 rows for mapping.")
    return df, df_for_mapping


In [30]:
def load_seed_files(seed_directory):
    seed_files = glob.glob(os.path.join(seed_directory, "*.csv"))
    seeds = {}
    for file in seed_files:
        df = pd.read_csv(file, skipinitialspace=True, dtype=str)
        filename = os.path.splitext(os.path.basename(file))[0]
        seeds[filename] = df
        # print(f"Loaded seed file: '{filename}' with {len(df)} rows.")
    return seeds

In [31]:
def find_primary_keys_fixed_length(df):
    primary_keys = []
    for col in df.columns:
        s = df[col].dropna().astype(str)
        if s.empty:
            continue
        cleaned = s.apply(clean_text)
        lengths = cleaned.apply(len).unique()
        if len(lengths) == 1 and len(cleaned) == len(cleaned.unique()):
            primary_keys.append((col, lengths[0]))
            print(f"Primary key candidate: '{col}' with fixed length {lengths[0]}")
    return primary_keys

In [32]:
def map_columns_based_on_characteristics(input_sample, seed_col_name, seed_col_values, seed_col_length):
    s = seed_col_values.dropna().astype(str)
    cleaned_seed = s.apply(clean_text)

    # Seed file column properties
    seed_props = {
        'is_numeric': is_numeric(cleaned_seed),
        'is_decimal': is_decimal(cleaned_seed),
        'has_special_characters': has_special_characters(cleaned_seed),
        'is_alphanumeric': is_alphanumeric(cleaned_seed),
        'is_mostly_letters': is_mostly_letters(cleaned_seed),
        'is_fixed_length': fixed_length(cleaned_seed),
        'length': cleaned_seed.apply(len).unique()[0]  # Only one length allowed for PK
    }

    matches = []
    for col in input_sample.columns:
        s_input = input_sample[col].dropna().astype(str)
        if s_input.empty:
            continue
        cleaned_input = s_input.apply(clean_text)

        # Input file properties
        input_props = {
            'is_numeric': is_numeric(cleaned_input),
            'is_decimal': is_decimal(cleaned_input),
            'has_special_characters': has_special_characters(cleaned_input),
            'is_alphanumeric': is_alphanumeric(cleaned_input),
            'is_mostly_letters': is_mostly_letters(cleaned_input),
            'is_fixed_length': fixed_length(cleaned_input),
            'length': cleaned_input.apply(len).unique()[0] if fixed_length(cleaned_input) else None
        }

        conditions = [
            (seed_props['is_numeric'] == input_props['is_numeric']),
            (seed_props['is_decimal'] == input_props['is_decimal']),
            (seed_props['has_special_characters'] == input_props['has_special_characters']),
            (seed_props['is_alphanumeric'] == input_props['is_alphanumeric']),
            (seed_props['is_mostly_letters'] == input_props['is_mostly_letters']),
            (seed_props['is_fixed_length'] == input_props['is_fixed_length']),
            (seed_props['length'] == input_props['length']),
        ]

        if all(conditions):
            print(f"Matched Input Column '{col}' for Seed Column '{seed_col_name}' based on all checks.")
            matches.append(col)

    return matches


In [33]:
def process_seed_file_mapping(input_sample, seed_df, seed_name, fixed_length_keys):
    print(f"\nProcessing seed file '{seed_name}'...")
    mappings = []
    if not fixed_length_keys:
        print(f"No primary keys found for seed '{seed_name}'.")
        return mappings

    for pk_col, pk_length in fixed_length_keys:
        if pk_col not in seed_df.columns:
            print(f"Column '{pk_col}' not found in seed '{seed_name}'. Skipping.")
            continue

        seed_col_values = seed_df[pk_col]
        matches = map_columns_based_on_characteristics(input_sample, pk_col, seed_col_values, pk_length)

        if matches:
            mappings.append({
                'seed_file': seed_name,
                'seed_column': pk_col,
                'fixed_length': pk_length,
                'input_columns': matches
            })
    return mappings

In [34]:
input_directory = "input"
seed_directory = "seeds"

full_input_df, input_sample = load_input_file(input_directory)
display(input_sample)

seeds = load_seed_files(seed_directory)

fixed_length_primary_keys = {}
for seed_name, seed_df in seeds.items():
    print(f"\nFinding primary keys for seed file '{seed_name}'...")
    fixed_length_primary_keys[seed_name] = find_primary_keys_fixed_length(seed_df)

all_mappings = {}
for seed_name, seed_df in seeds.items():
    mappings = process_seed_file_mapping(input_sample, seed_df, seed_name, fixed_length_primary_keys.get(seed_name, []))
    all_mappings[seed_name] = mappings

print("\n=== Final Mapping Results ===")
for seed_name, mapping_list in all_mappings.items():
    for mapping in mapping_list:
        input_cols_str = ", ".join(mapping["input_columns"])
        print(f"Seed File '{seed_name}': '{mapping['seed_column']}' -> {input_cols_str} (Fixed Length: {mapping['fixed_length']})")


Unnamed: 0,S.No,Ledger,Customer name,GSTIN/UIN,Place of supply,Invoice number,Tax invoice number,Invoice date,GL Date,Currency,...,Cess rate,Cess amount,Shipping bill/ Bill of export number,Shipping bill/ Bill of export date,Zero Rated Supplies,Nil Rated Supplies,Exempted (Other than Nil rated/non-GST supply),Non GST supply,Type of Export,E-commerce Operator
0,1,‭0138‬,0003 - EATON CORPORATION,,,2401382241,,2025-02-01,2025-02-01,USD,...,,,,,,Yes,,,Export LUT sales,
1,2,‭0138‬,0270 - EATON INDIA INNOVATION CENTER LLP,27AAFFE4333D1ZT,Maharashtra,2401382261,,2025-02-01,2025-02-01,INR,...,,,,,,Yes,,,Export LUT sales,
2,3,‭0138‬,0231 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382281,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A02,,,Yes,,,SEZ Supplies under LUT,
3,4,‭0138‬,0255 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382282,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A03,,,Yes,,,SEZ Supplies under LUT,
4,5,‭0138‬,0255 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382283,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A04,,,Yes,,,SEZ Supplies under LUT,
5,6,‭0138‬,0269 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382284,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A05,,,Yes,,,SEZ Supplies under LUT,
6,7,‭0138‬,1092 - EATON INDUSTRIAL SYSTEMS PRIVATE LIMITED,27AABCE4955C1ZT,Maharashtra,2401382285,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A06,,,,,,SEZ Supplies under LUT,
7,8,‭0138‬,2090 - EATON INDUSTRIAL SYSTEMS PRIVATE LIMITED,27AABCE4955C1ZT,Maharashtra,2401382286,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A09,,,,,,SEZ Supplies under LUT,
8,9,‭0138‬,3812 - EATON INDUSTRIAL PRODUCTS PVT. LTD,27AAACI7539R1ZT,Maharashtra,2401382287,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A10,,,,,,SEZ Supplies under LUT,
9,10,‭0138‬,0225 - EATON MANAGEMENT SERVICES LLP,27AAGFE3119C1ZW,Maharashtra,2401382288,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A11,,,,,,SEZ Supplies under LUT,



Finding primary keys for seed file 'master_sez'...

Finding primary keys for seed file 'master'...
Primary key candidate: 'Site' with fixed length 4

Processing seed file 'master_sez'...
No primary keys found for seed 'master_sez'.

Processing seed file 'master'...
Matched Input Column 'Ledger' for Seed Column 'Site' based on all checks.

=== Final Mapping Results ===
Seed File 'master': 'Site' -> Ledger (Fixed Length: 4)
