In [1]:
import os
import glob
import pandas as pd
import numpy as np
import re

In [None]:
def load_and_split_input(input_directory, n_chunks=10):
    files = glob.glob(os.path.join(input_directory, "*.xlsx"))
    if not files:
        raise FileNotFoundError("No input XLSX files found in the directory.")
    file_path = files[0]
    # Skip the first two rows when reading the file
    df = pd.read_excel(file_path, skiprows=2)
    # For mapping, use the top 10 rows (starting from row 3 of the original file)
    df_for_mapping = df.head(10)
    chunks = np.array_split(df_for_mapping, n_chunks)
    print(f"Loaded input file '{file_path}' with {len(df)} rows (skipped first 2 rows), and using top 10 rows for mapping split into {n_chunks} chunks.")
    return df, chunks

full_input_df, input_chunks = load_and_split_input("input", n_chunks=10)
print("Full Input DataFrame (first 10 rows for mapping):")
display(full_input_df.head(10))
# display(full_input_df.iloc[1])
# display(input_chunks[0])
# display(full_input_df['Ledger'])



Loaded input file 'input/main_input.xlsx' with 48 rows (skipped first 2 rows), and using top 10 rows for mapping split into 10 chunks.
Full Input DataFrame (first 10 rows for mapping):


  return bound(*args, **kwds)


S.No                                                                                     2
Ledger                                                                              ‭0138‬
Customer name                                     0270 - EATON INDIA INNOVATION CENTER LLP
GSTIN/UIN                                                                  27AAFFE4333D1ZT
Place of supply                                                                Maharashtra
Invoice  number                                                                 2401382261
Tax invoice number                                                                     NaN
Invoice date                                                           2025-02-01 00:00:00
GL Date                                                                2025-02-01 00:00:00
Currency                                                                               INR
Currency Rate                                                                          NaN

In [3]:
def load_seed_files(directory_path):
    seed_files = glob.glob(os.path.join(directory_path, "*.csv"))
    seeds = {}
    for file in seed_files:
        # Read all columns as strings to preserve leading zeros etc.
        df = pd.read_csv(file, skipinitialspace=True, dtype=str)
        filename = os.path.splitext(os.path.basename(file))[0]
        seeds[filename] = df
        print(f"Loaded seed file: '{filename}' with {len(df)} rows.")
    return seeds

seeds = load_seed_files("seeds")
# print(seeds)
print("Seed files loaded:", list(seeds.keys()))
for name, df in seeds.items():
    print(f"Seed File: '{name}'")
    display(df)


Loaded seed file: 'master_sez' with 8 rows.
Loaded seed file: 'master' with 10 rows.
Seed files loaded: ['master_sez', 'master']
Seed File: 'master_sez'


Unnamed: 0,Party Name,SEZ Flag,Party Number,GSTIN Number
0,0231 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,115723,27AABCE4323Q1ZE
1,0238 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,125933,27AABCE4323Q1ZE
2,0255 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,126293,27AABCE4323Q1ZE
3,0269 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,120229,27AABCE4323Q1ZE
4,0270 - EATON INDIA INNOVATION CENTER LLP,Yes,115967,27AAFFE4333D1ZT
5,5020 - EATON INDUSTRIAL PRODUCTS PRIVATE LIMITED,Yes,1127114,29AAACI7539R2ZO
6,DANFOSS TECHNOLOGIES PRIVATE LTD (9178),Yes,1468599,27AAHCD8246K1Z7
7,GENPACT INDIA PRIVATE LIMITED,Yes,1658096,27AABCE4461B3Z0


Seed File: 'master'


Unnamed: 0,Site,GSTIN
0,270,27AAFFE4333D1ZT
1,255,27AABCE4323Q1ZE
2,269,27AABCE4323Q1ZE
3,231,27AABCE4323Q1ZE
4,138,27AABCE4323Q2ZD
5,225,27AAGFE3119C1ZW
6,5020,29AAACI7539R2ZO
7,3812,27AAACI7539R1ZT
8,4435,33AAACM2555R1Z6
9,5235,07AADCC8538M1Z7


In [4]:
def find_primary_keys_fixed_length(df):
    primary_keys = []
    # Use the full DataFrame instead of a sample
    df_full = df
    for col in df_full.columns:
        s = df_full[col].dropna().astype(str)
        if s.empty:
            continue
        lengths = s.apply(len).unique()
        # Check if all non-null values have the same length and are unique
        if len(lengths) == 1 and len(s) == len(s.unique()):
            primary_keys.append((col, lengths[0]))
            print(f"Primary key candidate: '{col}' with fixed length {lengths[0]}")
    return primary_keys


# Compute fixed-length primary key candidates for all seed files
fixed_length_primary_keys = {}
for seed_name, seed_df in seeds.items():
    print(f"\nSeed File: '{seed_name}'")
    pk_candidates = find_primary_keys_fixed_length(seed_df)
    fixed_length_primary_keys[seed_name] = pk_candidates
    print("Primary Key Candidates:", pk_candidates)



Seed File: 'master_sez'
Primary key candidate: 'Party Name                                      ' with fixed length 48
Primary Key Candidates: [('Party Name                                      ', 48)]

Seed File: 'master'
Primary key candidate: 'Site' with fixed length 4
Primary Key Candidates: [('Site', 4)]


In [5]:
def map_fixed_length_columns(input_sample, seed_column, seed_length):
    """
    Iterate over all columns in the input sample and return all columns
    that have a fixed length equal to the provided seed column's fixed length.
    """
    matches = []
    for col in input_sample.columns:
        s = input_sample[col].dropna().astype(str)
        if s.empty:
            print(f"Input Column '{col}' is empty. Skipping.")
            continue
        
        unique_lengths = s.apply(lambda x: len(re.sub(r'[^a-zA-Z0-9]', '', x))).unique()
        if len(unique_lengths) == 1:
            if unique_lengths[0] == seed_length:
                print(f"Matched Input Column '{col}' (Length {unique_lengths[0]}) for Seed Column '{seed_column}'")
                matches.append(col)
            else:
                print(f"Input Column '{col}' has fixed length {unique_lengths[0]}, but does not match Seed Column '{seed_column}' (Length {seed_length})")
        else:
            print(f"Input Column '{col}' has varying lengths: {unique_lengths}. Skipping.")
    
    if not matches:
        print(f"No input columns matched for Seed Column '{seed_column}' with fixed length {seed_length}.")
    return matches


In [None]:
# Step 1: Find all primary keys (columns with unique values)
def find_primary_keys(df, sample_size=10):
    """Find all columns that could be primary keys (have unique values)"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
    
    primary_keys = []
    for col in df_sample.columns:
        s = df_sample[col].dropna()
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

# Step 2: Check which primary keys have fixed length
def check_fixed_length_keys(df, primary_keys, sample_size=10):
    """Check which primary keys have fixed length"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
        
    fixed_length_keys = []
    for col in primary_keys:
        s = df_sample[col].dropna().astype(str)
        if s.empty:
            continue
        lengths = s.apply(len).unique()
        if len(lengths) == 1:
            fixed_length_keys.append((col, lengths[0]))
            print(f"Fixed-length primary key: '{col}' with length {lengths[0]}")
    return fixed_length_keys

# Test on each seed file:
print("STEP 1: Extracting all primary keys from seed files...")
all_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    pk_candidates = find_primary_keys(df)
    all_primary_keys[name] = pk_candidates
    print(f"All Primary Key Candidates: {pk_candidates}")

print("\nSTEP 2: Checking for fixed-length primary keys...")
fixed_length_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    fixed_pk = check_fixed_length_keys(df, all_primary_keys[name])
    fixed_length_primary_keys[name] = fixed_pk
    print(f"Fixed-Length Primary Keys: {fixed_pk}")

In [None]:
# Step 1: Find all primary keys (columns with unique values)
def find_primary_keys(df, sample_size=10):
    """Find all columns that could be primary keys (have unique values)"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
    
    primary_keys = []
    for col in df_sample.columns:
        s = df_sample[col].dropna()
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

# Step 2: Check which primary keys have fixed length
def check_fixed_length_keys(df, primary_keys, sample_size=10):
    """Check which primary keys have fixed length"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
        
    fixed_length_keys = []
    for col in primary_keys:
        s = df_sample[col].dropna().astype(str)
        if s.empty:
            continue
        lengths = s.apply(len).unique()
        if len(lengths) == 1:
            fixed_length_keys.append((col, lengths[0]))
            print(f"Fixed-length primary key: '{col}' with length {lengths[0]}")
    return fixed_length_keys

# Test on each seed file:
print("STEP 1: Extracting all primary keys from seed files...")
all_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    pk_candidates = find_primary_keys(df)
    all_primary_keys[name] = pk_candidates
    print(f"All Primary Key Candidates: {pk_candidates}")

print("\nSTEP 2: Checking for fixed-length primary keys...")
fixed_length_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    fixed_pk = check_fixed_length_keys(df, all_primary_keys[name])
    fixed_length_primary_keys[name] = fixed_pk
    print(f"Fixed-Length Primary Keys: {fixed_pk}")

In [None]:
# Step 1: Find all primary keys (columns with unique values)
def find_primary_keys(df, sample_size=10):
    """Find all columns that could be primary keys (have unique values)"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
    
    primary_keys = []
    for col in df_sample.columns:
        s = df_sample[col].dropna()
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

# Step 2: Check which primary keys have fixed length
def check_fixed_length_keys(df, primary_keys, sample_size=10):
    """Check which primary keys have fixed length"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
        
    fixed_length_keys = []
    for col in primary_keys:
        s = df_sample[col].dropna().astype(str)
        if s.empty:
            continue
        lengths = s.apply(len).unique()
        if len(lengths) == 1:
            fixed_length_keys.append((col, lengths[0]))
            print(f"Fixed-length primary key: '{col}' with length {lengths[0]}")
    return fixed_length_keys

# Test on each seed file:
print("STEP 1: Extracting all primary keys from seed files...")
all_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    pk_candidates = find_primary_keys(df)
    all_primary_keys[name] = pk_candidates
    print(f"All Primary Key Candidates: {pk_candidates}")

print("\nSTEP 2: Checking for fixed-length primary keys...")
fixed_length_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    fixed_pk = check_fixed_length_keys(df, all_primary_keys[name])
    fixed_length_primary_keys[name] = fixed_pk
    print(f"Fixed-Length Primary Keys: {fixed_pk}")

In [None]:
# Step 1: Find all primary keys (columns with unique values)
def find_primary_keys(df, sample_size=10):
    """Find all columns that could be primary keys (have unique values)"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
    
    primary_keys = []
    for col in df_sample.columns:
        s = df_sample[col].dropna()
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

# Step 2: Check which primary keys have fixed length
def check_fixed_length_keys(df, primary_keys, sample_size=10):
    """Check which primary keys have fixed length"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
        
    fixed_length_keys = []
    for col in primary_keys:
        s = df_sample[col].dropna().astype(str)
        if s.empty:
            continue
        lengths = s.apply(len).unique()
        if len(lengths) == 1:
            fixed_length_keys.append((col, lengths[0]))
            print(f"Fixed-length primary key: '{col}' with length {lengths[0]}")
    return fixed_length_keys

# Test on each seed file:
print("STEP 1: Extracting all primary keys from seed files...")
all_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    pk_candidates = find_primary_keys(df)
    all_primary_keys[name] = pk_candidates
    print(f"All Primary Key Candidates: {pk_candidates}")

print("\nSTEP 2: Checking for fixed-length primary keys...")
fixed_length_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    fixed_pk = check_fixed_length_keys(df, all_primary_keys[name])
    fixed_length_primary_keys[name] = fixed_pk
    print(f"Fixed-Length Primary Keys: {fixed_pk}")

In [None]:
# Step 1: Find all primary keys (columns with unique values)
def find_primary_keys(df, sample_size=10):
    """Find all columns that could be primary keys (have unique values)"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
    
    primary_keys = []
    for col in df_sample.columns:
        s = df_sample[col].dropna()
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

# Step 2: Check which primary keys have fixed length
def check_fixed_length_keys(df, primary_keys, sample_size=10):
    """Check which primary keys have fixed length"""
    if sample_size is not None and len(df) > sample_size:
        df_sample = df.head(sample_size)
    else:
        df_sample = df
        
    fixed_length_keys = []
    for col in primary_keys:
        s = df_sample[col].dropna().astype(str)
        if s.empty:
            continue
        lengths = s.apply(len).unique()
        if len(lengths) == 1:
            fixed_length_keys.append((col, lengths[0]))
            print(f"Fixed-length primary key: '{col}' with length {lengths[0]}")
    return fixed_length_keys

# Test on each seed file:
print("STEP 1: Extracting all primary keys from seed files...")
all_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    pk_candidates = find_primary_keys(df)
    all_primary_keys[name] = pk_candidates
    print(f"All Primary Key Candidates: {pk_candidates}")

print("\nSTEP 2: Checking for fixed-length primary keys...")
fixed_length_primary_keys = {}
for name, df in seeds.items():
    print(f"\nSeed File: '{name}'")
    fixed_pk = check_fixed_length_keys(df, all_primary_keys[name])
    fixed_length_primary_keys[name] = fixed_pk
    print(f"Fixed-Length Primary Keys: {fixed_pk}")

In [6]:
def process_seed_file_mapping(input_sample, seed_df, seed_name, fixed_length_keys):
    """Process mapping between seed file columns and the input sample.
    
    For each fixed-length primary key candidate from the seed file,
    check the input sample (top 10 rows) for all input columns with matching fixed length.
    """
    print(f"STEP 4: Processing mapping for seed file '{seed_name}'")
    
    mappings = []
    if not fixed_length_keys:
        print(f"No fixed-length primary key candidates found in seed file '{seed_name}'.")
        return mappings
    
    for pk_col, pk_length in fixed_length_keys:
        print(f"\nProcessing seed file '{seed_name}': Candidate '{pk_col}' with fixed length {pk_length}")
        # Check the whole input sample (full_input_df.head(10)) for matching columns
        current_matches = map_fixed_length_columns(input_sample, pk_col, pk_length)
        if current_matches:
            mappings.append({
                'seed_file': seed_name,
                'seed_column': pk_col,
                'fixed_length': pk_length,
                'input_columns': current_matches
            })
            print(f"Candidate '{pk_col}' maps to input column(s): {current_matches}")
        else:
            print(f"No matching input column found for seed candidate '{pk_col}' with fixed length {pk_length}")
    return mappings


In [7]:
all_mappings = {}
for seed_name, seed_df in seeds.items():
    print(f"\n=== Processing seed file: '{seed_name}' ===")
    mapping_for_seed = process_seed_file_mapping(full_input_df.head(10), seed_df, seed_name,
                                                   fixed_length_primary_keys.get(seed_name, []))
    all_mappings[seed_name] = mapping_for_seed

print("\n=== Final Mapping Results ===")
for seed_name, mapping_list in all_mappings.items():
    for mapping in mapping_list:
        input_cols_str = ", ".join(mapping["input_columns"])
        print(f"Seed File '{seed_name}': '{mapping['seed_column']}' -> {input_cols_str} (Fixed Length: {mapping['fixed_length']})")



=== Processing seed file: 'master_sez' ===
STEP 4: Processing mapping for seed file 'master_sez'

Processing seed file 'master_sez': Candidate 'Party Name                                      ' with fixed length 48
Input Column 'S.No' has varying lengths: [1 2]. Skipping.
Input Column 'Ledger' has fixed length 4, but does not match Seed Column 'Party Name                                      ' (Length 48)
Input Column 'Customer name' has varying lengths: [20 33 35 40 30]. Skipping.
Input Column 'GSTIN/UIN' has fixed length 15, but does not match Seed Column 'Party Name                                      ' (Length 48)
Input Column 'Place of supply' has fixed length 11, but does not match Seed Column 'Party Name                                      ' (Length 48)
Input Column 'Invoice  number' has fixed length 10, but does not match Seed Column 'Party Name                                      ' (Length 48)
Input Column 'Tax invoice number' is empty. Skipping.
Input Column 'Invoice date