In [54]:
import os
import glob
import pandas as pd
import numpy as np
import re
import unicodedata

In [55]:
def clean_text(text):
    """Allow only letters, digits, and ., %, /, ,, -"""
    return re.sub(r'[^a-zA-Z0-9.%/,-]', '', text)

def normalize_text(text):
    """Remove hidden Unicode control characters like \u202d"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

def is_numeric(s):
    return s.apply(lambda x: re.fullmatch(r'\d+', x) is not None).all()

def is_decimal(s):
    return s.apply(lambda x: re.fullmatch(r'\d+\.\d+', x) is not None).all()

def has_special_characters(s):
    return s.apply(lambda x: any(c in x for c in ['/', '-', ',', '.', '%'])).all()

def is_alphanumeric(s):
    return s.apply(lambda x: re.fullmatch(r'[A-Za-z0-9]+', x) is not None).all()

def is_mostly_letters(s, threshold=1.0):
    letter_counts = s.apply(lambda x: sum(c.isalpha() for c in x))
    total_counts = s.apply(lambda x: len(x))
    ratios = letter_counts / total_counts
    return (ratios == threshold).mean() > 0.9

def fixed_length(s):
    cleaned = s.apply(clean_text)
    lengths = cleaned.apply(len).unique()
    return len(lengths) == 1

In [None]:
def load_input_file(input_directory):
    files = glob.glob(os.path.join(input_directory, "*.xlsx"))
    if not files:
        raise FileNotFoundError("No XLSX file found.")
    file_path = files[0]
    df = pd.read_excel(file_path, skiprows=2)
    df_for_mapping = df.head(10)
    print("\n=== Input File Loaded ===")
    display(df_for_mapping)
    return df, df_for_mapping



In [57]:
def load_seed_files(seed_directory):
    seed_files = glob.glob(os.path.join(seed_directory, "*.csv"))
    seeds = {}
    for file in seed_files:
        df = pd.read_csv(file, skipinitialspace=True, dtype=str)
        filename = os.path.splitext(os.path.basename(file))[0]
        seeds[filename] = df
    print("\n=== Seed Files Loaded ===")
    for name, df in seeds.items():
        print(f"\nSeed File: {name}")
        display(df)
    return seeds

In [58]:
def find_primary_keys(df):
    primary_keys = []
    for col in df.columns:
        s = df[col].dropna().astype(str).apply(normalize_text)
        if s.empty:
            continue
        cleaned = s.apply(clean_text)
        if len(cleaned) == len(cleaned.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

In [59]:
def analyze_column_properties(s):
    cleaned = s.dropna().astype(str).apply(lambda x: normalize_text(x.strip()))
    props = {
        'is_numeric': is_numeric(cleaned),
        'is_decimal': is_decimal(cleaned),
        'has_special_characters': has_special_characters(cleaned),
        'is_alphanumeric': is_alphanumeric(cleaned),
        'is_mostly_letters': is_mostly_letters(cleaned),
        'is_fixed_length': fixed_length(cleaned),
        'length': cleaned.apply(lambda x: len(clean_text(x))).unique()
    }
    return props

In [60]:
def map_columns_based_on_properties(input_sample, seed_col_name, seed_col_values):
    seed_props = analyze_column_properties(seed_col_values)

    matches = []
    for input_col in input_sample.columns:
        input_props = analyze_column_properties(input_sample[input_col])

        checks = [
            seed_props['is_numeric'] == input_props['is_numeric'],
            seed_props['is_decimal'] == input_props['is_decimal'],
            seed_props['has_special_characters'] == input_props['has_special_characters'],
            seed_props['is_alphanumeric'] == input_props['is_alphanumeric'],
            seed_props['is_mostly_letters'] == input_props['is_mostly_letters'],
            seed_props['is_fixed_length'] == input_props['is_fixed_length'],
            set(seed_props['length']) == set(input_props['length'])
        ]

        if all(checks):
            print(f"\nMatched: Seed Column '{seed_col_name}' -> Input Column '{input_col}'")
            print(f"Checks Matched: {checks}")
            matches.append(input_col)
    return matches


In [61]:
def process_all_seeds(input_sample, seeds):
    all_mappings = {}

    for seed_name, seed_df in seeds.items():
        print(f"\n=== Processing Seed File: {seed_name} ===")
        primary_keys = find_primary_keys(seed_df)
        all_mappings[seed_name] = []

        for pk in primary_keys:
            matches = map_columns_based_on_properties(input_sample, pk, seed_df[pk])
            if matches:
                all_mappings[seed_name].append({
                    'seed_column': pk,
                    'matched_input_columns': matches
                })
    return all_mappings

In [62]:
def final_verification(all_mappings, seeds, full_input_df):
    final_verified_mappings = {}

    for seed_name, mappings in all_mappings.items():
        final_verified_mappings[seed_name] = []
        for mapping in mappings:
            seed_col_name = mapping['seed_column']
            full_seed_col = seeds[seed_name][seed_col_name].dropna().astype(str).apply(lambda x: normalize_text(x.strip()))

            for input_col_name in mapping['matched_input_columns']:
                if input_col_name not in full_input_df.columns:
                    continue
                full_input_col = full_input_df[input_col_name].dropna().astype(str).apply(lambda x: normalize_text(x.strip()))

                input_unique = set(full_input_col)
                seed_unique = set(full_seed_col)

                if not input_unique:
                    continue

                common_elements = input_unique & seed_unique
                ratio = len(common_elements) / len(input_unique)

                print(f"\nMapping Attempt: '{seed_name}:{seed_col_name}' -> '{input_col_name}'")
                print(f"Input Unique Values ({input_col_name}): {list(input_unique)}")
                print(f"Seed Unique Values ({seed_col_name}): {list(seed_unique)}")
                print(f"Common Values: {list(common_elements)}")
                print(f"Input Unique Count: {len(input_unique)}, Common Count: {len(common_elements)}, Ratio: {ratio:.2f}")

                if ratio >= 0.8:
                    print(" Mapping Accepted based on Common Elements Ratio.")
                    final_verified_mappings[seed_name].append({
                        'seed_column': seed_col_name,
                        'input_column': input_col_name,
                        'ratio': ratio
                    })
                else:
                    print(" Mapping Rejected based on Common Elements Ratio.")

    print("\n=== Final Verified Mappings ===")
    for seed_name, verified_list in final_verified_mappings.items():
        for verified in verified_list:
            print(f"Seed File '{seed_name}': Seed Column '{verified['seed_column']}' -> Input Column '{verified['input_column']}' (Ratio: {verified['ratio']:.2f})")

    return final_verified_mappings

In [None]:
input_directory = "input"
seed_directory = "seeds"

full_input_df, input_sample = load_input_file(input_directory)
seeds = load_seed_files(seed_directory)

all_mappings = process_all_seeds(input_sample, seeds)

final_verified_mappings = final_verification(all_mappings, seeds, full_input_df)


=== Input File Loaded ===


Unnamed: 0,S.No,Ledger,Customer name,GSTIN/UIN,Place of supply,Invoice number,Tax invoice number,Invoice date,GL Date,Currency,...,Cess rate,Cess amount,Shipping bill/ Bill of export number,Shipping bill/ Bill of export date,Zero Rated Supplies,Nil Rated Supplies,Exempted (Other than Nil rated/non-GST supply),Non GST supply,Type of Export,E-commerce Operator
0,1,‭0138‬,0003 - EATON CORPORATION,,,2401382241,,2025-02-01,2025-02-01,USD,...,,,,,,Yes,,,Export LUT sales,
1,2,‭0138‬,0270 - EATON INDIA INNOVATION CENTER LLP,27AAFFE4333D1ZT,Maharashtra,2401382261,,2025-02-01,2025-02-01,INR,...,,,,,,Yes,,,Export LUT sales,
2,3,‭0138‬,0231 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382281,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A02,,,Yes,,,SEZ Supplies under LUT,
3,4,‭0138‬,0255 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382282,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A03,,,Yes,,,SEZ Supplies under LUT,
4,5,‭0138‬,0255 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382283,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A04,,,Yes,,,SEZ Supplies under LUT,
5,6,‭0138‬,0269 - EATON TECHNOLOGIES PRIVATE LIMITED,27AABCE4323Q1ZE,Maharashtra,2401382284,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A05,,,Yes,,,SEZ Supplies under LUT,
6,7,‭0138‬,1092 - EATON INDUSTRIAL SYSTEMS PRIVATE LIMITED,27AABCE4955C1ZT,Maharashtra,2401382285,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A06,,,,,,SEZ Supplies under LUT,
7,8,‭0138‬,2090 - EATON INDUSTRIAL SYSTEMS PRIVATE LIMITED,27AABCE4955C1ZT,Maharashtra,2401382286,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A09,,,,,,SEZ Supplies under LUT,
8,9,‭0138‬,3812 - EATON INDUSTRIAL PRODUCTS PVT. LTD,27AAACI7539R1ZT,Maharashtra,2401382287,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A10,,,,,,SEZ Supplies under LUT,
9,10,‭0138‬,0225 - EATON MANAGEMENT SERVICES LLP,27AAGFE3119C1ZW,Maharashtra,2401382288,,2025-02-12,2025-02-13,INR,...,,,D0138CR2002A11,,,,,,SEZ Supplies under LUT,



=== Seed Files Loaded ===

Seed File: master_sez


Unnamed: 0,Party Name,SEZ Flag,Party Number,GSTIN Number
0,0231 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,115723,27AABCE4323Q1ZE
1,0238 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,125933,27AABCE4323Q1ZE
2,0255 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,126293,27AABCE4323Q1ZE
3,0269 - EATON TECHNOLOGIES PRIVATE LIMITED,Yes,120229,27AABCE4323Q1ZE
4,0270 - EATON INDIA INNOVATION CENTER LLP,Yes,115967,27AAFFE4333D1ZT
5,5020 - EATON INDUSTRIAL PRODUCTS PRIVATE LIMITED,Yes,1127114,29AAACI7539R2ZO
6,DANFOSS TECHNOLOGIES PRIVATE LTD (9178),Yes,1468599,27AAHCD8246K1Z7
7,GENPACT INDIA PRIVATE LIMITED,Yes,1658096,27AABCE4461B3Z0



Seed File: master


Unnamed: 0,Site,GSTIN
0,270,27AAFFE4333D1ZT
1,255,27AABCE4323Q1ZE
2,269,27AABCE4323Q1ZE
3,231,27AABCE4323Q1ZE
4,138,27AABCE4323Q2ZD
5,225,27AAGFE3119C1ZW
6,5020,29AAACI7539R2ZO
7,3812,27AAACI7539R1ZT
8,4435,33AAACM2555R1Z6
9,5235,07AADCC8538M1Z7



=== Processing Seed File: master_sez ===
Primary key candidate: 'Party Name                                      '
Primary key candidate: 'Party Number'

=== Processing Seed File: master ===
Primary key candidate: 'Site'

Matched: Seed Column 'Site' -> Input Column 'Ledger'
Checks Matched: [True, True, True, True, True, True, True]

Mapping Attempt: 'master:Site' -> 'Ledger'
Input Unique Values (Ledger): ['0138']
Seed Unique Values (Site): ['0225', '0138', '5235', '0255', '3812', '0270', '4435', '0269', '0231', '5020']
Common Values: ['0138']
Input Unique Count: 1, Common Count: 1, Ratio: 1.00
 Mapping Accepted based on Common Elements Ratio.

=== Final Verified Mappings ===
Seed File 'master': Seed Column 'Site' -> Input Column 'Ledger' (Ratio: 1.00)
