In [103]:
import os
import glob
import pandas as pd
import numpy as np
import re
import unicodedata

In [104]:
def clean_text(text):
    """Allow only letters, digits, and ., %, /, ,, -"""
    return re.sub(r'[^a-zA-Z0-9.%/,-]', '', text)

def normalize_text(text):
    """Remove hidden Unicode control characters like \u202d"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

def is_numeric(s):
    return s.apply(lambda x: re.fullmatch(r'\d+', x) is not None).all()

def is_decimal(s):
    return s.apply(lambda x: re.fullmatch(r'\d+\.\d+', x) is not None).all()

def has_special_characters(s):
    return s.apply(lambda x: any(c in x for c in ['/', '-', ',', '.', '%'])).all()

def is_alphanumeric(s):
    return s.apply(lambda x: re.fullmatch(r'[A-Za-z0-9]+', x) is not None).all()

def is_mostly_letters(s, threshold=1.0):
    letter_counts = s.apply(lambda x: sum(c.isalpha() for c in x))
    total_counts = s.apply(lambda x: len(x))
    ratios = letter_counts / total_counts
    return (ratios == threshold).mean() > 0.9

def fixed_length(s):
    cleaned = s.apply(clean_text)
    lengths = cleaned.apply(len).unique()
    return len(lengths) == 1

In [105]:
def load_input_file(input_directory):
    files = glob.glob(os.path.join(input_directory, "*.xlsx"))
    if not files:
        raise FileNotFoundError("No XLSX file found.")
    file_path = files[0]
    df = pd.read_excel(file_path, skiprows=0)
    df_for_mapping = df.head(10)
    print("\n=== Input File Loaded ===")
    display(df_for_mapping)
    return df, df_for_mapping



In [106]:
def load_seed_files(seed_directory):
    seed_files = glob.glob(os.path.join(seed_directory, "*.csv"))
    seeds = {}
    for file in seed_files:
        df = pd.read_csv(file, skipinitialspace=True, dtype=str)
        filename = os.path.splitext(os.path.basename(file))[0]
        seeds[filename] = df
    print("\n=== Seed Files Loaded ===")
    for name, df in seeds.items():
        print(f"\nSeed File: {name}")
        display(df)
    return seeds

In [107]:
def find_primary_keys(df):
    primary_keys = []
    for col in df.columns:
        s = df[col].dropna().astype(str).apply(normalize_text)
        if s.empty:
            continue
        cleaned = s.apply(clean_text)
        if len(cleaned) == len(cleaned.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}'")
    return primary_keys

In [108]:
def analyze_column_properties(s):
    cleaned = s.dropna().astype(str).apply(lambda x: normalize_text(x.strip()))
    props = {
        'is_numeric': is_numeric(cleaned),
        'is_decimal': is_decimal(cleaned),
        'has_special_characters': has_special_characters(cleaned),
        'is_alphanumeric': is_alphanumeric(cleaned),
        'is_mostly_letters': is_mostly_letters(cleaned),
        'is_fixed_length': fixed_length(cleaned),
        'length': cleaned.apply(lambda x: len(clean_text(x))).unique()
    }
    return props

In [109]:
def map_columns_based_on_properties(input_sample, seed_col_name, seed_col_values):
    seed_props = analyze_column_properties(seed_col_values)

    matches = []
    for input_col in input_sample.columns:
        input_props = analyze_column_properties(input_sample[input_col])

        checks = [
            seed_props['is_numeric'] == input_props['is_numeric'],
            seed_props['is_decimal'] == input_props['is_decimal'],
            seed_props['has_special_characters'] == input_props['has_special_characters'],
            seed_props['is_alphanumeric'] == input_props['is_alphanumeric'],
            seed_props['is_mostly_letters'] == input_props['is_mostly_letters'],
            seed_props['is_fixed_length'] == input_props['is_fixed_length'],
            set(seed_props['length']) == set(input_props['length'])
        ]

        if all(checks):
            print(f"\nMatched: Seed Column '{seed_col_name}' -> Input Column '{input_col}'")
            print(f"Checks Matched: {checks}")
            matches.append(input_col)
    return matches


In [110]:
def process_all_seeds(input_sample, seeds):
    all_mappings = {}

    for seed_name, seed_df in seeds.items():
        print(f"\n=== Processing Seed File: {seed_name} ===")
        primary_keys = find_primary_keys(seed_df)
        all_mappings[seed_name] = []

        for pk in primary_keys:
            matches = map_columns_based_on_properties(input_sample, pk, seed_df[pk])
            if matches:
                all_mappings[seed_name].append({
                    'seed_column': pk,
                    'matched_input_columns': matches
                })
    return all_mappings

In [111]:
def final_verification(all_mappings, seeds, full_input_df):
    final_verified_mappings = {}

    for seed_name, mappings in all_mappings.items():
        final_verified_mappings[seed_name] = []
        for mapping in mappings:
            seed_col_name = mapping['seed_column']
            full_seed_col = seeds[seed_name][seed_col_name].dropna().astype(str).apply(lambda x: normalize_text(x.strip()))

            for input_col_name in mapping['matched_input_columns']:
                if input_col_name not in full_input_df.columns:
                    continue
                full_input_col = full_input_df[input_col_name].dropna().astype(str).apply(lambda x: normalize_text(x.strip()))

                input_unique = set(full_input_col)
                seed_unique = set(full_seed_col)

                if not input_unique:
                    continue

                common_elements = input_unique & seed_unique
                ratio = len(common_elements) / len(input_unique)

                print(f"\nMapping Attempt: '{seed_name}:{seed_col_name}' -> '{input_col_name}'")
                print(f"Input Unique Values ({input_col_name}): {list(input_unique)}")
                print(f"Seed Unique Values ({seed_col_name}): {list(seed_unique)}")
                print(f"Common Values: {list(common_elements)}")
                print(f"Input Unique Count: {len(input_unique)}, Common Count: {len(common_elements)}, Ratio: {ratio:.2f}")

                if ratio >= 0.8:
                    print(" Mapping Accepted based on Common Elements Ratio.")
                    final_verified_mappings[seed_name].append({
                        'seed_column': seed_col_name,
                        'input_column': input_col_name,
                        'ratio': ratio
                    })
                else:
                    print(" Mapping Rejected based on Common Elements Ratio.")

    print("\n=== Final Verified Mappings ===")
    for seed_name, verified_list in final_verified_mappings.items():
        for verified in verified_list:
            print(f"Seed File '{seed_name}': Seed Column '{verified['seed_column']}' -> Input Column '{verified['input_column']}' (Ratio: {verified['ratio']:.2f})")

    return final_verified_mappings

In [112]:
input_directory = "input"
seed_directory = "seeds"

full_input_df, input_sample = load_input_file(input_directory)
seeds = load_seed_files(seed_directory)

all_mappings = process_all_seeds(input_sample, seeds)

final_verified_mappings = final_verification(all_mappings, seeds, full_input_df)


=== Input File Loaded ===


Unnamed: 0,Internal ID,Date,Acctg Period,Tax Period,Vendor,Vendor Tax ID,Vendor Tax Number,Type,Document Number,Subsidiary (no hierarchy),...,Item,Account,Memo,Trxn Currency,Amount (Trxn Currency) w/ Correct GL Sign,Trxn Currency to Base Currency,Amount (Base Currency) w/ Correct GL Sign,Pivot Category,Unnamed: 20,Remarks
0,3337376,2024-09-18,Sep 2024,Sep 2024,EN-50-Pankaj R Pipada,,,Bill,6,50-Druva India,...,Contractor - India,62510 Outside Services : Contractors,The Consultant to provide the following Servic...,INR,500000.0,1.0,500000.0,Transaction Base,,
1,3337376,2024-09-18,Sep 2024,Sep 2024,EN-50-Pankaj R Pipada,,,Bill,6,50-Druva India,...,TDS on Professional Service,23052 Accrued Taxes : Output - India - TDS 94J...,,INR,-50000.0,1.0,-50000.0,Transaction Base,,
2,3337376,2024-09-18,Sep 2024,Sep 2024,EN-50-Pankaj R Pipada,,,Bill,6,50-Druva India,...,Subscription Software,64010 Software Subscription : Software Subscri...,Reimbursements for September 2024:_x000D_\nSof...,INR,3963.91,1.0,3963.91,Transaction Base,,
3,3337376,2024-09-18,Sep 2024,Sep 2024,EN-50-Pankaj R Pipada,,,Bill,6,50-Druva India,...,ZERO Group,23022 Accrued Taxes : GST Input,VAT,INR,0.0,1.0,0.0,Transaction Base,,
4,3337376,2024-09-18,Sep 2024,Sep 2024,EN-50-Pankaj R Pipada,,,Bill,6,50-Druva India,...,GST @18% Group,23026 Accrued Taxes : CGST Input,VAT,INR,45000.0,1.0,45000.0,GST,,
5,3337376,2024-09-18,Sep 2024,Sep 2024,EN-50-Pankaj R Pipada,,,Bill,6,50-Druva India,...,GST @18% Group,23024 Accrued Taxes : SGST Input,VAT,INR,45000.0,1.0,45000.0,GST,,
6,3322724,2024-09-05,Sep 2024,Sep 2024,EN-50-Samyak Restaurant LLP,,,Bill,19,50-Druva India,...,Pantry Items,61505 Employee Expenses : Employee Meals & Events,Food supply for all employee. ( This is all go...,INR,66085.71,1.0,66085.71,Transaction Base,,
7,3322724,2024-09-05,Sep 2024,Sep 2024,EN-50-Samyak Restaurant LLP,,,Bill,19,50-Druva India,...,TDS on Contractors,23056 Accrued Taxes : Output - India - TDS 94C...,,INR,-1322.0,1.0,-1322.0,Transaction Base,,
8,3322724,2024-09-05,Sep 2024,Sep 2024,EN-50-Samyak Restaurant LLP,,,Bill,19,50-Druva India,...,,23026 Accrued Taxes : CGST Input,Expense Out - CGST_IN,INR,-1652.14,1.0,-1652.14,GST,,
9,3322724,2024-09-05,Sep 2024,Sep 2024,EN-50-Samyak Restaurant LLP,,,Bill,19,50-Druva India,...,,23024 Accrued Taxes : SGST Input,Expense Out - SGST_IN,INR,-1652.14,1.0,-1652.14,GST,,



=== Seed Files Loaded ===

Seed File: druva_seed


Unnamed: 0,S_no,Name,GSTIN/UID,State/Province Display Name
0,0,EN-50-Seema Lawns ...,27AHLPJ5080D1ZD,Maharashtra
1,1,EN-50-PIEM HOTELS LTD UNIT TAJ M.G.ROAD BENGAL...,29AAACP8376M1ZN,Karnataka
2,2,EN-50-BAMBOO AND BRICKS RESORT ...,27AVQPK1409C2ZQ,Maharashtra
3,3,EN-50-VISHWAS VILLA RESORT ...,27AAZPP8426C1Z5,
4,4,EN-50-ITC-MARATHA ...,27AAACI5950L1ZA,Maharashtra
...,...,...,...,...
640,644,IC Vend 10-Druva US due from 50-Druva India ...,,
641,645,IC Vend 30-Druva UK due from 50-Druva India ...,,
642,646,IC Vend 35-Druva Germany due from 50-Druva Ind...,,
643,647,Master Vendor Request (ALL SUBS AND ALL CURREN...,,



=== Processing Seed File: druva_seed ===
Primary key candidate: 'S_no'

=== Final Verified Mappings ===
