In [160]:
import os
import glob
import pandas as pd
import numpy as np
import re
import unicodedata

In [161]:
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9.%/,-]', '', text)

def normalize_text(text):
    """Remove hidden Unicode control characters like \u202d"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

def is_numeric(s):
    return s.apply(lambda x: re.fullmatch(r'\d+', x) is not None).all()

def is_decimal(s):
    return s.apply(lambda x: re.fullmatch(r'\d+\.\d+', x) is not None).all()

def has_special_characters(s):
    return s.apply(lambda x: any(c in x for c in ['/', '-', ',', '.', '%'])).all()

def is_alphanumeric(s):
    return s.apply(lambda x: re.fullmatch(r'[A-Za-z0-9]+', x) is not None).all()

def is_mostly_letters(s, threshold=1.0):
    letter_counts = s.apply(lambda x: sum(c.isalpha() for c in x))
    total_counts = s.apply(lambda x: len(x))
    ratios = letter_counts / total_counts
    return (ratios == threshold).mean() >= 1.0

def fixed_length(s):
    cleaned = s.apply(clean_text)
    lengths = cleaned.apply(len).unique()
    return len(lengths) == 1

In [163]:
def load_input_file(input_directory):
    files = glob.glob(os.path.join(input_directory, "*.xlsx"))
    if not files:
        raise FileNotFoundError("No XLSX file found.")
    file_path = files[0]
    df = pd.read_excel(file_path, skiprows=2)
    df_for_mapping = df.head(10)
    print(f"Loaded input file '{file_path}' with {len(df)} rows. Using top 10 rows for mapping.")
    return df, df_for_mapping


In [164]:
def load_seed_files(seed_directory):
    seed_files = glob.glob(os.path.join(seed_directory, "*.csv"))
    seeds = {}
    for file in seed_files:
        df = pd.read_csv(file, skipinitialspace=True, dtype=str)
        filename = os.path.splitext(os.path.basename(file))[0]
        seeds[filename] = df
        print(f"Loaded seed file: '{filename}' with {len(df)} rows.")
    return seeds

In [165]:
def find_primary_keys(df):
    primary_keys = []
    for col in df.columns:
        s = df[col].dropna().astype(str).apply(normalize_text)
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}' (based on uniqueness)")
    return primary_keys

In [166]:
def analyze_column_properties(s):
    cleaned_s = s.dropna().astype(str).apply(clean_text)
    properties = {
        'is_numeric': is_numeric(cleaned_s),
        'is_decimal': is_decimal(cleaned_s),
        'has_special_characters': has_special_characters(cleaned_s),
        'is_alphanumeric': is_alphanumeric(cleaned_s),
        'is_mostly_letters': is_mostly_letters(cleaned_s),
        'is_fixed_length': fixed_length(cleaned_s),
        'length': cleaned_s.apply(len).unique()[0] if fixed_length(cleaned_s) else None
    }
    return properties

In [167]:
def map_columns(input_sample, seed_col_name, seed_col_values, seed_props):
    matches = []
    for col in input_sample.columns:
        s_input = input_sample[col].dropna().astype(str)
        if s_input.empty:
            continue
        cleaned_input = s_input.apply(clean_text)

        input_props = {
            'is_numeric': is_numeric(cleaned_input),
            'is_decimal': is_decimal(cleaned_input),
            'has_special_characters': has_special_characters(cleaned_input),
            'is_alphanumeric': is_alphanumeric(cleaned_input),
            'is_mostly_letters': is_mostly_letters(cleaned_input),
            'is_fixed_length': fixed_length(cleaned_input),
            'length': cleaned_input.apply(len).unique()[0] if fixed_length(cleaned_input) else None
        }

        conditions = all(seed_props[key] == input_props.get(key) for key in seed_props)

        if conditions:
            print(f"Matched input column '{col}' for seed column '{seed_col_name}'")
            matches.append(col)
    
    if not matches:
        print(f"No match found for seed column '{seed_col_name}'")
    
    return matches

In [168]:
# Step 8: Full processing flow

input_directory = "input"
seed_directory = "seeds"

# Load input file
full_input_df, input_sample = load_input_file(input_directory)

# Load seed files
seeds = load_seed_files(seed_directory)

# Analyze and map
all_mappings = {}

for seed_name, seed_df in seeds.items():
    print(f"\n=== Processing seed file '{seed_name}' ===")
    primary_keys = find_primary_keys(seed_df)

    mappings = []
    for pk_col in primary_keys:
        if pk_col not in seed_df.columns:
            continue

        seed_col_values = seed_df[pk_col]
        seed_props = analyze_column_properties(seed_col_values)

        print(f"\nProperties for seed column '{pk_col}': {seed_props}")
        
        matched_cols = map_columns(input_sample, pk_col, seed_col_values, seed_props)

        if matched_cols:
            mappings.append({
                'seed_column': pk_col,
                'matched_input_columns': matched_cols,
                'seed_properties': seed_props
            })

    all_mappings[seed_name] = mappings


Loaded input file 'input/main_input.xlsx' with 48 rows. Using top 10 rows for mapping.
Loaded seed file: 'master_sez' with 8 rows.
Loaded seed file: 'master' with 10 rows.

=== Processing seed file 'master_sez' ===
Primary key candidate: 'Party Name                                      ' (based on uniqueness)
Primary key candidate: 'Party Number' (based on uniqueness)

Properties for seed column 'Party Name                                      ': {'is_numeric': False, 'is_decimal': False, 'has_special_characters': False, 'is_alphanumeric': False, 'is_mostly_letters': False, 'is_fixed_length': False, 'length': None}
No match found for seed column 'Party Name                                      '

Properties for seed column 'Party Number': {'is_numeric': True, 'is_decimal': False, 'has_special_characters': False, 'is_alphanumeric': True, 'is_mostly_letters': False, 'is_fixed_length': False, 'length': None}
Matched input column 'S.No' for seed column 'Party Number'

=== Processing seed f

In [169]:
print("\n=== Final Mapping Results ===")
for seed_name, mappings in all_mappings.items():
    for mapping in mappings:
        input_cols = ", ".join(mapping['matched_input_columns'])
        print(f"Seed File '{seed_name}': '{mapping['seed_column']}' -> {input_cols}")


=== Final Mapping Results ===
Seed File 'master_sez': 'Party Number' -> S.No
Seed File 'master': 'Site' -> Ledger


In [170]:
final_verified_mappings = {}

for seed_name, mappings in all_mappings.items():
    final_verified_mappings[seed_name] = []
    for mapping in mappings:
        seed_column_name = mapping['seed_column']
        full_seed_col_values = seeds[seed_name][seed_column_name].dropna().astype(str).apply(lambda x: x.strip())

        for input_col_name in mapping['matched_input_columns']:
            if input_col_name not in full_input_df.columns:
                continue
            full_input_col_values = full_input_df[input_col_name].dropna().astype(str).apply(lambda x: normalize_text(x.strip()))


            input_unique = set(full_input_col_values)
            seed_unique = set(full_seed_col_values)

            if not input_unique:
                continue

            common_elements = input_unique & seed_unique
            ratio = len(common_elements) / len(input_unique)

            print(f"\nMapping Attempt: '{seed_name}:{seed_column_name}' -> '{input_col_name}'")
            print(f"Input Unique Values ({input_col_name}): {sorted(input_unique)}")
            print(f"Seed Unique Values ({seed_column_name}): {sorted(seed_unique)}")
            print(f"Common Values: {sorted(common_elements)}")
            print(f"Input Unique Count: {len(input_unique)}, Common Count: {len(common_elements)}, Ratio: {ratio:.2f}")

            if ratio >= 0.8:
                print(" Mapping Accepted based on Common Elements Ratio.\n")
                final_verified_mappings[seed_name].append({
                    'seed_column': seed_column_name,
                    'input_column': input_col_name,
                    'ratio': ratio
                })
            else:
                print("Mapping Rejected based on Common Elements Ratio.\n")

print("\n=== Final Verified Mappings ===")
for seed_name, verified_list in final_verified_mappings.items():
    for verified in verified_list:
        print(f"Seed File '{seed_name}': Seed Column '{verified['seed_column']}' -> Input Column '{verified['input_column']}' (Ratio: {verified['ratio']:.2f})")



Mapping Attempt: 'master_sez:Party Number' -> 'S.No'
Input Unique Values (S.No): ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '6', '7', '8', '9']
Seed Unique Values (Party Number): ['1127114', '115723', '115967', '120229', '125933', '126293', '1468599', '1658096']
Common Values: []
Input Unique Count: 48, Common Count: 0, Ratio: 0.00
Mapping Rejected based on Common Elements Ratio.


Mapping Attempt: 'master:Site' -> 'Ledger'
Input Unique Values (Ledger): ['0138']
Seed Unique Values (Site): ['0138', '0225', '0231', '0255', '0269', '0270', '3812', '4435', '5020', '5235']
Common Values: ['0138']
Input Unique Count: 1, Common Count: 1, Ratio: 1.00
 Mapping Accepted based on Common Elements Ratio.


=== Final Verified Mappings ===
Seed File 'master': Seed Column 'Site' -> Input