In [49]:
import os
import glob
import pandas as pd
import numpy as np
import re

In [51]:
def clean_text(text):
    """
    Clean text by keeping only allowed characters: letters, digits, dot (.), percent (%), slash (/), comma (,), and dash (-).
    IMPORTANT: Place '-' at the beginning or end to avoid bad range error.
    """
    return re.sub(r'[^a-zA-Z0-9.%/,-]', '', text)

def is_numeric(s):
    return s.apply(lambda x: re.fullmatch(r'\d+', x) is not None).all()

def is_decimal(s):
    return s.apply(lambda x: re.fullmatch(r'\d+(\.\d+)?', x) is not None).all()

def has_special_characters(s):
    """Check if string contains only special characters like / , - . %"""
    return s.apply(lambda x: any(c in x for c in ['/', '-', ',', '.', '%'])).all()

def is_alphanumeric(s):
    return s.apply(lambda x: re.fullmatch(r'[A-Za-z0-9]+', x) is not None).all()

def is_mostly_letters(s, threshold=0.7):
    """Check if most entries are mostly letters."""
    letter_counts = s.apply(lambda x: sum(c.isalpha() for c in x))
    total_counts = s.apply(lambda x: len(x))
    ratios = letter_counts / total_counts
    return (ratios > threshold).mean() > 0.7

def fixed_length(s):
    cleaned = s.apply(clean_text)
    lengths = cleaned.apply(len).unique()
    return len(lengths) == 1

In [52]:
def load_input_file(input_directory):
    files = glob.glob(os.path.join(input_directory, "*.xlsx"))
    if not files:
        raise FileNotFoundError("No XLSX file found.")
    file_path = files[0]
    df = pd.read_excel(file_path, skiprows=2)
    df_for_mapping = df.head(10)
    # print(f"Loaded input file '{file_path}' with {len(df)} rows. Using top 10 rows for mapping.")
    return df, df_for_mapping


In [53]:
def load_seed_files(seed_directory):
    seed_files = glob.glob(os.path.join(seed_directory, "*.csv"))
    seeds = {}
    for file in seed_files:
        df = pd.read_csv(file, skipinitialspace=True, dtype=str)
        filename = os.path.splitext(os.path.basename(file))[0]
        seeds[filename] = df
        print(f"Loaded seed file: '{filename}' with {len(df)} rows.")
    return seeds

In [54]:

def find_primary_keys(df):
    primary_keys = []
    for col in df.columns:
        s = df[col].dropna().astype(str)
        if s.empty:
            continue
        if len(s) == len(s.unique()):
            primary_keys.append(col)
            print(f"Primary key candidate: '{col}' (based on uniqueness)")
    return primary_keys

In [55]:
def analyze_column_properties(s):
    cleaned_s = s.dropna().astype(str).apply(clean_text)
    properties = {
        'is_numeric': is_numeric(cleaned_s),
        'is_decimal': is_decimal(cleaned_s),
        'has_special_characters': has_special_characters(cleaned_s),
        'is_alphanumeric': is_alphanumeric(cleaned_s),
        'is_mostly_letters': is_mostly_letters(cleaned_s),
        'is_fixed_length': fixed_length(cleaned_s),
        'length': cleaned_s.apply(len).unique()[0] if fixed_length(cleaned_s) else None
    }
    return properties

In [56]:
def map_columns(input_sample, seed_col_name, seed_col_values, seed_props):
    matches = []
    for col in input_sample.columns:
        s_input = input_sample[col].dropna().astype(str)
        if s_input.empty:
            continue
        cleaned_input = s_input.apply(clean_text)

        input_props = {
            'is_numeric': is_numeric(cleaned_input),
            'is_decimal': is_decimal(cleaned_input),
            'has_special_characters': has_special_characters(cleaned_input),
            'is_alphanumeric': is_alphanumeric(cleaned_input),
            'is_mostly_letters': is_mostly_letters(cleaned_input),
            'is_fixed_length': fixed_length(cleaned_input),
            'length': cleaned_input.apply(len).unique()[0] if fixed_length(cleaned_input) else None
        }

        conditions = all(seed_props[key] == input_props.get(key) for key in seed_props)

        if conditions:
            print(f"Matched input column '{col}' for seed column '{seed_col_name}'")
            matches.append(col)
    
    if not matches:
        print(f"No match found for seed column '{seed_col_name}'")
    
    return matches

In [57]:
# Step 8: Full processing flow

input_directory = "input"
seed_directory = "seeds"

# Load input file
full_input_df, input_sample = load_input_file(input_directory)

# Load seed files
seeds = load_seed_files(seed_directory)

# Analyze and map
all_mappings = {}

for seed_name, seed_df in seeds.items():
    print(f"\n=== Processing seed file '{seed_name}' ===")
    primary_keys = find_primary_keys(seed_df)

    mappings = []
    for pk_col in primary_keys:
        if pk_col not in seed_df.columns:
            continue

        seed_col_values = seed_df[pk_col]
        seed_props = analyze_column_properties(seed_col_values)

        print(f"\nProperties for seed column '{pk_col}': {seed_props}")
        
        matched_cols = map_columns(input_sample, pk_col, seed_col_values, seed_props)

        if matched_cols:
            mappings.append({
                'seed_column': pk_col,
                'matched_input_columns': matched_cols,
                'seed_properties': seed_props
            })

    all_mappings[seed_name] = mappings

# Step 9: Final print

print("\n=== Final Mapping Results ===")
for seed_name, mappings in all_mappings.items():
    for mapping in mappings:
        input_cols = ", ".join(mapping['matched_input_columns'])
        print(f"Seed File '{seed_name}': '{mapping['seed_column']}' -> {input_cols}")

Loaded seed file: 'master_sez' with 8 rows.
Loaded seed file: 'master' with 10 rows.

=== Processing seed file 'master_sez' ===
Primary key candidate: 'Party Name                                      ' (based on uniqueness)
Primary key candidate: 'Party Number' (based on uniqueness)

Properties for seed column 'Party Name                                      ': {'is_numeric': False, 'is_decimal': False, 'has_special_characters': False, 'is_alphanumeric': False, 'is_mostly_letters': True, 'is_fixed_length': False, 'length': None}
No match found for seed column 'Party Name                                      '

Properties for seed column 'Party Number': {'is_numeric': True, 'is_decimal': True, 'has_special_characters': False, 'is_alphanumeric': True, 'is_mostly_letters': False, 'is_fixed_length': False, 'length': None}
Matched input column 'S.No' for seed column 'Party Number'

=== Processing seed file 'master' ===
Primary key candidate: 'Site' (based on uniqueness)

Properties for seed