## **Feature:** Text Processing

**Names:** Gia Bao Ngo

### **What it does**
Comprehensive text cleaning and standardization functionality. Handles basic text cleaning (whitespace, encoding), case standardization, special character removal, categorical value standardization with fuzzy matching, numeric extraction from mixed text, and pattern validation using regex.

In [1]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
# Additional imports for text processing
import math
import re
import datetime
import unicodedata
from sklearn import preprocessing
from difflib import get_close_matches
import warnings
warnings.filterwarnings('ignore')

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

### **Helper Functions**
- `clean_text_basic(df, columns=None)` - Remove extra whitespace, standardize encoding
- `standardize_case(df, columns=None, case='title')` - Consistent case formatting
- `remove_special_chars(df, columns=None, keep_patterns=[])` - Clean special characters
- `standardize_categorical_values(df, column, mapping_dict=None)` - Map variants to standard values
- `extract_numeric_from_text(df, columns=None)` - Extract numbers from mixed text
- `validate_text_patterns(df, column, pattern)` - Validate against regex patterns

In [2]:
def clean_text_basic(df, columns=None):
    """
    Remove extra whitespace and standardize encoding for text columns.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to clean (None = all object columns)
    
    Returns:
    - DataFrame with cleaned text columns
    """
    result_df = df.copy()
    cleaned_cols = []
    
    if columns is None:
        columns = result_df.select_dtypes(include=['object']).columns.tolist()
    
    for col in columns:
        if col not in result_df.columns:
            continue
            
        # Skip if column is not text-like
        if not pd.api.types.is_object_dtype(result_df[col]):
            continue
            
        original_null_count = result_df[col].isnull().sum()
        
        try:
            # Convert to string and handle NaN values
            text_series = result_df[col].astype(str)
            
            # Basic cleaning operations
            # 1. Normalize unicode characters
            text_series = text_series.apply(lambda x: unicodedata.normalize('NFKD', x) if x != 'nan' else x)
            
            # 2. Strip leading/trailing whitespace
            text_series = text_series.str.strip()
            
            # 3. Replace multiple whitespace with single space
            text_series = text_series.str.replace(r'\s+', ' ', regex=True)
            
            # 4. Remove zero-width characters
            text_series = text_series.str.replace(r'[\u200b-\u200d\ufeff]', '', regex=True)
            
            # 5. Convert back 'nan' strings to actual NaN
            text_series = text_series.replace('nan', pd.NA)
            
            # Check if cleaning made meaningful changes
            if not text_series.equals(result_df[col]):
                result_df[col] = text_series
                new_null_count = result_df[col].isnull().sum()
                
                cleaned_cols.append({
                    'column': col,
                    'null_count_before': original_null_count,
                    'null_count_after': new_null_count,
                    'total_rows': len(result_df)
                })
                
        except Exception as e:
            print(f"Warning: Could not clean column '{col}': {e}")
            continue
    
    print(f"=== TEXT BASIC CLEANING RESULTS ===")
    print(f"Columns processed: {len(columns)}")
    print(f"Columns cleaned: {len(cleaned_cols)}")
    
    if cleaned_cols:
        print("\\nCleaning details:")
        for clean in cleaned_cols:
            null_change = clean['null_count_after'] - clean['null_count_before']
            null_change_str = f"({null_change:+d} nulls)" if null_change != 0 else ""
            print(f"  {clean['column']}: Whitespace and encoding normalized {null_change_str}")
    
    return result_df

In [3]:
def standardize_case(df, columns=None, case='title'):
    """
    Standardize case formatting for text columns.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to standardize (None = all object columns)
    - case: case format ('upper', 'lower', 'title', 'sentence', 'proper')
    
    Returns:
    - DataFrame with standardized case
    """
    result_df = df.copy()
    standardized_cols = []
    
    if columns is None:
        columns = result_df.select_dtypes(include=['object']).columns.tolist()
    
    valid_cases = ['upper', 'lower', 'title', 'sentence', 'proper']
    if case not in valid_cases:
        print(f"Warning: Invalid case '{case}'. Using 'title' instead.")
        case = 'title'
    
    for col in columns:
        if col not in result_df.columns:
            continue
            
        # Skip if column is not text-like
        if not pd.api.types.is_object_dtype(result_df[col]):
            continue
            
        try:
            original_values = result_df[col].copy()
            
            if case == 'upper':
                result_df[col] = result_df[col].str.upper()
            elif case == 'lower':
                result_df[col] = result_df[col].str.lower()
            elif case == 'title':
                result_df[col] = result_df[col].str.title()
            elif case == 'sentence':
                # First letter capitalized, rest lowercase
                result_df[col] = result_df[col].str.lower().str.capitalize()
            elif case == 'proper':
                # Smart title case (avoids capitalizing articles, prepositions)
                def proper_case(text):
                    if pd.isna(text):
                        return text
                    
                    # Words that shouldn't be capitalized unless they're first/last
                    minor_words = {'a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 
                                   'if', 'in', 'of', 'on', 'or', 'the', 'to', 'with'}
                    
                    words = str(text).lower().split()
                    if not words:
                        return text
                    
                    # Always capitalize first and last word
                    words[0] = words[0].capitalize()
                    if len(words) > 1:
                        words[-1] = words[-1].capitalize()
                    
                    # Capitalize middle words unless they're minor words
                    for i in range(1, len(words) - 1):
                        if words[i] not in minor_words:
                            words[i] = words[i].capitalize()
                    
                    return ' '.join(words)
                
                result_df[col] = result_df[col].apply(proper_case)
            
            # Check if changes were made
            if not result_df[col].equals(original_values):
                # Count changes
                changes_made = (~result_df[col].equals(original_values)).sum()
                standardized_cols.append({
                    'column': col,
                    'case_format': case,
                    'changes_made': changes_made,
                    'total_non_null': result_df[col].notna().sum()
                })
                
        except Exception as e:
            print(f"Warning: Could not standardize case for column '{col}': {e}")
            continue
    
    print(f"=== CASE STANDARDIZATION RESULTS ===")
    print(f"Columns processed: {len(columns)}")
    print(f"Columns standardized: {len(standardized_cols)}")
    print(f"Case format applied: {case}")
    
    if standardized_cols:
        print("\\nStandardization details:")
        for std in standardized_cols:
            change_ratio = std['changes_made'] / std['total_non_null'] if std['total_non_null'] > 0 else 0
            print(f"  {std['column']}: {std['changes_made']} values changed ({change_ratio:.1%})")
    
    return result_df

In [4]:
def remove_special_chars(df, columns=None, keep_patterns=None):
    """
    Remove special characters from text columns with customizable keep-lists.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to clean (None = all object columns)
    - keep_patterns: list of regex patterns to keep (e.g., ['[a-zA-Z0-9]', '\\s', '-'])
    
    Returns:
    - DataFrame with special characters removed
    """
    result_df = df.copy()
    cleaned_cols = []
    
    if columns is None:
        columns = result_df.select_dtypes(include=['object']).columns.tolist()
    
    # Default patterns to keep: alphanumeric, spaces, and basic punctuation
    if keep_patterns is None:
        keep_patterns = ['[a-zA-Z0-9]', '\\s', '[.,!?;:()\\-\'\""]']
    
    # Build regex pattern for characters to keep
    keep_pattern = '|'.join(keep_patterns)
    remove_pattern = f'[^{keep_pattern}]'
    
    for col in columns:
        if col not in result_df.columns:
            continue
            
        # Skip if column is not text-like
        if not pd.api.types.is_object_dtype(result_df[col]):
            continue
            
        try:
            original_values = result_df[col].copy()
            
            # Remove special characters using regex
            cleaned_series = result_df[col].str.replace(remove_pattern, '', regex=True)
            
            # Remove multiple spaces that might result from character removal
            cleaned_series = cleaned_series.str.replace(r'\\s+', ' ', regex=True)
            cleaned_series = cleaned_series.str.strip()
            
            # Check if changes were made
            if not cleaned_series.equals(original_values):
                # Count how many values were changed
                changes_made = (~cleaned_series.equals(original_values)).sum()
                
                # Sample some changes for reporting
                changed_indices = ~cleaned_series.equals(original_values)
                if changed_indices.any():
                    sample_changes = []
                    changed_rows = result_df[changed_indices].head(3)
                    for idx in changed_rows.index:
                        if pd.notna(original_values.loc[idx]) and pd.notna(cleaned_series.loc[idx]):
                            original = str(original_values.loc[idx])[:50]
                            cleaned = str(cleaned_series.loc[idx])[:50]
                            if original != cleaned:
                                sample_changes.append((original, cleaned))
                
                result_df[col] = cleaned_series
                
                cleaned_cols.append({
                    'column': col,
                    'changes_made': changes_made,
                    'total_non_null': result_df[col].notna().sum(),
                    'keep_patterns': keep_patterns,
                    'sample_changes': sample_changes[:2]  # Keep first 2 examples
                })
                
        except Exception as e:
            print(f"Warning: Could not remove special characters from column '{col}': {e}")
            continue
    
    print(f"=== SPECIAL CHARACTER REMOVAL RESULTS ===")
    print(f"Columns processed: {len(columns)}")
    print(f"Columns cleaned: {len(cleaned_cols)}")
    print(f"Patterns kept: {keep_patterns}")
    
    if cleaned_cols:
        print("\\nCleaning details:")
        for clean in cleaned_cols:
            change_ratio = clean['changes_made'] / clean['total_non_null'] if clean['total_non_null'] > 0 else 0
            print(f"  {clean['column']}: {clean['changes_made']} values changed ({change_ratio:.1%})")
            
            # Show sample changes
            for orig, new in clean['sample_changes']:
                if orig != new:
                    print(f"    Example: '{orig}' → '{new}'")
    
    return result_df

In [5]:
def standardize_categorical_values(df, column, mapping_dict=None, similarity_threshold=0.8):
    """
    Map variants of categorical values to standard values using fuzzy matching.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name to standardize
    - mapping_dict: optional dict with exact mappings {'variant': 'standard'}
    - similarity_threshold: threshold for fuzzy matching (0.0 to 1.0)
    
    Returns:
    - DataFrame with standardized categorical values
    """
    result_df = df.copy()
    
    if column not in result_df.columns:
        print(f"Warning: Column '{column}' not found in DataFrame")
        return result_df
    
    # Get unique non-null values
    unique_values = result_df[column].dropna().unique()
    if len(unique_values) == 0:
        print(f"Warning: No non-null values found in column '{column}'")
        return result_df
    
    # Convert to strings for processing
    unique_str_values = [str(val).strip() for val in unique_values]
    
    # Initialize mappings
    final_mapping = {}
    
    # Apply user-provided mappings first
    if mapping_dict:
        for variant, standard in mapping_dict.items():
            if variant in unique_str_values:
                final_mapping[variant] = standard
    
    # Find fuzzy matches for remaining values
    unmapped_values = [val for val in unique_str_values if val not in final_mapping]
    
    if len(unmapped_values) > 1:
        # Group similar values
        groups = []
        used_values = set()
        
        for value in unmapped_values:
            if value in used_values:
                continue
                
            # Find similar values
            similar_values = get_close_matches(
                value, 
                unmapped_values, 
                n=len(unmapped_values), 
                cutoff=similarity_threshold
            )
            
            if len(similar_values) > 1:
                # Create group with most common/shortest as standard
                group_values = [val for val in similar_values if val not in used_values]
                if group_values:
                    # Choose standard value (prefer shorter, more common)
                    value_counts = result_df[column].astype(str).value_counts()
                    
                    # Sort by frequency (desc) then by length (asc)
                    standard_value = max(group_values, 
                                       key=lambda x: (value_counts.get(x, 0), -len(x)))
                    
                    # Map all variants to standard
                    for variant in group_values:
                        if variant != standard_value:
                            final_mapping[variant] = standard_value
                        used_values.add(variant)
                    
                    groups.append({
                        'standard': standard_value,
                        'variants': [v for v in group_values if v != standard_value],
                        'count': len(group_values)
                    })
    
    # Apply mappings
    if final_mapping:
        original_values = result_df[column].copy()
        result_df[column] = result_df[column].astype(str).replace(final_mapping)
        
        # Convert back to original type if possible
        try:
            if original_values.dtype != 'object':
                result_df[column] = result_df[column].astype(original_values.dtype)
        except:
            pass  # Keep as string if conversion fails
        
        # Report results
        changes_made = (~result_df[column].equals(original_values)).sum()
        
        print(f"=== CATEGORICAL STANDARDIZATION RESULTS ===")
        print(f"Column: {column}")
        print(f"Total mappings applied: {len(final_mapping)}")
        print(f"Values changed: {changes_made}")
        print(f"Unique values before: {len(unique_values)}")
        print(f"Unique values after: {result_df[column].nunique()}")
        
        if final_mapping:
            print("\\nMappings applied:")
            for variant, standard in final_mapping.items():
                count = (original_values.astype(str) == variant).sum()
                print(f"  '{variant}' → '{standard}' ({count} occurrences)")
        
        # Show fuzzy groups if any
        if 'groups' in locals() and groups:
            print("\\nFuzzy matching groups:")
            for group in groups:
                if group['variants']:
                    print(f"  Standard: '{group['standard']}'")
                    for variant in group['variants']:
                        print(f"    Variant: '{variant}'")
    else:
        print(f"=== CATEGORICAL STANDARDIZATION RESULTS ===")
        print(f"Column: {column}")
        print(f"No mappings needed - values already standardized")
        print(f"Unique values: {len(unique_values)}")
    
    return result_df

In [6]:
def extract_numeric_from_text(df, columns=None, create_new_columns=True):
    """
    Extract numeric values from mixed text columns.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to process (None = all object columns)
    - create_new_columns: if True, create new columns with '_numeric' suffix
    
    Returns:
    - DataFrame with extracted numeric values
    """
    result_df = df.copy()
    extracted_cols = []
    
    if columns is None:
        columns = result_df.select_dtypes(include=['object']).columns.tolist()
    
    # Regex patterns for different numeric formats
    patterns = {
        'integer': r'-?\\b\\d+\\b',
        'decimal': r'-?\\b\\d+\\.\\d+\\b',
        'currency': r'\\$?\\s*-?\\d{1,3}(?:,\\d{3})*(?:\\.\\d{2})?',
        'percentage': r'-?\\d+(?:\\.\\d+)?%',
        'scientific': r'-?\\d+(?:\\.\\d+)?[eE][-+]?\\d+',
        'general_number': r'-?\\d+(?:\\.\\d+)?'
    }
    
    for col in columns:
        if col not in result_df.columns:
            continue
            
        # Skip if column is not text-like
        if not pd.api.types.is_object_dtype(result_df[col]):
            continue
            
        try:
            text_series = result_df[col].astype(str)
            extracted_numbers = []
            extraction_info = {
                'pattern_matches': {},
                'successful_extractions': 0,
                'total_attempts': 0
            }
            
            for value in text_series:
                if pd.isna(value) or value == 'nan':
                    extracted_numbers.append(np.nan)
                    continue
                    
                extraction_info['total_attempts'] += 1
                number_found = None
                pattern_used = None
                
                # Try patterns in order of specificity
                for pattern_name, pattern in patterns.items():
                    matches = re.findall(pattern, str(value))
                    if matches:
                        # Take the first/largest match
                        best_match = max(matches, key=len) if len(matches) > 1 else matches[0]
                        
                        try:
                            # Clean and convert the match
                            clean_number = best_match.replace(',', '').replace('$', '').replace('%', '')
                            if pattern_name == 'percentage':
                                number_found = float(clean_number) / 100
                            else:
                                number_found = float(clean_number)
                            
                            pattern_used = pattern_name
                            extraction_info['pattern_matches'][pattern_name] = extraction_info['pattern_matches'].get(pattern_name, 0) + 1
                            extraction_info['successful_extractions'] += 1
                            break
                            
                        except ValueError:
                            continue
                
                extracted_numbers.append(number_found)
            
            # Convert to pandas Series
            numeric_series = pd.Series(extracted_numbers, index=result_df.index)
            
            # Check if extraction was successful
            success_rate = extraction_info['successful_extractions'] / extraction_info['total_attempts'] if extraction_info['total_attempts'] > 0 else 0
            
            if success_rate > 0.1:  # At least 10% success rate
                if create_new_columns:
                    new_col_name = f"{col}_numeric"
                    result_df[new_col_name] = numeric_series
                else:
                    result_df[col] = numeric_series
                
                extracted_cols.append({
                    'original_column': col,
                    'new_column': new_col_name if create_new_columns else col,
                    'success_rate': success_rate,
                    'successful_extractions': extraction_info['successful_extractions'],
                    'total_attempts': extraction_info['total_attempts'],
                    'pattern_matches': extraction_info['pattern_matches'],
                    'created_new_column': create_new_columns
                })
            
        except Exception as e:
            print(f"Warning: Could not extract numbers from column '{col}': {e}")
            continue
    
    print(f"=== NUMERIC EXTRACTION RESULTS ===")
    print(f"Columns processed: {len(columns)}")
    print(f"Successful extractions: {len(extracted_cols)}")
    
    if extracted_cols:
        print("\\nExtraction details:")
        for extract in extracted_cols:
            print(f"  {extract['original_column']}:")
            print(f"    → {extract['new_column']} (success rate: {extract['success_rate']:.1%})")
            print(f"    → {extract['successful_extractions']}/{extract['total_attempts']} values extracted")
            
            # Show pattern usage
            if extract['pattern_matches']:
                pattern_str = ', '.join([f"{k}: {v}" for k, v in extract['pattern_matches'].items()])
                print(f"    → Patterns used: {pattern_str}")
    
    return result_df

In [7]:
def validate_text_patterns(df, column, pattern, pattern_name=None):
    """
    Validate text values against regex patterns and report violations.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name to validate
    - pattern: regex pattern to validate against
    - pattern_name: descriptive name for the pattern (optional)
    
    Returns:
    - Dictionary with validation results and DataFrame with validation flags
    """
    if column not in df.columns:
        print(f"Warning: Column '{column}' not found in DataFrame")
        return {'valid': False, 'error': 'Column not found'}
    
    try:
        # Compile pattern for efficiency
        compiled_pattern = re.compile(pattern)
        pattern_display = pattern_name if pattern_name else pattern[:50]
        
        # Get non-null values as strings
        text_series = df[column].dropna().astype(str)
        total_values = len(text_series)
        
        if total_values == 0:
            print(f"Warning: No non-null values found in column '{column}'")
            return {'valid': True, 'matches': 0, 'total': 0, 'violations': []}
        
        # Test each value against pattern
        matches = []
        violations = []
        
        for idx, value in text_series.items():
            is_match = bool(compiled_pattern.fullmatch(str(value)))
            matches.append(is_match)
            
            if not is_match:
                violations.append({
                    'index': idx,
                    'value': str(value)[:100],  # Truncate long values
                    'issue': 'Pattern mismatch'
                })
        
        # Calculate statistics
        match_count = sum(matches)
        violation_count = len(violations)
        match_rate = match_count / total_values if total_values > 0 else 0
        
        # Create validation flag column
        result_df = df.copy()
        validation_col_name = f"{column}_pattern_valid"
        
        # Initialize with True (valid) for all rows
        result_df[validation_col_name] = True
        
        # Set False for violations
        if violations:
            violation_indices = [v['index'] for v in violations]
            result_df.loc[violation_indices, validation_col_name] = False
        
        # Report results
        print(f"=== PATTERN VALIDATION RESULTS ===")
        print(f"Column: {column}")
        print(f"Pattern: {pattern_display}")
        print(f"Total values checked: {total_values}")
        print(f"Valid matches: {match_count} ({match_rate:.1%})")
        print(f"Violations found: {violation_count}")
        
        if violations:
            print(f"\\nValidation flag column created: {validation_col_name}")
            print("Sample violations:")
            for violation in violations[:5]:  # Show first 5 violations
                print(f"  Row {violation['index']}: '{violation['value']}'")
            
            if len(violations) > 5:
                print(f"  ... and {len(violations) - 5} more violations")
        
        # Common pattern suggestions if many violations
        if violation_count > total_values * 0.3:  # More than 30% violations
            print("\\nSuggestion: High violation rate detected.")
            print("Consider reviewing the pattern or cleaning the data first.")
            
            # Show some common characteristics of violations
            violation_values = [v['value'] for v in violations[:10]]
            print("Sample violation values for pattern analysis:")
            for val in violation_values:
                print(f"  '{val}'")
        
        # Return comprehensive results
        return {
            'valid': True,
            'column': column,
            'pattern': pattern,
            'pattern_name': pattern_name,
            'total_values': total_values,
            'matches': match_count,
            'violations': violation_count,
            'match_rate': match_rate,
            'violation_details': violations[:100],  # Limit to first 100
            'result_df': result_df,
            'validation_column': validation_col_name
        }
        
    except re.error as e:
        print(f"Error: Invalid regex pattern '{pattern}': {e}")
        return {'valid': False, 'error': f'Invalid regex: {e}'}
    except Exception as e:
        print(f"Error during pattern validation: {e}")
        return {'valid': False, 'error': str(e)}

In [8]:
helper_docs = """ Helper functions available:
- clean_text_basic(df, columns=None): Remove extra whitespace and standardize encoding for text columns. Returns DataFrame with cleaned text.
- standardize_case(df, columns=None, case='title'): Standardize case formatting ('upper', 'lower', 'title', 'sentence', 'proper'). Returns DataFrame with standardized case.
- remove_special_chars(df, columns=None, keep_patterns=None): Remove special characters with customizable keep-lists. Returns DataFrame with cleaned text.
- standardize_categorical_values(df, column, mapping_dict=None, similarity_threshold=0.8): Map variants to standard values using fuzzy matching. Returns DataFrame with standardized values.
- extract_numeric_from_text(df, columns=None, create_new_columns=True): Extract numbers from mixed text columns. Returns DataFrame with numeric columns.
- validate_text_patterns(df, column, pattern, pattern_name=None): Validate against regex patterns and create validation flags. Returns results dict and DataFrame.

Examples:
- "Clean text columns" -> df = clean_text_basic(df)
- "Standardize city names" -> df = standardize_categorical_values(df, 'city')
- "Convert to title case" -> df = standardize_case(df, case='title')
- "Remove special characters" -> df = remove_special_chars(df)
- "Extract numbers from product codes" -> df = extract_numeric_from_text(df, ['product_code'])
- "Validate email format" -> result = validate_text_patterns(df, 'email', r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', 'email')
"""

# **MAIN FEATURE FUNCTION**

In [9]:
def text_processing(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent focused on text processing and standardization.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime, unicodedata
    - difflib.get_close_matches
    - All helper functions listed above
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions for text processing tasks when appropriate
    - ASSUME \"df\" IS ALREADY DEFINED
    - For cleaning operations, use helper functions that modify DataFrame (clean_text_basic, standardize_case, etc.)
    - For validation, use validate_text_patterns which returns results dict
    - ALWAYS assign the result back to df when modifying: df = clean_text_basic(df)
    - For validation results, use: result = validate_text_patterns(df, 'column', 'pattern'); df = result['result_df'] if result['valid'] else df
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    
    Common query patterns:
    - "Clean text" or "Basic cleaning" -> df = clean_text_basic(df)
    - "Standardize case" or "Fix capitalization" -> df = standardize_case(df, case='title')
    - "Remove special characters" -> df = remove_special_chars(df)
    - "Standardize city names" or "Fix categorical values" -> df = standardize_categorical_values(df, 'column_name')
    - "Extract numbers" or "Get numeric values" -> df = extract_numeric_from_text(df)
    - "Validate email" or "Check pattern" -> result = validate_text_patterns(df, 'email', r'pattern'); df = result['result_df'] if result['valid'] else df
    - "Clean all text" -> df = clean_text_basic(df); df = standardize_case(df); df = remove_special_chars(df)
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            're': re,
            'unicodedata': unicodedata,
            'get_close_matches': get_close_matches,
            'clean_text_basic': clean_text_basic,
            'standardize_case': standardize_case,
            'remove_special_chars': remove_special_chars,
            'standardize_categorical_values': standardize_categorical_values,
            'extract_numeric_from_text': extract_numeric_from_text,
            'validate_text_patterns': validate_text_patterns,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# # Enter CSV filename from "datasets" folder
# dataset_name = "Life Expectancy Data.csv"

# # Build CSV path (to avoid import errors)
# load_dotenv()
# PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
# path = PROJECT_ROOT / "datasets" / dataset_name

# df = pd.read_csv(path)
# test_df = df.copy()

In [None]:
# query = "clean text"
# result = text_processing(test_df, query)

In [None]:
# test_df.info()
# print("---------------------------------")
# result.info()