## **Feature:** Advanced Data Profiling

**Names:** Gia Bao Ngo

### **What it does**
Comprehensive data profiling and quality analysis. Generates detailed reports about data characteristics, calculates quality metrics, detects data drift between datasets, analyzes correlations, and identifies patterns in the data. Provides interactive profiling reports and data quality scoring.

In [2]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
# Additional imports for data profiling
import math
import re
import datetime
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

### **Helper Functions**
- `generate_profile_report(df, title="Data Profile")` - Comprehensive data profiling with statistics
- `calculate_quality_metrics(df)` - Data quality scoring and metrics
- `analyze_correlations(df, method='pearson', threshold=0.8)` - Correlation analysis with visualization
- `detect_patterns(df, columns=None)` - Pattern detection in data

In [3]:
def generate_profile_report(df, title="Data Profile"):
    """
    Generate comprehensive data profiling report with statistics.
    
    Parameters:
    - df: pandas DataFrame
    - title: title for the report
    
    Returns:
    - Dictionary containing profiling results
    """
    profile = {
        'title': title,
        'dataset_info': {},
        'column_profiles': {},
        'summary_statistics': {},
        'data_quality': {}
    }
    
    # Dataset overview
    profile['dataset_info'] = {
        'shape': df.shape,
        'total_cells': df.shape[0] * df.shape[1],
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
        'duplicate_rows': df.duplicated().sum(),
        'duplicate_percentage': (df.duplicated().sum() / len(df)) * 100
    }
    
    # Column-by-column profiling
    for col in df.columns:
        col_profile = {
            'dtype': str(df[col].dtype),
            'non_null_count': df[col].count(),
            'null_count': df[col].isnull().sum(),
            'null_percentage': (df[col].isnull().sum() / len(df)) * 100,
            'unique_count': df[col].nunique(),
            'unique_percentage': (df[col].nunique() / len(df)) * 100,
        }
        
        # Type-specific profiling
        if pd.api.types.is_numeric_dtype(df[col]):
            # Numeric statistics
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                col_profile.update({
                    'min': float(non_null_data.min()),
                    'max': float(non_null_data.max()),
                    'mean': float(non_null_data.mean()),
                    'median': float(non_null_data.median()),
                    'std': float(non_null_data.std()),
                    'q25': float(non_null_data.quantile(0.25)),
                    'q75': float(non_null_data.quantile(0.75)),
                    'skewness': float(stats.skew(non_null_data)),
                    'kurtosis': float(stats.kurtosis(non_null_data)),
                    'zeros': int((non_null_data == 0).sum()),
                    'negative_values': int((non_null_data < 0).sum()),
                    'infinite_values': int(np.isinf(non_null_data).sum())
                })
        
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            # Datetime statistics
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                col_profile.update({
                    'min_date': str(non_null_data.min()),
                    'max_date': str(non_null_data.max()),
                    'date_range_days': (non_null_data.max() - non_null_data.min()).days,
                })
        
        else:
            # Text/categorical statistics
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                # Most common values
                value_counts = non_null_data.value_counts().head(5)
                col_profile['top_5_values'] = dict(value_counts)
                
                # Text length statistics (for object types)
                if df[col].dtype == 'object':
                    text_lengths = non_null_data.astype(str).str.len()
                    col_profile.update({
                        'min_length': int(text_lengths.min()),
                        'max_length': int(text_lengths.max()),
                        'mean_length': float(text_lengths.mean()),
                        'empty_strings': int((non_null_data.astype(str) == '').sum()),
                        'whitespace_only': int(non_null_data.astype(str).str.strip().eq('').sum())
                    })
        
        profile['column_profiles'][col] = col_profile
    
    # Summary statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    profile['summary_statistics'] = {
        'total_columns': len(df.columns),
        'numeric_columns': len(numeric_cols),
        'categorical_columns': len(df.select_dtypes(include=['object', 'category']).columns),
        'datetime_columns': len(df.select_dtypes(include=['datetime64']).columns),
        'boolean_columns': len(df.select_dtypes(include=['bool']).columns),
        'high_cardinality_cols': len([col for col in df.columns if df[col].nunique() / len(df) > 0.95]),
        'low_cardinality_cols': len([col for col in df.columns if df[col].nunique() / len(df) < 0.05]),
        'constant_columns': len([col for col in df.columns if df[col].nunique() <= 1]),
    }
    
    # Data quality assessment
    total_cells = df.shape[0] * df.shape[1]
    missing_cells = df.isnull().sum().sum()
    
    profile['data_quality'] = {
        'completeness_score': ((total_cells - missing_cells) / total_cells) * 100,
        'uniqueness_score': (df.nunique().sum() / total_cells) * 100,
        'consistency_score': 100 - (df.duplicated().sum() / len(df)) * 100,
        'overall_quality_score': 0  # Will be calculated
    }
    
    # Calculate overall quality score
    quality_score = (
        profile['data_quality']['completeness_score'] * 0.4 +
        profile['data_quality']['consistency_score'] * 0.3 +
        min(profile['data_quality']['uniqueness_score'], 100) * 0.3
    )
    profile['data_quality']['overall_quality_score'] = quality_score
    
    # Print comprehensive report
    print(f"=== {title.upper()} ===")
    print(f"Dataset Shape: {profile['dataset_info']['shape']}")
    print(f"Memory Usage: {profile['dataset_info']['memory_usage_mb']:.2f} MB")
    print(f"Duplicate Rows: {profile['dataset_info']['duplicate_rows']} ({profile['dataset_info']['duplicate_percentage']:.1f}%)")
    
    print(f"\\nCOLUMN BREAKDOWN:")
    print(f"  Numeric: {profile['summary_statistics']['numeric_columns']}")
    print(f"  Categorical: {profile['summary_statistics']['categorical_columns']}")
    print(f"  DateTime: {profile['summary_statistics']['datetime_columns']}")
    print(f"  Boolean: {profile['summary_statistics']['boolean_columns']}")
    
    print(f"\\nDATA QUALITY SCORES:")
    print(f"  Completeness: {profile['data_quality']['completeness_score']:.1f}%")
    print(f"  Consistency: {profile['data_quality']['consistency_score']:.1f}%")
    print(f"  Uniqueness: {profile['data_quality']['uniqueness_score']:.1f}%")
    print(f"  Overall Quality: {profile['data_quality']['overall_quality_score']:.1f}%")
    
    # Show top issues
    issues = []
    if profile['summary_statistics']['constant_columns'] > 0:
        issues.append(f"{profile['summary_statistics']['constant_columns']} constant columns")
    if profile['dataset_info']['duplicate_percentage'] > 5:
        issues.append(f"High duplicate rate ({profile['dataset_info']['duplicate_percentage']:.1f}%)")
    
    high_missing_cols = [col for col, prof in profile['column_profiles'].items() 
                        if prof['null_percentage'] > 20]
    if high_missing_cols:
        issues.append(f"{len(high_missing_cols)} columns with >20% missing values")
    
    if issues:
        print(f"\\nDATA QUALITY ISSUES DETECTED:")
        for issue in issues:
            print(f"  - {issue}")
    
    return profile

In [4]:
def calculate_quality_metrics(df):
    """
    Calculate comprehensive data quality scoring and metrics.
    
    Parameters:
    - df: pandas DataFrame
    
    Returns:
    - Dictionary with quality metrics and scores
    """
    metrics = {
        'completeness': {},
        'uniqueness': {},
        'consistency': {},
        'validity': {},
        'accuracy': {},
        'overall': {}
    }
    
    total_cells = df.shape[0] * df.shape[1]
    
    # 1. COMPLETENESS (missing data analysis)
    missing_cells = df.isnull().sum().sum()
    completeness_score = ((total_cells - missing_cells) / total_cells) * 100
    
    # Per-column completeness
    col_completeness = {}
    missing_by_col = df.isnull().sum()
    for col in df.columns:
        col_completeness[col] = ((len(df) - missing_by_col[col]) / len(df)) * 100
    
    metrics['completeness'] = {
        'overall_score': completeness_score,
        'missing_cells': missing_cells,
        'missing_percentage': (missing_cells / total_cells) * 100,
        'columns_with_missing': (missing_by_col > 0).sum(),
        'worst_columns': missing_by_col.nlargest(5).to_dict(),
        'column_scores': col_completeness
    }
    
    # 2. UNIQUENESS (duplicate analysis)
    duplicate_rows = df.duplicated().sum()
    uniqueness_score = ((len(df) - duplicate_rows) / len(df)) * 100
    
    # Per-column uniqueness
    col_uniqueness = {}
    for col in df.columns:
        unique_ratio = df[col].nunique() / len(df)
        col_uniqueness[col] = unique_ratio * 100
    
    metrics['uniqueness'] = {
        'overall_score': uniqueness_score,
        'duplicate_rows': duplicate_rows,
        'duplicate_percentage': (duplicate_rows / len(df)) * 100,
        'column_scores': col_uniqueness,
        'low_uniqueness_columns': [col for col, score in col_uniqueness.items() if score < 5]
    }
    
    # 3. CONSISTENCY (data format and type consistency)
    consistency_issues = []
    consistency_score = 100  # Start with perfect score and deduct
    
    # Check for mixed data types in object columns
    for col in df.select_dtypes(include=['object']).columns:
        non_null_data = df[col].dropna()
        if len(non_null_data) > 0:
            # Check for numeric data in string columns
            numeric_count = 0
            for val in non_null_data.head(100):  # Sample first 100
                try:
                    float(str(val))
                    numeric_count += 1
                except:
                    pass
            
            if numeric_count > len(non_null_data.head(100)) * 0.8:  # 80% numeric
                consistency_issues.append(f"Column '{col}' appears to contain numeric data but is stored as object")
                consistency_score -= 5
    
    # Check for inconsistent date formats
    for col in df.select_dtypes(include=['object']).columns:
        sample_data = df[col].dropna().head(50).astype(str)
        date_patterns = [
            r'\\d{4}-\\d{2}-\\d{2}',  # YYYY-MM-DD
            r'\\d{2}/\\d{2}/\\d{4}',  # MM/DD/YYYY
            r'\\d{2}-\\d{2}-\\d{4}',  # MM-DD-YYYY
        ]
        
        pattern_matches = {}
        for pattern in date_patterns:
            matches = sample_data.str.contains(pattern, na=False).sum()
            if matches > 0:
                pattern_matches[pattern] = matches
        
        if len(pattern_matches) > 1 and len(sample_data) > 10:
            consistency_issues.append(f"Column '{col}' has mixed date formats")
            consistency_score -= 3
    
    metrics['consistency'] = {
        'overall_score': max(consistency_score, 0),
        'issues_found': len(consistency_issues),
        'issues_detail': consistency_issues
    }
    
    # 4. VALIDITY (format and constraint validation)
    validity_issues = []
    validity_score = 100
    
    # Check for outliers in numeric columns using IQR
    for col in df.select_dtypes(include=[np.number]).columns:
        non_null_data = df[col].dropna()
        if len(non_null_data) > 4:  # Need at least 4 values for quartiles
            Q1 = non_null_data.quantile(0.25)
            Q3 = non_null_data.quantile(0.75)
            IQR = Q3 - Q1
            
            if IQR > 0:
                outliers = non_null_data[(non_null_data < (Q1 - 1.5 * IQR)) | 
                                       (non_null_data > (Q3 + 1.5 * IQR))]
                if len(outliers) > len(non_null_data) * 0.1:  # More than 10% outliers
                    validity_issues.append(f"Column '{col}' has {len(outliers)} potential outliers")
                    validity_score -= 2
    
    # Check for negative values in columns that might expect positive values
    potential_positive_cols = [col for col in df.columns if any(word in col.lower() 
                              for word in ['age', 'price', 'count', 'amount', 'quantity', 'score'])]
    
    for col in potential_positive_cols:
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            negative_count = (df[col] < 0).sum()
            if negative_count > 0:
                validity_issues.append(f"Column '{col}' has {negative_count} negative values (might be invalid)")
                validity_score -= 1
    
    metrics['validity'] = {
        'overall_score': max(validity_score, 0),
        'issues_found': len(validity_issues),
        'issues_detail': validity_issues
    }
    
    # 5. ACCURACY (placeholder - would need external reference data)
    # For now, we'll use some heuristics
    accuracy_issues = []
    accuracy_score = 95  # Default high score since we can't validate against truth
    
    # Check for impossible dates (future dates in birth year columns, etc.)
    current_year = datetime.datetime.now().year
    for col in df.columns:
        if any(word in col.lower() for word in ['birth', 'born']) and pd.api.types.is_numeric_dtype(df[col]):
            future_years = (df[col] > current_year).sum()
            if future_years > 0:
                accuracy_issues.append(f"Column '{col}' has {future_years} future years (impossible for birth dates)")
                accuracy_score -= 3
    
    metrics['accuracy'] = {
        'overall_score': max(accuracy_score, 0),
        'issues_found': len(accuracy_issues),
        'issues_detail': accuracy_issues
    }
    
    # 6. OVERALL QUALITY SCORE
    # Weighted average of all dimensions
    weights = {
        'completeness': 0.25,
        'uniqueness': 0.20,
        'consistency': 0.25,
        'validity': 0.20,
        'accuracy': 0.10
    }
    
    overall_score = (
        metrics['completeness']['overall_score'] * weights['completeness'] +
        metrics['uniqueness']['overall_score'] * weights['uniqueness'] +
        metrics['consistency']['overall_score'] * weights['consistency'] +
        metrics['validity']['overall_score'] * weights['validity'] +
        metrics['accuracy']['overall_score'] * weights['accuracy']
    )
    
    # Determine quality level
    if overall_score >= 90:
        quality_level = "Excellent"
    elif overall_score >= 80:
        quality_level = "Good"
    elif overall_score >= 70:
        quality_level = "Fair"
    elif overall_score >= 60:
        quality_level = "Poor"
    else:
        quality_level = "Very Poor"
    
    metrics['overall'] = {
        'score': overall_score,
        'level': quality_level,
        'weights_used': weights
    }
    
    # Print comprehensive quality report
    print("=== DATA QUALITY ASSESSMENT ===")
    print(f"Overall Quality Score: {overall_score:.1f}% ({quality_level})")
    print(f"Dataset: {df.shape[0]} rows × {df.shape[1]} columns")
    
    print(f"\\nQUALITY DIMENSIONS:")
    print(f"  Completeness: {metrics['completeness']['overall_score']:.1f}% ({metrics['completeness']['missing_cells']} missing cells)")
    print(f"  Uniqueness: {metrics['uniqueness']['overall_score']:.1f}% ({metrics['uniqueness']['duplicate_rows']} duplicate rows)")
    print(f"  Consistency: {metrics['consistency']['overall_score']:.1f}% ({metrics['consistency']['issues_found']} issues)")
    print(f"  Validity: {metrics['validity']['overall_score']:.1f}% ({metrics['validity']['issues_found']} issues)")
    print(f"  Accuracy: {metrics['accuracy']['overall_score']:.1f}% ({metrics['accuracy']['issues_found']} issues)")
    
    # Show worst completeness columns
    if metrics['completeness']['worst_columns']:
        print(f"\\nCOLUMNS WITH MOST MISSING DATA:")
        for col, missing_count in list(metrics['completeness']['worst_columns'].items())[:3]:
            missing_pct = (missing_count / len(df)) * 100
            print(f"  {col}: {missing_count} missing ({missing_pct:.1f}%)")
    
    # Show all issues found
    all_issues = (consistency_issues + validity_issues + accuracy_issues)
    if all_issues:
        print(f"\\nQUALITY ISSUES DETECTED ({len(all_issues)} total):")
        for issue in all_issues[:10]:  # Show first 10 issues
            print(f"  - {issue}")
        if len(all_issues) > 10:
            print(f"  ... and {len(all_issues) - 10} more issues")
    
    return metrics

In [6]:
def analyze_correlations(df, method='pearson', threshold=0.8):
    """
    Analyze correlations with visualization and detailed reporting.
    
    Parameters:
    - df: pandas DataFrame
    - method: correlation method ('pearson', 'spearman', 'kendall')
    - threshold: correlation threshold for flagging high correlations
    
    Returns:
    - Dictionary with correlation analysis results
    """
    correlation_report = {
        'method': method,
        'threshold': threshold,
        'correlation_matrix': None,
        'high_correlations': [],
        'correlation_summary': {},
        'multicollinearity': {}
    }
    
    # Get numeric columns for correlation analysis
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if len(numeric_cols) < 2:
        print("=== CORRELATION ANALYSIS ===")
        print("Error: Need at least 2 numeric columns for correlation analysis.")
        print(f"Found {len(numeric_cols)} numeric columns: {numeric_cols}")
        return correlation_report
    
    # Calculate correlation matrix
    try:
        correlation_matrix = df[numeric_cols].corr(method=method)
        correlation_report['correlation_matrix'] = correlation_matrix
    except Exception as e:
        print(f"Error calculating correlations: {e}")
        return correlation_report
    
    # Find high correlations (excluding diagonal)
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            col1 = correlation_matrix.columns[i]
            col2 = correlation_matrix.columns[j]
            corr_value = correlation_matrix.iloc[i, j]
            
            if abs(corr_value) >= threshold:
                high_corr_pairs.append({
                    'column1': col1,
                    'column2': col2,
                    'correlation': corr_value,
                    'absolute_correlation': abs(corr_value),
                    'relationship': 'positive' if corr_value > 0 else 'negative'
                })
    
    # Sort by absolute correlation descending
    high_corr_pairs.sort(key=lambda x: x['absolute_correlation'], reverse=True)
    correlation_report['high_correlations'] = high_corr_pairs
    
    # Correlation summary statistics
    # Flatten correlation matrix (excluding diagonal and upper triangle)
    corr_values = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_values.append(correlation_matrix.iloc[i, j])
    
    corr_array = np.array(corr_values)
    corr_array = corr_array[~np.isnan(corr_array)]  # Remove NaN values
    
    if len(corr_array) > 0:
        correlation_report['correlation_summary'] = {
            'total_pairs': len(corr_array),
            'mean_correlation': float(np.mean(np.abs(corr_array))),
            'max_correlation': float(np.max(np.abs(corr_array))),
            'min_correlation': float(np.min(np.abs(corr_array))),
            'std_correlation': float(np.std(corr_array)),
            'high_correlation_count': len(high_corr_pairs),
            'high_correlation_percentage': (len(high_corr_pairs) / len(corr_array)) * 100
        }
    
    # Multicollinearity detection (using correlation-based approach)
    multicollinear_groups = []
    processed_cols = set()
    
    for pair in high_corr_pairs:
        if pair['column1'] not in processed_cols and pair['column2'] not in processed_cols:
            # Find all columns highly correlated with this pair
            group = set([pair['column1'], pair['column2']])
            
            # Look for more columns correlated with either column in the pair
            for other_pair in high_corr_pairs:
                if (other_pair['column1'] in group and other_pair['column2'] not in processed_cols):
                    group.add(other_pair['column2'])
                elif (other_pair['column2'] in group and other_pair['column1'] not in processed_cols):
                    group.add(other_pair['column1'])
            
            if len(group) >= 2:
                multicollinear_groups.append({
                    'columns': list(group),
                    'size': len(group),
                    'max_correlation': max([abs(p['correlation']) for p in high_corr_pairs 
                                          if p['column1'] in group and p['column2'] in group])
                })
                processed_cols.update(group)
    
    correlation_report['multicollinearity'] = {
        'groups_found': len(multicollinear_groups),
        'groups_detail': multicollinear_groups,
        'affected_columns': len(processed_cols),
        'recommendation': get_multicollinearity_recommendation(multicollinear_groups)
    }
    
    # Print comprehensive correlation report
    print("=== CORRELATION ANALYSIS ===")
    print(f"Method: {method.title()}")
    print(f"Numeric columns analyzed: {len(numeric_cols)}")
    print(f"Total correlation pairs: {correlation_report['correlation_summary'].get('total_pairs', 0)}")
    
    if correlation_report['correlation_summary']:
        print(f"\\nCORRELATION STATISTICS:")
        print(f"  Mean absolute correlation: {correlation_report['correlation_summary']['mean_correlation']:.3f}")
        print(f"  Maximum correlation: {correlation_report['correlation_summary']['max_correlation']:.3f}")
        print(f"  Standard deviation: {correlation_report['correlation_summary']['std_correlation']:.3f}")
        print(f"  High correlations (≥{threshold}): {correlation_report['correlation_summary']['high_correlation_count']} ({correlation_report['correlation_summary']['high_correlation_percentage']:.1f}%)")
    
    # Show highest correlations
    if high_corr_pairs:
        print(f"\\nHIGHEST CORRELATIONS (≥{threshold}):")
        for pair in high_corr_pairs[:10]:  # Show top 10
            print(f"  {pair['column1']} ↔ {pair['column2']}: {pair['correlation']:.3f} ({pair['relationship']})")
        
        if len(high_corr_pairs) > 10:
            print(f"  ... and {len(high_corr_pairs) - 10} more high correlations")
    else:
        print(f"\\nNo correlations above threshold ({threshold}) found.")
    
    # Multicollinearity report
    if multicollinear_groups:
        print(f"\\nMULTICOLLINEARITY DETECTED:")
        print(f"  Groups found: {len(multicollinear_groups)}")
        print(f"  Columns affected: {correlation_report['multicollinearity']['affected_columns']}")
        
        for i, group in enumerate(multicollinear_groups[:3], 1):  # Show first 3 groups
            print(f"  Group {i}: {group['columns']} (max corr: {group['max_correlation']:.3f})")
        
        if len(multicollinear_groups) > 3:
            print(f"  ... and {len(multicollinear_groups) - 3} more groups")
            
        print(f"\\nRECOMMENDATION: {correlation_report['multicollinearity']['recommendation']}")
    else:
        print("\\nNo significant multicollinearity detected.")
    
    # Create visualization if matplotlib is available
    try:
        plt.figure(figsize=(12, 8))
        
        # Create correlation heatmap
        mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
        sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5, fmt='.2f')
        plt.title(f'{method.title()} Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        # Create distribution of correlations
        if len(corr_array) > 1:
            plt.figure(figsize=(10, 6))
            plt.hist(corr_array, bins=20, alpha=0.7, edgecolor='black')
            plt.axvline(threshold, color='red', linestyle='--', label=f'Threshold: {threshold}')
            plt.axvline(-threshold, color='red', linestyle='--')
            plt.xlabel('Correlation Coefficient')
            plt.ylabel('Frequency')
            plt.title('Distribution of Correlation Coefficients')
            plt.legend()
            plt.tight_layout()
            plt.show()
            
    except ImportError:
        print("\\nNote: Install matplotlib and seaborn for correlation visualizations.")
    except Exception as e:
        print(f"\\nVisualization error: {e}")
    
    return correlation_report

def get_multicollinearity_recommendation(groups):
    """Get recommendation for handling multicollinearity."""
    if len(groups) == 0:
        return "No multicollinearity issues detected."
    elif len(groups) == 1:
        return "Consider removing one variable from the correlated group or use dimensionality reduction."
    else:
        return "Multiple multicollinear groups found. Consider feature selection, PCA, or regularization techniques."

In [7]:
def detect_patterns(df, columns=None):
    """
    Detect patterns in data columns.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to analyze (None = all columns)
    
    Returns:
    - Dictionary with pattern analysis results
    """
    pattern_report = {
        'text_patterns': {},
        'numeric_patterns': {},
        'temporal_patterns': {},
        'categorical_patterns': {},
        'missing_patterns': {},
        'summary': {}
    }
    
    if columns is None:
        columns = df.columns.tolist()
    
    print("=== PATTERN DETECTION ANALYSIS ===")
    print(f"Analyzing {len(columns)} columns for patterns...")
    
    # 1. TEXT PATTERNS (for object columns)
    text_patterns = {}
    text_cols = [col for col in columns if col in df.select_dtypes(include=['object']).columns]
    
    for col in text_cols:
        non_null_data = df[col].dropna().astype(str)
        if len(non_null_data) == 0:
            continue
            
        col_patterns = {
            'column': col,
            'patterns_found': [],
            'format_consistency': 0,
            'common_formats': {}
        }
        
        # Email pattern
        email_pattern = r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b'
        email_matches = non_null_data.str.contains(email_pattern, regex=True, na=False).sum()
        if email_matches > len(non_null_data) * 0.5:
            col_patterns['patterns_found'].append(f'Email format ({email_matches}/{len(non_null_data)} matches)')
        
        # Phone pattern
        phone_patterns = [
            r'\\(\\d{3}\\)\\s*\\d{3}-\\d{4}',  # (123) 456-7890
            r'\\d{3}-\\d{3}-\\d{4}',          # 123-456-7890
            r'\\d{10}',                      # 1234567890
            r'\\+1\\d{10}'                   # +11234567890
        ]
        phone_matches = 0
        for pattern in phone_patterns:
            phone_matches += non_null_data.str.contains(pattern, regex=True, na=False).sum()
        if phone_matches > len(non_null_data) * 0.5:
            col_patterns['patterns_found'].append(f'Phone format ({phone_matches}/{len(non_null_data)} matches)')
        
        # URL pattern
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        url_matches = non_null_data.str.contains(url_pattern, regex=True, na=False).sum()
        if url_matches > len(non_null_data) * 0.3:
            col_patterns['patterns_found'].append(f'URL format ({url_matches}/{len(non_null_data)} matches)')
        
        # Date-like pattern
        date_patterns = [
            r'\\d{4}-\\d{2}-\\d{2}',      # YYYY-MM-DD
            r'\\d{2}/\\d{2}/\\d{4}',      # MM/DD/YYYY
            r'\\d{2}-\\d{2}-\\d{4}'       # MM-DD-YYYY
        ]
        date_matches = 0
        for pattern in date_patterns:
            date_matches += non_null_data.str.contains(pattern, regex=True, na=False).sum()
        if date_matches > len(non_null_data) * 0.5:
            col_patterns['patterns_found'].append(f'Date-like format ({date_matches}/{len(non_null_data)} matches)')
        
        # ID-like pattern (alphanumeric with consistent length)
        if len(non_null_data) > 10:
            lengths = non_null_data.str.len()
            length_consistency = (lengths == lengths.mode()[0]).sum() / len(lengths)
            if length_consistency > 0.8:
                mode_length = lengths.mode()[0]
                # Check if alphanumeric
                alphanum_pattern = r'^[A-Za-z0-9]+$'
                alphanum_matches = non_null_data.str.contains(alphanum_pattern, regex=True, na=False).sum()
                if alphanum_matches > len(non_null_data) * 0.8:
                    col_patterns['patterns_found'].append(f'ID-like format (length={mode_length}, {alphanum_matches}/{len(non_null_data)} alphanumeric)')
        
        # Format consistency check
        sample_data = non_null_data.head(100)
        format_counter = Counter()
        for value in sample_data:
            # Create format signature
            format_sig = re.sub(r'\\d', 'N', str(value))
            format_sig = re.sub(r'[A-Za-z]', 'A', format_sig)
            format_counter[format_sig] += 1
        
        if len(format_counter) > 0:
            most_common_format = format_counter.most_common(1)[0]
            col_patterns['format_consistency'] = most_common_format[1] / len(sample_data)
            col_patterns['common_formats'] = dict(format_counter.most_common(5))
        
        text_patterns[col] = col_patterns
    
    pattern_report['text_patterns'] = text_patterns
    
    # 2. NUMERIC PATTERNS
    numeric_patterns = {}
    numeric_cols = [col for col in columns if col in df.select_dtypes(include=[np.number]).columns]
    
    for col in numeric_cols:
        non_null_data = df[col].dropna()
        if len(non_null_data) == 0:
            continue
            
        col_patterns = {
            'column': col,
            'patterns_found': [],
            'distribution_type': 'unknown',
            'outlier_pattern': 'normal'
        }
        
        # Check for integer-like pattern in floats
        if df[col].dtype == 'float64':
            is_integer_like = (non_null_data == non_null_data.astype(int)).all()
            if is_integer_like:
                col_patterns['patterns_found'].append('Integer values in float column')
        
        # Check for specific numeric patterns
        # Percentage-like (0-100 range)
        if non_null_data.min() >= 0 and non_null_data.max() <= 100:
            col_patterns['patterns_found'].append('Percentage-like values (0-100 range)')
        
        # Probability-like (0-1 range)
        elif non_null_data.min() >= 0 and non_null_data.max() <= 1:
            col_patterns['patterns_found'].append('Probability-like values (0-1 range)')
        
        # Age-like pattern
        if non_null_data.min() >= 0 and non_null_data.max() <= 150 and non_null_data.dtype in ['int64', 'int32']:
            col_patterns['patterns_found'].append('Age-like values (0-150 integer range)')
        
        # Year-like pattern
        if non_null_data.min() >= 1900 and non_null_data.max() <= 2030 and len(non_null_data.unique()) > 5:
            col_patterns['patterns_found'].append('Year-like values (1900-2030 range)')
        
        # Distribution analysis
        try:
            from scipy import stats
            # Test for normality
            if len(non_null_data) > 7:
                _, normal_p = stats.shapiro(non_null_data.sample(min(5000, len(non_null_data))))
                if normal_p > 0.05:
                    col_patterns['distribution_type'] = 'approximately_normal'
                else:
                    # Check skewness
                    skewness = stats.skew(non_null_data)
                    if abs(skewness) > 2:
                        col_patterns['distribution_type'] = 'highly_skewed'
                    elif abs(skewness) > 1:
                        col_patterns['distribution_type'] = 'moderately_skewed'
                    else:
                        col_patterns['distribution_type'] = 'roughly_symmetric'
        except:
            pass
        
        # Outlier pattern detection using IQR
        Q1 = non_null_data.quantile(0.25)
        Q3 = non_null_data.quantile(0.75)
        IQR = Q3 - Q1
        
        if IQR > 0:
            outliers = non_null_data[(non_null_data < (Q1 - 1.5 * IQR)) | (non_null_data > (Q3 + 1.5 * IQR))]
            outlier_percentage = len(outliers) / len(non_null_data) * 100
            
            if outlier_percentage > 10:
                col_patterns['outlier_pattern'] = 'high_outliers'
            elif outlier_percentage > 5:
                col_patterns['outlier_pattern'] = 'moderate_outliers'
            else:
                col_patterns['outlier_pattern'] = 'few_outliers'
        
        numeric_patterns[col] = col_patterns
    
    pattern_report['numeric_patterns'] = numeric_patterns
    
    # 3. CATEGORICAL PATTERNS
    categorical_patterns = {}
    categorical_cols = text_cols + [col for col in columns if df[col].dtype == 'category']
    
    for col in categorical_cols:
        if col in text_patterns:  # Skip if already analyzed as text
            continue
            
        non_null_data = df[col].dropna()
        if len(non_null_data) == 0:
            continue
        
        value_counts = non_null_data.value_counts()
        col_patterns = {
            'column': col,
            'cardinality': len(value_counts),
            'cardinality_ratio': len(value_counts) / len(non_null_data),
            'distribution_pattern': 'unknown',
            'top_values': dict(value_counts.head(5))
        }
        
        # Determine distribution pattern
        if len(value_counts) == 2:
            col_patterns['distribution_pattern'] = 'binary'
        elif len(value_counts) <= 10:
            col_patterns['distribution_pattern'] = 'low_cardinality'
        elif col_patterns['cardinality_ratio'] < 0.05:
            col_patterns['distribution_pattern'] = 'low_cardinality_high_frequency'
        elif col_patterns['cardinality_ratio'] > 0.9:
            col_patterns['distribution_pattern'] = 'high_cardinality_unique'
        else:
            col_patterns['distribution_pattern'] = 'medium_cardinality'
        
        # Check for imbalanced distribution
        max_freq = value_counts.max()
        if max_freq / len(non_null_data) > 0.9:
            col_patterns['distribution_pattern'] += '_highly_imbalanced'
        elif max_freq / len(non_null_data) > 0.7:
            col_patterns['distribution_pattern'] += '_imbalanced'
        
        categorical_patterns[col] = col_patterns
    
    pattern_report['categorical_patterns'] = categorical_patterns
    
    # 4. MISSING DATA PATTERNS
    missing_patterns = {}
    missing_cols = [col for col in columns if df[col].isnull().sum() > 0]
    
    if missing_cols:
        # Overall missing pattern
        missing_matrix = df[missing_cols].isnull()
        
        # Find common missing patterns
        missing_pattern_counts = missing_matrix.value_counts().head(10)
        
        missing_patterns = {
            'columns_with_missing': missing_cols,
            'total_missing_cells': df[missing_cols].isnull().sum().sum(),
            'common_missing_patterns': dict(missing_pattern_counts),
            'missing_correlation': {}
        }
        
        # Check correlation between missing values
        if len(missing_cols) > 1:
            missing_corr = missing_matrix.corr()
            high_missing_corr = []
            
            for i in range(len(missing_corr.columns)):
                for j in range(i+1, len(missing_corr.columns)):
                    corr_val = missing_corr.iloc[i, j]
                    if abs(corr_val) > 0.5:
                        high_missing_corr.append({
                            'col1': missing_corr.columns[i],
                            'col2': missing_corr.columns[j],
                            'correlation': corr_val
                        })
            
            missing_patterns['missing_correlation'] = high_missing_corr
    
    pattern_report['missing_patterns'] = missing_patterns
    
    # 5. SUMMARY
    total_patterns_found = 0
    for category in ['text_patterns', 'numeric_patterns', 'categorical_patterns']:
        for col_data in pattern_report[category].values():
            if isinstance(col_data, dict) and 'patterns_found' in col_data:
                total_patterns_found += len(col_data['patterns_found'])
    
    pattern_report['summary'] = {
        'columns_analyzed': len(columns),
        'text_columns_analyzed': len(text_patterns),
        'numeric_columns_analyzed': len(numeric_patterns),
        'categorical_columns_analyzed': len(categorical_patterns),
        'columns_with_missing': len(missing_cols),
        'total_patterns_found': total_patterns_found
    }
    
    # Print comprehensive pattern report
    print(f"\\nPATTERN ANALYSIS SUMMARY:")
    print(f"  Text columns: {len(text_patterns)}")
    print(f"  Numeric columns: {len(numeric_patterns)}")
    print(f"  Categorical columns: {len(categorical_patterns)}")
    print(f"  Columns with missing data: {len(missing_cols)}")
    print(f"  Total patterns detected: {total_patterns_found}")
    
    # Show interesting text patterns
    if text_patterns:
        print(f"\\nTEXT PATTERNS DETECTED:")
        for col, patterns in text_patterns.items():
            if patterns['patterns_found']:
                print(f"  {col}: {', '.join(patterns['patterns_found'])}")
    
    # Show interesting numeric patterns
    if numeric_patterns:
        print(f"\\nNUMERIC PATTERNS DETECTED:")
        for col, patterns in numeric_patterns.items():
            if patterns['patterns_found']:
                print(f"  {col}: {', '.join(patterns['patterns_found'])}")
    
    # Show categorical distribution patterns
    if categorical_patterns:
        print(f"\\nCATEGORICAL PATTERNS:")
        for col, patterns in categorical_patterns.items():
            print(f"  {col}: {patterns['distribution_pattern']} (cardinality: {patterns['cardinality']})")
    
    # Show missing data patterns
    if missing_patterns and missing_patterns.get('missing_correlation'):
        print(f"\\nMISSING DATA CORRELATION:")
        for corr in missing_patterns['missing_correlation']:
            print(f"  {corr['col1']} ↔ {corr['col2']}: {corr['correlation']:.3f}")
    
    return pattern_report

In [None]:
helper_docs = """ Helper functions available:
- generate_profile_report(df, title="Data Profile"): Comprehensive data profiling with statistics, quality metrics, and detailed column analysis. Returns detailed profile dictionary.
- calculate_quality_metrics(df): Data quality scoring across 5 dimensions (completeness, uniqueness, consistency, validity, accuracy). Returns metrics dictionary with scores.
- analyze_correlations(df, method='pearson', threshold=0.8): Correlation analysis with visualization and multicollinearity detection. Returns correlation analysis results.
- detect_patterns(df, columns=None): Pattern detection in data including text patterns (email, phone, URL), numeric patterns, and categorical distributions. Returns pattern analysis report.

Examples:
- "Generate a data profile report" -> profile = generate_profile_report(df)
- "Calculate data quality metrics" -> metrics = calculate_quality_metrics(df)
- "Analyze correlations in the data" -> corr_analysis = analyze_correlations(df)
- "Detect patterns in the data" -> patterns = detect_patterns(df)
"""

# **MAIN FEATURE FUNCTION**

In [None]:
def profiling(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent focused on comprehensive data profiling and quality analysis.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - sklearn.preprocessing
    - matplotlib.pyplot as plt
    - seaborn as sns
    - scipy.stats (for statistical analysis)
    - All helper functions listed above
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions when appropriate for data profiling and quality analysis tasks
    - ASSUME \"df\" IS ALREADY DEFINED
    - For profiling queries, use helper functions that print comprehensive reports
    - ALWAYS assign results to variables when functions return data: profile = generate_profile_report(df)
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    
    Common query patterns:
    - "Generate a data profile report" or "Profile this dataset" -> profile = generate_profile_report(df)
    - "Calculate data quality metrics" or "Check data quality" -> metrics = calculate_quality_metrics(df)
    - "Analyze correlations" or "Find correlations" -> corr_analysis = analyze_correlations(df)
    - "Detect patterns" or "Find patterns in data" -> patterns = detect_patterns(df)
    - "Compare data quality" -> metrics1 = calculate_quality_metrics(df1); metrics2 = calculate_quality_metrics(df2)
    """)) 
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            'plt': plt,
            'sns': sns,
            'stats': stats,
            'Counter': Counter,
            'generate_profile_report': generate_profile_report,
            'calculate_quality_metrics': calculate_quality_metrics,
            'analyze_correlations': analyze_correlations,
            'detect_patterns': detect_patterns,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# # Create comprehensive test dataset for profiling
# import pandas as pd
# import numpy as np

# np.random.seed(42)  # For reproducible results

# # Create test data with various data quality issues and patterns
# test_data = {
#     'id': range(1, 201),  # Clean integer ID
#     'name': ['John Doe', 'jane smith', 'BOB JOHNSON', '  Mary Brown  ', 'ALICE WHITE'] * 40,  # Text with cleaning needs
#     'email': ['john@email.com', 'jane.invalid', 'bob@test.co.uk', 'mary@domain.org', 'alice@company.com'] * 40,  # Mixed valid/invalid emails
#     'age': np.random.randint(18, 80, 200),  # Age-like pattern
#     'salary': np.random.normal(60000, 15000, 200),  # Normal distribution with some potential outliers
#     'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing', 'Operations'], 200),  # Categorical
#     'score': np.random.uniform(0, 100, 200),  # Percentage-like values
#     'years_experience': np.random.exponential(3, 200),  # Skewed distribution
#     'bonus_pct': [f"{x:.1f}%" for x in np.random.uniform(5, 15, 200)],  # Percentage strings
#     'join_date': pd.date_range('2020-01-01', periods=200, freq='D')[:200],  # Date column
#     'active': np.random.choice(['Yes', 'No'], 200),  # Boolean-like pattern
#     'phone': ['(123) 456-7890', '123-456-7890', '1234567890', '+11234567890', 'invalid'] * 40,  # Phone patterns
# }

# # Add some missing values and duplicates
# test_df = pd.DataFrame(test_data)

# # Add missing values to simulate real data issues
# missing_indices = np.random.choice(test_df.index, 20, replace=False)
# test_df.loc[missing_indices, 'salary'] = np.nan

# missing_indices2 = np.random.choice(test_df.index, 15, replace=False)
# test_df.loc[missing_indices2, 'bonus_pct'] = np.nan

# # Add some duplicate rows
# duplicate_rows = test_df.sample(5).copy()
# test_df = pd.concat([test_df, duplicate_rows], ignore_index=True)

# # Add some outliers
# outlier_indices = np.random.choice(test_df.index, 5, replace=False)
# test_df.loc[outlier_indices, 'salary'] = np.random.uniform(200000, 300000, 5)  # Salary outliers

# print(f"Test dataset created: {test_df.shape}")
# print(f"Data types: {test_df.dtypes.value_counts().to_dict()}")
# print(f"Missing values per column: {test_df.isnull().sum().to_dict()}")
# print(f"Duplicate rows: {test_df.duplicated().sum()}")
# print("\\nSample data:")
# print(test_df.head())

In [None]:
# # Test all profiling functions
# print("=== TESTING PROFILING FUNCTIONALITY ===\\n")

# # Test 1: Generate Profile Report
# print("1. TESTING: Generate Profile Report")
# print("-" * 50)
# query1 = "Generate a comprehensive data profile report"
# result1 = profiling(test_df.copy(), query1)
# print("✓ Profile report test completed\\n")

# # Test 2: Calculate Quality Metrics  
# print("2. TESTING: Calculate Quality Metrics")
# print("-" * 50)
# query2 = "Calculate data quality metrics"
# result2 = profiling(test_df.copy(), query2)
# print("✓ Quality metrics test completed\\n")

# # Test 3: Analyze Correlations
# print("3. TESTING: Analyze Correlations")
# print("-" * 50)
# query3 = "Analyze correlations in the data"
# result3 = profiling(test_df.copy(), query3)
# print("✓ Correlation analysis test completed\\n")

# # Test 4: Detect Patterns
# print("4. TESTING: Detect Patterns")
# print("-" * 50)
# query4 = "Detect patterns in the data"
# result4 = profiling(test_df.copy(), query4)
# print("✓ Pattern detection test completed\\n")

# print("=== ALL PROFILING TESTS COMPLETED SUCCESSFULLY ===")
# print("✓ All functions executed without errors")
# print("✓ Comprehensive reporting and analysis generated")
# print("✓ Integration with main system verified")

In [None]:
# test_df.info()
# result.info()