## **Feature:** Export & Formatting Tools

**Names:** Gia Bao Ngo

### **What it does**
Provides comprehensive export and formatting capabilities for data cleaning results. Includes enhanced export options with proper formatting, automatic data dictionary generation, comprehensive summary reports, Excel-optimized exports with multiple sheets, and variable codebook creation for documentation.

In [1]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
# Additional imports for export and formatting
import math
import re
import datetime
import json
from typing import Dict, List, Optional, Union
import warnings
warnings.filterwarnings('ignore')

# For Excel formatting
try:
    import openpyxl
    from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
    from openpyxl.utils.dataframe import dataframe_to_rows
    EXCEL_AVAILABLE = True
except ImportError:
    EXCEL_AVAILABLE = False
    print("Warning: openpyxl not available. Excel formatting will be limited.")

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

### **Helper Functions**
- `export_with_formatting(df, filename, format='csv', options=None)` - Enhanced export options with custom formatting
- `create_data_dictionary(df, filename=None)` - Generate comprehensive data documentation  
- `export_summary_report(df, filename=None)` - Create detailed data summary report
- `format_for_excel(df, filename, sheets=None)` - Excel-optimized export with formatting
- `create_codebook(df, filename=None)` - Generate variable codebook for documentation

In [None]:
def export_with_formatting(df, filename, format='csv', options=None):
    """
    Enhanced export with custom formatting and encoding options.
    
    Parameters:
    - df: pandas DataFrame to export
    - filename: output filename (extension will be added if not present)
    - format: export format ('csv', 'json', 'parquet', 'pickle', 'html')
    - options: dict of format-specific options
    
    Returns:
    - Success status and file path
    """
    if options is None:
        options = {}
    
    # Ensure filename has correct extension
    file_path = Path(filename)
    if format == 'csv' and file_path.suffix.lower() != '.csv':
        file_path = file_path.with_suffix('.csv')
    elif format == 'json' and file_path.suffix.lower() != '.json':
        file_path = file_path.with_suffix('.json')
    elif format == 'parquet' and file_path.suffix.lower() != '.parquet':
        file_path = file_path.with_suffix('.parquet')
    elif format == 'pickle' and file_path.suffix.lower() not in ['.pkl', '.pickle']:
        file_path = file_path.with_suffix('.pkl')
    elif format == 'html' and file_path.suffix.lower() != '.html':
        file_path = file_path.with_suffix('.html')
    
    try:
        # Default options for each format
        default_options = {
            'csv': {'index': False, 'encoding': 'utf-8'},
            'json': {'orient': 'records', 'indent': 2},
            'parquet': {'index': False, 'compression': 'snappy'},
            'pickle': {'protocol': 4},
            'html': {'index': False, 'escape': False, 'table_id': 'data_table'}
        }
        
        # Merge with user options
        export_options = default_options.get(format, {})
        export_options.update(options)
        
        # Export based on format
        if format == 'csv':
            df.to_csv(file_path, **export_options)
            print(f"CSV exported to {file_path}")
            print(f"Encoding: {export_options.get('encoding', 'utf-8')}")
            
        elif format == 'json':
            df.to_json(file_path, **export_options)
            print(f"JSON exported to {file_path}")
            print(f"Format: {export_options.get('orient', 'records')}")
            
        elif format == 'parquet':
            df.to_parquet(file_path, **export_options)
            print(f"Parquet exported to {file_path}")
            print(f"Compression: {export_options.get('compression', 'snappy')}")
            
        elif format == 'pickle':
            df.to_pickle(file_path, **export_options)
            print(f"Pickle exported to {file_path}")
            
        elif format == 'html':
            # Enhanced HTML with basic styling
            html_options = export_options.copy()
            if 'classes' not in html_options:
                html_options['classes'] = 'table table-striped table-bordered'
            
            df.to_html(file_path, **html_options)
            print(f"HTML exported to {file_path}")
            
        else:
            raise ValueError(f"Unsupported format: {format}")
        
        # File size info
        file_size_mb = file_path.stat().st_size / (1024 * 1024)
        print(f"File size: {file_size_mb:.2f} MB")
        print(f"Data shape: {df.shape}")
        
        return True, str(file_path)
        
    except Exception as e:
        print(f"Export failed: {e}")
        return False, str(file_path)

In [None]:
def create_data_dictionary(df, filename=None):
    """
    Generate comprehensive data documentation including types, statistics, and quality metrics.
    
    Parameters:
    - df: pandas DataFrame to document
    - filename: output filename (optional, defaults to 'data_dictionary.csv')
    
    Returns:
    - DataFrame with data dictionary information
    """
    if filename is None:
        filename = 'data_dictionary.csv'
    
    dictionary_data = []
    
    print("=== GENERATING DATA DICTIONARY ===")
    print(f"Analyzing {len(df.columns)} columns...")
    
    for col in df.columns:
        col_info = {
            'Column_Name': col,
            'Data_Type': str(df[col].dtype),
            'Non_Null_Count': df[col].count(),
            'Null_Count': df[col].isnull().sum(),
            'Null_Percentage': (df[col].isnull().sum() / len(df)) * 100,
            'Unique_Values': df[col].nunique(),
            'Cardinality_Ratio': df[col].nunique() / len(df),
            'Memory_Usage_MB': df[col].memory_usage(deep=True) / (1024 * 1024)
        }
        
        # Type-specific statistics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Numeric statistics
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                col_info.update({
                    'Min_Value': non_null_data.min(),
                    'Max_Value': non_null_data.max(),
                    'Mean': non_null_data.mean(),
                    'Median': non_null_data.median(),
                    'Std_Deviation': non_null_data.std(),
                    'Sample_Values': str(non_null_data.head(3).tolist())
                })
            else:
                col_info.update({
                    'Min_Value': None, 'Max_Value': None, 'Mean': None,
                    'Median': None, 'Std_Deviation': None, 'Sample_Values': '[]'
                })
        
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            # Datetime statistics
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                col_info.update({
                    'Min_Value': non_null_data.min(),
                    'Max_Value': non_null_data.max(),
                    'Sample_Values': str(non_null_data.head(3).tolist())
                })
            else:
                col_info.update({
                    'Min_Value': None, 'Max_Value': None, 'Sample_Values': '[]'
                })
        
        else:
            # Categorical/Text statistics
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                # Most common values
                value_counts = non_null_data.value_counts()
                most_common = value_counts.head(3).index.tolist()
                
                col_info.update({
                    'Most_Common_Values': str(most_common),
                    'Most_Common_Counts': str(value_counts.head(3).values.tolist()),
                    'Sample_Values': str(non_null_data.head(3).tolist())
                })
                
                # Text length statistics for string data
                if df[col].dtype == 'object':
                    try:
                        text_lengths = non_null_data.astype(str).str.len()
                        col_info.update({
                            'Min_Text_Length': text_lengths.min(),
                            'Max_Text_Length': text_lengths.max(),
                            'Avg_Text_Length': text_lengths.mean()
                        })
                    except:
                        pass
            else:
                col_info.update({
                    'Most_Common_Values': '[]',
                    'Most_Common_Counts': '[]',
                    'Sample_Values': '[]'
                })
        
        # Data quality assessment
        quality_issues = []
        if col_info['Null_Percentage'] > 50:
            quality_issues.append('High_Null_Rate')
        if col_info['Cardinality_Ratio'] == 1.0:
            quality_issues.append('All_Unique')
        if col_info['Cardinality_Ratio'] < 0.01 and col_info['Unique_Values'] > 1:
            quality_issues.append('Low_Cardinality')
        
        col_info['Quality_Flags'] = ', '.join(quality_issues) if quality_issues else 'Good'
        
        # Suggestions for optimization/cleaning
        suggestions = []
        if pd.api.types.is_numeric_dtype(df[col]) and df[col].dtype in ['int64', 'float64']:
            suggestions.append('Consider_Downcasting')
        if df[col].dtype == 'object' and col_info['Cardinality_Ratio'] < 0.05:
            suggestions.append('Convert_To_Category')
        if col_info['Null_Percentage'] > 5:
            suggestions.append('Handle_Missing_Values')
        
        col_info['Optimization_Suggestions'] = ', '.join(suggestions) if suggestions else 'None'
        
        dictionary_data.append(col_info)
    
    # Create DataFrame
    dict_df = pd.DataFrame(dictionary_data)
    
    # Export to file
    try:
        dict_df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Data dictionary saved to: {filename}")
        print(f"Dictionary contains {len(dict_df)} column descriptions")
        
        # Summary statistics
        total_memory = dict_df['Memory_Usage_MB'].sum()
        avg_null_rate = dict_df['Null_Percentage'].mean()
        high_null_cols = (dict_df['Null_Percentage'] > 20).sum()
        
        print(f"Total dataset memory: {total_memory:.2f} MB")
        print(f"Average null rate: {avg_null_rate:.1f}%")
        print(f"Columns with >20% nulls: {high_null_cols}")
        
        return dict_df
        
    except Exception as e:
        print(f"Failed to save data dictionary: {e}")
        return dict_df

In [None]:
def export_summary_report(df, filename=None):
    """
    Create comprehensive data summary report with statistics and visualizations.
    
    Parameters:
    - df: pandas DataFrame to analyze
    - filename: output filename (optional, defaults to 'summary_report.html')
    
    Returns:
    - HTML content string
    """
    if filename is None:
        filename = 'summary_report.html'
    
    print("=== GENERATING SUMMARY REPORT ===")
    print(f"Analyzing dataset with shape: {df.shape}")
    
    # Basic dataset information
    total_cells = df.shape[0] * df.shape[1]
    total_missing = df.isnull().sum().sum()
    missing_percentage = (total_missing / total_cells) * 100
    memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024)
    
    # Data type summary
    dtype_counts = df.dtypes.value_counts()
    
    # Missing data by column
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
    
    # Numeric columns summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_summary = df[numeric_cols].describe() if len(numeric_cols) > 0 else pd.DataFrame()
    
    # Categorical columns summary
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    categorical_summary = []
    
    for col in categorical_cols[:10]:  # Limit to first 10 for performance
        col_summary = {
            'Column': col,
            'Unique_Values': df[col].nunique(),
            'Most_Common': df[col].mode().iloc[0] if not df[col].mode().empty else 'N/A',
            'Most_Common_Count': df[col].value_counts().iloc[0] if len(df[col].value_counts()) > 0 else 0
        }
        categorical_summary.append(col_summary)
    
    categorical_df = pd.DataFrame(categorical_summary)
    
    # Generate HTML content
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Data Summary Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            table {{ border-collapse: collapse; width: 100%; margin: 10px 0; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
            .section {{ margin: 30px 0; }}
            .metric {{ background-color: #f8f9fa; padding: 15px; margin: 10px; border-radius: 5px; }}
            .metric-value {{ font-size: 24px; font-weight: bold; color: #007bff; }}
            .metric-label {{ font-size: 14px; color: #6c757d; }}
        </style>
    </head>
    <body>
        <h1>Data Summary Report</h1>
        <p>Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        
        <div class="section">
            <h2>Dataset Overview</h2>
            <div class="metric">
                <div class="metric-value">{df.shape[0]:,}</div>
                <div class="metric-label">Rows</div>
            </div>
            <div class="metric">
                <div class="metric-value">{df.shape[1]:,}</div>
                <div class="metric-label">Columns</div>
            </div>
            <div class="metric">
                <div class="metric-value">{memory_usage:.2f} MB</div>
                <div class="metric-label">Memory Usage</div>
            </div>
            <div class="metric">
                <div class="metric-value">{missing_percentage:.1f}%</div>
                <div class="metric-label">Missing Data</div>
            </div>
        </div>
        
        <div class="section">
            <h2>Data Types Distribution</h2>
            {dtype_counts.to_frame('Count').to_html(classes='table')}
        </div>
    """
    
    # Add missing data section if there are missing values
    if len(missing_data) > 0:
        html_content += f"""
        <div class="section">
            <h2>Missing Data by Column</h2>
            {missing_data.head(20).to_frame('Missing Count').to_html(classes='table')}
        </div>
        """
    
    # Add numeric summary if numeric columns exist
    if not numeric_summary.empty:
        html_content += f"""
        <div class="section">
            <h2>Numeric Columns Summary</h2>
            {numeric_summary.round(2).to_html(classes='table')}
        </div>
        """
    
    # Add categorical summary if categorical columns exist
    if not categorical_df.empty:
        html_content += f"""
        <div class="section">
            <h2>Categorical Columns Summary (Top 10)</h2>
            {categorical_df.to_html(classes='table', index=False)}
        </div>
        """
    
    # Column details table
    column_details = []
    for col in df.columns:
        details = {
            'Column': col,
            'Type': str(df[col].dtype),
            'Non-Null Count': df[col].count(),
            'Unique Values': df[col].nunique(),
            'Memory (MB)': df[col].memory_usage(deep=True) / (1024 * 1024)
        }
        column_details.append(details)
    
    column_details_df = pd.DataFrame(column_details)
    
    html_content += f"""
        <div class="section">
            <h2>Column Details</h2>
            {column_details_df.round(3).to_html(classes='table', index=False)}
        </div>
        
        <div class="section">
            <h2>Data Quality Insights</h2>
            <ul>
                <li>Columns with high missing data: {len(missing_data[missing_data > len(df) * 0.2])}</li>
                <li>Potential categorical columns (low cardinality): {len([c for c in df.columns if df[c].nunique() / len(df) < 0.05 and df[c].dtype == 'object'])}</li>
                <li>High cardinality columns: {len([c for c in df.columns if df[c].nunique() > len(df) * 0.5])}</li>
                <li>Memory optimization opportunities: {len([c for c in df.columns if df[c].dtype in ['int64', 'float64']])}</li>
            </ul>
        </div>
    </body>
    </html>
    """
    
    # Save to file
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        print(f"Summary report saved to: {filename}")
        print(f"Report includes {len(df.columns)} column analyses")
        print(f"File size: {Path(filename).stat().st_size / 1024:.1f} KB")
        
        return html_content
        
    except Exception as e:
        print(f"Failed to save summary report: {e}")
        return html_content

In [None]:
def format_for_excel(df, filename, sheets=None):
    """
    Excel-optimized export with formatting and multiple sheets.
    
    Parameters:
    - df: pandas DataFrame to export
    - filename: output Excel filename (.xlsx will be added if not present)
    - sheets: dict of {sheet_name: dataframe} for multiple sheets, or None for single sheet
    
    Returns:
    - Success status and file path
    """
    # Ensure .xlsx extension
    file_path = Path(filename)
    if file_path.suffix.lower() != '.xlsx':
        file_path = file_path.with_suffix('.xlsx')
    
    print("=== EXPORTING TO EXCEL ===")
    print(f"Target file: {file_path}")
    
    try:
        if sheets is None:
            # Single sheet export
            sheets = {'Data': df}
        
        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            for sheet_name, sheet_df in sheets.items():
                print(f"Writing sheet: {sheet_name} (shape: {sheet_df.shape})")
                
                # Write DataFrame to Excel
                sheet_df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1)
                
                # Get the workbook and worksheet for formatting
                if EXCEL_AVAILABLE:
                    workbook = writer.book
                    worksheet = writer.sheets[sheet_name]
                    
                    # Add title row
                    worksheet['A1'] = f"Data Export - {sheet_name}"
                    title_cell = worksheet['A1']
                    title_cell.font = Font(bold=True, size=14)
                    title_cell.fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
                    title_cell.font = Font(bold=True, size=14, color="FFFFFF")
                    
                    # Format header row
                    header_row = 2
                    for col_num, column_title in enumerate(sheet_df.columns, 1):
                        cell = worksheet.cell(row=header_row, column=col_num)
                        cell.font = Font(bold=True)
                        cell.fill = PatternFill(start_color="D9E1F2", end_color="D9E1F2", fill_type="solid")
                        cell.alignment = Alignment(horizontal="center")
                        
                        # Add borders
                        thin_border = Border(
                            left=Side(style='thin'),
                            right=Side(style='thin'),
                            top=Side(style='thin'),
                            bottom=Side(style='thin')
                        )
                        cell.border = thin_border
                    
                    # Auto-adjust column widths
                    for column in worksheet.columns:
                        max_length = 0
                        column_letter = column[0].column_letter
                        
                        for cell in column:
                            try:
                                if len(str(cell.value)) > max_length:
                                    max_length = len(str(cell.value))
                            except:
                                pass
                        
                        adjusted_width = min(max_length + 2, 50)  # Cap at 50 characters
                        worksheet.column_dimensions[column_letter].width = adjusted_width
                    
                    # Format data rows
                    for row_num in range(3, len(sheet_df) + 3):  # Starting from row 3 (after title and header)
                        for col_num in range(1, len(sheet_df.columns) + 1):
                            cell = worksheet.cell(row=row_num, column=col_num)
                            
                            # Add borders
                            cell.border = thin_border
                            
                            # Format specific data types
                            if col_num <= len(sheet_df.columns):
                                col_name = sheet_df.columns[col_num - 1]
                                if pd.api.types.is_numeric_dtype(sheet_df[col_name]):
                                    cell.alignment = Alignment(horizontal="right")
                                    if sheet_df[col_name].dtype in ['float32', 'float64']:
                                        cell.number_format = '0.00'
                                elif pd.api.types.is_datetime64_any_dtype(sheet_df[col_name]):
                                    cell.number_format = 'yyyy-mm-dd'
                                    cell.alignment = Alignment(horizontal="center")
                    
                    # Add data validation and filtering if not too many rows
                    if len(sheet_df) <= 1000:
                        # Add autofilter
                        worksheet.auto_filter.ref = f"A{header_row}:{worksheet.max_column}{worksheet.max_row}"
                    
                    # Freeze header row
                    worksheet.freeze_panes = f"A{header_row + 1}"
                    
                else:
                    print("Advanced Excel formatting not available (openpyxl not installed)")
        
        # File statistics
        file_size_mb = file_path.stat().st_size / (1024 * 1024)
        total_rows = sum(len(sheet_df) for sheet_df in sheets.values())
        
        print(f"Excel export successful!")
        print(f"File size: {file_size_mb:.2f} MB")
        print(f"Sheets: {len(sheets)}")
        print(f"Total rows: {total_rows:,}")
        print(f"Features added: Headers, formatting, auto-width, borders")
        
        return True, str(file_path)
        
    except Exception as e:
        print(f"Excel export failed: {e}")
        print("Tip: Ensure openpyxl is installed for advanced Excel features")
        
        # Fallback to basic export
        try:
            if len(sheets) == 1:
                list(sheets.values())[0].to_excel(file_path, index=False)
                print(f"Basic Excel export successful to {file_path}")
                return True, str(file_path)
            else:
                print("Multiple sheets require openpyxl. Export failed.")
                return False, str(file_path)
        except Exception as fallback_error:
            print(f"Fallback export also failed: {fallback_error}")
            return False, str(file_path)

In [None]:
def create_codebook(df, filename=None):
    """
    Generate variable codebook for documentation with detailed metadata.
    
    Parameters:
    - df: pandas DataFrame to document
    - filename: output filename (optional, defaults to 'codebook.json')
    
    Returns:
    - Dictionary with codebook information
    """
    if filename is None:
        filename = 'codebook.json'
    
    print("=== GENERATING CODEBOOK ===")
    print(f"Creating codebook for {len(df.columns)} variables...")
    
    codebook = {
        'dataset_info': {
            'name': 'Dataset',
            'description': 'Automatically generated codebook',
            'shape': list(df.shape),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / (1024 * 1024),
            'creation_date': datetime.datetime.now().isoformat(),
            'total_missing_values': int(df.isnull().sum().sum()),
            'missing_percentage': float((df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100)
        },
        'variables': {}
    }
    
    for col in df.columns:
        print(f"Processing variable: {col}")
        
        variable_info = {
            'name': col,
            'position': int(df.columns.get_loc(col)),
            'data_type': str(df[col].dtype),
            'python_type': str(type(df[col].iloc[0]).__name__) if len(df[col].dropna()) > 0 else 'unknown',
            'non_null_count': int(df[col].count()),
            'null_count': int(df[col].isnull().sum()),
            'null_percentage': float((df[col].isnull().sum() / len(df)) * 100),
            'unique_values': int(df[col].nunique()),
            'cardinality_ratio': float(df[col].nunique() / len(df)),
            'memory_usage_bytes': int(df[col].memory_usage(deep=True))
        }
        
        # Type-specific information
        if pd.api.types.is_numeric_dtype(df[col]):
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                variable_info.update({
                    'variable_type': 'numeric',
                    'min_value': float(non_null_data.min()) if pd.api.types.is_float_dtype(non_null_data) else int(non_null_data.min()),
                    'max_value': float(non_null_data.max()) if pd.api.types.is_float_dtype(non_null_data) else int(non_null_data.max()),
                    'mean': float(non_null_data.mean()),
                    'median': float(non_null_data.median()),
                    'std_deviation': float(non_null_data.std()),
                    'quartiles': {
                        'q25': float(non_null_data.quantile(0.25)),
                        'q50': float(non_null_data.quantile(0.50)),
                        'q75': float(non_null_data.quantile(0.75))
                    },
                    'skewness': float(non_null_data.skew()),
                    'kurtosis': float(non_null_data.kurtosis())
                })
                
                # Detect potential issues
                issues = []
                if variable_info['skewness'] > 2 or variable_info['skewness'] < -2:
                    issues.append('highly_skewed')
                if variable_info['std_deviation'] == 0:
                    issues.append('no_variation')
                if len(non_null_data) != len(df[col]):
                    issues.append('missing_values')
                
                variable_info['data_quality_issues'] = issues
            else:
                variable_info.update({
                    'variable_type': 'numeric',
                    'min_value': None,
                    'max_value': None,
                    'data_quality_issues': ['all_missing']
                })
        
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                variable_info.update({
                    'variable_type': 'datetime',
                    'min_date': non_null_data.min().isoformat(),
                    'max_date': non_null_data.max().isoformat(),
                    'date_range_days': int((non_null_data.max() - non_null_data.min()).days)
                })
            else:
                variable_info.update({
                    'variable_type': 'datetime',
                    'data_quality_issues': ['all_missing']
                })
        
        elif df[col].dtype == 'bool':
            value_counts = df[col].value_counts()
            variable_info.update({
                'variable_type': 'boolean',
                'true_count': int(value_counts.get(True, 0)),
                'false_count': int(value_counts.get(False, 0)),
                'true_percentage': float((value_counts.get(True, 0) / len(df[col].dropna())) * 100) if len(df[col].dropna()) > 0 else 0
            })
        
        else:
            # Categorical/Text variable
            non_null_data = df[col].dropna()
            if len(non_null_data) > 0:
                value_counts = non_null_data.value_counts()
                
                variable_info.update({
                    'variable_type': 'categorical' if variable_info['cardinality_ratio'] < 0.5 else 'text',
                    'most_frequent_values': value_counts.head(10).index.tolist(),
                    'most_frequent_counts': value_counts.head(10).values.tolist(),
                    'least_frequent_values': value_counts.tail(5).index.tolist() if len(value_counts) > 5 else [],
                    'least_frequent_counts': value_counts.tail(5).values.tolist() if len(value_counts) > 5 else []
                })
                
                # Text length analysis for string data
                if df[col].dtype == 'object':
                    try:
                        text_lengths = non_null_data.astype(str).str.len()
                        variable_info.update({
                            'text_length_min': int(text_lengths.min()),
                            'text_length_max': int(text_lengths.max()),
                            'text_length_mean': float(text_lengths.mean()),
                            'text_length_std': float(text_lengths.std())
                        })
                    except:
                        pass
                
                # Detect patterns for categorical variables
                if variable_info['cardinality_ratio'] < 0.1:
                    # Check for potential boolean patterns
                    unique_lower = set(non_null_data.astype(str).str.lower().str.strip())
                    boolean_patterns = [
                        {'yes', 'no'}, {'true', 'false'}, {'y', 'n'}, 
                        {'1', '0'}, {'on', 'off'}, {'active', 'inactive'}
                    ]
                    
                    for pattern in boolean_patterns:
                        if unique_lower == pattern or (len(unique_lower) <= 2 and unique_lower.issubset(pattern)):
                            variable_info['suggested_conversion'] = 'boolean'
                            break
                    
                    # Check for percentage patterns
                    if any(str(val).endswith('%') for val in non_null_data.head(10)):
                        variable_info['suggested_conversion'] = 'percentage_to_float'
                    
                    # Low cardinality suggestion
                    if variable_info['cardinality_ratio'] < 0.05:
                        variable_info['suggested_conversion'] = 'category_type'
            else:
                variable_info.update({
                    'variable_type': 'unknown',
                    'data_quality_issues': ['all_missing']
                })
        
        # Add recommendations
        recommendations = []
        if variable_info.get('null_percentage', 0) > 20:
            recommendations.append('investigate_missing_pattern')
        if variable_info.get('cardinality_ratio', 0) == 1.0:
            recommendations.append('consider_removing_unique_identifier')
        if variable_info.get('cardinality_ratio', 0) < 0.05 and variable_info.get('variable_type') == 'categorical':
            recommendations.append('convert_to_category_dtype')
        if variable_info.get('data_type') in ['int64', 'float64']:
            recommendations.append('consider_numeric_downcasting')
        
        variable_info['recommendations'] = recommendations
        
        codebook['variables'][col] = variable_info
    
    # Add summary statistics
    codebook['summary'] = {
        'numeric_variables': len([v for v in codebook['variables'].values() if v.get('variable_type') == 'numeric']),
        'categorical_variables': len([v for v in codebook['variables'].values() if v.get('variable_type') == 'categorical']),
        'datetime_variables': len([v for v in codebook['variables'].values() if v.get('variable_type') == 'datetime']),
        'boolean_variables': len([v for v in codebook['variables'].values() if v.get('variable_type') == 'boolean']),
        'text_variables': len([v for v in codebook['variables'].values() if v.get('variable_type') == 'text']),
        'variables_with_missing': len([v for v in codebook['variables'].values() if v.get('null_count', 0) > 0]),
        'high_cardinality_variables': len([v for v in codebook['variables'].values() if v.get('cardinality_ratio', 0) > 0.8]),
        'potential_optimizations': len([v for v in codebook['variables'].values() if len(v.get('recommendations', [])) > 0])
    }
    
    # Save to file
    try:
        # Save JSON codebook
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(codebook, f, indent=2, ensure_ascii=False, default=str)
        
        print(f"Codebook saved to: {filename}")
        
        # Also save a human-readable version
        readable_filename = Path(filename).with_suffix('.txt')
        with open(readable_filename, 'w', encoding='utf-8') as f:
            f.write("DATA CODEBOOK\\n")
            f.write("=" * 50 + "\\n\\n")
            f.write(f"Dataset Shape: {codebook['dataset_info']['shape']}\\n")
            f.write(f"Memory Usage: {codebook['dataset_info']['memory_usage_mb']:.2f} MB\\n")
            f.write(f"Missing Data: {codebook['dataset_info']['missing_percentage']:.1f}%\\n\\n")
            
            f.write("VARIABLES:\\n")
            f.write("-" * 30 + "\\n")
            
            for var_name, var_info in codebook['variables'].items():
                f.write(f"\\n{var_name}\\n")
                f.write(f"  Type: {var_info['variable_type']} ({var_info['data_type']})\\n")
                f.write(f"  Non-null: {var_info['non_null_count']:,} ({100 - var_info['null_percentage']:.1f}%)\\n")
                f.write(f"  Unique: {var_info['unique_values']:,} ({var_info['cardinality_ratio']:.1%} cardinality)\\n")
                
                if var_info.get('recommendations'):
                    f.write(f"  Recommendations: {', '.join(var_info['recommendations'])}\\n")
        
        print(f"Human-readable codebook saved to: {readable_filename}")
        print(f"Summary: {codebook['summary']['numeric_variables']} numeric, {codebook['summary']['categorical_variables']} categorical, {codebook['summary']['datetime_variables']} datetime variables")
        
        return codebook
        
    except Exception as e:
        print(f"Failed to save codebook: {e}")
        return codebook

In [None]:
helper_docs = """ Helper functions available:
- export_with_formatting(df, filename, format='csv', options=None): Enhanced export with custom formatting for CSV, JSON, Parquet, Pickle, HTML formats. Returns success status and file path.
- create_data_dictionary(df, filename=None): Generate comprehensive data documentation including types, statistics, quality metrics. Default filename: 'data_dictionary.csv'. Returns DataFrame with analysis.
- export_summary_report(df, filename=None): Create detailed HTML summary report with statistics and insights. Default filename: 'summary_report.html'. Returns HTML content.
- format_for_excel(df, filename, sheets=None): Excel-optimized export with professional formatting, borders, auto-width, freeze panes. Supports multiple sheets. Returns success status and file path.
- create_codebook(df, filename=None): Generate variable codebook with detailed metadata in JSON and TXT formats. Default filename: 'codebook.json'. Returns codebook dictionary.

Examples:
- "Export to CSV" -> success, path = export_with_formatting(df, 'data.csv', format='csv')
- "Export to Excel with formatting" -> success, path = format_for_excel(df, 'data.xlsx')
- "Create data dictionary" -> dict_df = create_data_dictionary(df, 'dictionary.csv')
- "Generate summary report" -> html_content = export_summary_report(df, 'report.html')
- "Create codebook" -> codebook = create_codebook(df, 'codebook.json')
- "Export to JSON" -> success, path = export_with_formatting(df, 'data.json', format='json')
- "Export to multiple Excel sheets" -> format_for_excel(df, 'file.xlsx', {'Sheet1': df, 'Sheet2': df.head(100)})

Format options for export_with_formatting:
- CSV: encoding, index, separator
- JSON: orient, indent
- Parquet: compression, index
- HTML: classes, table_id, escape
- Pickle: protocol
"""

# **MAIN FEATURE FUNCTION**

In [None]:
def export_tools(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent focused on export and formatting operations.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Libraries available:
    - pd (pandas), np (numpy)
    - Path (from pathlib), json, datetime
    - All helper functions listed above
    - openpyxl for Excel formatting (if available)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions for all export and formatting operations
    - ASSUME "df" IS ALREADY DEFINED
    - For export operations, capture return values: success, path = export_with_formatting(df, filename, format)
    - For documentation operations, capture return values: result = create_data_dictionary(df)
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    - DEFAULT to current directory for file outputs unless user specifies path
    - Always return the original df (export functions don't modify the DataFrame)
    
    Common query patterns:
    - "Export to [format]" -> export_with_formatting(df, filename, format=format)
    - "Export to Excel" or "Excel export" -> format_for_excel(df, filename)
    - "Create data dictionary" -> create_data_dictionary(df)
    - "Generate summary report" -> export_summary_report(df)
    - "Create codebook" -> create_codebook(df)
    - "Export with custom options" -> export_with_formatting(df, filename, format, options=dict)
    - "Multiple sheets" -> format_for_excel(df, filename, sheets=dict)
    
    File naming:
    - If user doesn't specify filename, use descriptive defaults like 'cleaned_data.csv', 'data_summary.html'
    - Add appropriate extensions based on format
    - Use current timestamp in filename if multiple exports might conflict
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            'Path': Path,
            'json': json,
            'datetime': datetime,
            'export_with_formatting': export_with_formatting,
            'create_data_dictionary': create_data_dictionary,
            'export_summary_report': export_summary_report,
            'format_for_excel': format_for_excel,
            'create_codebook': create_codebook,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        # Return original df since export operations don't modify the data
        return original_df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# # Create test data with various export opportunities
# test_data = {
#     'id': range(1, 11),  
#     'name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Mary Brown', 'Alice White',
#              'Tom Wilson', 'Lisa Davis', 'Mike Chen', 'Sarah Lee', 'David Kim'],
#     'age': [25, 35, 45, 30, 28, 32, 41, 29, 38, 26],
#     'salary': [50000.0, 75000.0, 90000.0, 60000.0, 55000.0,
#                68000.0, 82000.0, 51000.0, 77000.0, 58000.0],
#     'department': ['IT', 'HR', 'Finance', 'IT', 'Marketing',
#                    'HR', 'Finance', 'IT', 'Marketing', 'Finance'],
#     'join_date': ['2023-01-15', '2022-03-20', '2021-06-10', '2023-02-28', '2022-11-05',
#                   '2023-04-12', '2022-08-18', '2023-01-30', '2022-12-14', '2023-03-08'],
#     'active': [True, True, False, True, True, False, True, True, True, False],
#     'performance_score': [85.5, 92.3, 78.1, 88.9, 91.2, 86.7, 89.4, 83.2, 90.8, 87.6]
# }

# test_df = pd.DataFrame(test_data)
# test_df['join_date'] = pd.to_datetime(test_df['join_date'])

# print("Test DataFrame created for export tools:")
# print(f"Shape: {test_df.shape}")
# print("\\nSample data:")
# print(test_df.head())
# print("\\nData types:")
# print(test_df.dtypes)

In [None]:
# # Test the export_tools feature with various queries
# print("=== TESTING EXPORT TOOLS FEATURE ===\\n")

# # Test 1: Basic CSV export
# print("1. Testing CSV export:")
# query1 = "Export data to CSV"
# result1 = export_tools(test_df, query1)
# print("✓ CSV export test completed\\n")

# # Test 2: Data dictionary creation
# print("2. Testing data dictionary creation:")
# query2 = "Create a comprehensive data dictionary"
# result2 = export_tools(test_df, query2)
# print("✓ Data dictionary test completed\\n")

# # Test 3: Summary report generation
# print("3. Testing summary report generation:")
# query3 = "Generate detailed summary report"
# result3 = export_tools(test_df, query3)
# print("✓ Summary report test completed\\n")

# print("=== ALL EXPORT TOOLS TESTS COMPLETED ===")
# print("If no errors appeared, the export_tools feature is working correctly!")

In [None]:
# test_df.info()
# result.info()