In [11]:
import pandas as pd
import glob
import os
from tqdm import tqdm

def describe_csv_file(file_path, sample_size=5):
    """Generate a detailed description of a CSV file for README documentation"""
    file_name = os.path.basename(file_path)
    description = f"\n### {file_name}\n\n"
    
    # Read the CSV file with low_memory=False to handle mixed types
    df = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False)
    
    # Basic file info
    description += f"**File Size**: {os.path.getsize(file_path)/1024/1024:.2f} MB  \n"
    description += f"**Total Rows**: {len(df):,}  \n"
    description += f"**Total Columns**: {len(df.columns)}  \n\n"
    
    # Column details
    description += "#### Columns\n\n"
    for col in df.columns:
        description += f"- **{col}**  \n"
        description += f"  - Type: {df[col].dtype}  \n"
        description += f"  - Non-null values: {df[col].count():,}  \n"
        description += f"  - Unique values: {df[col].nunique():,}  \n"
        
        # Sample data
        if df[col].dtype in ['object', 'string']:
            # Clean and filter samples
            samples = df[col].dropna()
            samples = samples[samples.str.len() < 500].unique()[:3].tolist()  # Limit length
            if samples:
                # Clean JSON-like strings and escape quotes
                cleaned_samples = []
                for s in samples:
                    s = str(s).strip()
                    # Remove escaped quotes and extra formatting
                    s = s.replace('\\"', '"').replace('\\\\', '\\')
                    if s.startswith('"') and s.endswith('"'):
                        s = s[1:-1]
                    # Handle JSON-like strings
                    if s.startswith('{') or s.startswith('['):
                        try:
                            import json
                            s = json.loads(s)
                            s = str(s)
                        except:
                            pass
                    cleaned_samples.append(s)
                description += f"  - Sample values: {', '.join(cleaned_samples)}  \n"
        else:
            try:
                description += f"  - Range: {df[col].min()} to {df[col].max()}  \n"
                description += f"  - Mean: {df[col].mean():.2f}  \n"
            except:
                pass # Skip numeric stats if they can't be calculated
        description += "\n"
    
    return description

# Process each CSV file and write to README
csv_files = glob.glob('../*.csv')
csv_files.sort()
readme_content = "## Data Description\n"

for file in tqdm(csv_files, desc="Processing CSV files"):
    try:
        readme_content += describe_csv_file(file)
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")

# Write to README.md
with open('../README.md', 'r') as f:
    existing_content = f.read()

# Find the Data Description section and replace it
start_marker = "## Data Description"
parts = existing_content.split(start_marker)
new_content = parts[0] + readme_content

with open('../README.md', 'w') as f:
    f.write(new_content)


Processing CSV files: 100%|██████████| 8/8 [00:10<00:00,  1.27s/it]
