In [2]:
import pandas as pd
import os
import glob
import re
from pathlib import Path
import csv

# Configuration - Update this path as needed
DATA_PATH = "/mnt/d/temp/gost_map_template/layers/request/shiny"

# Alternative Windows path (uncomment if using Windows directly)
# DATA_PATH = r"D:\temp\gost_map_template\layers\request\shiny"

def detect_delimiter(file_path, num_lines=5):
    """
    Detect the delimiter used in a CSV file by examining the first few lines.
    """
    delimiters = [',', ';', '\t', '|']
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read first few lines
            lines = []
            for _ in range(num_lines):
                line = file.readline()
                if not line:
                    break
                lines.append(line)
            
            if not lines:
                return ','  # Default to comma if file is empty
            
            # Count occurrences of each delimiter
            delimiter_counts = {}
            for delim in delimiters:
                counts = [line.count(delim) for line in lines]
                # Check if delimiter appears consistently
                if all(c > 0 for c in counts) and len(set(counts)) == 1:
                    delimiter_counts[delim] = counts[0]
            
            # Return delimiter with highest consistent count
            if delimiter_counts:
                return max(delimiter_counts, key=delimiter_counts.get)
            
            # Try csv.Sniffer as fallback
            file.seek(0)
            sample = file.read(1024)
            sniffer = csv.Sniffer()
            delimiter = sniffer.sniff(sample).delimiter
            return delimiter
            
    except Exception as e:
        print(f"    Warning: Could not detect delimiter: {e}")
        return ','  # Default to comma

def read_csv_auto(file_path):
    """
    Read a CSV file with automatic delimiter detection.
    """
    delimiter = detect_delimiter(file_path)
    delimiter_name = {',': 'comma', ';': 'semicolon', '\t': 'tab', '|': 'pipe'}.get(delimiter, delimiter)
    print(f"    Detected delimiter: {delimiter_name}")
    
    try:
        # Try reading with detected delimiter
        df = pd.read_csv(file_path, delimiter=delimiter, encoding='utf-8')
        return df
    except Exception as e:
        print(f"    Error with utf-8, trying latin-1 encoding...")
        try:
            df = pd.read_csv(file_path, delimiter=delimiter, encoding='latin-1')
            return df
        except Exception as e2:
            print(f"    Error reading file: {e2}")
            return None

def inspect_file(file_path, file_name="file"):
    """
    Inspect a CSV file and show its structure.
    """
    print(f"\nInspecting {file_name}:")
    df = read_csv_auto(file_path)
    if df is not None:
        print(f"  Columns found: {list(df.columns)}")
        print(f"  Shape: {df.shape[0]} rows × {df.shape[1]} columns")
        print(f"  First row sample:")
        if len(df) > 0:
            print(f"    {df.iloc[0].to_dict()}")
    return df

def process_csv_files():
    """
    Main function to process CSV files according to specifications:
    1. Join base.csv with map_{theme}_data{num}.csv files
    2. Transform columns (drop muni_n_af, variable; rename value)
    3. Join by theme with adm2.csv and save final results
    """
    
    # Define themes
    themes = ['communications', 'hydric', 'infra', 'insecurity', 
              'outflow', 'poverty', 'production']
    
    # Set working directory or use full paths
    base_path = Path(DATA_PATH)
    
    # Check if directory exists
    if not base_path.exists():
        print(f"Error: Directory not found: {DATA_PATH}")
        print("Please check the path and update DATA_PATH variable in the script.")
        return
    
    print(f"Working directory: {base_path}")
    
    # File paths
    base_file = base_path / 'base.csv'
    adm2_file = base_path / 'adm2.csv'
    
    # Check if required base files exist
    if not base_file.exists():
        print(f"Error: base.csv not found at {base_file}")
        return
    
    if not adm2_file.exists():
        print(f"Error: adm2.csv not found at {adm2_file}")
        return
    
    # Load base files with auto-detection
    print("\nLoading base.csv...")
    base_df = read_csv_auto(base_file)
    if base_df is None:
        print("Error: Could not read base.csv")
        return
    print(f"  Base data loaded: {len(base_df)} rows")
    print(f"  Columns: {list(base_df.columns)}")
    
    print("\nLoading adm2.csv...")
    adm2_df = read_csv_auto(adm2_file)
    if adm2_df is None:
        print("Error: Could not read adm2.csv")
        return
    print(f"  ADM2 data loaded: {len(adm2_df)} rows")
    print(f"  Columns: {list(adm2_df.columns)}")
    
    # Process each theme
    for theme in themes:
        print(f"\n{'='*50}")
        print(f"Processing theme: {theme}")
        print('='*50)
        
        # Find all CSV files for this theme
        pattern = f'map_{theme}_data*.csv'
        theme_files = list(base_path.glob(pattern))
        
        if not theme_files:
            print(f"  No files found for pattern: {pattern}")
            continue
        
        # Sort files to ensure correct order (data1, data2, ..., data10)
        theme_files = sorted(theme_files, key=lambda x: int(re.search(r'data(\d+)', x.name).group(1)))
        
        print(f"  Found {len(theme_files)} files for {theme}:")
        for file in theme_files:
            print(f"    - {file.name}")
        
        # Dictionary to store processed dataframes for this theme
        theme_dfs = {}
        
        # Step 1 & 2: Process each file
        for file_path in theme_files:
            # Extract the number from filename
            match = re.search(r'data(\d+)', file_path.name)
            if not match:
                print(f"  Warning: Could not extract number from {file_path.name}")
                continue
            
            num = match.group(1)
            
            print(f"\n  Processing {file_path.name}...")
            
            # Read the map file with auto-detection
            map_df = read_csv_auto(file_path)
            if map_df is None:
                print(f"    Error: Could not read {file_path.name}")
                continue
            
            print(f"    Loaded: {len(map_df)} rows")
            print(f"    Columns: {list(map_df.columns)}")
            
            # Check if required columns exist
            required_cols = ['muni_n_af', 'variable', 'value']
            if not all(col in map_df.columns for col in required_cols):
                print(f"    Warning: Missing required columns. Found: {list(map_df.columns)}")
                continue
            
            # Step 1: Join with base.csv on muni_n_af
            merged_df = pd.merge(base_df, map_df, on='muni_n_af', how='inner')
            print(f"    After joining with base: {len(merged_df)} rows")
            
            if len(merged_df) == 0:
                print(f"    Warning: No matching records found when joining with base.csv")
                continue
            
            # Step 2: Drop muni_n_af and variable columns, rename value column
            processed_df = merged_df[['adm2code', 'value']].copy()
            processed_df = processed_df.rename(columns={'value': f'data{num}'})
            
            # Remove duplicates if any (keep first occurrence)
            processed_df = processed_df.drop_duplicates(subset=['adm2code'], keep='first')
            
            # Store the processed dataframe
            theme_dfs[int(num)] = processed_df
            print(f"    Processed: columns = {list(processed_df.columns)}, rows = {len(processed_df)}")
        
        if not theme_dfs:
            print(f"  No valid data files processed for {theme}")
            continue
        
        # Step 3: Join all processed files for this theme
        print(f"\n  Combining all {theme} files...")
        
        # Start with adm2.csv as the base
        final_df = adm2_df.copy()
        
        # Join each processed file in order
        for num in sorted(theme_dfs.keys()):
            df = theme_dfs[num]
            print(f"    Joining data{num}...")
            
            # Perform left join to keep all adm2 records
            final_df = pd.merge(final_df, df, on='adm2code', how='left')
        
        # Save the final result in the same directory
        output_filename = f'ago_tbl_{theme}_rapp_2020.csv'
        output_path = base_path / output_filename
        final_df.to_csv(output_path, index=False)
        
        print(f"\n  ✓ Saved {output_filename}")
        print(f"    Location: {output_path}")
        print(f"    Final shape: {final_df.shape[0]} rows × {final_df.shape[1]} columns")
        print(f"    Columns: {list(final_df.columns)}")
        
        # Display summary statistics
        data_cols = [col for col in final_df.columns if col.startswith('data')]
        if data_cols:
            print(f"    Data columns summary:")
            for col in data_cols:
                non_null = final_df[col].notna().sum()
                print(f"      {col}: {non_null} non-null values")
    
    print("\n" + "="*50)
    print("Processing complete!")
    print(f"Output files saved in: {base_path}")
    print("="*50)

def validate_input_files():
    """
    Validate that all required input files exist and have correct structure
    """
    print("Validating input files...")
    
    base_path = Path(DATA_PATH)
    
    # Check if directory exists
    if not base_path.exists():
        print(f"❌ Directory not found: {DATA_PATH}")
        return False
    
    issues = []
    
    # Check base.csv
    base_file = base_path / 'base.csv'
    if base_file.exists():
        df = read_csv_auto(base_file)
        if df is not None:
            required_cols = ['adm2code', 'muni_n_af']
            missing = [col for col in required_cols if col not in df.columns]
            if missing:
                issues.append(f"base.csv missing columns: {missing}. Found: {list(df.columns)}")
            else:
                print(f"  ✓ base.csv validated ({len(df)} rows)")
        else:
            issues.append("base.csv could not be read")
    else:
        issues.append("base.csv not found")
    
    # Check adm2.csv
    adm2_file = base_path / 'adm2.csv'
    if adm2_file.exists():
        df = read_csv_auto(adm2_file)
        if df is not None:
            required_cols = ['iso3', 'adm0name', 'adm1name', 'adm1code', 'adm2name', 'adm2code']
            missing = [col for col in required_cols if col not in df.columns]
            if missing:
                issues.append(f"adm2.csv missing columns: {missing}. Found: {list(df.columns)}")
            else:
                print(f"  ✓ adm2.csv validated ({len(df)} rows)")
        else:
            issues.append("adm2.csv could not be read")
    else:
        issues.append("adm2.csv not found")
    
    # Check for map files
    themes = ['communications', 'hydric', 'infra', 'insecurity', 
              'outflow', 'poverty', 'production']
    
    map_files_summary = {}
    for theme in themes:
        pattern = f'map_{theme}_data*.csv'
        files = list(base_path.glob(pattern))
        if files:
            map_files_summary[theme] = len(files)
            # Check structure of first file
            df = read_csv_auto(files[0])
            if df is not None:
                required_cols = ['muni_n_af', 'variable', 'value']
                missing = [col for col in required_cols if col not in df.columns]
                if missing:
                    issues.append(f"{files[0].name} missing columns: {missing}. Found: {list(df.columns)}")
    
    if map_files_summary:
        print("  ✓ Map files found:")
        for theme, count in map_files_summary.items():
            print(f"      {theme}: {count} files")
    else:
        issues.append("No map_*_data*.csv files found")
    
    if issues:
        print("\nValidation issues found:")
        for issue in issues:
            print(f"  ❌ {issue}")
        return False
    else:
        print("\n✓ All validations passed!")
        return True

def inspect_all_base_files():
    """
    Inspect the structure of base.csv and adm2.csv files to understand their actual format.
    """
    base_path = Path(DATA_PATH)
    
    print("\n" + "="*50)
    print("FILE STRUCTURE INSPECTION")
    print("="*50)
    
    # Inspect base.csv
    base_file = base_path / 'base.csv'
    if base_file.exists():
        inspect_file(base_file, "base.csv")
    
    # Inspect adm2.csv
    adm2_file = base_path / 'adm2.csv'
    if adm2_file.exists():
        inspect_file(adm2_file, "adm2.csv")
    
    # Inspect a sample map file
    sample_map = list(base_path.glob("map_*_data*.csv"))
    if sample_map:
        inspect_file(sample_map[0], sample_map[0].name)

def list_files():
    """
    List all CSV files in the data directory
    """
    base_path = Path(DATA_PATH)
    
    if not base_path.exists():
        print(f"Error: Directory not found: {DATA_PATH}")
        return
    
    print(f"\nListing CSV files in: {base_path}")
    print("="*50)
    
    csv_files = list(base_path.glob("*.csv"))
    
    if not csv_files:
        print("No CSV files found in the directory.")
        return
    
    # Categorize files
    base_files = []
    map_files = {}
    output_files = []
    other_files = []
    
    for file in sorted(csv_files):
        if file.name == 'base.csv' or file.name == 'adm2.csv':
            base_files.append(file.name)
        elif file.name.startswith('map_') and 'data' in file.name:
            # Extract theme
            match = re.match(r'map_(.+?)_data\d+\.csv', file.name)
            if match:
                theme = match.group(1)
                if theme not in map_files:
                    map_files[theme] = []
                map_files[theme].append(file.name)
        elif file.name.startswith('ago_tbl_'):
            output_files.append(file.name)
        else:
            other_files.append(file.name)
    
    # Display categorized files
    if base_files:
        print("Base files:")
        for f in base_files:
            print(f"  - {f}")
    
    if map_files:
        print("\nMap data files by theme:")
        for theme, files in sorted(map_files.items()):
            print(f"  {theme}: {len(files)} files")
            if len(files) <= 3:
                for f in sorted(files):
                    print(f"    - {f}")
    
    if output_files:
        print("\nOutput files (previously generated):")
        for f in output_files:
            print(f"  - {f}")
    
    if other_files:
        print("\nOther CSV files:")
        for f in other_files:
            print(f"  - {f}")
    
    print(f"\nTotal: {len(csv_files)} CSV files")

if __name__ == "__main__":
    print("CSV Data Processing Script")
    print("="*50)
    
    # List files in directory first
    list_files()
    
    # Inspect the actual structure of the files
    inspect_all_base_files()
    
    print("\n" + "="*50)
    
    # Validate input files
    if validate_input_files():
        response = input("\nDo you want to proceed with processing? (y/n): ")
        if response.lower() == 'y':
            print("\nStarting processing...")
            process_csv_files()
        else:
            print("Processing cancelled.")
    else:
        print("\nPlease check the file structure above.")
        print("The script will automatically detect delimiters (comma, semicolon, tab).")
        print(f"\nCurrent data path: {DATA_PATH}")
        print("Update the DATA_PATH variable in the script if needed.")

CSV Data Processing Script

Listing CSV files in: /mnt/d/temp/gost_map_template/layers/request/shiny
Base files:
  - adm2.csv
  - base.csv

Map data files by theme:
  communications: 6 files
  hydric: 5 files
  infra: 10 files
  insecurity: 9 files
  outflow: 7 files
  poverty: 3 files
    - map_poverty_data1.csv
    - map_poverty_data2.csv
    - map_poverty_data3.csv
  production: 10 files
  ttvma: 1 files
    - map_ttvma_data1.csv

Total: 82 CSV files

FILE STRUCTURE INSPECTION

Inspecting base.csv:
    Detected delimiter: semicolon
  Columns found: ['adm2code', 'muni_n_af']
  Shape: 160 rows × 2 columns
  First row sample:
    {'adm2code': 'AGO019001', 'muni_n_af': 'Alto Zambeze'}

Inspecting adm2.csv:
    Detected delimiter: semicolon
  Columns found: ['iso3', 'adm0name', 'adm1name', 'adm1code', 'adm2name', 'adm2code']
  Shape: 164 rows × 6 columns
  First row sample:
    {'iso3': 'AGO', 'adm0name': 'Angola', 'adm1name': 'Uíge', 'adm1code': 'AGO017', 'adm2name': 'Alto Cauale', 'adm


Do you want to proceed with processing? (y/n):  y



Starting processing...
Working directory: /mnt/d/temp/gost_map_template/layers/request/shiny

Loading base.csv...
    Detected delimiter: semicolon
  Base data loaded: 160 rows
  Columns: ['adm2code', 'muni_n_af']

Loading adm2.csv...
    Detected delimiter: semicolon
  ADM2 data loaded: 164 rows
  Columns: ['iso3', 'adm0name', 'adm1name', 'adm1code', 'adm2name', 'adm2code']

Processing theme: communications
  Found 6 files for communications:
    - map_communications_data1.csv
    - map_communications_data2.csv
    - map_communications_data3.csv
    - map_communications_data4.csv
    - map_communications_data5.csv
    - map_communications_data6.csv

  Processing map_communications_data1.csv...
    Detected delimiter: comma
    Loaded: 160 rows
    Columns: ['muni_n_af', 'variable', 'value']
    After joining with base: 162 rows
    Processed: columns = ['adm2code', 'data1'], rows = 160

  Processing map_communications_data2.csv...
    Detected delimiter: comma
    Loaded: 160 rows
  