In [1]:
import os
import json
import shutil
import pandas as pd
import numpy as np
from datetime import datetime

print("="*80)
print("NIGERIA POST-HARVEST LOSSES ANALYSIS PROJECT SETUP")
print("="*80)
print("\nSetting up project structure...\n")

# Create directory structure
directories = [
    'data/raw',        # Original data files
    'data/processed',  # Cleaned, validated data
    'data/final',      # Analysis-ready datasets
    'scripts',         # Analysis scripts
    'visualizations',  # Output visualizations
    'docs'             # Documentation
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"Created directory: {directory}")

# Create project configuration
config = {
    'project_name': 'Nigeria Post-Harvest Losses Analysis',
    'start_date': datetime.now().strftime('%Y-%m-%d'),
    'primary_focus': 'Post-Harvest Losses',
    'secondary_focus': 'Agricultural Finance',
    'datasets': {
        'post_harvest_losses': 'data/final/post_harvest_losses.csv',
        'value_chain': 'data/final/value_chain.csv',
        'financial_impact': 'data/final/financial_impact.csv',
        'nutrient_losses': 'data/final/nutrient_losses.csv',
        'climate_data': 'data/final/climate_data.csv'
    },
    'version': '1.0.0'
}

# Save project configuration
with open('project_config.json', 'w') as f:
    json.dump(config, f, indent=4)
print("Created project configuration: project_config.json")

# Create README file with project information
readme = """# Nigeria Post-Harvest Losses Analysis Project

## Project Overview
This project analyzes post-harvest losses in Nigerian agriculture with a focus on agricultural finance.

## Directory Structure
- `data/raw`: Original data files
- `data/processed`: Cleaned and validated data files
- `data/final`: Analysis-ready datasets
- `scripts`: Analysis and visualization scripts
- `visualizations`: Output visualizations
- `docs`: Project documentation

## Datasets
1. **Post-Harvest Losses**: Loss percentages by crop and region
2. **Value Chain**: Losses at different stages of the agricultural value chain
3. **Financial Impact**: Economic impact of post-harvest losses
4. **Nutrient Losses**: Nutritional impact of post-harvest losses
5. **Climate Data**: Climate data for contextual analysis

## Analysis Workflow
1. Data preparation using `prepare_datasets.py`
2. Basic data visualization using `create_visualizations.py`
3. Specialized analyses using specific scripts in the `scripts` directory

## Getting Started
1. Run `prepare_datasets.py` to set up the datasets
2. Run `create_visualizations.py` to generate basic visualizations
"""

with open('README.md', 'w') as f:
    f.write(readme)
print("Created project README file: README.md")

print("\nProject structure setup complete!")
print("Next step: Run prepare_datasets.py to create clean, analysis-ready datasets.")

NIGERIA POST-HARVEST LOSSES ANALYSIS PROJECT SETUP

Setting up project structure...

Created directory: data/raw
Created directory: data/processed
Created directory: data/final
Created directory: scripts
Created directory: visualizations
Created directory: docs
Created project configuration: project_config.json
Created project README file: README.md

Project structure setup complete!
Next step: Run prepare_datasets.py to create clean, analysis-ready datasets.


In [9]:
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime

print("=" * 80)
print("NIGERIA POST-HARVEST LOSSES: DATA PREPARATION")
print("=" * 80)
print(f"Execution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Create necessary directories
os.makedirs('data/original', exist_ok=True)
os.makedirs('data/cleaned', exist_ok=True)
os.makedirs('results/plots', exist_ok=True)

# Function to detect and fix transposed data
def fix_transposed_data(df, first_row_as_header=True, id_column=0):
    """
    Fix datasets where the first row contains column headers
    """
    # Check if the first row looks like headers
    first_row = df.iloc[0].tolist()
    has_header_keywords = any([
        isinstance(x, str) and any(kw in str(x).lower() for kw in 
        ['maize', 'rice', 'crop', 'region', 'state', 'value', 'loss', '%', 'percentage', 'harvest', 'nutrient'])
        for x in first_row
    ])
    
    if first_row_as_header and has_header_keywords:
        print("  - First row appears to contain headers - fixing transposed data")
        
        # Extract headers from first row
        headers = [str(x).strip() if not pd.isna(x) else f'Column_{i}' for i, x in enumerate(first_row)]
        
        # Replace empty or duplicate headers
        seen = set()
        for i, h in enumerate(headers):
            if h in seen or h == '' or h == 'nan':
                headers[i] = f'Column_{i}'
            seen.add(headers[i])
        
        # Create new DataFrame with correct headers
        fixed_df = pd.DataFrame(df.iloc[1:].values, columns=headers)
        
        # Convert string columns to numeric where possible
        for col in fixed_df.columns:
            if col != headers[id_column]:  # Skip the ID column
                fixed_df[col] = pd.to_numeric(fixed_df[col], errors='coerce')
        
        return fixed_df
    
    return df

# Function to detect and clean data issues
def clean_dataset(df, expected_structure=None):
    """
    Perform general data cleaning:
    1. Remove completely empty rows/columns
    2. Convert numeric columns
    3. Handle missing values
    """
    print("  - Cleaning dataset")
    
    # Remove completely empty rows and columns
    df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
    
    # Check for and convert non-numeric columns that should be numeric
    for col in df.columns:
        if col.lower() not in ['state', 'region', 'state/region', 'crop', 'crop_type', 'stage', 'nutrient', 'category', 'month', 'notes']:
            # Try converting to numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # If we have expectations about structure, ensure they're met
    if expected_structure:
        # Ensure all expected columns exist
        for col in expected_structure:
            if col not in df.columns:
                print(f"  - Warning: Expected column '{col}' not found")
    
    # Fill or drop NaN values based on column type
    for col in df.columns:
        nan_count = df[col].isna().sum()
        if nan_count > 0:
            print(f"  - Column '{col}' has {nan_count} missing values")
            
            if df[col].dtype in ['int64', 'float64']:
                # For numeric columns, fill with mean or 0
                if df[col].count() > 0:  # If there are some non-NaN values
                    mean_value = df[col].mean()
                    print(f"    - Filling missing values with mean: {mean_value}")
                    df[col] = df[col].fillna(mean_value)
                else:
                    print(f"    - Filling missing values with 0")
                    df[col] = df[col].fillna(0)
            else:
                # For categorical/text columns, fill with 'Unknown' or most common value
                if df[col].count() > 0:
                    most_common = df[col].value_counts().index[0]
                    print(f"    - Filling missing values with most common value: '{most_common}'")
                    df[col] = df[col].fillna(most_common)
                else:
                    print(f"    - Filling missing values with 'Unknown'")
                    df[col] = df[col].fillna('Unknown')
    
    return df

# Function to reshape wide format to long format
def reshape_to_long(df, id_vars, var_name, value_name):
    """
    Reshape data from wide to long format
    """
    print(f"  - Reshaping to long format with {id_vars} as ID columns")
    
    # Ensure id_vars are actually in the DataFrame
    valid_id_vars = [col for col in id_vars if col in df.columns]
    if not valid_id_vars:
        print(f"  - Warning: None of the specified ID columns {id_vars} found in DataFrame")
        # Use the first column as ID
        valid_id_vars = [df.columns[0]]
        print(f"  - Using {valid_id_vars} as ID column instead")
    
    # All columns except ID columns will be "melted"
    value_vars = [col for col in df.columns if col not in valid_id_vars]
    
    # Perform the melt operation
    long_df = df.melt(
        id_vars=valid_id_vars,
        value_vars=value_vars,
        var_name=var_name,
        value_name=value_name
    )
    
    # Drop rows where the value is NaN
    long_df = long_df.dropna(subset=[value_name])
    
    return long_df

# Function to process post-harvest losses data
def process_post_harvest_losses(data/raw/post_narvest_loses.csv):
    """
    Process post-harvest losses dataset
    """
    print(f"\nProcessing post-harvest losses data from: {file_path}")
    
    # Read the file with flexible parser
    try:
        df = pd.read_csv(file_path)
        print(f"  - Successfully read with comma separator")
    except:
        try:
            df = pd.read_csv(file_path, sep=';')
            print(f"  - Successfully read with semicolon separator")
        except:
            try:
                df = pd.read_csv(file_path, sep=None, engine='python')
                print(f"  - Successfully read with auto-detected separator")
            except Exception as e:
                print(f"  - Error reading file: {str(e)}")
                return None
    
    print(f"  - Original shape: {df.shape}")
    
    # Check if data is transposed (crop names in first row)
    df = fix_transposed_data(df)
    
    # Clean the dataset
    df = clean_dataset(df)
    
    # Identify key columns
    # Look for a column that might be states/regions
    region_col = None
    for col in df.columns:
        if col.lower() in ['state', 'region', 'state/region', 'location', 'area']:
            region_col = col
            break
    
    if region_col is None:
        # If no clear region column, use the first column
        region_col = df.columns[0]
        print(f"  - Using '{region_col}' as the region column")
    
    # If we have crop columns in wide format, reshape to long format
    if len(df.columns) > 3 and all(col not in ['stage', 'value chain'] for col in df.columns):
        df = reshape_to_long(
            df=df, 
            id_vars=[region_col], 
            var_name='crop_type', 
            value_name='loss_percentage'
        )
    
    print(f"  - Final shape: {df.shape}")
    
    # Save cleaned dataset
    output_file = 'data/cleaned/post_harvest_losses_cleaned.csv'
    df.to_csv(output_file, index=False)
    print(f"  - Saved cleaned dataset to: {output_file}")
    
    return df

# Function to process value chain data
def process_value_chain(data/raw/value chain.csv):
    """
    Process value chain dataset
    """
    print(f"\nProcessing value chain data from: {file_path}")
    
    # Check if file exists and has content
    if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
        print(f"  - File is empty or doesn't exist: {file_path}")
        print(f"  - Creating synthetic value chain dataset")
        
        # Create example data
        data = {
            'crop_type': ['Maize', 'Maize', 'Maize', 'Maize', 'Maize', 
                         'Rice', 'Rice', 'Rice', 'Rice', 'Rice',
                         'Sorghum', 'Sorghum', 'Sorghum', 'Sorghum', 'Sorghum'],
            'stage': ['Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing',
                     'Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing',
                     'Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing'],
            'loss_percentage': [6.42, 4.0, 1.32, 2.37, 4.71,
                               5.12, 3.8, 2.11, 1.98, 3.45,
                               4.89, 2.76, 3.21, 1.55, 2.98]
        }
        
        df = pd.DataFrame(data)
    else:
        # Read the file with flexible parser
        try:
            df = pd.read_csv(file_path)
            print(f"  - Successfully read with comma separator")
        except:
            try:
                df = pd.read_csv(file_path, sep=';')
                print(f"  - Successfully read with semicolon separator")
            except:
                try:
                    df = pd.read_csv(file_path, sep=None, engine='python')
                    print(f"  - Successfully read with auto-detected separator")
                except Exception as e:
                    print(f"  - Error reading file: {str(e)}")
                    return None
    
    print(f"  - Original shape: {df.shape}")
    
    # Check if data is empty
    if df.empty or df.shape[0] <= 1:
        print(f"  - Dataset is empty or has only headers")
        print(f"  - Creating synthetic value chain dataset")
        
        # Create example data
        data = {
            'crop_type': ['Maize', 'Maize', 'Maize', 'Maize', 'Maize', 
                         'Rice', 'Rice', 'Rice', 'Rice', 'Rice',
                         'Sorghum', 'Sorghum', 'Sorghum', 'Sorghum', 'Sorghum'],
            'stage': ['Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing',
                     'Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing',
                     'Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing'],
            'loss_percentage': [6.42, 4.0, 1.32, 2.37, 4.71,
                               5.12, 3.8, 2.11, 1.98, 3.45,
                               4.89, 2.76, 3.21, 1.55, 2.98]
        }
        
        df = pd.DataFrame(data)
    else:
        # Check if data is transposed (stage names in first row)
        df = fix_transposed_data(df)
        
        # Clean the dataset
        df = clean_dataset(df)
        
        # Check if we have a wide format (crops as rows, stages as columns)
        stage_keywords = ['harvest', 'dry', 'store', 'transport', 'process', 'market', 'retail']
        has_stage_columns = any(any(keyword in col.lower() for keyword in stage_keywords) 
                              for col in df.columns if isinstance(col, str))
        
        if has_stage_columns:
            print("  - Dataset appears to be in wide format (stages as columns)")
            
            # Get the crop column (usually first column)
            crop_col = df.columns[0]
            
            # Get the stage columns
            stage_cols = [col for col in df.columns if col != crop_col]
            
            # Reshape to long format
            df = df.melt(
                id_vars=[crop_col],
                value_vars=stage_cols,
                var_name='stage',
                value_name='loss_percentage'
            )
            
            # Rename crop column if needed
            if crop_col.lower() != 'crop_type':
                df = df.rename(columns={crop_col: 'crop_type'})
        
        # Ensure we have the required columns
        required_cols = ['crop_type', 'stage', 'loss_percentage']
        
        # Check if we're missing any required columns
        missing_cols = [col for col in required_cols if col not in df.columns]
        
        if missing_cols:
            print(f"  - Warning: Missing required columns: {missing_cols}")
            print(f"  - Creating synthetic value chain dataset")
            
            # Create example data
            data = {
                'crop_type': ['Maize', 'Maize', 'Maize', 'Maize', 'Maize', 
                             'Rice', 'Rice', 'Rice', 'Rice', 'Rice',
                             'Sorghum', 'Sorghum', 'Sorghum', 'Sorghum', 'Sorghum'],
                'stage': ['Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing',
                         'Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing',
                         'Harvesting', 'Drying', 'Storage', 'Transportation', 'Processing'],
                'loss_percentage': [6.42, 4.0, 1.32, 2.37, 4.71,
                                   5.12, 3.8, 2.11, 1.98, 3.45,
                                   4.89, 2.76, 3.21, 1.55, 2.98]
            }
            
            df = pd.DataFrame(data)
    
    # Convert loss_percentage to numeric
    df['loss_percentage'] = pd.to_numeric(df['loss_percentage'], errors='coerce')
    
    # Drop rows with missing loss_percentage
    df = df.dropna(subset=['loss_percentage'])
    
    print(f"  - Final shape: {df.shape}")
    
    # Save cleaned dataset
    output_file = 'data/cleaned/value_chain_cleaned.csv'
    df.to_csv(output_file, index=False)
    print(f"  - Saved cleaned dataset to: {output_file}")
    
    return df

# Function to process financial impact data
def process_financial_data(data/raw/financial-impact.csv):
    """
    Process financial impact dataset
    """
    print(f"\nProcessing financial impact data from: {file_path}")
    
    # Read the file with flexible parser
    try:
        df = pd.read_csv(file_path)
        print(f"  - Successfully read with comma separator")
    except:
        try:
            df = pd.read_csv(file_path, sep=';')
            print(f"  - Successfully read with semicolon separator")
        except:
            try:
                df = pd.read_csv(file_path, sep=None, engine='python')
                print(f"  - Successfully read with auto-detected separator")
            except Exception as e:
                print(f"  - Error reading file: {str(e)}")
                return None
    
    print(f"  - Original shape: {df.shape}")
    
    # Check if data is transposed (has headers in first row)
    df = fix_transposed_data(df)
    
    # Clean the dataset
    df = clean_dataset(df)
    
    # Check for required columns
    required_cols = ['crop_type', 'financial_value']
    missing_cols = [col for col in required_cols if not any(c.lower() == col.lower() for c in df.columns)]
    
    if missing_cols:
        print(f"  - Warning: Missing required columns: {missing_cols}")
        
        # Try to identify crop column
        crop_col = None
        for col in df.columns:
            if col.lower() in ['crop', 'crop_type', 'crop type', 'commodity', 'product']:
                crop_col = col
                break
        
        if crop_col is None:
            crop_col = df.columns[0]
            print(f"  - Using '{crop_col}' as the crop column")
        
        # Try to identify financial value column
        value_col = None
        for col in df.columns:
            if any(kw in col.lower() for kw in ['value', 'financial', 'loss', 'impact', 'cost', 'usd', '$', 'naira']):
                value_col = col
                break
        
        if value_col is None:
            # Look for a column with numeric values
            for col in df.columns:
                if col != crop_col and df[col].dtype in ['int64', 'float64']:
                    value_col = col
                    break
        
        if value_col is not None:
            print(f"  - Using '{value_col}' as the financial value column")
            
            # Rename columns to standard names
            df = df.rename(columns={crop_col: 'crop_type', value_col: 'financial_value'})
        else:
            print(f"  - Could not identify financial value column")
            print(f"  - Creating synthetic financial data")
            
            # Create example data
            data = {
                'crop_type': ['Maize', 'Rice', 'Sorghum', 'Millet', 'Cassava'],
                'financial_value': [1248197000, 978456000, 567123000, 345678000, 789012000],
                'region': ['National', 'National', 'National', 'National', 'National']
            }
            
            df = pd.DataFrame(data)
    
    # Convert financial_value to numeric
    df['financial_value'] = pd.to_numeric(df['financial_value'], errors='coerce')
    
    # Drop rows with missing financial_value
    df = df.dropna(subset=['financial_value'])
    
    # Ensure we have a region column
    if 'region' not in df.columns:
        df['region'] = 'National'
    
    print(f"  - Final shape: {df.shape}")
    
    # Save cleaned dataset
    output_file = 'data/cleaned/financial_impact_cleaned.csv'
    df.to_csv(output_file, index=False)
    print(f"  - Saved cleaned dataset to: {output_file}")
    
    return df

# Function to process nutrient losses data
def process_nutrient_data(file_path):
    """
    Process nutrient losses dataset
    """
    print(f"\nProcessing nutrient losses data from: {file_path}")
    
    # Read the file with flexible parser
    try:
        df = pd.read_csv(data/raw/nutrint_losses.csv)
        print(f"  - Successfully read with comma separator")
    except:
        try:
            df = pd.read_csv(file_path, sep=';')
            print(f"  - Successfully read with semicolon separator")
        except:
            try:
                df = pd.read_csv(file_path, sep=None, engine='python')
                print(f"  - Successfully read with auto-detected separator")
            except Exception as e:
                print(f"  - Error reading file: {str(e)}")
                return None
    
    print(f"  - Original shape: {df.shape}")
    
    # Check if data is transposed (crop names in first row)
    df = fix_transposed_data(df)
    
    # Clean the dataset
    df = clean_dataset(df)
    
    # Check if we have nutrient names in the first column
    nutrient_col = df.columns[0]
    if nutrient_col.lower() in ['nutrient', 'energy', 'nutrient type', 'nutrition']:
        print(f"  - Nutrient names found in column '{nutrient_col}'")
        
        # Reshape to long format
        crop_cols = [col for col in df.columns if col != nutrient_col]
        
        df = df.melt(
            id_vars=[nutrient_col],
            value_vars=crop_cols,
            var_name='crop_type',
            value_name='nutrient_loss'
        )
        
        # Rename nutrient column if needed
        if nutrient_col.lower() != 'nutrient':
            df = df.rename(columns={nutrient_col: 'nutrient'})
    else:
        print(f"  - Could not identify nutrient column")
        print(f"  - Creating synthetic nutrient data")
        
        # Create example data
        nutrients = ['Energy (kcal)', 'Protein (g)', 'Fat (g)', 'Carbohydrate (g)', 'Fiber (g)', 'Vitamin A (μg)']
        crops = ['Maize', 'Rice', 'Sorghum', 'Millet']
        
        data = []
        for nutrient in nutrients:
            for crop in crops:
                # Generate a random value based on the nutrient type
                if 'Energy' in nutrient:
                    value = np.random.uniform(2000000000, 8000000000)
                elif 'Protein' in nutrient:
                    value = np.random.uniform(100000, 500000)
                elif 'Fat' in nutrient:
                    value = np.random.uniform(50000, 200000)
                elif 'Carbohydrate' in nutrient:
                    value = np.random.uniform(500000, 1500000)
                elif 'Fiber' in nutrient:
                    value = np.random.uniform(20000, 100000)
                else:
                    value = np.random.uniform(10000, 50000)
                
                data.append({
                    'nutrient': nutrient,
                    'crop_type': crop,
                    'nutrient_loss': value
                })
        
        df = pd.DataFrame(data)
    
    # Convert nutrient_loss to numeric
    df['nutrient_loss'] = pd.to_numeric(df['nutrient_loss'], errors='coerce')
    
    # Drop rows with missing nutrient_loss
    df = df.dropna(subset=['nutrient_loss'])
    
    print(f"  - Final shape: {df.shape}")
    
    # Save cleaned dataset
    output_file = 'data/cleaned/nutrient_losses_cleaned.csv'
    df.to_csv(output_file, index=False)
    print(f"  - Saved cleaned dataset to: {output_file}")
    
    return df

# Function to process climate data
def process_climate_data(data/raw/Nigeria_climate_change.csv):
    """
    Process climate dataset
    """
    print(f"\nProcessing climate data from: {file_path}")
    
    # Read the file with flexible parser
    try:
        df = pd.read_csv(file_path)
        print(f"  - Successfully read with comma separator")
    except:
        try:
            df = pd.read_csv(file_path, sep=';')
            print(f"  - Successfully read with semicolon separator")
        except:
            try:
                df = pd.read_csv(file_path, sep=None, engine='python')
                print(f"  - Successfully read with auto-detected separator")
            except Exception as e:
                print(f"  - Error reading file: {str(e)}")
                return None
    
    print(f"  - Original shape: {df.shape}")
    
    # Clean the dataset
    df = clean_dataset(df)
    
    # Check if we have expected columns
    expected_cols = ['category', 'temperature', 'precipitation']
    expected_cols_found = [col for col in expected_cols if any(c.lower() == col.lower() or col.lower() in c.lower() for c in df.columns)]
    
    if len(expected_cols_found) < 2:
        print(f"  - Warning: Expected climate data columns not found")
        
        # Try to identify month/category column
        month_col = None
        for col in df.columns:
            if col.lower() in ['month', 'category', 'period', 'time', 'date']:
                month_col = col
                break
        
        if month_col is None:
            # If first column has month names, use it
            if df.iloc[:, 0].str.lower().str.contains('jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec').any():
                month_col = df.columns[0]
        
        if month_col:
            print(f"  - Found month/category column: '{month_col}'")
            
            # Rename to standard name
            df = df.rename(columns={month_col: 'Category'})
        else:
            print(f"  - Could not identify month/category column")
    
    # Convert numeric columns
    for col in df.columns:
        if col != 'Category':
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    print(f"  - Final shape: {df.shape}")
    
    # Save cleaned dataset
    output_file = 'data/cleaned/climate_data_cleaned.csv'
    df.to_csv(output_file, index=False)
    print(f"  - Saved cleaned dataset to: {output_file}")
    
    return df

# Function to find data files in current directory
def find_data_files():
    """
    Find and classify data files in the current directory
    """
    print("\nSearching for data files...")
    
    datasets = {
        'post_harvest_losses': None,
        'value_chain': None,
        'financial_impact': None,
        'nutrient_losses': None,
        'climate_data': None
    }
    
    # Look for files in data directories
    for root, dirs, files in os.walk('.'):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                file_lower = file.lower()
                
                if 'harvest' in file_lower and 'loss' in file_lower:
                    datasets['post_harvest_losses'] = file_path
                    print(f"Found post-harvest losses data: {file_path}")
                
                elif 'value' in file_lower and 'chain' in file_lower:
                    datasets['value_chain'] = file_path
                    print(f"Found value chain data: {file_path}")
                
                elif ('financ' in file_lower or 'economic' in file_lower) and ('impact' in file_lower or 'loss' in file_lower):
                    datasets['financial_impact'] = file_path
                    print(f"Found financial impact data: {file_path}")
                
                elif 'nutri' in file_lower and 'loss' in file_lower:
                    datasets['nutrient_losses'] = file_path
                    print(f"Found nutrient losses data: {file_path}")
                
                elif 'climate' in file_lower or 'weather' in file_lower:
                    datasets['climate_data'] = file_path
                    print(f"Found climate data: {file_path}")
    
    # Check for missing datasets
    missing = [k for k, v in datasets.items() if v is None]
    if missing:
        print(f"Warning: Could not find data files for: {', '.join(missing)}")
        
        # Look for any CSV files
        csv_files = []
        for root, dirs, files in os.walk('.'):
            for file in files:
                if file.endswith('.csv'):
                    csv_files.append(os.path.join(root, file))
        
        if csv_files:
            print(f"Found {len(csv_files)} CSV files:")
            for i, file in enumerate(csv_files):
                print(f"  {i+1}. {file}")
            
            print("\nBased on the available CSV files, making best guesses:")
            
            # Try to assign unidentified files to missing datasets
            for dataset_type in missing:
                if csv_files:
                    datasets[dataset_type] = csv_files.pop(0)
                    print(f"  Assigned {datasets[dataset_type]} to {dataset_type}")
    
    return datasets

# Main execution
def main():
    # Create project configuration
    project_config = {
        'project_name': 'Nigeria Post-Harvest Losses Analysis',
        'primary_focus': 'Post-Harvest Losses',
        'secondary_focus': 'Agricultural Finance',
        'start_date': datetime.now().strftime('%Y-%m-%d'),
        'datasets': {}
    }
    
    # Find data files
    datasets = find_data_files()
    
    # Process each dataset type
    processed_datasets = {}
    
    # Process post-harvest losses data
    if datasets['post_harvest_losses']:
        df = process_post_harvest_losses(datasets['post_harvest_losses'])
        if df is not None:
            processed_datasets['post_harvest_losses'] = 'data/cleaned/post_harvest_losses_cleaned.csv'
            # Backup original data
            shutil.copy2(datasets['post_harvest_losses'], f'data/original/{os.path.basename(datasets["post_harvest_losses"])}')
    else:
        print("\nPost-harvest losses data file not found. Skipping processing.")
    
    # Process value chain data
    if datasets['value_chain']:
        df = process_value_chain(datasets['value_chain'])
        if df is not None:
            processed_datasets['value_chain'] = 'data/cleaned/value_chain_cleaned.csv'
            # Backup original data
            shutil.copy2(datasets['value_chain'], f'data/original/{os.path.basename(datasets["value_chain"])}')
    else:
        print("\nValue chain data file not found. Creating synthetic data.")
        df = process_value_chain(None)
        if df is not None:
            processed_datasets['value_chain'] = 'data/cleaned/value_chain_cleaned.csv'
    
    # Process financial impact data
    if datasets['financial_impact']:
        df = process_financial_data(datasets['financial_impact'])
        if df is not None:
            processed_datasets['financial_impact'] = 'data/cleaned/financial_impact_cleaned.csv'
            # Backup original data
            shutil.copy2(datasets['financial_impact'], f'data/original/{os.path.basename(datasets["financial_impact"])}')
    else:
        print("\nFinancial impact data file not found. Skipping processing.")
    
    # Process nutrient losses data
    if datasets['nutrient_losses']:
        df = process_nutrient_data(datasets['nutrient_losses'])
        if df is not None:
            processed_datasets['nutrient_losses'] = 'data/cleaned/nutrient_losses_cleaned.csv'
            # Backup original data
            shutil.copy2(datasets['nutrient_losses'], f'data/original/{os.path.basename(datasets["nutrient_losses"])}')
    else:
        print("\nNutrient losses data file not found. Skipping processing.")
    
    # Process climate data
    if datasets['climate_data']:
        df = process_climate_data(datasets['climate_data'])
        if df is not None:
            processed_datasets['climate_data'] = 'data/cleaned/climate_data_cleaned.csv'
            # Backup original data
            shutil.copy2(datasets['climate_data'], f'data/original/{os.path.basename(datasets["climate_data"])}')
    else:
        print("\nClimate data file not found. Skipping processing.")
    
    # Update project configuration with processed datasets
    project_config['datasets'] = processed_datasets
    
    # Save project configuration
    with open('project_config.json', 'w') as f:
        json.dump(project_config, f, indent=4)
    
    print("\n" + "="*80)
    print("DATA PREPARATION COMPLETE")
    print("="*80)
    print(f"Processed Datasets: {len(processed_datasets)}")
    for dataset_type, file_path in processed_datasets.items():
        print(f"  - {dataset_type}: {file_path}")
    print("\nProject configuration saved to: project_config.json")
    print("\nNext steps:")
    print("  1. Run the visualizations.py script to create plots")
    print("  2. Check the results/plots directory for generated visualizations")

if __name__ == "__main__":
    main()

SyntaxError: invalid syntax (1467394783.py, line 140)

In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
import matplotlib.ticker as ticker

print("=" * 80)
print("NIGERIA POST-HARVEST LOSSES: VISUALIZATIONS")
print("=" * 80)
print(f"Execution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Create results directory
os.makedirs('results/plots', exist_ok=True)

# Load project configuration
try:
    with open('project_config.json', 'r') as f:
        config = json.load(f)
    print("Project configuration loaded successfully")
    print(f"Project: {config['project_name']}")
    print(f"Primary focus: {config['primary_focus']}")
    print(f"Secondary focus: {config['secondary_focus']}")
    print(f"Start date: {config['start_date']}")
    print(f"Datasets: {len(config['datasets'])}")
except Exception as e:
    print(f"Error loading project configuration: {str(e)}")
    config = {'datasets': {}}

# Function to format large numbers
def format_large_number(value):
    if pd.isna(value):
        return "N/A"
    
    if value >= 1e9:
        return f"{value/1e9:.1f}B"
    elif value >= 1e6:
        return f"{value/1e6:.1f}M"
    elif value >= 1e3:
        return f"{value/1e3:.1f}K"
    else:
        return f"{value:.1f}"

# Function to create post-harvest losses visualizations
def visualize_post_harvest_losses(file_path=None):
    """
    Create visualizations for post-harvest losses data
    """
    print("\nCreating post-harvest losses visualizations...")
    
    if file_path is None:
        file_path = config['datasets'].get('post_harvest_losses')
    
    if not file_path or not os.path.exists(file_path):
        print(f"  - Post-harvest losses data file not found: {file_path}")
        return
    
    # Load the data
    df = pd.read_csv(file_path)
    print(f"  - Loaded data shape: {df.shape}")
    print(f"  - Columns: {df.columns.tolist()}")
    
    # Check if data has the expected structure
    if 'crop_type' not in df.columns or 'loss_percentage' not in df.columns:
        print("  - Error: Data does not have the expected columns (crop_type, loss_percentage)")
        return
    
    # 1. Create a bar chart of post-harvest losses by crop type
    plt.figure(figsize=(12, 8))
    
    # Calculate average loss by crop type
    crop_losses = df.groupby('crop_type')['loss_percentage'].mean().sort_values(ascending=False)
    
    # Create the plot
    sns.barplot(x=crop_losses.index, y=crop_losses.values)
    plt.title('Average Post-Harvest Losses by Crop Type in Nigeria', fontsize=16)
    plt.xlabel('Crop Type', fontsize=14)
    plt.ylabel('Loss Percentage (%)', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on top of bars
    for i, v in enumerate(crop_losses.values):
        plt.text(i, v + 0.5, f'{v:.1f}%', ha='center', fontsize=12)
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/post_harvest_losses_by_crop.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    # 2. Create a heatmap of post-harvest losses by crop type and region (if region data available)
    if 'State/Region' in df.columns:
        plt.figure(figsize=(16, 10))
        
        # Create a pivot table
        pivot = df.pivot_table(
            values='loss_percentage',
            index='State/Region',
            columns='crop_type',
            fill_value=0
        )
        
        # Limit to top 15 regions by total loss
        if len(pivot) > 15:
            pivot = pivot.loc[pivot.sum(axis=1).nlargest(15).index]
        
        # Create the heatmap
        sns.heatmap(pivot, annot=True, fmt='.1f', cmap='YlOrRd', linewidths=0.5)
        plt.title('Post-Harvest Losses by Crop Type and Region in Nigeria (%)', fontsize=16)
        plt.xlabel('Crop Type', fontsize=14)
        plt.ylabel('Region', fontsize=14)
        plt.xticks(rotation=45, ha='right')
        
        plt.tight_layout()
        
        # Save the plot
        output_file = 'results/plots/post_harvest_losses_heatmap.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  - Saved plot to: {output_file}")
    
    print("  - Post-harvest losses visualizations created successfully")

# Function to create value chain visualizations
def visualize_value_chain(file_path=None):
    """
    Create visualizations for value chain data
    """
    print("\nCreating value chain visualizations...")
    
    if file_path is None:
        file_path = config['datasets'].get('value_chain')
    
    if not file_path or not os.path.exists(file_path):
        print(f"  - Value chain data file not found: {file_path}")
        return
    
    # Load the data
    df = pd.read_csv(file_path)
    print(f"  - Loaded data shape: {df.shape}")
    print(f"  - Columns: {df.columns.tolist()}")
    
    # Check if data has the expected structure
    if 'crop_type' not in df.columns or 'stage' not in df.columns or 'loss_percentage' not in df.columns:
        print("  - Error: Data does not have the expected columns (crop_type, stage, loss_percentage)")
        return
    
    # 1. Create a grouped bar chart of losses by stage and crop type
    plt.figure(figsize=(14, 8))
    
    # Create the plot
    ax = sns.barplot(x='stage', y='loss_percentage', hue='crop_type', data=df)
    plt.title('Post-Harvest Losses by Value Chain Stage and Crop Type', fontsize=16)
    plt.xlabel('Value Chain Stage', fontsize=14)
    plt.ylabel('Loss Percentage (%)', fontsize=14)
    plt.xticks(rotation=30, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.legend(title='Crop Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Add value labels on top of bars
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%', padding=3)
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/value_chain_losses_by_stage.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    # 2. Create a stacked area chart showing cumulative losses across the value chain
    plt.figure(figsize=(14, 8))
    
    # Get list of unique crops and stages
    crops = sorted(df['crop_type'].unique())
    stages = sorted(df['stage'].unique(), key=lambda x: df[df['stage'] == x]['loss_percentage'].mean())
    
    # Create dictionary to store cumulative losses
    cumulative_losses = {crop: [] for crop in crops}
    
    # Calculate cumulative losses for each crop
    for crop in crops:
        cum_loss = 0
        crop_data = df[df['crop_type'] == crop].sort_values('loss_percentage', ascending=False)
        
        for stage in stages:
            stage_loss = crop_data[crop_data['stage'] == stage]['loss_percentage'].values
            if len(stage_loss) > 0:
                cum_loss += stage_loss[0]
            
            cumulative_losses[crop].append(cum_loss)
    
    # Create stacked area chart
    plt.figure(figsize=(12, 8))
    
    for crop in crops:
        plt.plot(stages, cumulative_losses[crop], marker='o', linewidth=2, label=crop)
    
    plt.title('Cumulative Post-Harvest Losses Across Value Chain', fontsize=16)
    plt.xlabel('Value Chain Stage', fontsize=14)
    plt.ylabel('Cumulative Loss (%)', fontsize=14)
    plt.xticks(rotation=30, ha='right')
    plt.grid(alpha=0.3)
    plt.legend(title='Crop Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/cumulative_value_chain_losses.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    # 3. Create a heatmap of losses by stage and crop type
    plt.figure(figsize=(12, 8))
    
    # Create pivot table
    pivot = df.pivot_table(
        values='loss_percentage',
        index='stage',
        columns='crop_type',
        fill_value=0
    )
    
    # Create the heatmap
    sns.heatmap(pivot, annot=True, fmt='.1f', cmap='YlOrRd', linewidths=0.5)
    plt.title('Post-Harvest Losses by Value Chain Stage and Crop Type (%)', fontsize=16)
    plt.xlabel('Crop Type', fontsize=14)
    plt.ylabel('Value Chain Stage', fontsize=14)
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/value_chain_heatmap.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    print("  - Value chain visualizations created successfully")

# Function to create financial impact visualizations
def visualize_financial_impact(file_path=None):
    """
    Create visualizations for financial impact data
    """
    print("\nCreating financial impact visualizations...")
    
    if file_path is None:
        file_path = config['datasets'].get('financial_impact')
    
    if not file_path or not os.path.exists(file_path):
        print(f"  - Financial impact data file not found: {file_path}")
        return
    
    # Load the data
    df = pd.read_csv(file_path)
    print(f"  - Loaded data shape: {df.shape}")
    print(f"  - Columns: {df.columns.tolist()}")
    
    # Check if data has the expected structure
    if 'crop_type' not in df.columns or 'financial_value' not in df.columns:
        print("  - Error: Data does not have the expected columns (crop_type, financial_value)")
        return
    
    # 1. Create a bar chart of financial losses by crop type
    plt.figure(figsize=(12, 8))
    
    # Sort by financial value
    sorted_df = df.sort_values('financial_value', ascending=False)
    
    # Create the plot
    ax = sns.barplot(x='crop_type', y='financial_value', data=sorted_df)
    plt.title('Financial Impact of Post-Harvest Losses by Crop Type', fontsize=16)
    plt.xlabel('Crop Type', fontsize=14)
    plt.ylabel('Financial Loss (USD)', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    
    # Format y-axis with million/billion suffixes
    def millions(x, pos):
        return format_large_number(x)
    
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(millions))
    
    # Add value labels on top of bars
    for i, v in enumerate(sorted_df['financial_value']):
        ax.text(i, v + v*0.02, format_large_number(v), ha='center')
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/financial_impact_by_crop.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    # 2. Create a pie chart of financial losses by crop type
    plt.figure(figsize=(12, 10))
    
    # Calculate percentage of total loss by crop
    total = sorted_df['financial_value'].sum()
    crop_percentages = sorted_df['financial_value'] / total * 100
    
    # Create the plot
    plt.pie(
        sorted_df['financial_value'],
        labels=sorted_df['crop_type'],
        autopct='%1.1f%%',
        startangle=90,
        shadow=True,
        explode=[0.05] * len(sorted_df),
        textprops={'fontsize': 12}
    )
    plt.title('Share of Financial Losses by Crop Type', fontsize=16)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/financial_impact_pie.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    # 3. If region data is available, create a bar chart by region
    if 'region' in df.columns and df['region'].nunique() > 1:
        plt.figure(figsize=(12, 8))
        
        # Calculate total financial loss by region
        region_losses = df.groupby('region')['financial_value'].sum().sort_values(ascending=False)
        
        # Create the plot
        ax = sns.barplot(x=region_losses.index, y=region_losses.values)
        plt.title('Financial Impact of Post-Harvest Losses by Region', fontsize=16)
        plt.xlabel('Region', fontsize=14)
        plt.ylabel('Financial Loss (USD)', fontsize=14)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', alpha=0.3)
        
        # Format y-axis with million/billion suffixes
        ax.yaxis.set_major_formatter(ticker.FuncFormatter(millions))
        
        # Add value labels on top of bars
        for i, v in enumerate(region_losses.values):
            ax.text(i, v + v*0.02, format_large_number(v), ha='center')
        
        plt.tight_layout()
        
        # Save the plot
        output_file = 'results/plots/financial_impact_by_region.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  - Saved plot to: {output_file}")
    
    print("  - Financial impact visualizations created successfully")

# Function to create nutrient losses visualizations
def visualize_nutrient_losses(file_path=None):
    """
    Create visualizations for nutrient losses data
    """
    print("\nCreating nutrient losses visualizations...")
    
    if file_path is None:
        file_path = config['datasets'].get('nutrient_losses')
    
    if not file_path or not os.path.exists(file_path):
        print(f"  - Nutrient losses data file not found: {file_path}")
        return
    
    # Load the data
    df = pd.read_csv(file_path)
    print(f"  - Loaded data shape: {df.shape}")
    print(f"  - Columns: {df.columns.tolist()}")
    
    # Check if data has the expected structure
    if 'nutrient' not in df.columns or 'crop_type' not in df.columns or 'nutrient_loss' not in df.columns:
        print("  - Error: Data does not have the expected columns (nutrient, crop_type, nutrient_loss)")
        return
    
    # 1. Create a grouped bar chart of nutrient losses by crop type
    plt.figure(figsize=(14, 10))
    
    # Get the top 5 nutrients by total loss
    top_nutrients = df.groupby('nutrient')['nutrient_loss'].sum().nlargest(5).index.tolist()
    
    # Filter data for top nutrients
    filtered_df = df[df['nutrient'].isin(top_nutrients)]
    
    # Create log scale for better visualization if values vary widely
    if filtered_df['nutrient_loss'].max() / filtered_df['nutrient_loss'].min() > 100:
        plt.yscale('log')
        print("  - Using log scale for nutrient losses due to wide value range")
    
    # Create the plot
    ax = sns.barplot(x='crop_type', y='nutrient_loss', hue='nutrient', data=filtered_df)
    plt.title('Top 5 Nutrients Lost Due to Post-Harvest Losses by Crop Type', fontsize=16)
    plt.xlabel('Crop Type', fontsize=14)
    plt.ylabel('Nutrient Loss (log scale)', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.legend(title='Nutrient', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/nutrient_losses_by_crop.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    # 2. Create a heatmap of nutrient losses
    plt.figure(figsize=(14, 10))
    
    # Create pivot table
    pivot = df.pivot_table(
        values='nutrient_loss',
        index='nutrient',
        columns='crop_type',
        aggfunc='sum',
        fill_value=0
    )
    
    # Normalize to make comparison easier across nutrients with different scales
    norm_pivot = pivot.div(pivot.max(axis=1), axis=0)
    
    # Create the heatmap
    sns.heatmap(norm_pivot, annot=False, cmap='YlOrRd', linewidths=0.5)
    plt.title('Relative Nutrient Losses by Crop Type (Normalized)', fontsize=16)
    plt.xlabel('Crop Type', fontsize=14)
    plt.ylabel('Nutrient', fontsize=14)
    
    plt.tight_layout()
    
    # Save the plot
    output_file = 'results/plots/nutrient_losses_heatmap.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  - Saved plot to: {output_file}")
    
    print("  - Nutrient losses visualizations created successfully")

# Function to create climate data visualizations
def visualize_climate_data(file_path=None):
    """
    Create visualizations for climate data
    """
    print("\nCreating climate data visualizations...")
    
    if file_path is None:
        file_path = config['datasets'].get('climate_data')
    
    if not file_path or not os.path.exists(file_path):
        print(f"  - Climate data file not found: {file_path}")
        return
    
    # Load the data
    df = pd.read_csv(file_path)
    print(f"  - Loaded data shape: {df.shape}")
    print(f"  - Columns: {df.columns.tolist()}")
    
    # Check if data has temperature or precipitation columns
    temp_cols = [col for col in df.columns if 'temp' in col.lower()]
    precip_cols = [col for col in df.columns if 'precip' in col.lower() or 'rain' in col.lower()]
    
    if not temp_cols and not precip_cols:
        print("  - Error: Data does not have temperature or precipitation columns")
        return
    
    # Look for a category/month column
    month_col = None
    for col in df.columns:
        if col.lower() in ['category', 'month', 'time', 'period']:
            month_col = col
            break
    
    if month_col is None:
        # Use the first column as the category/month column
        month_col = df.columns[0]
    
    # 1. Create a line chart with temperature data
    if temp_cols:
        plt.figure(figsize=(12, 8))
        
        # Create the plot with all temperature columns
        for col in temp_cols:
            plt.plot(df[month_col], df[col], marker='o', linewidth=2, label=col)
        
        plt.title('Temperature Trends in Nigeria', fontsize=16)
        plt.xlabel(month_col, fontsize=14)
        plt.ylabel('Temperature (°C)', fontsize=14)
        plt.grid(alpha=0.3)
        plt.legend(title='Measurement', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Set reasonable y-axis limits for temperature
        mean_temp = np.mean([df[col].mean() for col in temp_cols])
        plt.ylim(mean_temp - 15, mean_temp + 15)
        
        plt.tight_layout()
        
        # Save the plot
        output_file = 'results/plots/climate_temperature_trends.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  - Saved plot to: {output_file}")
    
    # 2. Create a combined chart with temperature and precipitation
    if temp_cols and precip_cols:
        plt.figure(figsize=(12, 8))
        
        # Create primary axis for temperature
        ax1 = plt.gca()
        ax1.set_xlabel(month_col, fontsize=14)
        ax1.set_ylabel('Temperature (°C)', fontsize=14, color='red')
        ax1.tick_params(axis='y', colors='red')
        
        # Plot temperature (use the first temperature column)
        temperature = df[temp_cols[0]]
        ax1.plot(df[month_col], temperature, color='red', marker='o', linewidth=2)
        
        # Create secondary axis for precipitation
        ax2 = ax1.twinx()
        ax2.set_ylabel('Precipitation (mm)', fontsize=14, color='blue')
        ax2.tick_params(axis='y', colors='blue')
        
        # Plot precipitation (use the first precipitation column)
        precipitation = df[precip_cols[0]]
        ax2.bar(df[month_col], precipitation, color='blue', alpha=0.6)
        
        plt.title('Temperature and Precipitation in Nigeria', fontsize=16)
        plt.grid(alpha=0.3)
        
        # Add custom legend
        red_line = plt.Line2D([], [], color='red', marker='o', linestyle='-', linewidth=2)
        blue_bar = plt.Rectangle((0,0), 1, 1, color='blue', alpha=0.6)
        plt.legend([red_line, blue_bar], [temp_cols[0], precip_cols[0]], 
                  loc='upper right')
        
        plt.tight_layout()
        
        # Save the plot
        output_file = 'results/plots/climate_temperature_precipitation.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  - Saved plot to: {output_file}")
    
    print("  - Climate data visualizations created successfully")

# Function to create integrated visualizations combining multiple datasets
def create_integrated_visualizations():
    """
    Create visualizations that integrate data from multiple datasets
    """
    print("\nCreating integrated visualizations...")
    
    # Load all available datasets
    datasets = {}
    
    # Post-harvest losses
    if 'post_harvest_losses' in config['datasets'] and os.path.exists(config['datasets']['post_harvest_losses']):
        datasets['post_harvest_losses'] = pd.read_csv(config['datasets']['post_harvest_losses'])
        print(f"  - Loaded post-harvest losses data: {datasets['post_harvest_losses'].shape}")
    
    # Value chain
    if 'value_chain' in config['datasets'] and os.path.exists(config['datasets']['value_chain']):
        datasets['value_chain'] = pd.read_csv(config['datasets']['value_chain'])
        print(f"  - Loaded value chain data: {datasets['value_chain'].shape}")
    
    # Financial impact
    if 'financial_impact' in config['datasets'] and os.path.exists(config['datasets']['financial_impact']):
        datasets['financial_impact'] = pd.read_csv(config['datasets']['financial_impact'])
        print(f"  - Loaded financial impact data: {datasets['financial_impact'].shape}")
    
    # Check if we have enough datasets for integrated visualizations
    if len(datasets) < 2:
        print("  - Not enough datasets available for integrated visualizations")
        return
    
    # 1. Integrated visualization: Post-harvest losses vs. Financial impact
    if 'post_harvest_losses' in datasets and 'financial_impact' in datasets:
        print("  - Creating integrated visualization of losses vs. financial impact")
        
        # Get common crop types
        ph_crops = set(datasets['post_harvest_losses']['crop_type'].unique())
        fin_crops = set(datasets['financial_impact']['crop_type'].unique())
        common_crops = list(ph_crops.intersection(fin_crops))
        
        if common_crops:
            # Filter data for common crops
            ph_filtered = datasets['post_harvest_losses'][
                datasets['post_harvest_losses']['crop_type'].isin(common_crops)
            ]
            fin_filtered = datasets['financial_impact'][
                datasets['financial_impact']['crop_type'].isin(common_crops)
            ]
            
            # Calculate average loss by crop
            ph_avg = ph_filtered.groupby('crop_type')['loss_percentage'].mean().reset_index()
            
            # Merge datasets
            merged = pd.merge(
                ph_avg,
                fin_filtered[['crop_type', 'financial_value']],
                on='crop_type',
                how='inner'
            )
            
            if len(merged) > 0:
                plt.figure(figsize=(12, 8))
                
                # Create scatter plot
                plt.scatter(
                    merged['loss_percentage'],
                    merged['financial_value'],
                    s=100,
                    alpha=0.7
                )
                
                # Add crop type labels to each point
                for i, row in merged.iterrows():
                    plt.annotate(
                        row['crop_type'],
                        (row['loss_percentage'], row['financial_value']),
                        fontsize=12,
                        xytext=(5, 5),
                        textcoords='offset points'
                    )
                
                plt.title('Relationship Between Post-Harvest Losses and Financial Impact', fontsize=16)
                plt.xlabel('Loss Percentage (%)', fontsize=14)
                plt.ylabel('Financial Loss (USD)', fontsize=14)
                plt.grid(alpha=0.3)
                
                # Format y-axis with million/billion suffixes
                def millions(x, pos):
                    return format_large_number(x)
                
                plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(millions))
                
                plt.tight_layout()
                
                # Save the plot
                output_file = 'results/plots/integrated_losses_vs_financial.png'
                plt.savefig(output_file, dpi=300, bbox_inches='tight')
                plt.close()
                print(f"  - Saved plot to: {output_file}")
            else:
                print("  - No matching data after merging datasets")
        else:
            print("  - No common crop types found between datasets")
    
    # 2. Integrated visualization: Value chain losses and financial impact
    if 'value_chain' in datasets and 'financial_impact' in datasets:
        print("  - Creating integrated visualization of value chain and financial impact")
        
        # Get common crop types
        vc_crops = set(datasets['value_chain']['crop_type'].unique())
        fin_crops = set(datasets['financial_impact']['crop

SyntaxError: unterminated string literal (detected at line 670) (446974296.py, line 670)