In [1]:
import os
import re
import pandas as pd

def load_h1_data(file_path="anonymized_h1_data.csv"):
    """
    Load anonymized H1 data from a CSV file
    
    Args:
        file_path (str): Path to the anonymized CSV file
        
    Returns:
        pd.DataFrame: The H1 data or generated sample data if loading fails
    """
    try:
        # Load the CSV file
        df = pd.read_csv(file_path)
        print(f"Loaded data from {file_path} with shape: {df.shape}")
        
        # Clean up column names (strip whitespace)
        df.columns = [str(col).strip() for col in df.columns]
        
        # Validate the structure (should have Segment, Size, and Week columns)
        required_columns = ["Segment", "Size"]
        
        # Check if we have the right first two columns
        if len(df.columns) < 2:
            raise ValueError("Not enough columns in the data")
        
        # Check if first columns match expected segment/size columns
        actual_first_cols = df.columns[:2]
        if not (actual_first_cols[0].lower() == "segment" and actual_first_cols[1].lower() == "size"):
            # Try to rename based on content
            print(f"Column names don't match expected. Found: {actual_first_cols}")
            
            # Rename first two columns to match expected
            df = df.rename(columns={
                df.columns[0]: "Segment",
                df.columns[1]: "Size"
            })
            print("Renamed first two columns to 'Segment' and 'Size'")
        
        # Check for week columns (they might have spaces like "Week 1" instead of "Week1")
        week_pattern = re.compile(r"week\s*\d+", re.IGNORECASE)
        week_cols = [col for col in df.columns[2:] if week_pattern.match(col)]
        
        if len(week_cols) < 10:  # Require at least 10 weeks of data
            print(f"Not enough week columns found. Found: {df.columns[2:10]}")
            raise ValueError(f"Not enough week columns identified. Need at least 10.")
        
        # Ensure all numeric columns are float type
        numeric_cols = df.columns[2:]
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        print(f"Successfully loaded and processed H1 data with shape: {df.shape}")
        return df
            
    except Exception as e:
        print(f"Error loading H1 data: {e}")
        # Fall back to generating sample data
        print("Falling back to sample data generation")
        return generate_sample_h1_data()