In [None]:
#  [markdown]
# # YOLO Syringe Volume Estimation Analysis
#
# This notebook analyzes the performance of a YOLO-based model for estimating liquid volume in syringes.
# It includes data loading, preprocessing, calculation of various error metrics, statistical tests, and visualizations.

#  [markdown]
# ## 1. Setup and Library Imports

#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats
import statsmodels.api as sm # This line was causing the error if statsmodels is not installed
from statsmodels.formula.api import ols
# from statsmodels.stats.multicomp import pairwise_tukeyhsd # Uncomment if needed for post-hoc tests

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("talk") # "paper", "notebook", "talk", "poster"

#  [markdown]
# ## 2. Data Loading and Initial Inspection
#
# The data will be loaded from a CSV file.
# **You should replace `csv_file_path` with the actual path to your CSV file if it's different from "overview.csv".**

#
# Function to load data
def load_data(file_path: str) -> pd.DataFrame:
    """Loads data from a CSV file path."""
    print(f"Loading data from: {file_path}")
    # Correctly handle 'nan' and 'corrupted video' as NaN values across all columns.
    df = pd.read_csv(file_path, na_values=['nan', 'corrupted video', 'NaN', 'Corrupted video'])
    return df

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# IMPORTANT: Ensure "overview.csv" is in the same directory as the script,
# or provide the full path to your CSV file.
csv_file_path = "overview.csv"
df = load_data(file_path=csv_file_path)
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

print("\nFirst 5 rows of the dataset:")
print(df.head())

# Attempt numeric conversion for key columns immediately after loading
print("\nConverting relevant columns to numeric types...")
for col in ['target volume', 'estimated volume', 'container weight']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        print(f"Column '{col}' Dtype after to_numeric: {df[col].dtype}")
    else:
        print(f"Warning: Column '{col}' not found in DataFrame for numeric conversion.")


print("\nDataset information (after initial numeric conversion):")
df.info()

print("\nSummary statistics for numeric columns (initial - after to_numeric):")
# This should now work as expected if conversions were successful for some columns
if not df.select_dtypes(include=np.number).empty:
    print(df.describe(include=np.number))
else:
    print("No numeric columns found to describe after initial conversion attempt.")

print("\nSummary statistics for object columns (initial):")
if not df.select_dtypes(include='object').empty:
    print(df.describe(include='object'))
else:
    print("No object columns found to describe.")


#  [markdown]
# ## 3. Data Cleaning and Preprocessing

#
# Drop rows where essential data for analysis might be missing AFTER initial load
# For instance, if 'person', 'target place', 'syringe' is NaN, it's hard to analyze.
# 'target volume' NaNs (original or from coerce) will be handled by specific analysis steps.
df.dropna(subset=['person', 'target place', 'syringe'], inplace=True)


# Extract syringe size (numeric part)
# Ensure 'syringe' column exists before trying to extract from it
if 'syringe' in df.columns:
    df['syringe_size_ml'] = df['syringe'].str.extract(r'(\d+)').astype(float)
else:
    print("Warning: 'syringe' column not found for 'syringe_size_ml' extraction.")
    df['syringe_size_ml'] = np.nan # Create the column with NaNs if 'syringe' is missing


# Define container base weights (empty and with lid)
container_specs = {
    'throat': {'empty': 4.8, 'lid': 8.0},
    'arm': {'empty': 4.9, 'lid': 8.1},
    'foot': {'empty': 4.9, 'lid': 8.1}
}

# Function to determine true container weight based on notes
def get_true_container_weight(row: pd.Series) -> float:
    """Calculates the true weight of the container based on person and target place."""
    place = row['target place']

    if pd.isna(place) or place not in container_specs:
        return np.nan

    has_lid = False
    if pd.notna(row['notes']):
        if 'with lid' in str(row['notes']).lower():
            has_lid = True
        # Special handling for person1 and person2 if notes are ambiguous
        elif row['person'] in ['person1', 'person2'] and 'with lid' not in str(row['notes']).lower() and 'no lid' not in str(row['notes']).lower():
             has_lid = True


    if has_lid:
        return container_specs[place]['lid']
    else:
        return container_specs[place]['empty']

df['true_container_weight'] = df.apply(get_true_container_weight, axis=1)

# Calculate actual liquid weight (measured) and volume (assuming density 1g/mL)
# Ensure 'container weight' and 'true_container_weight' exist and are numeric
if 'container weight' in df.columns and 'true_container_weight' in df.columns:
    df['actual_liquid_weight_measured'] = df['container weight'] - df['true_container_weight']
    df['actual_liquid_volume_from_weight'] = df['actual_liquid_weight_measured'] # Assuming density 1 g/mL
else:
    print("Warning: 'container weight' or 'true_container_weight' missing for liquid weight calculation.")
    df['actual_liquid_weight_measured'] = np.nan
    df['actual_liquid_volume_from_weight'] = np.nan


# Flag problematic readings based on 'notes'
if 'notes' in df.columns:
    df['notes_lower'] = df['notes'].str.lower().fillna('')
    df['is_occluded'] = df['notes_lower'].str.contains('occlusion|covering syringe')
    df['is_bad_reading'] = df['notes_lower'].str.contains('bad reading|horrible reading')
    df['is_manual_extraction'] = df['notes_lower'].str.contains('manual extraction')
    df['other_issues'] = df['notes_lower'].str.contains('dissaperars|initial condition not met|no syringe detected')
    df['problematic_reading'] = df['is_occluded'] | df['is_bad_reading'] | df['is_manual_extraction'] | df['other_issues']
else:
    print("Warning: 'notes' column missing. Cannot create flags for problematic readings.")
    df['is_occluded'] = False
    df['is_bad_reading'] = False
    df['is_manual_extraction'] = False
    df['other_issues'] = False
    df['problematic_reading'] = False


# Consistency check for target place vs. estimated place
if 'target place' in df.columns and 'estimated place' in df.columns:
    df['place_match'] = df['target place'] == df['estimated place']
    print("\n--- Place Estimation Match (Target vs. Estimated) ---")
    if not df[pd.notna(df['estimated place'])].empty:
        print(df[pd.notna(df['estimated place'])]['place_match'].value_counts(normalize=True, dropna=False))
    else:
        print("No non-NaN 'estimated place' data for place match analysis.")
else:
    print("Warning: 'target place' or 'estimated place' columns missing for place match analysis.")


print("\nDataset after cleaning and preprocessing (info):")
df.info()
print("\nSummary statistics for numeric columns (processed):")
numeric_cols_for_summary = ['target volume', 'estimated volume', 'container weight',
                            'true_container_weight', 'actual_liquid_volume_from_weight', 'syringe_size_ml']
existing_numeric_cols = [col for col in numeric_cols_for_summary if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

if existing_numeric_cols:
    print(df[existing_numeric_cols].describe())
else:
    print("No existing numeric columns for final summary statistics.")


# Analysis of rows with NaN in 'estimated volume'
if 'estimated volume' in df.columns:
    nan_estimated_df = df[df['estimated volume'].isna()]
    print("\n--- Analysis of Rows with NaN Estimated Volume (Post-Preprocessing) ---")
    if not nan_estimated_df.empty:
        print(f"Total rows with NaN estimated volume: {len(nan_estimated_df)}")
        if 'target place' in nan_estimated_df: print("Counts by 'target place':\n", nan_estimated_df['target place'].value_counts(dropna=False))
        if 'syringe' in nan_estimated_df: print("\nCounts by 'syringe':\n", nan_estimated_df['syringe'].value_counts(dropna=False))
        if 'notes_lower' in nan_estimated_df: print("\nCounts from 'notes_lower' (indicative):\n", nan_estimated_df['notes_lower'].value_counts(dropna=False).head())

        problem_flags_cols = ['is_occluded', 'is_bad_reading', 'is_manual_extraction', 'other_issues']
        existing_problem_flags = [col for col in problem_flags_cols if col in nan_estimated_df.columns]
        if existing_problem_flags: print("\nProblematic flags for these NaN rows:\n", nan_estimated_df[existing_problem_flags].sum())
    else:
        print("No rows with NaN estimated volume found after preprocessing.")
else:
    print("Warning: 'estimated volume' column not found for NaN analysis.")


#  [markdown]
# ## 4. Exploratory Data Analysis (EDA)

#
# Histograms of key volumes
key_volume_cols = ['target volume', 'estimated volume', 'actual_liquid_volume_from_weight']
existing_key_volume_cols = [col for col in key_volume_cols if col in df.columns and df[col].notna().any()]

if len(existing_key_volume_cols) > 0:
    fig, axes = plt.subplots(1, len(existing_key_volume_cols), figsize=(7 * len(existing_key_volume_cols), 6))
    if len(existing_key_volume_cols) == 1: axes = [axes] # Make it iterable

    for i, col_name in enumerate(existing_key_volume_cols):
        sns.histplot(df[col_name].dropna(), kde=True, ax=axes[i], bins=10).set_title(f'{col_name.replace("_", " ").title()} Distribution')
    plt.tight_layout()
    plt.savefig("volume_distributions_histograms.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No key volume columns with data available for histogram plotting.")

# Scatter plot: Target Volume vs. Actual Liquid Volume from Weight
if 'target volume' in df.columns and 'actual_liquid_volume_from_weight' in df.columns and \
   df['target volume'].notna().any() and df['actual_liquid_volume_from_weight'].notna().any():
    plt.figure(figsize=(10, 8))
    hue_col = 'target place' if 'target place' in df.columns else None
    style_col = 'person' if 'person' in df.columns else None
    sns.scatterplot(x='target volume', y='actual_liquid_volume_from_weight', data=df, hue=hue_col, style=style_col, s=100)

    temp_df_for_plot_limits = df[['target volume', 'actual_liquid_volume_from_weight']].dropna()
    if not temp_df_for_plot_limits.empty:
        max_val_data = temp_df_for_plot_limits.max().max()
        min_val_data = temp_df_for_plot_limits.min().min()
        plt.plot([min_val_data, max_val_data], [min_val_data, max_val_data],
                 'k--', lw=2, label='Identity Line')
    plt.title('Target Volume vs. Volume Calculated from Weight')
    plt.xlabel('Target Volume (mL)')
    plt.ylabel('Actual Liquid Volume from Weight (mL)')
    if hue_col or style_col: plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("target_vs_weight_volume_scatter.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Skipping 'Target Volume vs. Volume from Weight' scatter plot due to missing data.")

#  [markdown]
# ## 5. YOLO Volume Estimation Performance Analysis
#
# Here, we analyze the performance of the YOLO model by comparing `estimated volume` against `target volume`.
# We will first analyze all valid readings, then consider filtering out problematic ones.

#
# Ensure 'target volume' and 'estimated volume' exist before proceeding
if 'target volume' not in df.columns or 'estimated volume' not in df.columns:
    print("Critical Error: 'target volume' or 'estimated volume' columns are missing. Cannot proceed with performance analysis.")
    # Optionally exit or skip large sections of the script here
    analysis_df = pd.DataFrame() # Empty df to prevent further errors
    clean_df = pd.DataFrame()
else:
    analysis_df = df.dropna(subset=['estimated volume', 'target volume']).copy()

    if not analysis_df.empty:
        analysis_df['error'] = analysis_df['estimated volume'] - analysis_df['target volume']
        analysis_df['abs_error'] = np.abs(analysis_df['error'])
        analysis_df['percentage_error'] = np.where(analysis_df['target volume'] != 0, (analysis_df['error'] / analysis_df['target volume']) * 100, np.nan)
        analysis_df['abs_percentage_error'] = np.abs(analysis_df['percentage_error'])
        if 'syringe_size_ml' in analysis_df.columns:
            analysis_df['error_normalized_by_syringe_capacity'] = analysis_df['error'] / analysis_df['syringe_size_ml']
            analysis_df['abs_error_normalized_by_syringe_capacity'] = np.abs(analysis_df['error_normalized_by_syringe_capacity'])
        else:
            analysis_df['error_normalized_by_syringe_capacity'] = np.nan
            analysis_df['abs_error_normalized_by_syringe_capacity'] = np.nan

        if 'problematic_reading' in analysis_df.columns:
            clean_df = analysis_df[~analysis_df['problematic_reading']].copy()
        else: # If problematic_reading flag wasn't created (e.g. no 'notes')
            print("Warning: 'problematic_reading' flag not available. 'clean_df' will be a copy of 'analysis_df'.")
            clean_df = analysis_df.copy()
    else:
        clean_df = pd.DataFrame() # Empty if analysis_df is empty

print(f"Total valid readings for analysis (non-NaN target & estimated): {len(analysis_df)}")
print(f"Number of 'clean' readings (no major issues noted): {len(clean_df)}")
if 'problematic_reading' in analysis_df.columns:
    print(f"Number of 'problematic' readings: {len(analysis_df[analysis_df['problematic_reading']])}")


def calculate_metrics(y_true: pd.Series, y_pred: pd.Series, dataset_name: str = "Overall") -> dict:
    """Calculates and prints key performance metrics."""
    if len(y_true) < 2 or len(y_pred) < 2 or y_true.isnull().all() or y_pred.isnull().all():
        print(f"Not enough valid data (N={len(y_true.dropna())}) for {dataset_name} metrics calculation.")
        return None

    y_true_clean = y_true.dropna()
    y_pred_clean = y_pred.loc[y_true_clean.index].dropna() # Align and drop NaNs based on y_true
    y_true_clean = y_true_clean.loc[y_pred_clean.index] # Ensure same length after y_pred drop

    if len(y_true_clean) < 2:
        print(f"Not enough aligned valid data points (N={len(y_true_clean)}) for {dataset_name} metrics.")
        return None

    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    mse = mean_squared_error(y_true_clean, y_pred_clean)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true_clean, y_pred_clean)
    mape = np.mean(np.abs((y_true_clean - y_pred_clean) / np.where(y_true_clean == 0, np.finfo(float).eps, y_true_clean))) * 100
    mean_error = np.mean(y_pred_clean - y_true_clean)
    
    try:
        cor = np.corrcoef(y_true_clean, y_pred_clean)[0,1]
        sd_true = np.std(y_true_clean)
        sd_pred = np.std(y_pred_clean)
        mean_true_val = np.mean(y_true_clean)
        mean_pred_val = np.mean(y_pred_clean)
        
        if sd_true == 0 or sd_pred == 0 or np.isnan(cor): # Check for NaN correlation
            ccc = np.nan
        else:
            ccc = (2 * cor * sd_true * sd_pred) / (sd_true**2 + sd_pred**2 + (mean_true_val - mean_pred_val)**2)
        pearson_corr, _ = stats.pearsonr(y_true_clean, y_pred_clean)

    except Exception as e: # Catch any error during correlation calculation (e.g. if data is all constant)
        print(f"Could not calculate correlation for {dataset_name}: {e}")
        cor = np.nan
        ccc = np.nan
        pearson_corr = np.nan


    metrics = {
        "Dataset": dataset_name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "MAPE (%)": mape,
        "Mean Error (Bias)": mean_error,
        "Pearson Correlation": pearson_corr,
        "Lin's CCC": ccc,
        "Count": len(y_true_clean)
    }
    
    print(f"\n--- Performance Metrics for {dataset_name} ---")
    for k, v in metrics.items():
        if isinstance(v, (int, float)):
            print(f"{k}: {v:.3f}")
        else:
            print(f"{k}: {v}")
    return metrics

all_metrics_summary = []
if not analysis_df.empty and 'target volume' in analysis_df.columns and 'estimated volume' in analysis_df.columns:
    overall_metrics = calculate_metrics(analysis_df['target volume'], analysis_df['estimated volume'], "All Valid Data")
    if overall_metrics: all_metrics_summary.append(overall_metrics)

if not clean_df.empty and 'target volume' in clean_df.columns and 'estimated volume' in clean_df.columns:
    clean_metrics = calculate_metrics(clean_df['target volume'], clean_df['estimated volume'], "Clean Data (No Major Issues)")
    if clean_metrics: all_metrics_summary.append(clean_metrics)


# [markdown]
# ### 5.1. Visualizations: Scatter Plot and Bland-Altman Plot

#
def plot_scatter_estimated_vs_target(df_subset: pd.DataFrame, title_suffix: str, filename_suffix: str):
    """Plots estimated vs. target volume."""
    if df_subset.empty or len(df_subset.dropna(subset=['target volume', 'estimated volume'])) < 2:
        print(f"No data or insufficient data to plot for {title_suffix}")
        return

    plt.figure(figsize=(10, 8))
    hue_col = 'target place' if 'target place' in df_subset.columns else None
    style_col = 'syringe_size_ml' if 'syringe_size_ml' in df_subset.columns else None

    sns.scatterplot(x='target volume', y='estimated volume', data=df_subset, hue=hue_col, style=style_col, s=120, alpha=0.8)
    
    temp_df = df_subset[['target volume', 'estimated volume']].dropna()
    if not temp_df.empty:
        min_val = min(temp_df['target volume'].min(), temp_df['estimated volume'].min())
        max_val = max(temp_df['target volume'].max(), temp_df['estimated volume'].max())
        plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Identity Line (y=x)')
    
        if len(temp_df) >= 2:
            slope, intercept, r_value, p_value, std_err = stats.linregress(temp_df['target volume'], temp_df['estimated volume'])
            plt.plot(temp_df['target volume'], intercept + slope * temp_df['target volume'], 'r-', label=f'Fit: y={slope:.2f}x+{intercept:.2f}\n$R^2={r_value**2:.2f}$')

    plt.title(f'Estimated Volume vs. Target Volume ({title_suffix})', fontsize=16)
    plt.xlabel('Target Volume (mL)', fontsize=14)
    plt.ylabel('Estimated Volume (YOLO) (mL)', fontsize=14)
    if hue_col or style_col: plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"scatter_est_vs_target_{filename_suffix}.png", dpi=300, bbox_inches='tight')
    plt.show()

if not analysis_df.empty: plot_scatter_estimated_vs_target(analysis_df, "All Valid Data", "all_valid")
if not clean_df.empty: plot_scatter_estimated_vs_target(clean_df, "Clean Data", "clean")

def plot_bland_altman(df_subset: pd.DataFrame, title_suffix: str, filename_suffix: str):
    """Creates a Bland-Altman plot."""
    df_plot_data = df_subset.dropna(subset=['target volume', 'estimated volume'])
    if df_plot_data.empty or len(df_plot_data) < 2:
        print(f"No data or insufficient data for Bland-Altman plot for {title_suffix}")
        return

    y_true = df_plot_data['target volume']
    y_pred = df_plot_data['estimated volume']
    
    diff = y_pred - y_true
    mean_val = (y_true + y_pred) / 2
    
    mean_diff = np.mean(diff)
    std_diff = np.std(diff)
    
    plt.figure(figsize=(12, 7))
    hue_col = 'target place' if 'target place' in df_plot_data.columns else None
    style_col = 'syringe_size_ml' if 'syringe_size_ml' in df_plot_data.columns else None
    sns.scatterplot(x=mean_val, y=diff, data=df_plot_data, hue=hue_col, style=style_col, s=120, alpha=0.8)

    plt.axhline(mean_diff, color='black', linestyle='-', label=f'Mean Diff: {mean_diff:.2f}', lw=2)
    plt.axhline(mean_diff + 1.96 * std_diff, color='red', linestyle='--', label=f'+1.96 SD: {mean_diff + 1.96 * std_diff:.2f}', lw=1.5)
    plt.axhline(mean_diff - 1.96 * std_diff, color='red', linestyle='--', label=f'-1.96 SD: {mean_diff - 1.96 * std_diff:.2f}', lw=1.5)
    
    plt.title(f'Bland-Altman Plot ({title_suffix})', fontsize=16)
    plt.xlabel('Mean of Target and Estimated Volumes (mL)', fontsize=14)
    plt.ylabel('Difference (Estimated - Target Volume) (mL)', fontsize=14)
    if hue_col or style_col: plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"bland_altman_{filename_suffix}.png", dpi=300, bbox_inches='tight')
    plt.show()

if not analysis_df.empty: plot_bland_altman(analysis_df, "All Valid Data", "all_valid")
if not clean_df.empty: plot_bland_altman(clean_df, "Clean Data", "clean")


#  [markdown]
# ## 6. Analysis by Factors
#
# Analyzing performance based on different experimental factors like target place, syringe type, and person.
# We generally use `clean_df` for this factor analysis to reduce noise from known problematic readings.

#
if not clean_df.empty:
    factor_analysis_df = clean_df.copy()
elif not analysis_df.empty : # Fallback to analysis_df if clean_df is empty
    print("Clean_df is empty, using analysis_df (all valid data) for factor analysis.")
    factor_analysis_df = analysis_df.copy()
else:
    factor_analysis_df = pd.DataFrame() # Empty df

if factor_analysis_df.empty or 'target volume' not in factor_analysis_df.columns:
    print("Skipping factor analysis as no suitable dataframe or 'target volume' column is available.")
else:
    # Define volume ranges for analysis
    # Ensure 'target volume' is suitable for pd.cut (numeric and not all NaN)
    if pd.api.types.is_numeric_dtype(factor_analysis_df['target volume']) and factor_analysis_df['target volume'].notna().any():
        bins = [0, 1, 3, 5, 10, 15, 20, np.inf]
        labels = ['0-1mL', '1-3mL', '3-5mL', '5-10mL', '10-15mL', '15-20mL', '>20mL']
        factor_analysis_df['volume_range_target'] = pd.cut(factor_analysis_df['target volume'], bins=bins, labels=labels, right=False)
    else:
        factor_analysis_df['volume_range_target'] = np.nan # Add column as NaN if cannot cut


    factors = ['target place', 'syringe_size_ml', 'person', 'volume_range_target']
    error_types_to_plot = ['error', 'abs_error', 'abs_percentage_error', 'error_normalized_by_syringe_capacity']
    existing_factors = [f for f in factors if f in factor_analysis_df.columns]


    for factor in existing_factors:
        if factor_analysis_df[factor].nunique(dropna=False) < 1 : # Changed from <2 to <1 for cases where factor column itself might be all NaN
            print(f"\nSkipping factor '{factor}' due to no unique values or all NaNs.")
            continue

        print(f"\n--- Metrics by {factor} (using {'Clean Data' if factor_analysis_df is clean_df else 'All Valid Data'}) ---")
        # Ensure groupby factor does not contain all NaNs if that's problematic for your pandas version with observed=False
        if factor_analysis_df[factor].notna().any():
            grouped = factor_analysis_df.groupby(factor, observed=False)

            for name, group_df in grouped:
                if len(group_df.dropna(subset=['target volume', 'estimated volume'])) > 1 :
                    group_metrics = calculate_metrics(group_df['target volume'], group_df['estimated volume'], f"{factor}: {name}")
                    if group_metrics: all_metrics_summary.append(group_metrics)
                else:
                    print(f"Not enough data for {factor}: {name} (count: {len(group_df)})")
        else:
            print(f"Factor '{factor}' contains all NaN values, skipping grouping.")


        # Box plots for error distributions
        if factor_analysis_df[factor].nunique(dropna=True) > 0:
            num_err_types = len(error_types_to_plot)
            valid_error_types_to_plot = [err for err in error_types_to_plot if err in factor_analysis_df.columns and factor_analysis_df[err].notna().any()]

            if not valid_error_types_to_plot:
                print(f"No valid error types with data to plot for factor '{factor}'.")
                continue

            fig, axes = plt.subplots(1, len(valid_error_types_to_plot), figsize=(6 * len(valid_error_types_to_plot), 6))
            if len(valid_error_types_to_plot) == 1: axes = [axes]

            for i, err_type in enumerate(valid_error_types_to_plot):
                sns.boxplot(x=factor, y=err_type, data=factor_analysis_df, ax=axes[i], showfliers=True)
                axes[i].set_title(f'{err_type.replace("_", " ").title()} by {factor.replace("_", " ").title()}', fontsize=14)
                axes[i].tick_params(axis='x', rotation=45 if factor_analysis_df[factor].nunique(dropna=True) > 4 else 0, labelsize=10)
                axes[i].tick_params(axis='y', labelsize=10)
                axes[i].set_xlabel(factor.replace("_", " ").title(), fontsize=12)
                axes[i].set_ylabel(err_type.replace("_", " ").title(), fontsize=12)
            plt.suptitle(f"Error Distributions by {factor.replace('_', ' ').title()}", fontsize=18, y=1.03)
            plt.tight_layout(rect=[0, 0, 1, 0.97])
            plt.savefig(f"error_boxplot_by_{factor}.png", dpi=300, bbox_inches='tight')
            plt.show()

    # FacetGrid for error distribution
    if 'target place' in factor_analysis_df.columns and \
       factor_analysis_df['target place'].nunique(dropna=True) > 0 and \
       'error' in factor_analysis_df.columns and factor_analysis_df['error'].notna().any():
        g = sns.FacetGrid(factor_analysis_df.dropna(subset=['error', 'target place']), col="target place", col_wrap=3, sharex=False, sharey=False, height=5, aspect=1.2)
        g.map(sns.histplot, "error", kde=True, bins=10)
        g.set_titles("Error Distribution for {col_name}", size=14)
        g.set_axis_labels("Error (mL)", "Count", fontsize=12)
        plt.suptitle("Detailed Error Distribution by Target Place", fontsize=18, y=1.03)
        plt.tight_layout(rect=[0, 0, 1, 0.97])
        plt.savefig("error_hist_by_target_place.png", dpi=300, bbox_inches='tight')
        plt.show()

#  [markdown]
# ## 7. Impact of Noted Issues
#
# Compare performance for readings with and without specific issues noted (e.g., occlusion).
# Uses 'analysis_df' to ensure all data with issues are captured.

#
if not analysis_df.empty and 'target volume' in analysis_df.columns and 'estimated volume' in analysis_df.columns:
    # Impact of Occlusion
    if 'is_occluded' in analysis_df.columns:
        occluded_df = analysis_df[analysis_df['is_occluded'] == True]
        not_occluded_df = analysis_df[analysis_df['is_occluded'] == False]

        print("\n--- Impact of Occlusion ---")
        occlusion_metrics = calculate_metrics(occluded_df['target volume'], occluded_df['estimated volume'], "Occluded Readings")
        non_occlusion_metrics = calculate_metrics(not_occluded_df['target volume'], not_occluded_df['estimated volume'], "Non-Occluded Readings (may include other issues)")
        
        if occlusion_metrics: all_metrics_summary.append(occlusion_metrics)
        if non_occlusion_metrics: all_metrics_summary.append(non_occlusion_metrics)
    else:
        print("Skipping occlusion impact analysis: 'is_occluded' column missing.")

    # Impact of 'Bad Readings'
    if 'is_bad_reading' in analysis_df.columns:
        bad_readings_df = analysis_df[analysis_df['is_bad_reading'] == True]
        
        print("\n--- Impact of 'Bad Readings' (as noted) ---")
        bad_reading_metrics = calculate_metrics(bad_readings_df['target volume'], bad_readings_df['estimated volume'], "Flagged Bad Readings")
        if bad_reading_metrics: all_metrics_summary.append(bad_reading_metrics)

        if not clean_df.empty: # Compare with clean_df if available
             clean_for_bad_comparison_metrics = calculate_metrics(clean_df['target volume'], clean_df['estimated volume'], "Clean Readings (for Bad Reading comparison)")
             if clean_for_bad_comparison_metrics: all_metrics_summary.append(clean_for_bad_comparison_metrics)
        else: # Fallback if clean_df is empty
            good_readings_for_comparison_df = analysis_df[(analysis_df['is_bad_reading'] == False)]
            good_reading_metrics_general = calculate_metrics(good_readings_for_comparison_df['target volume'], good_readings_for_comparison_df['estimated volume'], "Not Flagged as Bad (may include other issues)")
            if good_reading_metrics_general: all_metrics_summary.append(good_reading_metrics_general)
    else:
        print("Skipping bad reading impact analysis: 'is_bad_reading' column missing.")


    # Display problematic readings details
    print("\n--- Details of All Problematic Readings (from analysis_df) ---")
    if 'problematic_reading' in analysis_df.columns and analysis_df['problematic_reading'].any():
        # Ensure error columns are present
        if 'error' not in analysis_df.columns:
            analysis_df['error'] = analysis_df['estimated volume'] - analysis_df['target volume']
        if 'abs_percentage_error' not in analysis_df.columns:
             analysis_df['abs_percentage_error'] = np.abs((analysis_df['error'] / np.where(analysis_df['target volume'] == 0, np.finfo(float).eps, analysis_df['target volume'])) * 100)

        problematic_cols_to_show = ['person', 'target volume', 'estimated volume', 'target place', 'syringe', 'notes',
                                    'error', 'abs_percentage_error', 'is_occluded', 'is_bad_reading',
                                    'is_manual_extraction', 'other_issues']
        existing_problematic_cols = [col for col in problematic_cols_to_show if col in analysis_df.columns]
        problematic_details_df = analysis_df[analysis_df['problematic_reading']]

        if not problematic_details_df.empty and existing_problematic_cols:
            print(problematic_details_df[existing_problematic_cols].to_string())
        else:
            print("No problematic readings found or relevant columns missing for details.")
    else:
        print("No problematic readings found based on current flags in analysis_df or 'problematic_reading' column missing.")

#  [markdown]
# ## 8. Statistical Significance (Optional but Recommended)

#
if not clean_df.empty and 'error' in clean_df.columns and clean_df['error'].notna().any() and \
   'target volume' in clean_df.columns and 'estimated volume' in clean_df.columns:
    diff_clean = clean_df['error'].dropna()
    if len(diff_clean) > 7: # Shapiro needs more than a few samples
        shapiro_test_stat, shapiro_p_value = stats.shapiro(diff_clean)
        print("\n--- Normality Test for Differences (Clean Data Error) ---")
        print(f"Shapiro-Wilk Test Statistic: {shapiro_test_stat:.3f}, P-value: {shapiro_p_value:.3f}")

        test_to_use = ""
        if shapiro_p_value > 0.05:
            print("Differences appear normally distributed. Using Paired t-test.")
            test_to_use = "ttest"
        else:
            print("Differences do not appear normally distributed. Using Wilcoxon signed-rank test.")
            test_to_use = "wilcoxon"
        
        target_clean = clean_df['target volume'].dropna()
        estimated_clean = clean_df['estimated volume'].loc[target_clean.index].dropna()
        target_clean = target_clean.loc[estimated_clean.index]

        if len(target_clean) > 1:
            if test_to_use == "ttest":
                ttest_result = stats.ttest_rel(target_clean, estimated_clean, nan_policy='omit')
                print("\n--- Paired t-test (Target vs. Estimated on Clean Data) ---")
                print(f"T-statistic: {ttest_result.statistic:.3f}")
                print(f"P-value: {ttest_result.pvalue:.3f}")
                if ttest_result.pvalue < 0.05:
                    print("The difference between target and estimated volumes is statistically significant (p < 0.05).")
                else:
                    print("No statistically significant difference found between target and estimated volumes (p >= 0.05).")
            elif test_to_use == "wilcoxon":
                # Wilcoxon needs differences, which is `diff_clean`
                # Ensure `diff_clean` corresponds to paired observations, which it should if `error` was calculated correctly
                if len(diff_clean) > 0 : # Wilcoxon might need non-zero differences for some versions or cases
                    try:
                        wilcoxon_result = stats.wilcoxon(diff_clean, nan_policy='omit')
                        print("\n--- Wilcoxon Signed-Rank Test (on Clean Data Error) ---")
                        print(f"Statistic: {wilcoxon_result.statistic:.3f}")
                        print(f"P-value: {wilcoxon_result.pvalue:.3f}")
                        if wilcoxon_result.pvalue < 0.05:
                            print("The median difference between target and estimated volumes is statistically significant (p < 0.05).")
                        else:
                            print("No statistically significant median difference found (p >= 0.05).")
                    except ValueError as e:
                        print(f"Could not perform Wilcoxon test: {e}. This can happen if all differences are zero.")
                else:
                    print("Not enough non-zero differences for Wilcoxon test.")
        else:
            print("Not enough aligned clean data for paired statistical test after NaN handling.")
    else:
        print("\nNot enough clean data (or errors) for normality test or subsequent paired tests.")
else:
    print("\nSkipping paired statistical tests: Clean data, error column, or target/estimated columns are missing or empty.")


# ANOVA example for comparing mean errors across 'target place' using clean data
if not clean_df.empty and 'error' in clean_df.columns and clean_df['error'].notna().any() and \
   'target place' in clean_df.columns and clean_df['target place'].nunique(dropna=True) > 1:
    
    model_formula = 'error ~ C(target_place)'
    try:
        anova_data = clean_df[['error', 'target_place']].dropna()
        if anova_data['target_place'].nunique() > 1 and len(anova_data) > anova_data['target_place'].nunique() * 2 :
            model = ols(model_formula, data=anova_data).fit()
            anova_table = sm.stats.anova_lm(model, typ=2)
            print("\n--- ANOVA for Error by Target Place (Clean Data) ---")
            print(anova_table)
        else:
            print("\nNot enough data or unique groups for ANOVA on 'error' by 'target_place'.")

    except Exception as e:
        print(f"\nCould not perform ANOVA for 'error' by 'target_place': {e}")
else:
    print("\nSkipping ANOVA: Clean data, 'error', or 'target_place' column issues, or not enough groups.")


#  [markdown]
# ## 9. Summary of Metrics and Conclusion

#
print("\n--- Summary of All Calculated Metrics ---")
if all_metrics_summary:
    summary_df = pd.DataFrame(all_metrics_summary)
    cols_order = ['Dataset', 'Count', 'MAE', 'RMSE', 'MAPE (%)', 'Mean Error (Bias)', 'R2', 'Pearson Correlation', "Lin's CCC", 'MSE']
    # Ensure only existing columns are selected for reordering
    existing_cols_for_summary = [col for col in cols_order if col in summary_df.columns]
    summary_df = summary_df[existing_cols_for_summary]
    print(summary_df.to_string())
    try:
        summary_df.to_csv("metrics_summary_yolo_analysis.csv", index=False)
        print("\nMetrics summary saved to 'metrics_summary_yolo_analysis.csv'")
    except Exception as e:
        print(f"\nError saving metrics summary to CSV: {e}")
else:
    print("No metrics were calculated to summarize.")

#  [markdown]
# ### Discussion Points:
# * **Overall Performance:** Review MAE, RMSE, R2, Bias from "All Valid Data" and "Clean Data".
# * **Impact of Factors:**
#     * `target place`, `syringe size`, `person`, `volume_range_target`: How do errors (MAE, MAPE, bias) vary? Are these variations statistically significant (ANOVA results)?
#     * `error_normalized_by_syringe_capacity`: Does this reveal different insights than raw error?
# * **Problematic Readings:**
#     * Quantify the impact of occlusions, "bad readings", etc., on metrics. Refer to "Impact of Noted Issues" section.
#     * Notes like "syringe on table disappears", "manual extraction", "person8 in folder" are critical. Discuss their implications and how they were handled (e.g., exclusion via `problematic_reading` flag).
# * **Comparison with Volume from Weight:** How does `actual_liquid_volume_from_weight` compare to `target_volume` (from EDA)? Discrepancies here indicate issues with the ground truth preparation itself.
# * **Statistical Significance:**
#     * Was there a systematic bias in the YOLO estimates (paired t-test/Wilcoxon)?
#     * Did error significantly differ across factor levels (ANOVA/Kruskal-Wallis)?
# * **Data Quality & Model Limitations:**
#     * Address `NaN` estimated volumes: what types of scenarios (notes, conditions) led to model failure?
#     * "no syringe detected," "corrupted video": these are data loss scenarios.
# * **Limitations of this Analysis:** Sample size, specific conditions, assumptions of statistical tests.
# * **Future Work:** Model retraining with more diverse/problematic data, specific occlusion handling, improvements to experimental protocol based on findings.

#  [markdown]
# --- End of Notebook ---