In [None]:
%pip install pandas scipy

Collecting pandas
  Using cached pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.3 MB)
Using cached numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]2m3/4[0m [pandas]
[1A[2KSuccessfully installed numpy-2.3.5 pandas-2.3.3 pytz-202

In [5]:
import pandas as pd
import numpy as np
from scipy.signal import welch, correlate
from scipy.stats import kurtosis

class LoopAnalyzer:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = pd.read_csv(file_path)
        self.results = {}

    def _identify_columns(self):
        """Automatically maps columns based on names."""
        cols = {c.lower(): c for c in self.data.columns}
        self.pv_col = next((c for c in cols if 'pv' in c), None)
        self.sp_col = next((c for c in cols if 'sp' in c), None)
        self.op_col = next((c for c in cols if 'op' in c), None)

        if not all([self.pv_col, self.sp_col, self.op_col]):
            raise ValueError("Could not auto-identify PV, SP, or OP columns.")

    def analyze(self):
        self._identify_columns()

        pv = self.data[self.pv_col].values
        sp = self.data[self.sp_col].values
        op = self.data[self.op_col].values

        # 1. Basic Error Calculation
        error = sp - pv
        mse = np.mean(error**2) # Variance / Performance Proxy

        # 2. Noise Detection (High Frequency Variation)
        # Ratio of "jumpiness" (diff) to "range" (std).
        # High ratio = signal is mostly noise, not process movement.
        pv_std = np.std(pv) + 1e-9
        noise_metric = np.std(np.diff(pv)) / pv_std
        is_noisy = noise_metric > 0.5  # Threshold: 50% of variation is high-freq noise

        # 3. Oscillation Detection (Spectral Analysis)
        # Calculate Power Spectral Density
        f, Pxx = welch(error - np.mean(error), nperseg=min(len(error), 256))
        total_power = np.sum(Pxx)
        max_power = np.max(Pxx)
        oscillation_index = max_power / (total_power + 1e-9)

        is_oscillating = oscillation_index > 0.1 # Threshold: Dominant freq has >10% of power

        # 4. Stiction Probability Estimation
        # Heuristic: Stiction causes oscillation + Non-Gaussian Error (Square/Triangular waves)
        # Stiction probability is high if: Loop is Oscillating AND Error Kurtosis is low (Square wave < 2.0)
        kurt = kurtosis(error)
        stiction_prob = 0

        if is_oscillating:
            base_prob = 60.0
            # If wave is "squarish" (low kurtosis), stiction is more likely
            shape_factor = 20.0 if kurt < 2.5 else 0
            stiction_prob = base_prob + shape_factor + (oscillation_index * 20)
        else:
            # Low probability if no oscillation, but could be 'Stick-Slip'
            stiction_prob = 5.0

        stiction_prob = min(stiction_prob, 99.9)

        # 5. Disturbance Detection
        # Significant error but NOT oscillating
        is_disturbance = (mse > 1.0) and (not is_oscillating) and (np.std(sp) < 0.1)

        # Store Results
        self.results = {
            "Loop Name": self.file_path.split('/')[-1],
            "MSE (Variance)": round(mse, 4),
            "Noise Level (Ratio)": round(noise_metric, 2),
            "Oscillation Index": round(oscillation_index, 2),
            "Stiction Probability (%)": round(stiction_prob, 1),
            "Behaviors Detected": []
        }

        # Generate Inferences
        if is_noisy: self.results["Behaviors Detected"].append("Noisy Sensor Signal")
        if is_oscillating: self.results["Behaviors Detected"].append("Significant Oscillation")
        if is_disturbance: self.results["Behaviors Detected"].append("External Disturbance")
        if stiction_prob > 50: self.results["Behaviors Detected"].append("High Valve Stiction Risk")
        if not self.results["Behaviors Detected"] and mse < 1.0:
            self.results["Behaviors Detected"].append("Stable / Well-Tuned")

        return self.results

# --- Example Usage with the provided file ---
analyzer = LoopAnalyzer('loop_1_loop.csv')
report = analyzer.analyze()

# Display Comprehensive Summary
print("-" * 40)
print(f"Loop Analysis Report: {report['Loop Name']}")
print("-" * 40)
print(f"Condition              | Value")
print("-" * 40)
print(f"Control Variance (MSE) | {report['MSE (Variance)']}")
print(f"Oscillation Index      | {report['Oscillation Index']} (Threshold > 0.1)")
print(f"Noise Ratio            | {report['Noise Level (Ratio)']} (Threshold > 0.5)")
print(f"Stiction Probability   | {report['Stiction Probability (%)']}%")
print("-" * 40)
print("AUTOMATED INFERENCES:")
for inference in report['Behaviors Detected']:
    print(f"• {inference}")
print("-" * 40)

ModuleNotFoundError: No module named 'scipy'

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from scipy.signal import welch, correlate
from scipy.stats import kurtosis, skew
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error

# ==========================================
# PART 1: CONFIGURATION & UTILS
# ==========================================
CONFIG = {
    'description_file': 'Loop description.xlsx', # Update path
    'data_folder': 'data', # Changed to current directory to find CSVs
    'test_size': 0.2,
    'random_state': 42
}

def clean_binary_target(val):
    """
    Parses messy text labels (e.g., 'Yes, highly oscillatory', 'may be')
    into binary integers (0 or 1).
    """
    if pd.isna(val):
        return 0
    val = str(val).lower().strip()

    # Positive keywords indicative of the condition
    positive_keywords = [
        'yes', 'high', 'oscillat', 'noise', 'sticky', 'bad','Oscillatory','Yes, long oscillation''Oscillating SP','variance high','Yes + high variance','Yes (Intermittent)'
        'ripples', 'disturbance', 'instability','Highly Oscillatory','Yes , Long oscillation','Yes. Oscillatory','Highly intermittenly oscillatory'
    ]

    if any(keyword.lower() in val for keyword in positive_keywords):
        return 1
    return 0

# ==========================================
# PART 2: FEATURE ENGINEERING (THE ANALYZER)
# ==========================================
class LoopFeatureExtractor:
    """
    Extracts statistical and signal processing features from Loop Data.
    """

    def extract_features(self, filepath):
        try:
            df = pd.read_csv(filepath)
            # Normalize column names
            df.columns = [c.lower() for c in df.columns]

            # Identify columns dynamically
            pv_col = next((c for c in df.columns if 'pv' in c), None)
            sp_col = next((c for c in df.columns if 'sp' in c), None)
            op_col = next((c for c in df.columns if 'op' in c), None)

            if not (pv_col and sp_col and op_col):
                return None # Skip invalid files

            pv = df[pv_col].values
            sp = df[sp_col].values
            op = df[op_col].values

            # --- CALCULATIONS ---

            # 1. Error Signal
            error = sp - pv

            # 2. Basic Statistics
            mse = np.mean(error**2)
            mae = np.mean(np.abs(error))
            error_std = np.std(error)

            # 3. Noise Estimation (High Frequency)
            # Ratio of diff_std (jumps) to signal_std (range)
            pv_diff = np.diff(pv)
            noise_metric = np.std(pv_diff)
            noise_ratio = noise_metric / (np.std(pv) + 1e-9)

            # 4. Oscillation Detection (Spectral Analysis)
            # Power Spectral Density using Welch's method
            f, Pxx = welch(error - np.mean(error), nperseg=min(len(error), 256))
            total_power = np.sum(Pxx)
            max_power = np.max(Pxx)
            oscillation_strength = max_power / (total_power + 1e-9) # Spectral Peak Ratio

            # 5. Non-Linearity / Stiction Features
            # Stiction often creates square waves (low Kurtosis) or specific PV-OP shapes
            err_kurtosis = kurtosis(error)
            err_skew = skew(error)

            # PV-OP Correlation Lag
            # Normalize for correlation
            norm_pv = (pv - np.mean(pv)) / (np.std(pv) + 1e-9)
            norm_op = (op - np.mean(op)) / (np.std(op) + 1e-9)
            xcorr = correlate(norm_pv, norm_op, mode='full')
            lag_index = np.argmax(xcorr)
            center_index = len(norm_pv) - 1
            lag = abs(lag_index - center_index)

            return {
                'mse': mse,
                'mae': mae,
                'error_std': error_std,
                'noise_metric': noise_metric,
                'noise_ratio': noise_ratio,
                'oscillation_strength': oscillation_strength,
                'error_kurtosis': err_kurtosis,
                'error_skew': err_skew,
                'pv_op_lag': lag
            }

        except Exception as e:
            print(f"Error processing {filepath}: {e}")
            return None

# ==========================================
# PART 3: MAIN EXECUTION AND DATA PREP
# ==========================================

# Load Loop Descriptions (Ground Truth)
# Use pd.read_excel for .xlsx files
try:
    description_df = pd.read_excel('/content/Loop description.xlsx')
    description_df.columns = [c.lower().replace(' ', '_') for c in description_df.columns]

    # Apply cleaning function to relevant target columns
    # for col in ['loop_no', 'oscillatory', 'noisy', 'sticky_valve_probability', 'disturbance']:
    for col in ['oscillatory?']:
        if col in description_df.columns:
            description_df[col] = description_df[col].apply(clean_binary_target)

except Exception as e:
    print(f"Error loading description file: {e}")
    description_df = pd.DataFrame() # Create empty DF on error

# Extract Features from Loop Data
feature_extractor = LoopFeatureExtractor()
all_features = []

# Assuming loop files are in the same directory for this example
loop_files = glob.glob('data_folder/loop_*.csv')
loop_name =1
for f_path in loop_files:
    features = feature_extractor.extract_features(f_path)
    if features:
       # loop_name = os.path.basename(f_path).replace('.csv', '')
        features['loop_no'] = loop_name
        all_features.append(features)
        loop_name +=1

features_df = pd.DataFrame(all_features)
print(features_df.shape[0])
merged_df=[]
# Merge Features with Descriptions
if not description_df.empty and not features_df.empty:
    # Ensure 'loop_name' is the key in both DFs for merging
    # description_df might have 'loop_id' or similar, so rename if necessary
    if 'Loop No' in description_df.columns:
        description_df = description_df.rename(columns={'Loop No': 'loop_no'})

    print("Description >>> ",description_df.shape[0])
    print("Features >>> ",features_df.shape[0])

    # Perform the merge
    merged_df = pd.merge(features_df, description_df, on='loop_no', how='left')
    print("Merged DataFrame Head:")
    print(merged_df.head())
    feature_names = merged_df.columns.tolist()
    feature_names.remove('loop_no')
elif not features_df.empty:
    print("Description DataFrame is empty or could not be loaded. Only features DataFrame is available.")
else:
    print("Description or features DataFrame is empty. Cannot merge.")

    merged_df

100
Description >>>  100
Features >>>  100
Merged DataFrame Head:
        mse       mae  error_std  noise_metric  noise_ratio  \
0  0.000030  0.004235   0.005454      0.000234     0.023052   
1  0.001045  0.025571   0.032332      0.017812     0.592497   
2  0.119583  0.280936   0.345769      0.053442     0.154560   
3  0.002643  0.037581   0.051404      0.040104     0.179440   
4  0.000824  0.022811   0.028635      0.022426     0.783154   

   oscillation_strength  error_kurtosis  error_skew  pv_op_lag  loop_no  \
0              0.100876        0.939374   -0.193102          0        1   
1              0.076720        0.270480   -0.128951          4        2   
2              0.232022       -0.276325   -0.149960        206        3   
3              0.039037       60.624015    3.424605          4        4   
4              0.039180        0.025652   -0.298246       2351        5   

   oscillatory? noisy disturbance                         comments  \
0           0.0   Yes          No 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Define feature_names from the available features, excluding non-feature columns
feature_names = [
    'mse', 'mae', 'error_std', 'noise_metric', 'noise_ratio',
    'oscillation_strength', 'error_kurtosis', 'error_skew', 'pv_op_lag'
]

# Create the targets DataFrame from merged_df
targets = merged_df[['loop_no', 'oscillatory?']].rename(columns={'oscillatory?': 'target'})

# Create DataFrame
X_df = pd.DataFrame(merged_df, columns=['loop_no'] + feature_names)
train_df = X_df.merge(targets, on='loop_no') # Removed 'Loop No' as 'loop_no' is the common key

# --- Cleaning step: Remove rows with NaN in the target variable ---
train_df.dropna(subset=['target'], inplace=True)

print(f"Training DataFrame ({len(train_df)} samples):\n")
print(train_df[['loop_no', 'oscillation_strength', 'target']].head())

# ==========================================
# 4. TRAIN MODEL (Gradient Boosting)
# ==========================================
X = train_df[feature_names]
y = train_df['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GradientBoostingClassifier is the sklearn equivalent of XGBoost
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# ==========================================
# 5. RESULTS & INTERPRETATION
# ==========================================
preds_test = model.predict(X_test)
acc_test = accuracy_score(y_test, preds_test)

print("\n" + "-" * 30)
print(f"Model Test Accuracy: {acc_test*100:.2f}%")
print("-" * 30)
print("Feature Importance (Top 3):")
importances = sorted(zip(feature_names, model.feature_importances_), key=lambda x: x[1], reverse=True)
for name, val in importances[:3]:
    print(f"• {name}: {val:.4f}")
print("-" * 30)

Training DataFrame (70 samples):

   loop_no  oscillation_strength  target
0        1              0.100876     0.0
1        2              0.076720     1.0
2        3              0.232022     1.0
3        4              0.039037     1.0
4        5              0.039180     0.0

------------------------------
Model Test Accuracy: 64.29%
------------------------------
Feature Importance (Top 3):
• noise_ratio: 0.2364
• error_kurtosis: 0.1724
• error_skew: 0.1529
------------------------------


# Task
Update cell `q-dVV8kFWLqm` to apply the `clean_binary_target` function to 'noisy' and 'disturbance' columns, then train a multi-output regression model in cell `63PZFO4nVFVA` using `MultiOutputRegressor` with `GradientBoostingRegressor` to predict 'oscillatory?', 'noisy', 'disturbance', and 'sticky_valve_probability_'. Evaluate the model using accuracy for the binary targets and Mean Squared Error for the regression target.

## Update Data Cleaning for Multi-Output Targets

### Subtask:
Modify cell `q-dVV8kFWLqm` to apply the `clean_binary_target` function to 'noisy' and 'disturbance' columns, ensuring all binary classification targets are correctly processed before model training.


**Reasoning**:
The subtask requires modifying cell `q-dVV8kFWLqm` to apply the `clean_binary_target` function to 'noisy' and 'disturbance' columns. I will update the code in that cell to include these columns in the cleaning process.



In [None]:
import pandas as pd
import numpy as np
import os
import glob
from scipy.signal import welch, correlate
from scipy.stats import kurtosis, skew
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error

# ==========================================
# PART 1: CONFIGURATION & UTILS
# ==========================================
CONFIG = {
    'description_file': 'Loop description.xlsx', # Update path
    'data_folder': 'data', # Changed to current directory to find CSVs
    'test_size': 0.2,
    'random_state': 42
}

def clean_binary_target(val):
    """
    Parses messy text labels (e.g., 'Yes, highly oscillatory', 'may be')
    into binary integers (0 or 1).
    """
    if pd.isna(val):
        return 0
    val = str(val).lower().strip()

    # Positive keywords indicative of the condition
    positive_keywords = [
        'yes', 'high', 'oscillat', 'noise', 'sticky', 'bad','Oscillatory','Yes, long oscillation''Oscillating SP','variance high','Yes + high variance','Yes (Intermittent)'
        'ripples', 'disturbance', 'instability','Highly Oscillatory','Yes , Long oscillation','Yes. Oscillatory','Highly intermittenly oscillatory'
    ]

    if any(keyword.lower() in val for keyword in positive_keywords):
        return 1
    return 0

# ==========================================
# PART 2: FEATURE ENGINEERING (THE ANALYZER)
# ==========================================
class LoopFeatureExtractor:
    """
    Extracts statistical and signal processing features from Loop Data.
    """

    def extract_features(self, filepath):
        try:
            df = pd.read_csv(filepath)
            # Normalize column names
            df.columns = [c.lower() for c in df.columns]

            # Identify columns dynamically
            pv_col = next((c for c in df.columns if 'pv' in c), None)
            sp_col = next((c for c in df.columns if 'sp' in c), None)
            op_col = next((c for c in df.columns if 'op' in c), None)

            if not (pv_col and sp_col and op_col):
                return None # Skip invalid files

            pv = df[pv_col].values
            sp = df[sp_col].values
            op = df[op_col].values

            # --- CALCULATIONS ---

            # 1. Error Signal
            error = sp - pv

            # 2. Basic Statistics
            mse = np.mean(error**2)
            mae = np.mean(np.abs(error))
            error_std = np.std(error)

            # 3. Noise Estimation (High Frequency)
            # Ratio of diff_std (jumps) to signal_std (range)
            pv_diff = np.diff(pv)
            noise_metric = np.std(pv_diff)
            noise_ratio = noise_metric / (np.std(pv) + 1e-9)

            # 4. Oscillation Detection (Spectral Analysis)
            # Power Spectral Density using Welch's method
            f, Pxx = welch(error - np.mean(error), nperseg=min(len(error), 256))
            total_power = np.sum(Pxx)
            max_power = np.max(Pxx)
            oscillation_strength = max_power / (total_power + 1e-9) # Spectral Peak Ratio

            # 5. Non-Linearity / Stiction Features
            # Stiction often creates square waves (low Kurtosis) or specific PV-OP shapes
            err_kurtosis = kurtosis(error)
            err_skew = skew(error)

            # PV-OP Correlation Lag
            # Normalize for correlation
            norm_pv = (pv - np.mean(pv)) / (np.std(pv) + 1e-9)
            norm_op = (op - np.mean(op)) / (np.std(op) + 1e-9)
            xcorr = correlate(norm_pv, norm_op, mode='full')
            lag_index = np.argmax(xcorr)
            center_index = len(norm_pv) - 1
            lag = abs(lag_index - center_index)

            return {
                'mse': mse,
                'mae': mae,
                'error_std': error_std,
                'noise_metric': noise_metric,
                'noise_ratio': noise_ratio,
                'oscillation_strength': oscillation_strength,
                'error_kurtosis': err_kurtosis,
                'error_skew': err_skew,
                'pv_op_lag': lag
            }

        except Exception as e:
            print(f"Error processing {filepath}: {e}")
            return None

# ==========================================
# PART 3: MAIN EXECUTION AND DATA PREP
# ==========================================

# Load Loop Descriptions (Ground Truth)
# Use pd.read_excel for .xlsx files
try:
    description_df = pd.read_excel('/content/Loop description.xlsx')
    description_df.columns = [c.lower().replace(' ', '_') for c in description_df.columns]

    # Apply cleaning function to relevant target columns
    for col in ['oscillatory?', 'noisy', 'disturbance']:
        if col in description_df.columns:
            description_df[col] = description_df[col].apply(clean_binary_target)

except Exception as e:
    print(f"Error loading description file: {e}")
    description_df = pd.DataFrame() # Create empty DF on error

# Extract Features from Loop Data
feature_extractor = LoopFeatureExtractor()
all_features = []

# Assuming loop files are in the same directory for this example
loop_files = glob.glob('data_folder/loop_*.csv')
loop_name =1
for f_path in loop_files:
    features = feature_extractor.extract_features(f_path)
    if features:
       # loop_name = os.path.basename(f_path).replace('.csv', '')
        features['loop_no'] = loop_name
        all_features.append(features)
        loop_name +=1

features_df = pd.DataFrame(all_features)
print(features_df.shape[0])
merged_df=[]
# Merge Features with Descriptions
if not description_df.empty and not features_df.empty:
    # Ensure 'loop_name' is the key in both DFs for merging
    # description_df might have 'loop_id' or similar, so rename if necessary
    if 'Loop No' in description_df.columns:
        description_df = description_df.rename(columns={'Loop No': 'loop_no'})

    print("Description >>> ",description_df.shape[0])
    print("Features >>> ",features_df.shape[0])

    # Perform the merge
    merged_df = pd.merge(features_df, description_df, on='loop_no', how='left')
    print("Merged DataFrame Head:")
    print(merged_df.head())
    feature_names = merged_df.columns.tolist()
    feature_names.remove('loop_no')
elif not features_df.empty:
    print("Description DataFrame is empty or could not be loaded. Only features DataFrame is available.")
else:
    print("Description or features DataFrame is empty. Cannot merge.")

    merged_df

100
Description >>>  100
Features >>>  100
Merged DataFrame Head:
        mse       mae  error_std  noise_metric  noise_ratio  \
0  0.000030  0.004235   0.005454      0.000234     0.023052   
1  0.001045  0.025571   0.032332      0.017812     0.592497   
2  0.119583  0.280936   0.345769      0.053442     0.154560   
3  0.002643  0.037581   0.051404      0.040104     0.179440   
4  0.000824  0.022811   0.028635      0.022426     0.783154   

   oscillation_strength  error_kurtosis  error_skew  pv_op_lag  loop_no  \
0              0.100876        0.939374   -0.193102          0        1   
1              0.076720        0.270480   -0.128951          4        2   
2              0.232022       -0.276325   -0.149960        206        3   
3              0.039037       60.624015    3.424605          4        4   
4              0.039180        0.025652   -0.298246       2351        5   

   oscillatory?  noisy  disturbance                         comments  \
0           0.0    1.0          

**Reasoning**:
The subtask requires modifying cell `63PZFO4nVFVA` to train a multi-output regression model. This involves preparing multiple target variables, using `MultiOutputRegressor` with `GradientBoostingRegressor`, and evaluating the model with appropriate metrics for both classification and regression targets.



In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

# Define feature_names from the available features, excluding non-feature columns
feature_names = [
    'mse', 'mae', 'error_std', 'noise_metric', 'noise_ratio',
    'oscillation_strength', 'error_kurtosis', 'error_skew', 'pv_op_lag'
]

# Define target columns - these are now multiple
target_columns = ['oscillatory?', 'noisy', 'disturbance', 'sticky_valve_probability_']

# Create DataFrame for features and targets
X_df = pd.DataFrame(merged_df, columns=['loop_no'] + feature_names)
train_df = X_df.merge(merged_df[['loop_no'] + target_columns], on='loop_no')

# --- Cleaning step: Remove rows with NaN in the target variables ---
train_df.dropna(subset=target_columns, inplace=True)

print(f"Training DataFrame ({len(train_df)} samples) with multiple targets:")
print(train_df[['loop_no'] + target_columns].head())

# ==========================================
# 4. TRAIN MULTI-OUTPUT MODEL
# ==========================================
X = train_df[feature_names]
y = train_df[target_columns]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use GradientBoostingRegressor for multi-output regression
# MultiOutputRegressor allows a single model to predict multiple targets independently
model = MultiOutputRegressor(estimator=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
model.fit(X_train, y_train)

# ==========================================
# 5. RESULTS & INTERPRETATION
# ==========================================
preds_test = model.predict(X_test)

# Convert predictions to DataFrame for easier evaluation
preds_test_df = pd.DataFrame(preds_test, columns=target_columns, index=y_test.index)

print("\n" + "-" * 30)
print("Multi-Output Model Evaluation")
print("-" * 30)

# Evaluate each target
for i, target_col in enumerate(target_columns):
    print(f"Target: {target_col}")
    actual = y_test[target_col]
    predicted = preds_test_df[target_col]

    if target_col in ['oscillatory?', 'noisy', 'disturbance']:
        # For binary classification targets, round predictions and evaluate accuracy
        predicted_binary = (predicted > 0.5).astype(int)
        acc = accuracy_score(actual, predicted_binary)
        print(f"  Accuracy: {acc*100:.2f}%")
    elif target_col == 'sticky_valve_probability_':
        # For regression target, evaluate Mean Squared Error
        mse = mean_squared_error(actual, predicted)
        print(f"  Mean Squared Error: {mse:.4f}")
    print("-" * 30)

# Feature Importance (from the first estimator in MultiOutputRegressor, usually represents average importance)
# Note: Feature importances for MultiOutputRegressor are not directly combined,
# but we can look at the importance for the first estimator as an example.
# If we wanted per-target importance, we would iterate through model.estimators_

print("Overall Feature Importance (from first estimator):")
first_estimator_importances = model.estimators_[0].feature_importances_ if hasattr(model.estimators_[0], 'feature_importances_') else []
if first_estimator_importances.size > 0:
    importances = sorted(zip(feature_names, first_estimator_importances), key=lambda x: x[1], reverse=True)
    for name, val in importances[:3]:
        print(f"• {name}: {val:.4f}")
else:
    print("Feature importances not available for the first estimator.")
print("-" * 30)

Training DataFrame (70 samples) with multiple targets:
   loop_no  oscillatory?  noisy  disturbance  sticky_valve_probability_
0        1           0.0    1.0          0.0                       1.00
1        2           1.0    0.0          0.0                      38.04
2        3           1.0    0.0          1.0                       5.00
3        4           1.0    0.0          0.0                      50.19
4        5           0.0    0.0          0.0                       6.95

------------------------------
Multi-Output Model Evaluation
------------------------------
Target: oscillatory?
  Accuracy: 71.43%
------------------------------
Target: noisy
  Accuracy: 71.43%
------------------------------
Target: disturbance
  Accuracy: 78.57%
------------------------------
Target: sticky_valve_probability_
  Mean Squared Error: 2143.9691
------------------------------
Overall Feature Importance (from first estimator):
• noise_ratio: 0.2269
• error_kurtosis: 0.1688
• error_skew: 0.1548

## Summary:

### Data Analysis Key Findings

*   The `clean_binary_target` function was successfully applied to the 'noisy' and 'disturbance' columns in the description data, converting their values into binary (0 or 1) as required.
*   A `MultiOutputRegressor` using `GradientBoostingRegressor` was trained on 70 samples to predict 'oscillatory?', 'noisy', 'disturbance', and 'sticky\_valve\_probability\_'.
*   The model achieved moderate accuracy for the binary classification targets:
    *   'oscillatory?': 71.43%
    *   'noisy': 71.43%
    *   'disturbance': 78.57%
*   For the regression target, 'sticky\_valve\_probability\_', the model resulted in a Mean Squared Error (MSE) of 2143.9691.
*   The most important features from the first estimator were identified as 'noise\_ratio' (0.2269), 'error\_kurtosis' (0.1688), and 'error\_skew' (0.1548).

### Insights or Next Steps

*   The Mean Squared Error of approximately 2144 for 'sticky\_valve\_probability\_' suggests the regression model's predictions are not very accurate, indicating a need for further investigation into the feature set or model architecture for this specific target.
*   Given the moderate accuracy for binary targets and high MSE for the regression target, consider exploring hyperparameter tuning for the `GradientBoostingRegressor`, experimenting with other regression models (e.g., Random Forest Regressor, XGBoost), or augmenting the dataset with more samples to potentially improve overall model performance.
