In [None]:
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
import pandas as pd
from pathlib import Path
import re


def load_data(data_path: Path) -> pd.DataFrame:
    """
    Load and preprocess data from a CSV file. Remove rows with unlabeled data.

    Args:
        data_path (Path): Path to the CSV data file.

    Returns:
        pd.DataFrame: Preprocessed DataFrame with unlabeled data removed.
    Raises:
        FileNotFoundError: If the specified data file does not exist.
        ValueError: If the data is empty after removing unlabeled data and dropping NaN values.
    """
    if not data_path.exists():
        raise FileNotFoundError(f"No file in Path {data_path} is found")

    df = pd.read_csv(data_path)

    df = remove_unlabeled_data(df)
    try:
        for cols in df.columns:
            df[cols] = df[cols].apply(pd.to_numeric, errors="raise")
    except Exception as exc:
        raise ValueError(
            "umeric values detected in numeric timeseries columns.")

    df.dropna(inplace=True)

    if df.empty:
        raise ValueError(
            "Dataframe empty after removing unlabeled data and dropping NaN values")

    return df


def remove_unlabeled_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Remove rows with unlabeled data (where labels == -1).

    Args:
        data (pd.DataFrame): Input DataFrame containing a 'labels' column.

    Returns:
        pd.DataFrame: DataFrame with unlabeled data removed.
    """
    df = data[data["labels"] != -1].copy()
    return df


def col_num(col: str) -> int:
    """
    Returns number of the cycle
    """
    m = re.search(r'(\d+)$', col)
    return int(m.group(1))


def convert_to_np(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Convert DataFrame to numpy arrays, separating labels, experiment IDs, and features.
    """
    labels = data['labels'].to_numpy()
    exp_ids = data['exp_ids'].to_numpy()

    # Get I and V columns, then sort them numerically to avoid I1, I10, I2 sorting issues
    curr_cols = sorted(
        [col for col in data.columns if col.startswith(
            "I") and col not in ['exp_ids', 'labels']],
        key=col_num
    )
    vol_cols = sorted(
        [col for col in data.columns if col.startswith("V")],
        key=col_num
    )

    currents = data[curr_cols].to_numpy()
    voltages = data[vol_cols].to_numpy()

    # Stack into shape [n_samples, timesteps, 2]
    curr_and_volts = np.stack((currents, voltages), axis=-1)

    return labels, exp_ids, curr_and_volts


def create_sliding_windows_first_dim(data: np.ndarray, sequence_length: int) -> np.ndarray:
    """
    Create sliding windows over the first dimension of a 3D array.

    Args:
        data (np.ndarray): Input array of shape (n_samples, timesteps, features)
        sequence_length (int): Length of each window

    Returns:
        np.ndarray: Windowed data of shape (n_windows, sequence_length*timesteps, features)
    """

    # Input shape: (n_samples, timesteps, features)
    # shape: (n_windows, timesteps, features, sequence_length)
    view = sliding_window_view(data, window_shape=(sequence_length,), axis=0)

    # Target shape: (n_windows, sequence_length, timesteps, features)
    view_permuted = np.moveaxis(view, -1, 1)

    # shape (n_windows, sequence_length * timesteps, features)
    n_windows = view_permuted.shape[0]
    n_timesteps = data.shape[1]
    n_features = data.shape[2]

    reshaped_view = view_permuted.reshape(
        n_windows, sequence_length * n_timesteps, n_features)

    return reshaped_view


def get_welding_data(path: Path, n_samples: int | None = None, return_sequences: bool = False, sequence_length: int = 100) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Load welding data from CSV or cached numpy files.

    If numpy cache files don't exist, loads from CSV and creates cache files.
    If cache files exist, loads directly from them.

    Args:
        path (Path): Path to the CSV data file.
        n_samples (int | None): Number of samples to sample from the data. If None, all data is returned.
        return_sequences (bool): If True, return sequences of length sequence_length.
        sequence_length (int): Length of sequences to return.
    Returns:
        tuple: A tuple containing:
            - np.ndarray: Array of welding data features
            - np.ndarray: Array of labels
            - np.ndarray: Array of experiment IDs
    """
    pathpre = path.with_suffix("")
    cache_feat = pathpre.with_name(f"{pathpre.name}_features.npy")
    cache_labels = pathpre.with_name(f"{pathpre.name}_labels.npy")
    cache_exp = pathpre.with_name(f"{pathpre.name}_exp_ids.npy")

    # check is cache esxists
    if cache_feat.exists() and cache_labels.exists() and cache_exp.exists():
        features = np.load(cache_feat)
        labels = np.load(cache_labels)
        exp_ids = np.load(cache_exp)
    else:
        df = load_data(path)
        labels, exp_ids, features = convert_to_np(df)

        np.save(cache_feat, features)
        np.save(cache_labels, labels)
        np.save(cache_exp, exp_ids)

    if return_sequences:
        features = create_sliding_windows_first_dim(features, sequence_length)

        # match *(n_windows, sequence_length)*
        labels = np.lib.stride_tricks.sliding_window_view(
            labels, window_shape=sequence_length
        )
        exp_ids = np.lib.stride_tricks.sliding_window_view(
            exp_ids, window_shape=sequence_length
        )

    if n_samples is not None:
        if n_samples <= 0:
            raise ValueError("n_samples must be a positive integer")
        if n_samples > len(labels):
            raise ValueError(
                f"Requested n_samples={n_samples}, but only {len(labels)} available")
        rng = np.random.default_rng()
        idx = rng.choice(len(labels), size=n_samples, replace=False)

        features = features[idx]
        labels = labels[idx]
        exp_ids = exp_ids[idx]

    return features, labels, exp_ids


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
from plotly.offline import plot
import plotly.express as px
import plotly.graph_objects as go
from utils import save_plotly_figure_as_html
from ex_01_read_data import get_welding_data
from plotly.subplots import make_subplots




In [None]:
data_path = Path("data/Welding/data.csv")
plot_path = Path("plots/ex_02")

## Exercise 2.1: Dataset Statistics Analysis

In this exercise, we'll calculate and present comprehensive descriptive statistics for the welding dataset. This statistical overview will help us understand the fundamental characteristics of our data before visualization or modeling.

1. Calculate the class distribution to understand data balance
2. Count samples per experiment to assess data volume across experimental conditions
3. Determine min/max values for voltage and current to understand the range of measurements
4. Calculate mean and standard deviation to assess central tendency and variability
5. Find median values to identify central points unaffected by outliers

In [None]:
data, labels, exp_ids = get_welding_data(data_path)

In [None]:
# Flatten the data for overall statistics
all_currents = data[:, :, 0].flatten()
all_voltages = data[:, :, 1].flatten()

#  statistics
class_counts = pd.Series(labels).value_counts()
exp_counts = pd.Series(exp_ids).value_counts()

stats_summary = {
    "Total Samples": [len(labels)],
    "Min Current (I)": [np.min(all_currents)],
    "Max Current (I)": [np.max(all_currents)],
    "Mean Current (I)": [np.mean(all_currents)],
    "Standard Deviation Current (I))": [np.std(all_currents)],
    "Median Current (I)": [np.median(all_currents)],
    "Min Voltage (V)": [np.min(all_voltages)],
    "Max Voltage (V)": [np.max(all_voltages)],
    "Mean Voltage (V)": [np.mean(all_voltages)],
    "Standard Deviation Voltage (V)": [np.std(all_voltages)],
    "Median Voltage (V)": [np.median(all_voltages)],
}

stats_df = pd.DataFrame(stats_summary).T
stats_df.columns = ["Value"]

print("Class Distribution:\n", class_counts, "\n")
print("Samples per Experiment:\n", exp_counts, "\n")
print("Overall Measurement Statistics:\n", stats_df)


## Exercise 2.2: Current and Voltage Distribution Visualization

In this exercise, we'll create interactive boxplots to visualize and compare the distributions of voltage and current measurements in the welding data. Boxplots will help us identify central tendencies, spread, and potential outliers in our measurements.

1. Create side-by-side boxplots for voltage and current using Plotly
2. Display key statistics (median, quartiles, etc.) in a visual format in the plot
3. Enable interactive exploration of the distributions
4. Save the visualization for future reference

In [None]:
data, labels, exp_ids = get_welding_data(data_path, n_samples=10_000)

save_path =  plot_path / "voltage_current_distribution"

In [None]:


current_flat = data[:, :, 0].flatten()
voltage_flat = data[:, :, 1].flatten()

plot_df = pd.DataFrame({
    'Value': np.concatenate([current_flat, voltage_flat]),
    'Measurement': ['Current'] * len(current_flat) + ['Voltage'] * len(voltage_flat)
})

fig_box = px.box(
    plot_df,
    x='Measurement',
    y='Value',
    color='Measurement',
    title='Distribution of Current and Voltage Measurements',
    labels={'Value': 'Measurement Value', 'Measurement': 'Measurement Type'},
)
fig_box.show()

# Save and show the plot
save_plotly_figure_as_html(fig_box, save_path)
print(f"Saved boxplot to {save_path}.html")


## Exercise 2.3: Time-Series Sequence Visualization

In this exercise, we'll implement functions to visualize the time-series patterns of welding cycles. These visualizations will help us understand the temporal relationships between voltage and current during the welding process and identify patterns associated with quality outcomes.

1. Create dual-axis plots showing voltage and current over time (10 welding cycles -> sequence_length=10) using matplotlib
2. Implement clear legends and labels for data interpretation
3. Enable saving of visualizations for reporting and comparison

In [None]:

data, labels, exp_ids = get_welding_data(data_path, n_samples=100, return_sequences=True, sequence_length=10)
save_path = plot_path / "welding_sample.png"    

In [None]:
def plot_random_sample(data: np.ndarray, labels: np.ndarray, save_path: Path):
    """
    Selects a single random sample (or sequence), plots its current and voltage
    over time using matplotlib with a dual-axis, saves it, and shows it.
    Us
    """
    if data.shape[0] == 0:
        print(f"No data to plot")
        return

    idx = np.random.randint(0, data.shape[0])
    sample_data = data[idx]

    timesteps_x_axis = np.arange(sample_data.shape[0])
    current = sample_data[:, 0]
    voltage = sample_data[:, 1]
    
    fig, ax1 = plt.subplots(figsize=(12, 6))

    color = 'tab:red'
    ax1.set_xlabel('Timesteps') 
    ax1.set_ylabel('Current (A)', color=color)
    ax1.plot(timesteps_x_axis, current, color=color, label='Current')
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.legend(loc='upper left')

    ax2 = ax1.twinx()
    color = 'tab:blue'
    ax2.set_ylabel('Voltage (V)', color=color)
    ax2.plot(timesteps_x_axis, voltage, color=color, label='Voltage')
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.legend(loc='upper right')

    plt.title("Welding Cycle Sample")
    fig.tight_layout() 
    
    save_path.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(save_path)
    print(f"Saved Matplotlib plot to {save_path}")
    
    plt.show() 
    plt.close(fig) # Close the figure to free up memory



In [None]:
try:

    if data.shape[0] == 0:
        print("No sequences loaded for Exercise 2.3")
    else:
        print(f"Loaded {data.shape[0]} sequences. Plotting one random example...")
        save_path_ex2_3 = plot_path / "welding_sequence_matplotlib_single_ex2_3.png"
        
        plot_random_sample(
            data, 
            labels, 
            save_path=save_path,
        )

except Exception as e:
    print(f"An error occurred during Exercise 2.3: {e}")


## Exercise 2.4: Interactive Time-Series Visualization with Plotly

In this exercise, we'll create enhanced interactive visualizations of welding samples using Plotly. These interactive plots will provide more advanced exploration capabilities for analyzing the time-series patterns.

1. Create interactive plots with dual y-axes for voltage and current
2. Implement time-based range sliders for detailed exploration
3. Add unified tooltips for precise data reading
4. Display quality metrics in the plot title
5. Save interactive visualizations as HTML for sharing

In [None]:
def create_plotly_plot(data: np.ndarray, labels: np.ndarray, exp_ids: np.ndarray = None) -> go.Figure:
    """
    Create an interactive Plotly visualization of a random welding sample.

    Args:
        data (np.ndarray): Array containing voltage and current data
        labels (np.ndarray): Array containing class labels
        exp_ids (np.ndarray, optional): Array containing experiment IDs. Defaults to None.

    Returns:
        plotly.graph_objects.Figure: Interactive Plotly figure object
    """
    
    if data.shape[0] == 0:
        print("Warning: No data provided to create_plotly_plot. Returning empty figure.")
        return go.Figure()

    idx = np.random.randint(0, data.shape[0])
    sample_sequence_data = data[idx] 

    timesteps_x_axis = np.arange(sample_sequence_data.shape[0])
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=timesteps_x_axis, y=sample_sequence_data[:, 0], name='Current (A)', line=dict(color='red')),
        secondary_y=False,
    )
    fig.add_trace(
        go.Scatter(x=timesteps_x_axis, y=sample_sequence_data[:, 1], name='Voltage (V)', line=dict(color='blue')),
        secondary_y=True,
    )
    fig.update_layout(
        title_text=f"Interactive Welding Sequence",
        xaxis_title="Time Points within Sequence",
        hovermode='x unified', 
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    fig.update_yaxes(title_text="Current (A)", secondary_y=False, color='red')
    fig.update_yaxes(title_text="Voltage (V)", secondary_y=True, color='blue')
    fig.update_layout(xaxis=dict(rangeslider=dict(visible=True))) 
    return fig


fig = create_plotly_plot(data, labels, exp_ids)
save_plotly_figure_as_html(fig, plot_path / "welding_samples")
fig.show()

## Exercise 2.5: Multiple Sample Comparison

In this exercise, we'll generate and compare visualizations from multiple random welding samples. This comparison will help us identify common patterns and variations across different welding cycles.
 

1. Generate multiple random sample visualizations using matplotlib of single welding cycles
2. Create dual-axis plots showing voltage and current over time
3. Implement clear legends and labels for data interpretation
4. Save each visualization for comparison

In [None]:
data, labels, exp_ids = get_welding_data(data_path, n_samples=1_000)

In [None]:
for i in range(5):
    plot_random_sample(data, labels, save_path=plot_path / f"welding_sample_{i}.png")
    