# Extended BUMP Analysis

## Preparation

#### Import dependencies

In [None]:
from collections import Counter
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import chi2_contingency

#### Configure plots look and feel

In [None]:
def set_plot_theme(base_font: int) -> None:
    """ 
    Set the theme for plots in the notebook.
    """
    sns.set_theme(
        style="whitegrid", 
        context="talk", 
        palette="colorblind",
        rc={
            "font.size": base_font, 
            "axes.titlesize": base_font + 4, 
            "axes.labelsize": base_font + 2,
            "xtick.labelsize": base_font,
            "ytick.labelsize": base_font,
            "legend.fontsize": base_font
            })
    
    mpl.rcParams.update({
        "figure.titlesize": base_font + 4,
        "axes.titlepad": 30,
        "axes.labelpad": 10,
        })

set_plot_theme(10)

#### Read dataset

In [None]:
path = "data/bc_type_distribution_full_data.csv"
df = pd.read_csv(path)
df.head()

## Analysis

#### Compute number of files

In [None]:
files = len(df)
print(f"Total files: {files}")

#### Identify breaking and non-breaking files

In [None]:
bc_files = df[df["BC_kinds"] != "NONE"]
bc_count = len(bc_files)

none_files = df[df["BC_kinds"] == "NONE"]
none_count = len(none_files)

print(f"Files with breaking changes: {bc_count} ({(bc_count / files) * 100:.2f}%)")
print(f"Files with no breaking changes: {none_count} ({(none_count / files) * 100:.2f}%)")

#### Compute number of BC types per file

In [None]:
ax = sns.countplot(
    x="BC_kinds_count",
    data=df,
    order=sorted(df["BC_kinds_count"].unique())
)

for p in ax.patches:
    p.set_alpha(0.8)
    height = p.get_height() + 1
    ax.annotate(f'{int(height)}',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom')

ax.set_title("Breaking Change Types per File")
ax.set_xlabel("Breaking change type count")
ax.set_ylabel("File count")

# plt.tight_layout()
plt.show()

In [None]:
def parse_bcs_labels(labels: str) -> list[str]:
    """
    Parses the breaking change labels from a string and returns a list 
    of breaking change types (e.g., METHOD_REMOVED, METHOD_ADDED_TO_INTERFACE,
    NONE). If the input is NaN or not a string, it returns an empty list.

    :param labels: A string containing the breaking change labels separated 
        by semicolons.
    :return: A list of breaking change types.
    """
    if not pd.isna(labels) and isinstance(labels, str):
        return [label.strip() for label in labels.split(";")]
    return []

In [None]:
df["bc_types"] = df["BC_kinds"].apply(parse_bcs_labels)
df.tail()

In [None]:
multi_label_bc_type_counts = Counter(df["bc_types"].explode().tolist())
single_label_counts = Counter([bc_types[0] for bc_types in df["bc_types"] if len(bc_types) == 1])

# Multi-label files
bc_types_distribution = pd.DataFrame.from_dict(multi_label_bc_type_counts, orient="index", columns=["multi-label-files"])
bc_types_distribution["multi-label-files-perc"] = bc_types_distribution["multi-label-files"] / files * 100
bc_types_distribution = bc_types_distribution.sort_values("multi-label-files", ascending=False)

# Single-label files
single_label_df = pd.DataFrame.from_dict(single_label_counts, orient="index", columns=["single-label-files"])
single_label_df["single-label-files-perc"] = single_label_df["single-label-files"] / files * 100

# Join both dataframes and fill missing values with zeros
bc_types_distribution = bc_types_distribution.join(single_label_df, how="left", on=None, validate="many_to_many")
bc_types_distribution["single-label-files"] = bc_types_distribution["single-label-files"].fillna(0).astype(int)
bc_types_distribution["single-label-files-perc"] = bc_types_distribution["single-label-files-perc"].fillna(0)

bc_types_distribution


In [None]:
# Data
ax = sns.barplot(
    x=bc_types_distribution.index, 
    y=bc_types_distribution["multi-label-files"]
)
plt.xticks(rotation=90)

# Annotations
for p in ax.patches:
    p.set_alpha(0.8)
    height = p.get_height() + 1
    ax.annotate(f'{int(height)}',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom')

# Labels
ax.set_title("Distribution of Breaking Change Types")
ax.set_xlabel("Breaking change type")
ax.set_ylabel("File count")

plt.show()

In [None]:
# Data
fig, ax = plt.subplots(figsize=(max(6, len(bc_types_distribution) * 0.6), 4))
x = np.arange(len(bc_types_distribution))
width = 0.45
ax.bar(x - width / 2, bc_types_distribution['multi-label-files'], width, label='Multi-label files')
ax.bar(x + width / 2, bc_types_distribution['single-label-files'], width, label='Single-label files')
ax.set_xticks(x)
ax.set_xticklabels(bc_types_distribution.index, rotation=90)

# Annotations
for p in ax.patches:
    p.set_alpha(0.8)
    height = p.get_height() + 1
    ax.annotate(f'{int(height)}',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=8)

# Labels
ax.set_title("Distribution of Breaking Change Types")
ax.set_xlabel("Breaking change type")
ax.set_ylabel("File count")
ax.legend()

plt.show()

#### Remove files with NONE as BC

In [None]:
len_before = len(df)
df = df[df["BC_kinds"] != "NONE"]
len_after = len(df)
print(f"Removed {len_before - len_after} files with BC_kinds NONE. Remaining files: {len_after}")

## Train/Test Split

**Current situation**
- Each file has more than one BC type, resulting in a multi-labeled dataset.
- We should prevent label imbalance when performing the split. This is more challenging given the multi-labeled nature of the dataset.
- We need to ensure we end up witha representative coverage of all BC types available in the dataset for bot the training and test datasets. 
- We do not cover all possible BC types due to the current dataset we count on. This should be reported in the threats to validity.

**Alternatives to perform the split**
- Random sampling with a final check would be possible if the dataset was not that small (as it is our case).
- Multi-label stratified split: guarantees that each split preserves the proportion of labels (aka. BC types) in the training and test datasets. We account for the multi-label nature of the dataset---a simple stratified split could fail.
- Multi-label stratified k-fold or cross-validation: prevents us from focusing on "too good" or "too bad" cases. Although ideal, it might become too computationally expensive.

#### Multi-label stratified split

In [None]:
binarizer = MultiLabelBinarizer()
Y = binarizer.fit_transform(df["bc_types"])

splitter = MultilabelStratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

train_idx, test_idx = next(splitter.split(df, Y))

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]
remaining_df = df.drop(test_df.index)

print(f"Size of training set: {len(train_df)} files")
print(f"Size of test set: {len(test_df)} files")

In [None]:
test_df.to_csv("data/test_set.csv", index=False)

#### Split the training dataset

In [None]:
def sample_training_set(base_df, size, seed=42):
    """
    Randomly samples a subset of the training data.
    
    :param base_df: The DataFrame to sample from.
    :param size: The number of samples to draw.
    :param seed: The random seed for reproducibility.
    :return: A DataFrame containing the sampled training data.
    """
    return base_df.sample(n=size, random_state=seed)

In [None]:
def compute_label_distribution(df):
    """
    Computes the distribution of breaking change types in the given 
    DataFrame.
    
    :param df: The DataFrame containing a "bc_types" column with lists 
        of breaking change types.
    :return: A Counter object mapping each breaking change type to its count.
    """
    return Counter([",".join(sorted(labels)) for labels in df["bc_types"]])


def is_training_dataset_balanced(train_df, test_df, alpha=0.05):
    """
    Performs a chi-squared test to determine if the distribution of 
    breaking change types in the training set is statistically similar 
    to that in the test set.

    :param train_df: The training DataFrame.
    :param test_df: The test DataFrame.
    :param alpha: The significance level for the test (default is 0.05).
    :return: A tuple (is_balanced, p_value, chi2_statistic, cramers_v) where 
        is_balanced is True if the distributions are similar, False otherwise."""
    train_dist = compute_label_distribution(train_df)
    test_dist = compute_label_distribution(test_df)

    contingency = pd.DataFrame({
        "train": train_dist,
        "test": test_dist
    }).fillna(0)

    chi2, p_value, dof, expected = chi2_contingency(contingency)

    # Calculate Cramér's V (effect size)
    n = contingency.to_numpy().sum()
    cramers_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))

    return p_value >= alpha, p_value, chi2, cramers_v

In [None]:
def compute_next_sample_size(previous_sample_size: int, sample_increment: int, total_train: int) -> int:
    """
    Computes the next sample size for the training dataset based on the previous sample size,
    the sample increment, and the total number of training samples available.

    :param previous_sample_size: The size of the previous training dataset.
    :param sample_increment: The number of additional samples to add in the next round.
    :param total_train: The total number of training samples available.
    :return: The computed sample size for the next training dataset.
    """
    if previous_sample_size == 1:
        return sample_increment
    elif previous_sample_size + sample_increment > total_train:
        return total_train
    else:        
        return previous_sample_size + sample_increment
    

def create_training_datasets(remaining_df=remaining_df, sample_increment=25, min_train_size=1, rounds=5):
    """
    Creates multiple training datasets of increasing size and checks 
    if they are balanced with respect to the test set.
    
    :param remaining_df: The DataFrame to sample from for creating training datasets.
    :param sample_increment: The number of additional samples to add in each round.
    :param min_train_size: The minimum number of samples for teh training dataset.
    :param rounds: The number of training datasets to create.
    """
    total_train = len(remaining_df)
    sample_size = min_train_size

    while sample_size != total_train:
        _round = 0

        while _round < rounds:
            print(f"Creating training dataset for round {_round + 1} with {sample_size} samples...")
            train_df = sample_training_set(remaining_df, sample_size)
            is_balanced, p_value, chi2, cramers_v = is_training_dataset_balanced(train_df, test_df, alpha=0.01)
        
            if not is_balanced:
                print(f"Dataset for round {_round + 1} is imbalanced (p-value: {p_value:.4f}). Skipping this round.")
                continue

            train_df.to_csv(f"data/train_{sample_size}_round_{_round}.csv", index=False)
            _round += 1
            print(f"Chi-squared: {chi2:.4f}")
            print(f"P-value: {p_value:.4f}")
            print(f"Cramér's V: {cramers_v:.4f}")

        print(f"Completed round {_round}.")
        sample_size = compute_next_sample_size(sample_size, sample_increment, total_train)
        print(f"Moving to next round with {sample_size} samples...")

In [None]:
create_training_datasets()

#### BC-based training dataset

In [None]:
def create_training_datasets_per_bc_type(remaining_df=remaining_df, min_samples_per_type=40):
    """
    Creates training datasets where each round focuses on files containing 
    a specific breaking change type. For each BC type found in the remaining_df,
    it creates a training dataset containing only files with that label.

    :param remaining_df: The DataFrame to sample from for creating training datasets.
    :param min_samples_per_type: Minimum number of samples required for a BC type to 
        be included.
    """
    bc_types = {bc_type for bc_types in remaining_df["bc_types"] for bc_type in bc_types}
    bc_types = sorted(bc_types)
    
    for bc_type in bc_types:
        train_df = remaining_df[remaining_df["bc_types"].apply(
            lambda x, bc_type=bc_type: isinstance(x, list) and len(x) == 1 and x[0] == bc_type
        )]
        
        if len(train_df) < min_samples_per_type:
            print(f"Skipping BC type '{bc_type}', only {len(train_df)} samples found")
            continue
        
        print(f"Creating training dataset for round {bc_type} with {len(train_df)} samples...")
        train_df.to_csv(f"data/train_{bc_type}.csv", index=False)


In [None]:
create_training_datasets_per_bc_type()

## \<EOF\>