# Commit Chronicle Dataset

This notebook investigates the [Commit Chronicle dataset](https://huggingface.co/datasets/JetBrains-Research/commit-chronicle) introduced in the paper ["From Commit Message Generation to History-Aware Commit Message Completion", ASE 2023](https://arxiv.org/abs/2308.07655) - loading, filtering, EDA and preprocessing

In [None]:
import multiprocessing as mp
from functools import partial

import rootutils
from datasets import load_dataset, load_from_disk

In [None]:
ROOT = rootutils.setup_root(".", ".project-root", pythonpath=True)
OUTPUT_DIR = ROOT / "data/playground"

## Loading and Filtering

Note: Filtering logic is implemented in `CommitChroniclePreprocessor`

In [None]:
# Run this cell to continue with the rest of this notebook.

SPLIT = "validation"  # we select this split as it's small for our EDA, feel free to change to `train` split if u want
LANGUAGES = ["Go"]

filtered = OUTPUT_DIR / "01-filtered-validation"


# we don't directly reference `LANGUAGES` in the function because in python multiprocessing,
# all functions passed as parameters shouldn't reference variables outside of them
def filter_dataset(example, languages):
    return example["language"] in languages


if not filtered.exists():
    (
        load_dataset("JetBrains-Research/commit-chronicle", "default", split=SPLIT)
        .filter(partial(filter_dataset, languages=LANGUAGES), num_proc=mp.cpu_count())
        .save_to_disk(filtered)
    )
dataset = load_from_disk(filtered)

In [None]:
dataset.select(range(10)).to_pandas()

In [None]:
dataset.select(range(1))[0]["mods"][0].keys()

In [None]:
dataset.select(range(1))

## EDA

### Column names

These are the columns we have in our dataset and an example of each

In [None]:
print(
    dataset.column_names
)  # ['author','date','timezone','hash','message','mods','language','license','repo','original_message']

subset = dataset.select(range(10))

for element in subset:
    print(element)

subset[0][
    "mods"
]  # 'Mods' will have multiple changes in different files. need to be appended accordingly. Its length varies.

In [None]:
df = dataset.to_pandas()
add_changes_df = df[
    df["mods"].apply(lambda mods: all(mod["change_type"] == "ADD" for mod in mods))
]
print(
    len(add_changes_df)
)  # we have 3400 rows of changes that ONLY contain ADD type, can be used as a start to train our first model

In [None]:
modify_changes_df = df[
    df["mods"].apply(lambda mods: all(mod["change_type"] == "MODIFY" for mod in mods))
]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_change_types(df):
    """
    Plots the distribution of change types (e.g., MODIFY, ADD, DELETE).
    """
    change_types = (
        df["mods"]
        .apply(lambda mods: [mod["change_type"] for mod in mods])
        .explode()
        .value_counts()
    )

    plt.figure(figsize=(8, 6))
    sns.barplot(x=change_types.values, y=change_types.index, palette="coolwarm")
    plt.title("Distribution of Change Types")
    plt.xlabel("Number of Changes")
    plt.ylabel("Change Type")
    plt.show()


plot_change_types(df)  # We have a lot of MODIFY changes, ADD and DELETE are less frequent

In [None]:
import string
from collections import Counter


def analyze_commit_messages(df):
    """
    Analyzes and plots the most common words in commit messages.
    """
    # Combine all messages
    all_messages = " ".join(df["message"].dropna().tolist())

    # Remove punctuation
    all_messages = all_messages.translate(str.maketrans("", "", string.punctuation))

    # Tokenize and count
    words = all_messages.lower().split()
    common_words = Counter(words).most_common(20)

    # Plot
    words, counts = zip(*common_words)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=list(words), y=list(counts), hue=list(words), legend=False)
    plt.title("Top 20 Common Words in Commit Messages")
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.show()


analyze_commit_messages(df)
analyze_commit_messages(add_changes_df)

In [None]:
import nltk
import numpy as np
import pandas as pd

nltk.download("stopwords", quiet=True)


def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    stopwords_list = nltk.corpus.stopwords.words("english")
    stop_words = set(stopwords_list)
    cleaned_words = [word for word in words if word not in stop_words]

    return cleaned_words


def get_word_counts(df):
    all_words = []
    for message in df["message"].dropna():
        all_words.extend(preprocess_text(message))
    return Counter(all_words)


word_counts_original = get_word_counts(
    df.sample(len(add_changes_df))
)  # sample the same number of ADD changes to get a good idea
word_counts_add = get_word_counts(add_changes_df)
word_counts_modify = get_word_counts(modify_changes_df.sample(len(add_changes_df)))


def counter_to_df(counter, title):
    df = pd.DataFrame(counter.most_common(20), columns=["word", "count"])
    df["dataset"] = title
    return df


df_original_words = counter_to_df(word_counts_original, "All Commits")
df_add_words = counter_to_df(word_counts_add, "Only ADD Commits")
df_modify_words = counter_to_df(word_counts_modify, "Only MODIFY Commits")

df_combined = pd.concat([df_original_words, df_add_words, df_modify_words])

plt.figure(figsize=(14, 8))
sns.barplot(data=df_combined, x="word", y="count", hue="dataset")

plt.title("Top 20 Common Words in Commit Messages")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.legend(title="Dataset")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()  # We can see there is a difference between the most common words in ADD and MODIFY commits
# It's supported by the fact that MODIFY commits are more similar to the original commits as they are more frequent

In [None]:
from scipy.stats import entropy

vocabulary = set(word_counts_original.keys()).union(set(word_counts_add.keys()))

counts_original = []
counts_add = []

for word in vocabulary:
    counts_original.append(word_counts_original.get(word, 0))
    counts_add.append(word_counts_add.get(word, 0))

counts_original = np.array(counts_original, dtype=np.float64)
counts_add = np.array(counts_add, dtype=np.float64)

epsilon = 1e-10
counts_original += epsilon
counts_add += epsilon

prob_original = counts_original / counts_original.sum()
prob_add = counts_add / counts_add.sum()

kl_divergence = entropy(prob_original, prob_add)

print(f"KL Divergence (All Commits || Only ADD Commits): {kl_divergence:.4f}")

kl_divergence_reverse = entropy(prob_add, prob_original)
print(f"KL Divergence (Only ADD Commits || All Commits): {kl_divergence_reverse:.4f}")

Higher values for the KL Divergence (non-zero) indicate strong differences between both distributions

## Preprocessing

Columns of interest in the dataset are:
1. `mods` - Contains all file changes information - what files are changed, the type of change made (addition, modification), and the exact file changes.
2. `message` - The (processed) git commit message
3. `author` - (Optional) This will be used if we want to group commits by a certain author and use that as input. This is and advanced use case

We are going to tokenize the `mods` and `message` using two different tokenizer, since `mods` contains code and which is quite different from `message` which is mostly natural language. So, one tokenizer for `mods`, another for `message`.

We'll start with `message`. The output from the tokenization of  `message` will be called `msg_input_ids`.

Note: All the preprocessing logic explored here is implemented in `CommitChroniclePreprocessor`.

In [None]:
from transformers import AutoTokenizer

from src.data.components.tokenization import add_special_tokens

# This is the tokenizer used in the Commit Chronicle dataset
# The rationale behind this choice is yet to be investigated? Someone could investigate and report :)
# OR we may have to train our own tokenizer as suggested by our all-knowing ChatGPT (https://chatgpt.com/share/672e3b64-6b84-8009-a6c9-adac73cf647e)
msg_tokenizer_ = AutoTokenizer.from_pretrained("Salesforce/codet5-base")

# add `sep_token` and `pad_token`
# `sep_token` is necessary when we are training on a history of git diffs (which is an advanced usage and not part of our initial experiments)
# `pad_token` is necessary for correct batch construction.
msg_tokenizer_ = add_special_tokens(msg_tokenizer_, None)

Let's try out commit message tokenization on a single example

In [None]:
msg_input_ids_ = msg_tokenizer_(
    dataset[0]["message"], truncation=False, padding=False, add_special_tokens=False
).input_ids

print(dataset[0]["message"])
print(msg_input_ids_)

Next, we'll look at the tokenization of git commit changes, `mods`. But before we do that, let's examine the structure of the data.

In [None]:
dataset[0]["mods"]

We'll need to somehow combine all that information into a single string before tokenization.

In [None]:
def preprocess_mods(mods: list[dict[str, str]], line_sep: str) -> str:
    """
    Transforms a list of all files modifications made in a commit into a single string representation.

    Specifically, adds a header to each file diff (https://git-scm.com/docs/git-diff#_generating_patch_text_with_p)
    and concatenates the results.

    Args:
        mods: A list of files modifications made in a commit.
        line_sep: The line separator to separate each file modification.

    Returns:
        A single string representation of all files modifications made in a commit.
    """
    diff = ""

    for mod in mods:
        if mod["change_type"] == "UNKNOWN":
            continue
        elif mod["change_type"] == "ADD":
            file_diff = f"new file {mod['new_path']}"
        elif mod["change_type"] == "DELETE":
            file_diff = f"deleted file {mod['old_path']}"
        elif mod["change_type"] == "RENAME":
            file_diff = f"rename from {mod['old_path']}{line_sep}rename to {mod['new_path']}"
        elif mod["change_type"] == "COPY":
            file_diff = f"copy from {mod['old_path']}{line_sep}copy to {mod['new_path']}"
        else:
            file_diff = f"{mod['new_path']}"
        diff += file_diff + line_sep + mod["diff"]

    return diff


# Let's test it out
print(preprocess_mods(dataset[0]["mods"], line_sep="\n"))
print(dataset[0]["message"])

Let's analyze the length of commit messages and diffs in the dataset.

In [None]:
def analyze_commit_lengths(dataset):
    """Analyze lengths of commit messages and diffs, with examples of extremes."""

    # Process each commit to get lengths
    messages = []
    diffs = []

    for commit in dataset:
        # Get message length
        message = commit.get("message", "")
        msg_len = len(message) if isinstance(message, str) else len(" ".join(map(str, message)))
        messages.append({"length": msg_len, "content": message, "sha": commit.get("sha")})

        # Get diff length
        diff = preprocess_mods(commit.get("mods", []), line_sep="\n")
        diff_len = len(diff) if diff else 0
        diffs.append({"length": diff_len, "content": diff, "sha": commit.get("sha")})

    # Find extremes
    max_msg = max(messages, key=lambda x: x["length"])
    min_msg = min(messages, key=lambda x: x["length"])
    max_diff = max(diffs, key=lambda x: x["length"])
    min_diff = min(diffs, key=lambda x: x["length"])

    # Calculate statistics
    msg_lengths = [m["length"] for m in messages]
    diff_lengths = [d["length"] for d in diffs]

    stats = pd.DataFrame(
        {
            "Metric": ["Average", "Minimum", "Maximum", "Median"],
            "Message Length": [
                np.mean(msg_lengths),
                np.min(msg_lengths),
                np.max(msg_lengths),
                np.median(msg_lengths),
            ],
            "Diff Length": [
                np.mean(diff_lengths),
                np.min(diff_lengths),
                np.max(diff_lengths),
                np.median(diff_lengths),
            ],
        }
    )

    # Round statistics
    stats.iloc[:, 1:] = stats.iloc[:, 1:].round(2)

    return stats, {
        "longest_message": max_msg,
        "shortest_message": min_msg,
        "longest_diff": max_diff,
        "shortest_diff": min_diff,
    }


# Example usage
stats, examples = analyze_commit_lengths(dataset)

print("Statistics:")
print(stats)

In [None]:
from copy import deepcopy

# Here, we just duplicate the message tokenizer, but it could be completely different, or maybe I lied :)
diff_tokenizer_ = deepcopy(msg_tokenizer_)
# diff can be very long, we need to set a limit that our model (and computer resources) can handle
DIFF_MAX_LEN = 512

# again, let's test it
git_diff_ = preprocess_mods(dataset[0]["mods"], line_sep="\n")
diff_input_ids_ = diff_tokenizer_(
    git_diff_,
    truncation=True,
    max_length=DIFF_MAX_LEN
    - 2,  # -2 to account for special tokens (BOS and EOS) to be added later, during batch data construction.
    padding=False,
    add_special_tokens=False,
).input_ids
print(diff_input_ids_[:100], len(diff_input_ids_))

In [None]:
diff_tokenizer_.vocab_size

In [None]:
diff_tokenizer_.unk_token_id

In [None]:
def analyze_unknown_tokens(dataset, msg_tokenizer, diff_tokenizer, DIFF_MAX_LEN=512):
    """
    Analyze the percentage of unknown tokens in commit messages and diffs.

    Parameters:
    -----------
    dataset : list
        List of commit dictionaries
    msg_tokenizer : tokenizer
        Tokenizer for commit messages
    diff_tokenizer : tokenizer
        Tokenizer for diffs
    DIFF_MAX_LEN : int, optional
        Maximum length for diff tokenization

    Returns:
    --------
    pandas.DataFrame
        A DataFrame with unknown token statistics
    """

    def count_unknown_tokens(tokenizer, texts):
        # Get the UNK token ID
        unk_token_id = tokenizer.unk_token_id

        # Tokenize texts
        tokenized_texts = tokenizer(
            texts,
            truncation=True,
            max_length=DIFF_MAX_LEN - 2,
            padding=False,
            add_special_tokens=False,
        ).input_ids

        # Calculate UNK token percentages
        total_tokens = sum(len(tokens) for tokens in tokenized_texts)
        unknown_tokens = sum(
            sum(1 for token in tokens if token == unk_token_id) for tokens in tokenized_texts
        )

        return {
            "total_tokens": total_tokens,
            "unknown_tokens": unknown_tokens,
            "unk_percentage": (unknown_tokens / total_tokens * 100) if total_tokens > 0 else 0,
        }

    # Prepare texts
    messages = [commit.get("message", "") for commit in dataset]
    diffs = [preprocess_mods(commit["mods"], line_sep="\n") for commit in dataset]

    # Analyze messages and diffs
    msg_unk_stats = count_unknown_tokens(msg_tokenizer, messages)
    diff_unk_stats = count_unknown_tokens(diff_tokenizer, diffs)

    # Create DataFrame
    import pandas as pd

    unk_analysis = pd.DataFrame(
        {
            "Type": ["Messages", "Diffs"],
            "Total Tokens": [msg_unk_stats["total_tokens"], diff_unk_stats["total_tokens"]],
            "Unknown Tokens": [msg_unk_stats["unknown_tokens"], diff_unk_stats["unknown_tokens"]],
            "UNK Percentage": [
                round(msg_unk_stats["unk_percentage"], 2),
                round(diff_unk_stats["unk_percentage"], 2),
            ],
        }
    )

    return unk_analysis


# Example usage
unk_analysis = analyze_unknown_tokens(dataset, msg_tokenizer_, diff_tokenizer_)
print(unk_analysis)

Let's put everything together to process `mods` and `message` columns for all rows in the dataset.

In [None]:
def process_example(
    example,
    msg_tokenizer,
    diff_tokenizer,
    diff_max_len,
    diff_line_sep,
    preprocess_mods_func,
):
    msg_input_ids = msg_tokenizer(
        example["message"], truncation=False, padding=False, add_special_tokens=False
    ).input_ids

    git_diff = preprocess_mods_func(example["mods"], line_sep=diff_line_sep)
    diff_input_ids = diff_tokenizer(
        git_diff,
        truncation=True,  # we unfortunately have to truncate the git changes
        max_length=diff_max_len
        - 2,  # -2 to account for special tokens (BOS and EOS) to be added later, during batch data construction.
        padding=False,
        add_special_tokens=False,
    ).input_ids

    return {
        "author": example["author"],
        "message": example["message"],
        "msg_input_ids": msg_input_ids,
        "diff_input_ids": diff_input_ids,
        "repo": example["repo"],
        "language": example["language"],
    }


processed = OUTPUT_DIR / "02-processed-validation"
if not processed.exists():
    (
        dataset.map(
            partial(
                process_example,
                msg_tokenizer=msg_tokenizer_,
                diff_tokenizer=diff_tokenizer_,
                diff_max_len=DIFF_MAX_LEN,
                diff_line_sep="\n",
                preprocess_mods_func=preprocess_mods,
            ),
            num_proc=mp.cpu_count(),
        )
        .select_columns(["author", "msg_input_ids", "diff_input_ids", "language", "repo"])
        .save_to_disk(processed)
    )
dataset = load_from_disk(processed)

In [None]:
dataset.select(range(10)).to_pandas()