In [4]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import multiprocessing as mp
from tqdm import tqdm
from datasets import load_dataset
import polars as pl
import pandas as pd
import re


In [5]:
import nltk
nltk.download('punkt_tab')

from nltk import word_tokenize


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /cluster/home/andstorh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
DATA_DIR = '../../../data/baselines/PatchFinder'

In [7]:
ds_cve = load_dataset('fals3/cvevc_cve')
ds_patches = load_dataset('fals3/cvevc_commits', "patches")
ds_nonpatches = load_dataset('fals3/cvevc_commits', "non_patches")

Resolving data files:   0%|          | 0/140 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/140 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/140 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/79 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/52 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/58 [00:00<?, ?it/s]

In [8]:
import re

def convert_to_unified_0(diff: str) -> str:
    """
    Takes a git diff string and returns a version equivalent to `git diff --unified=0`.
    """
    output_lines = []
    diff_lines = diff.splitlines()
    
    inside_diff = False
    
    for line in diff_lines:
        if line.startswith("diff --git") or line.startswith("index") or line.startswith("---") or line.startswith("+++"):
            output_lines.append(line)
        elif line.startswith("@@"):
            inside_diff = True
            # Extract hunk header and modify it to show 0 lines of context
            match = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@", line)
            if match:
                old_start, old_count, new_start, new_count = match.groups()
                old_count = int(old_count) if old_count else 1
                new_count = int(new_count) if new_count else 1
                output_lines.append(f"@@ -{old_start},0 +{new_start},0 @@")
            else:
                output_lines.append(line)
        elif inside_diff:
            if line.startswith("+") or line.startswith("-"):
                output_lines.append(line)
        else:
            output_lines.append(line)
    
    return "\n".join(output_lines)

In [9]:
import re

def format_git_show_minimal(git_show_string):
    """
    Robustly extracts diff content starting from the first '@@' line for each file, including the 'diff --git' line.

    Args:
        git_show_string: The git show diff string with potentially multiple file diffs.

    Returns:
        The extracted diff content, or an empty string if no diff is found.
    """
    lines = git_show_string.splitlines()
    result_diffs = []
    current_diff = []
    at_at_found = False

    for line in lines:
        if line.startswith("diff --git"):
            if current_diff:  # Store the previous diff if any
                result_diffs.append("\n".join(current_diff))
            current_diff = [line]  # Start a new diff
            at_at_found = False
        elif current_diff:
            if line.startswith("@@"):
                at_at_found = True
                current_diff.append(line)
            elif at_at_found:
                current_diff.append(line)

    if current_diff:  # Store the last diff
        result_diffs.append("\n".join(current_diff))

    return "\n".join(result_diffs).strip()


In [11]:
num_cpus = 20

In [20]:
ds_cve = ds_cve.map(lambda x: {"desc_token": ' '.join(word_tokenize(x["desc"]))}, batched=False, num_proc=num_cpus)
ds_cve

DatasetDict({
    train: Dataset({
        features: ['cve', 'published_date', 'desc', 'commit_urls', 'commits', 'desc_token'],
        num_rows: 8963
    })
    test: Dataset({
        features: ['cve', 'published_date', 'desc', 'commit_urls', 'commits', 'desc_token'],
        num_rows: 1366
    })
    validation: Dataset({
        features: ['cve', 'published_date', 'desc', 'commit_urls', 'commits', 'desc_token'],
        num_rows: 1607
    })
})

In [21]:
ds_cve.save_to_disk("tmp/ds_cve")

Saving the dataset (0/1 shards):   0%|          | 0/8963 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1366 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1607 [00:00<?, ? examples/s]

In [13]:
ds_patches = ds_patches.map(lambda x: {"diff_token": 
                                            ' '.join(word_tokenize(
                                                ''.join(format_git_show_minimal(
                                                    convert_to_unified_0(
                                                       x["diff"]
                                                    )
                                               ).splitlines(keepends=True)[:1000])
                                           )),
                                       "msg_token": ' '.join(word_tokenize(x["commit_message"]))
                                      }, batched=False, num_proc=num_cpus)

Map (num_proc=20):   0%|          | 0/11620 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/1453 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/1453 [00:00<?, ? examples/s]

In [14]:
ds_patches = ds_patches.remove_columns(["commit_message", "diff"])
ds_patches

DatasetDict({
    train: Dataset({
        features: ['commit_id', 'repo', 'label', 'diff_token', 'msg_token'],
        num_rows: 11620
    })
    test: Dataset({
        features: ['commit_id', 'repo', 'label', 'diff_token', 'msg_token'],
        num_rows: 1453
    })
    validation: Dataset({
        features: ['commit_id', 'repo', 'label', 'diff_token', 'msg_token'],
        num_rows: 1453
    })
})

In [15]:
ds_patches.save_to_disk("tmp/ds_patches")

Saving the dataset (0/1 shards):   0%|          | 0/11620 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1453 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1453 [00:00<?, ? examples/s]

In [16]:
ds_nonpatches = ds_nonpatches.map(lambda x: {"diff_token": 
                                                  ' '.join(word_tokenize(
                                                      "".join(format_git_show_minimal(
                                                          convert_to_unified_0(
                                                             x["diff"]
                                                          )
                                                     ).splitlines(keepends=True)[:1000])
                                                 )),
                                             "msg_token": ' '.join(word_tokenize(x["commit_message"]))
                                            }, batched=False, num_proc=num_cpus)

Map (num_proc=20):   0%|          | 0/3632163 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/2150904 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/2485831 [00:00<?, ? examples/s]

In [14]:
ds_nonpatches = ds_nonpatches.remove_columns(["commit_message", "diff"])
ds_nonpatches

DatasetDict({
    test: Dataset({
        features: ['commit_id', 'owner', 'repo', 'label', 'diff_token', 'msg_token'],
        num_rows: 2131395
    })
    validation: Dataset({
        features: ['commit_id', 'owner', 'repo', 'label', 'diff_token', 'msg_token'],
        num_rows: 2467950
    })
    train: Dataset({
        features: ['commit_id', 'owner', 'repo', 'label', 'diff_token', 'msg_token'],
        num_rows: 3624262
    })
})

In [17]:
ds_nonpatches.save_to_disk("tmp/ds_nonpatches")

Saving the dataset (0/108 shards):   0%|          | 0/3632163 [00:00<?, ? examples/s]

Saving the dataset (0/66 shards):   0%|          | 0/2150904 [00:00<?, ? examples/s]

Saving the dataset (0/74 shards):   0%|          | 0/2485831 [00:00<?, ? examples/s]