In [1]:
from datasets import load_dataset

In [2]:
stack_v1_name = 'stackv1'
stack_v2_name = 'stackv2'
language = 'Java'

ds = load_dataset(
    "AISE-TUDelft/the-heap",
    f"{language}",
    split="train",
    num_proc=16
)

ds = ds.filter(lambda x: not x[f'exact_duplicates_{stack_v2_name}'] and not x[f'near_duplicates_{stack_v2_name}'])

ds = ds.filter(lambda x: not x[f'exact_duplicates_{stack_v1_name}'] and not x[f'near_duplicates_{stack_v1_name}'])

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/70 [00:00<?, ?it/s]

In [3]:
new_ds = ds.select(range(3343312))

In [4]:
from detection.annotate_code import annotate_code

In [5]:
new_ds = new_ds.map(
    lambda x: {"language_detected": annotate_code(x["content"])},
    num_proc=16,
)

In [6]:
test_ds = new_ds.select(range(3343312))

In [7]:
from detection.annotate_code import get_numerical_data_from_code

In [8]:
from datasets import Features, Value, Sequence

In [9]:
numeric_features = Features({
    "lang_identifiers"     : Sequence(Value("string")),
    "lang_max_identifiers" : Sequence(Value("int64")),
    "lang_freq_identifiers": Sequence(Value("int64")),
    "lang_comments"        : Sequence(Value("string")),
    "lang_max_comments"    : Sequence(Value("int64")),
    "lang_freq_comments"   : Sequence(Value("int64")),
    "lang_strings"         : Sequence(Value("string")),
    "lang_max_strings"     : Sequence(Value("int64")),
    "lang_freq_strings"    : Sequence(Value("int64")),
})

In [10]:
full_features = test_ds.features.copy()
full_features.update(numeric_features)

In [11]:
def add_numerical_fields(example):
    out = get_numerical_data_from_code(
        example["content"],
        example["language_detected"]
    )
    return out

In [12]:
test_ds = test_ds.map(add_numerical_fields,  num_proc=16, features=full_features)

In [49]:
from pathlib import Path


In [50]:
external_root = Path("/Volumes") / "Personal Backup"

In [51]:
external_root

PosixPath('/Volumes/Bogdan Personal Backup')

In [26]:
save_path     = external_root / f"heap_{language}_annotated"

In [28]:
test_ds.save_to_disk(save_path)

Saving the dataset (0/243 shards):   0%|          | 0/3343312 [00:00<?, ? examples/s]

In [13]:
from analysis.lang_stats import compute_language_stats, compute_single_language_stats, compute_multi_language_stats, compute_single_language_in_files, get_stats,get_file_stats

In [14]:
partials = (
    test_ds
      .map(
          compute_language_stats,
          batched=True,
          batch_size=None,       # whole shard per worker → 1 row
          num_proc=16,  # can’t exceed split size
          remove_columns=test_ds.column_names # keep RAM very low
      )
      .to_list()                 # bring the tiny per-shard dicts to driver
)

In [15]:
print(get_stats(partials)["comments"] )


{'languages': ['en', 'pt', 'zh', 'gl', 'de', 'fr', 'es', 'da', 'it', 'ja'], 'freq': [23602848, 798878, 789565, 277939, 215848, 209110, 200725, 171161, 145063, 132797], 'max_size': [1884220, 137960, 742173, 117202, 122307, 517352, 211735, 221410, 40593, 1881287], 'file_count': [2526777, 103371, 122138, 47503, 67221, 62436, 47787, 84464, 44448, 37857]}


In [16]:
print(get_stats(partials)["identifiers"] )

{'languages': ['en', 'es', 'it', 'fr', 'de', 'pt', 'nl', 'cs', 'sv', 'ca'], 'freq': [35246633, 175845, 159276, 152777, 92865, 84063, 51278, 36386, 34444, 33943], 'max_size': [493, 123, 188, 74, 134, 68, 83, 55, 80, 34], 'file_count': [3057990, 53426, 54901, 70130, 26180, 33051, 19350, 20391, 21379, 17211]}


In [17]:
print(get_stats(partials)["strings"] )

{'languages': ['en', 'zh', 'da', 'de', 'fr', 'es', 'pt', 'it', 'nl', 'la'], 'freq': [17879347, 249762, 236074, 225174, 187846, 139949, 138466, 136324, 131645, 128011], 'max_size': [606887, 5036, 606887, 70426, 65534, 29903, 47691, 36034, 606887, 65534], 'file_count': [1684818, 46202, 84491, 49549, 62485, 33791, 45700, 32977, 44203, 50207]}


In [18]:
print(get_stats(partials))

{'identifiers': {'languages': ['en', 'es', 'it', 'fr', 'de', 'pt', 'nl', 'cs', 'sv', 'ca'], 'freq': [35246633, 175845, 159276, 152777, 92865, 84063, 51278, 36386, 34444, 33943], 'max_size': [493, 123, 188, 74, 134, 68, 83, 55, 80, 34], 'file_count': [3057990, 53426, 54901, 70130, 26180, 33051, 19350, 20391, 21379, 17211]}, 'comments': {'languages': ['en', 'pt', 'zh', 'gl', 'de', 'fr', 'es', 'da', 'it', 'ja'], 'freq': [23602848, 798878, 789565, 277939, 215848, 209110, 200725, 171161, 145063, 132797], 'max_size': [1884220, 137960, 742173, 117202, 122307, 517352, 211735, 221410, 40593, 1881287], 'file_count': [2526777, 103371, 122138, 47503, 67221, 62436, 47787, 84464, 44448, 37857]}, 'strings': {'languages': ['en', 'zh', 'da', 'de', 'fr', 'es', 'pt', 'it', 'nl', 'la'], 'freq': [17879347, 249762, 236074, 225174, 187846, 139949, 138466, 136324, 131645, 128011], 'max_size': [606887, 5036, 606887, 70426, 65534, 29903, 47691, 36034, 606887, 65534], 'file_count': [1684818, 46202, 84491, 49549,

In [19]:
single_partials = (
    test_ds
    .map(
        compute_single_language_stats,
        batched=True,
        batch_size=None,  # whole shard per worker → 1 row
        num_proc=16,  # can’t exceed split size
        remove_columns=test_ds.column_names  # keep RAM very low
    )
    .to_list()  # bring the tiny per-shard dicts to driver
)

In [20]:
print(get_stats(single_partials)["comments"] )

{'languages': ['en', 'zh', 'gl', 'fr', 'de', 'es', 'ja', 'pt', 'it', 'ru'], 'freq': [21769559, 702291, 258982, 141094, 139693, 135518, 115763, 100257, 94376, 79754], 'max_size': [647658, 9833, 610, 5450, 28553, 6934, 92094, 5278, 7022, 29439], 'file_count': [2432676, 104145, 36333, 28398, 25640, 25675, 29442, 37032, 16595, 13767]}


In [21]:
print(get_stats(single_partials)["identifiers"] )

{'languages': ['en', 'es', 'it', 'fr', 'de', 'pt', 'nl', 'cs', 'sv', 'ca'], 'freq': [35246633, 175845, 159276, 152777, 92865, 84063, 51278, 36386, 34444, 33943], 'max_size': [493, 123, 188, 74, 134, 68, 83, 55, 80, 34], 'file_count': [3057990, 53426, 54901, 70130, 26180, 33051, 19350, 20391, 21379, 17211]}


In [22]:
print(get_stats(single_partials)["strings"] )

{'languages': ['en', 'zh', 'de', 'da', 'fr', 'pt', 'it', 'es', 'la', 'nl'], 'freq': [17711673, 248507, 214917, 202414, 177730, 131664, 130833, 125844, 116656, 99142], 'max_size': [65534, 5036, 2792, 635, 38349, 19330, 1085, 2951, 2229, 471], 'file_count': [1681189, 45881, 47566, 78073, 59504, 43735, 31577, 32413, 47541, 41139]}


In [23]:
print(get_stats(single_partials))

{'identifiers': {'languages': ['en', 'es', 'it', 'fr', 'de', 'pt', 'nl', 'cs', 'sv', 'ca'], 'freq': [35246633, 175845, 159276, 152777, 92865, 84063, 51278, 36386, 34444, 33943], 'max_size': [493, 123, 188, 74, 134, 68, 83, 55, 80, 34], 'file_count': [3057990, 53426, 54901, 70130, 26180, 33051, 19350, 20391, 21379, 17211]}, 'comments': {'languages': ['en', 'zh', 'gl', 'fr', 'de', 'es', 'ja', 'pt', 'it', 'ru'], 'freq': [21769559, 702291, 258982, 141094, 139693, 135518, 115763, 100257, 94376, 79754], 'max_size': [647658, 9833, 610, 5450, 28553, 6934, 92094, 5278, 7022, 29439], 'file_count': [2432676, 104145, 36333, 28398, 25640, 25675, 29442, 37032, 16595, 13767]}, 'strings': {'languages': ['en', 'zh', 'de', 'da', 'fr', 'pt', 'it', 'es', 'la', 'nl'], 'freq': [17711673, 248507, 214917, 202414, 177730, 131664, 130833, 125844, 116656, 99142], 'max_size': [65534, 5036, 2792, 635, 38349, 19330, 1085, 2951, 2229, 471], 'file_count': [1681189, 45881, 47566, 78073, 59504, 43735, 31577, 32413, 475

In [24]:
file_partials = (
    test_ds
    .map(
        compute_single_language_in_files,
        batched=True,
        batch_size=None,  # whole shard per worker → 1 row
        num_proc=16,  # can’t exceed split size
        remove_columns=test_ds.column_names  # keep RAM very low
    )
    .to_list()  # bring the tiny per-shard dicts to driver
)

In [25]:
print(get_file_stats(file_partials).most_common(15))


[('en', 1907629), ('zh', 2000), ('es', 1355), ('pt', 1005), ('fr', 771), ('it', 454), ('de', 433), ('cs', 189), ('ja', 183), ('pl', 177), ('ru', 173), ('da', 170), ('ko', 150), ('nl', 143), ('tr', 114)]


In [26]:
from sampling.dataset_sampling import keep_files_with_comments, keep_single_language_dataset, is_language_present

In [28]:
ds_in_non_english = keep_single_language_dataset(test_ds,
                                           num_proc=16,
                                           non_english=True)

In [29]:
ds_in_english = keep_single_language_dataset(test_ds,
                                           num_proc=16,
                                           non_english=False)

In [30]:
ds_in_non_english_with_comments = keep_files_with_comments(ds_in_non_english)

In [33]:
ds_in_english_with_comments = keep_files_with_comments(ds_in_english)

In [35]:
import random

In [36]:
number_of_samples_non_english = 1000
random.seed(45)
indices_of_non_english = random.sample(
    range(len(ds_in_non_english_with_comments)),
    number_of_samples_non_english
)
samples_non_english = ds_in_non_english_with_comments.select(indices_of_non_english)
save_path     = external_root / f"heap_{language}_sampled_non_english"
samples_non_english.save_to_disk(save_path)


In [43]:
number_of_samples_english = 2000
indices_of_english = random.sample(range(len(ds_in_english_with_comments)), number_of_samples_english)
samples_english = ds_in_english_with_comments.select(indices_of_english)
save_path = external_root / f"heap_{language}_sampled_english"
samples_english.save_to_disk(save_path)

In [62]:
number_of_samples_non_english_fim = 1000
random.seed(112)
indices_of_non_english_fim = random.sample(
    range(len(ds_in_non_english_with_comments)),
    number_of_samples_non_english_fim
)
samples_non_english_fim = ds_in_non_english_with_comments.select(indices_of_non_english_fim)
save_path     = external_root / f"heap_{language}_sampled_non_english_FIM"
samples_non_english_fim.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [63]:
number_of_samples_english_fim = 1000
random.seed(112)
indices_of_english_fim = random.sample(
    range(len(ds_in_english_with_comments)),
    number_of_samples_english_fim
)
samples_english_fim = ds_in_english_with_comments.select(indices_of_english_fim)
save_path     = external_root / f"heap_{language}_sampled_english_FIM"
samples_english_fim.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]