In [1]:
import os
import sys
import json
import random
import pathlib
from collections import Counter

sys.path.append("/home/arnaik/OracleProject")
random.seed(42) # seed for deterministic behavior

from src.datautils import MetaLinterDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi

Tue Mar 11 18:09:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:41:00.0 Off |                    0 |
| N/A   40C    P0             55W /  270W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          On  |   00

In [3]:
cd "/home/arnaik/OracleProject"

/home/arnaik/OracleProject


In [4]:
dataset = MetaLinterDataset("ruff", "./data/ruff_results/")

112it [00:39,  2.86it/s]
5000it [00:01, 3773.88it/s]    | 0/129 [00:00<?, ?it/s]
100%|██████████| 5000/5000 [00:01<00:00, 3729.25it/s]
2it [00:00, 2130.71it/s]       | 1/129 [00:02<05:54,  2.77s/it]
100%|██████████| 2/2 [00:00<00:00, 11650.84it/s]
5000it [00:01, 2796.81it/s]
100%|██████████| 5000/5000 [00:01<00:00, 2506.22it/s]
5000it [00:00, 5719.63it/s]    | 3/129 [00:06<04:33,  2.17s/it]
100%|██████████| 5000/5000 [00:02<00:00, 1957.99it/s]
5000it [00:00, 5483.99it/s]    | 4/129 [00:10<05:31,  2.65s/it]
100%|██████████| 5000/5000 [00:01<00:00, 2741.22it/s]
5000it [00:02, 2027.06it/s]    | 5/129 [00:13<05:38,  2.73s/it]
100%|██████████| 5000/5000 [00:00<00:00, 8887.94it/s]
5000it [00:02, 1798.49it/s]    | 6/129 [00:16<05:53,  2.87s/it]
100%|██████████| 5000/5000 [00:00<00:00, 8535.66it/s]
5000it [00:03, 1619.89it/s]    | 7/129 [00:19<06:15,  3.08s/it]
100%|██████████| 5000/5000 [00:00<00:00, 8306.57it/s]
5000it [00:03, 1572.44it/s]    | 8/129 [00:23<06:41,  3.32s/it]
100%|██████████|

In [1]:
def balance_neutral_and_flagged_files(
        data: list,
        neutral_file_to_flagged_file_ratio: float=1.0,
    ):
    # iterate over data and create a list of neutral files and flagged files.
    neutral_files = []
    flagged_files = []
    for rec in data:
        response = rec['messages'][1]['content']
        if response.strip() == "NO VIOLATIONS FOUND": neutral_files.append(rec)
        else: flagged_files.append(rec)
    
    # balance the amount of neutral and modified files.
    num_neutral_files = min(int(neutral_file_to_flagged_file_ratio*len(flagged_files)), len(neutral_files))
    neutral_files = random.sample(neutral_files, k=num_neutral_files)
    data = neutral_files + flagged_files 
    data = random.sample(data, k=len(data)) # shuffle the data around.

    return data
    

In [8]:
train_idiom_mix = [
    ["F405", "F501", "F502", "F601", "F621"],
    ["E402", "E701", "E721", "E741", "E743"],
    ["N801", "N802", "N803", "N804", "N805"],
    ["N806", "N807", "N811", "N812", "N813"],
    ["UP001", "UP003", "UP004", "UP005", "UP006"],
    ["UP007", "UP008", "UP009", "UP010", "UP011"],
    ["UP044", "UP045", "UP046", "UP047", "UP040"],
    ["ERA001", "C901", "I001", "I002", "BLE001"],
    ["B002", "B003", "B004", "B005", "B006"],
    ["B007", "B008", "B009", "B010", "B012"],
]
test_idiom_mix = [
    ["F406", "F403", "F503", "F602", "F622"],
    ["E401", "E702", "E722", "E731", "E742"],
    ["ERA001", "C901", "I001", "I002", "BLE001"],
    ["ANN001", "ANN002", "ANN003", "ANN201", "ANN202"],
    ["ASYNC100", "ASYNC105", "ASYNC109", "ASYNC110", "ASYNC115"],
    ["ASYNC116", "ASYNC210", "ASYNC220", "ASYNC221", "ASYNC222"],
    ["ASYNC230", "ASYNC251", "ANN204", "ANN205", "ANN206"],
    ["S102", "S103", "S104", "S105", "S106"],
    ["S107", "S108", "S110", "S112", "S113"],
    ["S201", "S202", "S301", "S302", "S303"],
]

In [9]:
all_train_data = []
all_test_data = []
random.seed(42)

for idiom_mix in train_idiom_mix:
    mix_data = dataset.generate_data_mix(idiom_mix, max_code_lines=200)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    all_train_data.extend(mix_data)
    
for idiom_mix in test_idiom_mix:
    mix_data = dataset.generate_data_mix(idiom_mix, max_code_lines=200)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    all_test_data.extend(mix_data)
    
print(len(all_train_data))
print(len(all_test_data))

['F405', 'F501', 'F502', 'F601', 'F621'] 21155
['E402', 'E701', 'E721', 'E741', 'E743'] 32186
['N801', 'N802', 'N803', 'N804', 'N805'] 68822
['N806', 'N807', 'N811', 'N812', 'N813'] 43329
['UP001', 'UP003', 'UP004', 'UP005', 'UP006'] 26420
['UP007', 'UP008', 'UP009', 'UP010', 'UP011'] 82555
['UP044', 'UP045', 'UP046', 'UP047', 'UP040'] 14
['ERA001', 'C901', 'I001', 'I002', 'BLE001'] 296494
['B002', 'B003', 'B004', 'B005', 'B006'] 3377
['B007', 'B008', 'B009', 'B010', 'B012'] 21277
['F406', 'F403', 'F503', 'F602', 'F622'] 24595


: 

In [6]:
!ls

README.md	    experiments			 plots	    vllm_env.yaml
access_tokens.json  filter_codereviewer_data.py  ruff.toml
alignment-handbook  handbook.yml		 scripts
data		    peft_requirements.txt	 src


In [None]:
# mix_data = dataset.generate_data_mix(['ERA001'])
# print(len(mix_data))
# mix_data[2]['messages'][1]['content']
# print(len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))

In [9]:
random.seed(42)
from collections import defaultdict

def balance_neutral_and_flagged_files(
        data: list,
        neutral_file_to_flagged_file_ratio: float=1.0,
    ):
    # iterate over data and create a list of neutral files and flagged files.
    neutral_files = []
    flagged_files = []
    for rec in data:
        response = rec['messages'][1]['content']
        if response.strip() == "NO VIOLATIONS FOUND": neutral_files.append(rec)
        else: flagged_files.append(rec)
    
    # balance the amount of neutral and modified files.
    num_neutral_files = min(int(neutral_file_to_flagged_file_ratio*len(flagged_files)), len(neutral_files))
    neutral_files = random.sample(neutral_files, k=num_neutral_files)
    data = neutral_files + flagged_files 
    data = random.sample(data, k=len(data)) # shuffle the data around.

    return data

def impose_idiom_mix_ceilings(data, ceiling: int=5000):
    """reduce size of data stratified by the idiom mix and violation or no violation category"""
    category_to_data = defaultdict(lambda: [])
    for rec in data:
        violation_present = "yes" if rec['messages'][1]['content'] != "NO VIOLATIONS FOUND" else "no"
        category_to_data[rec['source']+"-"+violation_present].append(rec)
    category_to_data = dict(category_to_data)
    final_data = []
    for category, data_subset in category_to_data.items():
        selected_data = random.sample(data_subset, k=min(len(data_subset), ceiling))
        final_data.extend(selected_data)
        # print(category, len(selected_data))
    # print(len(final_data))
    return final_data

def split_train_and_test_data(train_data, test_data):
    train_ids = set()
    test_ids = set()
    id_to_data = {}

    for rec in train_data:
        train_ids.add(rec['id'])
        id_to_data[rec['id']] = rec
    for rec in test_data:
        test_ids.add(rec['id'])
        id_to_data[rec['id']] = rec
    
    common_ids = train_ids.intersection(test_ids)
    train_only_ids = train_ids.difference(test_ids)
    test_only_ids = test_ids.difference(train_ids)

    train_only_data = impose_idiom_mix_ceilings([id_to_data[ID] for ID in train_only_ids], ceiling=5000)
    test_only_data = impose_idiom_mix_ceilings([id_to_data[ID] for ID in test_only_ids], ceiling=500)
    # common_data = [id_to_data[ID] for ID in test_only_ids]
    common_data_split_1_IDs = set(random.sample(list(common_ids), k=len(common_ids)//2)) 
    common_data_split_2_IDs = common_ids.difference(common_data_split_1_IDs)
    common_data_split_1 = [id_to_data[ID] for ID in common_data_split_1_IDs]
    common_data_split_2 = [id_to_data[ID] for ID in common_data_split_2_IDs]
    
    train_from_common_data = impose_idiom_mix_ceilings(common_data_split_1, ceiling=5000)
    test_from_common_data = impose_idiom_mix_ceilings(common_data_split_2, ceiling=500)

    print(len(train_from_common_data))
    print(len(test_from_common_data))
    print(len(train_only_data))
    print(len(test_only_data))

    return train_only_data+train_from_common_data, test_only_data+test_from_common_data

filt_train_data, filt_test_data = split_train_and_test_data(all_train_data, all_test_data)

10000
1000
81074
8048


In [13]:
def shuffle_data(data: list[dict]):
    return random.sample(data, k=len(data))

In [19]:
# print(filt_train_data[0]['messages'][0]['content'])
print(shuffle_data(filt_train_data)[0]['messages'][1]['content'])

{"code": "UP009", "code_spans_and_lines": [{"line": "# -*- coding: utf-8 -*-", "span": "# -*- coding: utf-8 -*-"}], "fix": {"edits": [{"content": "", "code_spans_and_lines": [{"line": "# -*- coding: utf-8 -*-", "span": ""}, {"line": "\"\"\"DiabtetesPredictions.ipynb", "span": ""}]}]}}


In [6]:
# with open("./data/ruff_meta_linting/train_v3.json", "w") as f:
#     print(f"train data len: {len(filt_train_data)}")
#     # print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
#     # print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
#     print(dict(Counter([rec['source'] for rec in filt_train_data]).most_common()))
#     json.dump(shuffle_data(filt_train_data), f, indent=4)
# with open("./data/ruff_meta_linting/test_v3.json", "w") as f:
#     print(f"test data len: {len(filt_test_data)}")
#     # print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
#     # print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
#     print(dict(Counter([rec['source'] for rec in filt_test_data]).most_common()))
#     json.dump(shuffle_data(filt_test_data), f, indent=4)

## Idiom Learnability/Hardness Experiment Data Creation
Dataset with the same idioms in the train and test split (and same idiom mix as the original test set) to evaluate 3 things:
1. is the task learnable/doable
2. is the training framework working properly
3. are some idioms harder to learn than others? and which one (also is it related to training frequency).

In [8]:
idiom_mixes = [
    ["F406", "F403", "F503", "F602", "F622"],
    ["E401", "E702", "E722", "E731", "E742"],
    ["ERA001", "C901", "I001", "I002", "BLE001"],
    ["ANN001", "ANN002", "ANN003", "ANN201", "ANN202"],
    ["ASYNC100", "ASYNC105", "ASYNC109", "ASYNC110", "ASYNC115"],
    ["ASYNC116", "ASYNC210", "ASYNC220", "ASYNC221", "ASYNC222"],
    ["ASYNC230", "ASYNC251", "ANN204", "ANN205", "ANN206"],
    ["S102", "S103", "S104", "S105", "S106"],
    ["S107", "S108", "S110", "S112", "S113"],
    ["S201", "S202", "S301", "S302", "S303"],
]
all_data = []
random.seed(42)

for idiom_mix in idiom_mixes:
    mix_data = dataset.generate_data_mix(idiom_mix, max_code_lines=200)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    all_data.extend(mix_data)
    
print(len(all_data))

['F406', 'F403', 'F503', 'F602', 'F622'] 24595
['E401', 'E702', 'E722', 'E731', 'E742'] 23983
['ERA001', 'C901', 'I001', 'I002', 'BLE001'] 296494
['ANN001', 'ANN002', 'ANN003', 'ANN201', 'ANN202'] 43702


['ASYNC100', 'ASYNC105', 'ASYNC109', 'ASYNC110', 'ASYNC115'] 46
['ASYNC116', 'ASYNC210', 'ASYNC220', 'ASYNC221', 'ASYNC222'] 89
['ASYNC230', 'ASYNC251', 'ANN204', 'ANN205', 'ANN206'] 77455
['S102', 'S103', 'S104', 'S105', 'S106'] 14756
['S107', 'S108', 'S110', 'S112', 'S113'] 12301
['S201', 'S202', 'S301', 'S302', 'S303'] 5357
4727890


In [5]:
random.seed(42)
from collections import defaultdict

def create_random_split_of_given_size(data, split_1_size: int):
    split_1, split_2 = [], []
    split_1_indices = random.sample(range(len(data)), k=split_1_size)
    put_in_split_1 = [False for _ in range(len(data))]
    for index in split_1_indices:
        put_in_split_1[index] = True
    for index in range(len(data)):
        if put_in_split_1[index]:
            split_1.append(data[index])
        else: split_2.append(data[index])

    return split_1, split_2

def impose_idiom_mix_ceilings_and_split_data(data, train_ceiling: int=5000, test_ceiling: int=500):
    """reduce size of data stratified by the idiom mix and violation or no violation category"""
    category_to_data = defaultdict(lambda: [])
    violation_present_counts = defaultdict(lambda: 0) # the counts of data with at least one violation present for the meta-task idioms.
    for rec in data:
        violation_present = "yes" if rec['messages'][1]['content'] != "NO VIOLATIONS FOUND" else "no"
        category_to_data[rec['source']+"-"+violation_present].append(rec)
    for category, data_subset in category_to_data.items():
        if category.endswith("-yes"):
            violation_present_counts[category.replace("-yes","")] = len(data_subset)

    category_to_data = dict(category_to_data)
    final_train_data = []
    final_test_data = []
    for category, data_subset in category_to_data.items():
        subset_max_possible_size = violation_present_counts[category.replace("-yes","").replace("-no","")]
        # selected_data = random.sample(data_subset, k=min(len(data_subset), train_ceiling+test_ceiling))
        selected_data = random.sample(data_subset, k=min(subset_max_possible_size, train_ceiling+test_ceiling))
        data_selected_for_train, data_selected_for_test = create_random_split_of_given_size(
            selected_data, split_1_size=min(5000, 
            int(train_ceiling*len(selected_data)/(train_ceiling+test_ceiling)))
        )
        final_train_data.extend(data_selected_for_train)
        final_test_data.extend(data_selected_for_test)
        # print(category, len(selected_data))
    # print(len(final_data))
    return final_train_data, final_test_data

def impose_idiom_mix_ceilings(data, ceiling: int=5000):
    """reduce size of data stratified by the idiom mix and violation or no violation category"""
    category_to_data = defaultdict(lambda: [])
    violation_present_counts = defaultdict(lambda: 0) # the counts of data with at least one violation present for the meta-task idioms.
    for rec in data:
        violation_present = "yes" if rec['messages'][1]['content'] != "NO VIOLATIONS FOUND" else "no"
        category_to_data[rec['source']+"-"+violation_present].append(rec)
    for category, data_subset in category_to_data.items():
        if category.endswith("-yes"):
            violation_present_counts[category.replace("-yes","")] = len(data_subset)

    category_to_data = dict(category_to_data)
    final_data = []
    for category, data_subset in category_to_data.items():
        subset_max_possible_size = violation_present_counts[category.replace("-yes","").replace("-no","")]
        # selected_data = random.sample(data_subset, k=min(len(data_subset), train_ceiling+test_ceiling))
        selected_data = random.sample(data_subset, k=min(subset_max_possible_size, ceiling))
        final_data.extend(selected_data)
        # print(category, len(selected_data))
    # print(len(final_data))
    return final_data

# filt_train_data, filt_test_data = impose_idiom_mix_ceilings_and_split_data(all_data)

In [13]:
filt_train_data, filt_test_data = impose_idiom_mix_ceilings_and_split_data(all_data)
with open("./data/ruff_meta_linting/hardness_experiment/train.json", "w") as f:
    print(f"train data len: {len(filt_train_data)}")
    print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_train_data]).most_common()))
    json.dump(shuffle_data(filt_train_data), f, indent=4)
with open("./data/ruff_meta_linting/hardness_experiment/test.json", "w") as f:
    print(f"test data len: {len(filt_test_data)}")
    print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_test_data]).most_common()))
    json.dump(shuffle_data(filt_test_data), f, indent=4)

train data len: 79982
{'rull_linter/F406-F403-F503-F602-F622': 5000, 'rull_linter/E401-E702-E722-E731-E742': 5000, 'rull_linter/ERA001-C901-I001-I002-BLE001': 5000, 'rull_linter/ANN001-ANN002-ANN003-ANN201-ANN202': 5000, 'rull_linter/ASYNC230-ASYNC251-ANN204-ANN205-ANN206': 5000, 'rull_linter/S102-S103-S104-S105-S106': 5000, 'rull_linter/S107-S108-S110-S112-S113': 5000, 'rull_linter/S201-S202-S301-S302-S303': 4870, 'rull_linter/ASYNC116-ASYNC210-ASYNC220-ASYNC221-ASYNC222': 80, 'rull_linter/ASYNC100-ASYNC105-ASYNC109-ASYNC110-ASYNC115': 41}
{'rull_linter/F406-F403-F503-F602-F622': 5000, 'rull_linter/E401-E702-E722-E731-E742': 5000, 'rull_linter/ERA001-C901-I001-I002-BLE001': 5000, 'rull_linter/ANN001-ANN002-ANN003-ANN201-ANN202': 5000, 'rull_linter/ASYNC230-ASYNC251-ANN204-ANN205-ANN206': 5000, 'rull_linter/S102-S103-S104-S105-S106': 5000, 'rull_linter/S107-S108-S110-S112-S113': 5000, 'rull_linter/S201-S202-S301-S302-S303': 4870, 'rull_linter/ASYNC116-ASYNC210-ASYNC220-ASYNC221-ASYNC22

## Idiom Transfer Learning
Dataset with the some of the same idioms, some paired near transfer idioms and some unseen test only idioms/meta-tasks to evaluate tendency for memorization vs instruction following/reasoning in LLMs.

In [9]:
train_only_idiom_mix = [
    ["F405", "F501", "F502", "F601", "F621"],
    ["E402", "E701", "E721", "E741", "E743"],
    ["N801", "N802", "N803", "N804", "N805"],
    ["N806", "N807", "N811", "N812", "N813"],
    ["UP001", "UP003", "UP004", "UP005", "UP006"],
    ["UP007", "UP008", "UP009", "UP010", "UP011"],
    ["UP044", "UP045", "UP046", "UP047", "UP040"],
    ["B002", "B003", "B004", "B005", "B006"],
    ["B007", "B008", "B009", "B010", "B012"],
]
test_only_idiom_mix = [
    ["F406", "F403", "F503", "F602", "F622"],
    ["E401", "E702", "E722", "E731", "E742"],
    ["ANN001", "ANN002", "ANN003", "ANN201", "ANN202"],
    ["ASYNC100", "ASYNC105", "ASYNC109", "ASYNC110", "ASYNC115"],
    ["ASYNC116", "ASYNC210", "ASYNC220", "ASYNC221", "ASYNC222"],
    ["ASYNC230", "ASYNC251", "ANN204", "ANN205", "ANN206"],
    ["S102", "S103", "S104", "S105", "S106"],
    ["S107", "S108", "S110", "S112", "S113"],
    ["S201", "S202", "S301", "S302", "S303"],
]

shared_data = [] # basically no-transfer setting/
random.seed(42)

shared_idiom_mix = ["ERA001", "C901", "I001", "I002", "BLE001"]
shared_data = dataset.generate_data_mix(shared_idiom_mix, max_code_lines=200)
print(shared_idiom_mix, len([rec for rec in shared_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
shared_train_data, shared_test_data = impose_idiom_mix_ceilings_and_split_data(shared_data)
print(len(shared_train_data))
print(len(shared_test_data))

['ERA001', 'C901', 'I001', 'I002', 'BLE001'] 296494
10000
1000


In [7]:
del shared_data

In [10]:
train_only_data = []
test_only_data = []

for idiom_mix in train_only_idiom_mix:
    mix_data = impose_idiom_mix_ceilings(dataset.generate_data_mix(idiom_mix, max_code_lines=200), ceiling=5000)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    train_only_data.extend(mix_data)
    del mix_data
for idiom_mix in test_only_idiom_mix:
    mix_data = impose_idiom_mix_ceilings(dataset.generate_data_mix(idiom_mix, max_code_lines=200), ceiling=500)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    test_only_data.extend(mix_data)
    del mix_data

# train_only_data = impose_idiom_mix_ceilings(all_train_data, ceiling=5000)
# del all_train_data
# test_only_data = impose_idiom_mix_ceilings(all_test_data, ceiling=500)
# del all_test_data

print(len(train_only_data))
print(len(test_only_data))

['F405', 'F501', 'F502', 'F601', 'F621'] 5000
['E402', 'E701', 'E721', 'E741', 'E743'] 5000
['N801', 'N802', 'N803', 'N804', 'N805'] 5000
['N806', 'N807', 'N811', 'N812', 'N813'] 5000
['UP001', 'UP003', 'UP004', 'UP005', 'UP006'] 5000
['UP007', 'UP008', 'UP009', 'UP010', 'UP011'] 5000
['UP044', 'UP045', 'UP046', 'UP047', 'UP040'] 14
['B002', 'B003', 'B004', 'B005', 'B006'] 3377
['B007', 'B008', 'B009', 'B010', 'B012'] 5000
['F406', 'F403', 'F503', 'F602', 'F622'] 500
['E401', 'E702', 'E722', 'E731', 'E742'] 500
['ANN001', 'ANN002', 'ANN003', 'ANN201', 'ANN202'] 500
['ASYNC100', 'ASYNC105', 'ASYNC109', 'ASYNC110', 'ASYNC115'] 46
['ASYNC116', 'ASYNC210', 'ASYNC220', 'ASYNC221', 'ASYNC222'] 89
['ASYNC230', 'ASYNC251', 'ANN204', 'ANN205', 'ANN206'] 500
['S102', 'S103', 'S104', 'S105', 'S106'] 500
['S107', 'S108', 'S110', 'S112', 'S113'] 500
['S201', 'S202', 'S301', 'S302', 'S303'] 500
76782
7270


In [14]:
filt_train_data = train_only_data + shared_train_data
filt_test_data = test_only_data + shared_test_data

# # shuffle data
# filt_train_data = random.sample(filt_train_data, k=len(filt_train_data))
# filt_test_data = random.sample(filt_test_data, k=len(filt_test_data))

In [15]:
with open("./data/ruff_meta_linting/train_v4.json", "w") as f:
    print(f"train data len: {len(filt_train_data)}")
    print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_train_data]).most_common()))
    json.dump(shuffle_data(filt_train_data), f, indent=4)
with open("./data/ruff_meta_linting/test_v4.json", "w") as f:
    print(f"test data len: {len(filt_test_data)}")
    print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_test_data]).most_common()))
    json.dump(shuffle_data(filt_test_data), f, indent=4)

train data len: 86782
{'rull_linter/F405-F501-F502-F601-F621': 5000, 'rull_linter/E402-E701-E721-E741-E743': 5000, 'rull_linter/N801-N802-N803-N804-N805': 5000, 'rull_linter/N806-N807-N811-N812-N813': 5000, 'rull_linter/UP001-UP003-UP004-UP005-UP006': 5000, 'rull_linter/UP007-UP008-UP009-UP010-UP011': 5000, 'rull_linter/B007-B008-B009-B010-B012': 5000, 'rull_linter/ERA001-C901-I001-I002-BLE001': 5000, 'rull_linter/B002-B003-B004-B005-B006': 3377, 'rull_linter/UP044-UP045-UP046-UP047-UP040': 14}
{'rull_linter/F405-F501-F502-F601-F621': 5000, 'rull_linter/E402-E701-E721-E741-E743': 5000, 'rull_linter/N801-N802-N803-N804-N805': 5000, 'rull_linter/N806-N807-N811-N812-N813': 5000, 'rull_linter/UP001-UP003-UP004-UP005-UP006': 5000, 'rull_linter/UP007-UP008-UP009-UP010-UP011': 5000, 'rull_linter/B007-B008-B009-B010-B012': 5000, 'rull_linter/ERA001-C901-I001-I002-BLE001': 5000, 'rull_linter/B002-B003-B004-B005-B006': 3377, 'rull_linter/UP044-UP045-UP046-UP047-UP040': 14}
{'rull_linter/F405-F50