In [1]:
import os
import sys
import json
import random
import pathlib
from collections import Counter

sys.path.append("/home/arnaik/OracleProject")
random.seed(42) # seed for deterministic behavior

from src.datautils import MetaLinterDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi

Sat Feb 15 00:46:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:C1:00.0 Off |                    0 |
| N/A   37C    P0             56W /  270W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
cd "/home/arnaik/OracleProject"

/home/arnaik/OracleProject


In [4]:
dataset = MetaLinterDataset("ruff", "./data/ruff_results/")

0it [00:00, ?it/s]

112it [01:08,  1.64it/s]
5000it [00:01, 3490.15it/s]    | 0/129 [00:00<?, ?it/s]
100%|██████████| 5000/5000 [00:01<00:00, 3795.74it/s]
2it [00:00, 37.88it/s]         | 1/129 [00:02<06:03,  2.84s/it]
100%|██████████| 2/2 [00:00<00:00, 7738.57it/s]
5000it [00:01, 2673.93it/s]
100%|██████████| 5000/5000 [00:01<00:00, 2519.73it/s]
5000it [00:01, 4532.52it/s]    | 3/129 [00:06<04:40,  2.23s/it]
100%|██████████| 5000/5000 [00:02<00:00, 2019.58it/s]
5000it [00:00, 5229.64it/s]    | 4/129 [00:10<05:42,  2.74s/it]
100%|██████████| 5000/5000 [00:01<00:00, 2868.34it/s]
5000it [00:02, 2062.00it/s]    | 5/129 [00:13<05:43,  2.77s/it]
100%|██████████| 5000/5000 [00:00<00:00, 9705.33it/s] 
5000it [00:02, 1735.82it/s]    | 6/129 [00:16<05:53,  2.87s/it]
100%|██████████| 5000/5000 [00:00<00:00, 9364.23it/s]
5000it [00:03, 1627.11it/s]    | 7/129 [00:20<06:16,  3.09s/it]
100%|██████████| 5000/5000 [00:00<00:00, 9101.93it/s]
5000it [00:03, 1582.12it/s]    | 8/129 [00:23<06:38,  3.30s/it]
100%|██████████|

In [5]:
def balance_neutral_and_flagged_files(
        data: list,
        neutral_file_to_flagged_file_ratio: float=1.0,
    ):
    # iterate over data and create a list of neutral files and flagged files.
    neutral_files = []
    flagged_files = []
    for rec in data:
        response = rec['messages'][1]['content']
        if response.strip() == "NO VIOLATIONS FOUND": neutral_files.append(rec)
        else: flagged_files.append(rec)
    
    # balance the amount of neutral and modified files.
    num_neutral_files = min(int(neutral_file_to_flagged_file_ratio*len(flagged_files)), len(neutral_files))
    neutral_files = random.sample(neutral_files, k=num_neutral_files)
    data = neutral_files + flagged_files 
    data = random.sample(data, k=len(data)) # shuffle the data around.

    return data
    

In [6]:
train_idiom_mix = [
    ["F405", "F501", "F502", "F601", "F621"],
    ["E402", "E701", "E721", "E741", "E743"],
    ["N801", "N802", "N803", "N804", "N805"],
    ["N806", "N807", "N811", "N812", "N813"],
    ["UP001", "UP003", "UP004", "UP005", "UP006"],
    ["UP007", "UP008", "UP009", "UP010", "UP011"],
    ["UP044", "UP045", "UP046", "UP047", "UP040"],
    ["ERA001", "C901", "I001", "I002", "BLE001"],
    ["B002", "B003", "B004", "B005", "B006"],
    ["B007", "B008", "B009", "B010", "B012"],
]
test_idiom_mix = [
    ["F406", "F403", "F503", "F602", "F622"],
    ["E401", "E702", "E722", "E731", "E742"],
    ["ERA001", "C901", "I001", "I002", "BLE001"],
    ["ANN001", "ANN002", "ANN003", "ANN201", "ANN202"],
    ["ASYNC100", "ASYNC105", "ASYNC109", "ASYNC110", "ASYNC115"],
    ["ASYNC116", "ASYNC210", "ASYNC220", "ASYNC221", "ASYNC222"],
    ["ASYNC230", "ASYNC251", "ANN204", "ANN205", "ANN206"],
    ["S102", "S103", "S104", "S105", "S106"],
    ["S107", "S108", "S110", "S112", "S113"],
    ["S201", "S202", "S301", "S302", "S303"],
]

In [7]:
all_train_data = []
all_test_data = []
random.seed(42)

for idiom_mix in train_idiom_mix:
    mix_data = dataset.generate_data_mix(idiom_mix, max_code_lines=200)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    all_train_data.extend(mix_data)
    
for idiom_mix in test_idiom_mix:
    mix_data = dataset.generate_data_mix(idiom_mix, max_code_lines=200)
    print(idiom_mix, len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))
    all_test_data.extend(mix_data)
    
print(len(all_train_data))
print(len(all_test_data))

['F405', 'F501', 'F502', 'F601', 'F621'] 6664
['E402', 'E701', 'E721', 'E741', 'E743'] 10151
['N801', 'N802', 'N803', 'N804', 'N805'] 21798
['N806', 'N807', 'N811', 'N812', 'N813'] 13654
['UP001', 'UP003', 'UP004', 'UP005', 'UP006'] 8280
['UP007', 'UP008', 'UP009', 'UP010', 'UP011'] 25805
['UP044', 'UP045', 'UP046', 'UP047', 'UP040'] 4
['ERA001', 'C901', 'I001', 'I002', 'BLE001'] 93209
['B002', 'B003', 'B004', 'B005', 'B006'] 1070
['B007', 'B008', 'B009', 'B010', 'B012'] 6773
['F406', 'F403', 'F503', 'F602', 'F622'] 7743
['E401', 'E702', 'E722', 'E731', 'E742'] 7544
['ERA001', 'C901', 'I001', 'I002', 'BLE001'] 93209
['ANN001', 'ANN002', 'ANN003', 'ANN201', 'ANN202'] 13753
['ASYNC100', 'ASYNC105', 'ASYNC109', 'ASYNC110', 'ASYNC115'] 14
['ASYNC116', 'ASYNC210', 'ASYNC220', 'ASYNC221', 'ASYNC222'] 34
['ASYNC230', 'ASYNC251', 'ANN204', 'ANN205', 'ANN206'] 24476
['S102', 'S103', 'S104', 'S105', 'S106'] 4598
['S107', 'S108', 'S110', 'S112', 'S113'] 3952
['S201', 'S202', 'S301', 'S302', 'S303

In [8]:
!ls

README.md	    experiments			 plots	    vllm_env.yaml
access_tokens.json  filter_codereviewer_data.py  ruff.toml
alignment-handbook  handbook.yml		 scripts
data		    peft_requirements.txt	 src


In [None]:
# mix_data = dataset.generate_data_mix(['ERA001'])
# print(len(mix_data))
# mix_data[2]['messages'][1]['content']
# print(len([rec for rec in mix_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']))

In [9]:
random.seed(42)
from collections import defaultdict

def balance_neutral_and_flagged_files(
        data: list,
        neutral_file_to_flagged_file_ratio: float=1.0,
    ):
    # iterate over data and create a list of neutral files and flagged files.
    neutral_files = []
    flagged_files = []
    for rec in data:
        response = rec['messages'][1]['content']
        if response.strip() == "NO VIOLATIONS FOUND": neutral_files.append(rec)
        else: flagged_files.append(rec)
    
    # balance the amount of neutral and modified files.
    num_neutral_files = min(int(neutral_file_to_flagged_file_ratio*len(flagged_files)), len(neutral_files))
    neutral_files = random.sample(neutral_files, k=num_neutral_files)
    data = neutral_files + flagged_files 
    data = random.sample(data, k=len(data)) # shuffle the data around.

    return data

def impose_idiom_mix_ceilings(data, ceiling: int=5000):
    """reduce size of data stratified by the idiom mix and violation or no violation category"""
    category_to_data = defaultdict(lambda: [])
    for rec in data:
        violation_present = "yes" if rec['messages'][1]['content'] != "NO VIOLATIONS FOUND" else "no"
        category_to_data[rec['source']+"-"+violation_present].append(rec)
    category_to_data = dict(category_to_data)
    final_data = []
    for category, data_subset in category_to_data.items():
        selected_data = random.sample(data_subset, k=min(len(data_subset), ceiling))
        final_data.extend(selected_data)
        # print(category, len(selected_data))
    # print(len(final_data))
    return final_data

def split_train_and_test_data(train_data, test_data):
    train_ids = set()
    test_ids = set()
    id_to_data = {}

    for rec in train_data:
        train_ids.add(rec['id'])
        id_to_data[rec['id']] = rec
    for rec in test_data:
        test_ids.add(rec['id'])
        id_to_data[rec['id']] = rec
    
    common_ids = train_ids.intersection(test_ids)
    train_only_ids = train_ids.difference(test_ids)
    test_only_ids = test_ids.difference(train_ids)

    train_only_data = impose_idiom_mix_ceilings([id_to_data[ID] for ID in train_only_ids], ceiling=5000)
    test_only_data = impose_idiom_mix_ceilings([id_to_data[ID] for ID in test_only_ids], ceiling=500)
    # common_data = [id_to_data[ID] for ID in test_only_ids]
    common_data_split_1_IDs = set(random.sample(list(common_ids), k=len(common_ids)//2)) 
    common_data_split_2_IDs = common_ids.difference(common_data_split_1_IDs)
    common_data_split_1 = [id_to_data[ID] for ID in common_data_split_1_IDs]
    common_data_split_2 = [id_to_data[ID] for ID in common_data_split_2_IDs]
    
    train_from_common_data = impose_idiom_mix_ceilings(common_data_split_1, ceiling=5000)
    test_from_common_data = impose_idiom_mix_ceilings(common_data_split_2, ceiling=500)

    print(len(train_from_common_data))
    print(len(test_from_common_data))
    print(len(train_only_data))
    print(len(test_only_data))

    return train_only_data+train_from_common_data, test_only_data+test_from_common_data

filt_train_data, filt_test_data = split_train_and_test_data(all_train_data, all_test_data)

10000
1000
81074
8048


In [10]:
def shuffle_data(data: list[dict]):
    return random.sample(data, k=len(data))

In [19]:
# print(filt_train_data[0]['messages'][0]['content'])
print(shuffle_data(filt_train_data)[0]['messages'][1]['content'])

{"code": "UP009", "code_spans_and_lines": [{"line": "# -*- coding: utf-8 -*-", "span": "# -*- coding: utf-8 -*-"}], "fix": {"edits": [{"content": "", "code_spans_and_lines": [{"line": "# -*- coding: utf-8 -*-", "span": ""}, {"line": "\"\"\"DiabtetesPredictions.ipynb", "span": ""}]}]}}


In [20]:
# neutral_file_to_flagged_file_ratio = 1.0
# train_data = balance_neutral_and_flagged_files(all_train_data, neutral_file_to_flagged_file_ratio)
# test_data = balance_neutral_and_flagged_files(all_test_data, neutral_file_to_flagged_file_ratio)

with open("./data/ruff_meta_linting/train_v3.json", "w") as f:
    print(f"train data len: {len(filt_train_data)}")
    # print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
    # print(dict(Counter([rec['source'] for rec in filt_train_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_train_data]).most_common()))
    json.dump(shuffle_data(filt_train_data), f, indent=4)
with open("./data/ruff_meta_linting/test_v3.json", "w") as f:
    print(f"test data len: {len(filt_test_data)}")
    # print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] != 'NO VIOLATIONS FOUND']).most_common()))
    # print(dict(Counter([rec['source'] for rec in filt_test_data if rec['messages'][1]['content'] == 'NO VIOLATIONS FOUND']).most_common()))
    print(dict(Counter([rec['source'] for rec in filt_test_data]).most_common()))
    json.dump(shuffle_data(filt_test_data), f, indent=4)

train data len: 91074
{'rull_linter/B007-B008-B009-B010-B012': 10000, 'rull_linter/E402-E701-E721-E741-E743': 10000, 'rull_linter/UP007-UP008-UP009-UP010-UP011': 10000, 'rull_linter/UP001-UP003-UP004-UP005-UP006': 10000, 'rull_linter/N806-N807-N811-N812-N813': 10000, 'rull_linter/F405-F501-F502-F601-F621': 10000, 'rull_linter/N801-N802-N803-N804-N805': 10000, 'rull_linter/ERA001-C901-I001-I002-BLE001': 10000, 'rull_linter/B002-B003-B004-B005-B006': 6070, 'rull_linter/UP044-UP045-UP046-UP047-UP040': 5004}
test data len: 9048
{'rull_linter/F406-F403-F503-F602-F622': 1000, 'rull_linter/ASYNC230-ASYNC251-ANN204-ANN205-ANN206': 1000, 'rull_linter/S102-S103-S104-S105-S106': 1000, 'rull_linter/E401-E702-E722-E731-E742': 1000, 'rull_linter/S201-S202-S301-S302-S303': 1000, 'rull_linter/ANN001-ANN002-ANN003-ANN201-ANN202': 1000, 'rull_linter/S107-S108-S110-S112-S113': 1000, 'rull_linter/ERA001-C901-I001-I002-BLE001': 1000, 'rull_linter/ASYNC116-ASYNC210-ASYNC220-ASYNC221-ASYNC222': 534, 'rull_li