In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from code_function import preprocess_script, make_dataset, reduction_dataset,indentation
from sklearn.model_selection import train_test_split , KFold, StratifiedKFold

In [2]:
from glob import glob
problem_folders = glob("D:/code_preprocessing/clean/*.py")

In [65]:
augment_folders = glob("D:/code_preprocessing/executable/*.py")

In [3]:
def make_df(problem_folders):
    preproc_scripts = []
    problem_nums = []
    for problem_folder in tqdm(problem_folders):
        problem_num = os.path.basename(problem_folder).split(".")[0]
        with open(problem_folder, "rt", encoding='utf-8') as file:
            text = file.read()
            preproc_scripts.append(text)
        problem_nums.append(problem_num)
    df = pd.DataFrame(data = {'code':preproc_scripts, 'problem_script':problem_nums})
    df['problem_num'] = df['problem_script'].apply(lambda x: x.split("_")[0])
    return df

In [4]:
df = make_df(problem_folders)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45101/45101 [03:55<00:00, 191.50it/s]


In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
df['tokens'] = df['code'].apply(tokenizer.tokenize)
df['len'] = df['tokens'].apply(len)
df['problem_num'] = df['problem_script'].apply(lambda x: x.split("_")[0])

Token indices sequence length is longer than the specified maximum sequence length for this model (922 > 512). Running this sequence through the model will result in indexing errors


In [13]:
global train_df
global valid_df

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=33)

for e, (train_index, test_index) in enumerate(skf.split(df, df['problem_num'])):
    if e==0:
        train_df, valid_df = df.iloc[train_index], df.iloc[test_index]

In [82]:
def aug_list(df):
    aug_train = []
    aug_label = []
    for t in tqdm(df.problem_script):
        lf = f"D:/code_preprocessing/executable/{t}.py"
        if os.path.exists(lf):
            with open(lf, "rt", encoding = 'utf-8') as file:
                lines = file.readlines()
                relines = []
                for line in lines:
                    if "sys.stdout = open(" in line:
                        continue
                    if "from unittest.mock import patch" in line:
                        continue
                    if line.startswith("for FFF in"):
                        continue
                    if "with patch('builtins.input') as input_mock:" in line:
                        continue
                    if "input_mock.side_effect = FFF" in line:
                        continue
                    if "print('GOODJOBANDSUCCESS')" in line:
                        continue
                    if "sys.stdout.close()" in line:
                        continue
                    else:
                        relines.append(line)
                relines = indentation(relines)
                aug_train.append(relines)
                aug_label.append(t)
    aug = pd.DataFrame({"code":aug_train, "problem_script":aug_label})
    aug['problem_num'] = aug['problem_script'].apply(lambda x: x.split("_")[0])
    return aug

aug_train = aug_list(train_df)
aug_valid = aug_list(valid_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40590/40590 [00:58<00:00, 689.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4511/4511 [00:24<00:00, 181.81it/s]


In [84]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
aug_train['tokens'] = aug_train['code'].apply(tokenizer.tokenize)
aug_train['len'] = aug_train['tokens'].apply(len)

aug_valid['tokens'] = aug_valid['code'].apply(tokenizer.tokenize)
aug_valid['len'] = aug_valid['tokens'].apply(len)

Token indices sequence length is longer than the specified maximum sequence length for this model (832 > 512). Running this sequence through the model will result in indexing errors


In [107]:
add_df = train_df[train_df['len']>2048]
train_df = train_df[train_df["len"] <= 2048]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-2048:]))
train_df = pd.concat([train_df, add_df], ignore_index=True)
train_df = train_df.reset_index(drop=True)

add_df = valid_df[valid_df['len']>2048]
valid_df = valid_df[valid_df["len"] <= 2048]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-2048:]))
# valid = pd.concat([valid_df, add_df], ignore_index=True) #코드실수함
valid_df = pd.concat([valid_df, add_df], ignore_index=True)
valid_df = valid_df.reset_index(drop=True)

In [14]:
add_df = train_df[train_df['len']>1026]
train_df = train_df[train_df["len"] <= 1026]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-1026:]))
train_df = pd.concat([train_df, add_df], ignore_index=True)
train_df = train_df.reset_index(drop=True)

add_df = valid_df[valid_df['len']>1026]
valid_df = valid_df[valid_df["len"] <= 1026]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-1026:]))
# valid = pd.concat([valid_df, add_df], ignore_index=True) #코드실수함
valid_df = pd.concat([valid_df, add_df], ignore_index=True)
valid_df = valid_df.reset_index(drop=True)

In [87]:
add_df = aug_train[aug_train['len']>2048]
aug_train = aug_train[aug_train["len"] <= 2048]
aug_train["code"] = aug_train['code'].apply(lambda x :" ".join(x.split(" ")[-2048:]))
aug_train = pd.concat([aug_train, add_df], ignore_index=True)
aug_train = aug_train.reset_index(drop=True)

add_df = aug_valid[aug_valid['len']>2048]
aug_valid = aug_valid[aug_valid["len"] <= 2048]
add_df["code"] = add_df['code'].apply(lambda x :" ".join(x.split(" ")[-2048:]))
aug_valid = pd.concat([aug_valid, add_df], ignore_index=True)
aug_valid = aug_valid.reset_index(drop=True)

In [15]:
#version1
train_code = []
for name, code in zip(train_df["problem_script"], train_df["code"]):
#     if os.path.exists(f"D:/open/executable/{name}.txt"):
#         temp = open(f"D:/open/executable/{name}.txt", "r").read()
    if os.path.exists(f"D:/code_preprocessing/executable/{name}.txt"):
        temp = open(f"D:/code_preprocessing/executable/{name}.txt", "r").read()
        temp = temp.replace('GOODJOBANDSUCCESS', ' ')
        temp = temp.replace("**START**", " ")
        temp = temp[:100]
        code = code +"\n" + temp
        train_code.append(code)
    
    else:
        train_code.append(code)

In [8]:
#version2
train_code = []
for name, code in zip(train_df["problem_script"], train_df["code"]):
#     if os.path.exists(f"D:/open/executable/{name}.txt"):
#         temp = open(f"D:/open/executable/{name}.txt", "r").read()
    if (os.path.exists(f"D:/code_preprocessing/executable/{name}.txt")) and \
        (os.path.exists(f"D:/code_preprocessing/executable2/{name}.txt")) and \
        (os.path.exists(f"D:/code_preprocessing/executable3/{name}.txt")):
        temp = open(f"D:/code_preprocessing/executable/{name}.txt", "r").read()
        temp = temp.replace('GOODJOBANDSUCCESS', ' ')
        temp = temp.replace("**START**", " ")
        temp = temp[:100]
        temp2 =  open(f"D:/code_preprocessing/executable2/{name}.txt", "r").read()
        temp2 = temp2[:100]
        temp3 =  open(f"D:/code_preprocessing/executable3/{name}.txt", "r").read()
        temp3 = temp3[:100]
        code = code +"\n" + temp +"\n" + temp2 + "\n" + temp3
        train_code.append(code)
    elif (os.path.exists(f"D:/code_preprocessing/executable/{name}.txt")):
        temp = open(f"D:/code_preprocessing/executable/{name}.txt", "r").read()
        temp = temp.replace('GOODJOBANDSUCCESS', ' ')
        temp = temp.replace("**START**", " ")
        temp = temp[:100]
        code = code +"\n" + temp 
        train_code.append(code)
    else:
        train_code.append(code)

In [112]:
#version3
train_code = []
no_print = []
for name, code in tqdm(zip(train_df["problem_script"], train_df["code"]), total=len(train_df["code"])):
    if len(code) > 200:
        temp = code.split("\n")    
        temp_list = []
        for t in temp:
            s = t.strip()
            if s.startswith("print"):
                continue
            else:
                temp_list.append(t)
        temp_result = "\n".join(temp_list)
        if temp_result:
            no_print.append(temp_result)
        else:
            no_print.append(code)
    else:
        no_print.append(code)
    
    if (os.path.exists(f"D:/code_preprocessing/executable/{name}.txt")) and \
        (os.path.exists(f"D:/code_preprocessing/executable2/{name}.txt")) and \
        (os.path.exists(f"D:/code_preprocessing/executable3/{name}.txt")):
        temp = open(f"D:/code_preprocessing/executable/{name}.txt", "r").read()
        temp = temp.replace('GOODJOBANDSUCCESS', ' ')
        temp = temp.replace("**START**", " ")
        temp = temp[:100]
        temp2 =  open(f"D:/code_preprocessing/executable2/{name}.txt", "r").read()
        temp2 = temp2[:100]
        temp3 =  open(f"D:/code_preprocessing/executable3/{name}.txt", "r").read()
        temp3 = temp3[:100]
        code2 = aug_train[aug_train["problem_script"]==name]["code"].values[0]
        if code2 = aug_train[aug_train["problem_script"]==name]["code"].values[0]:
            code2 = code2 +"\n" + temp +"\n" + temp2 + "\n" + temp3
            train_code.append(code2)
        else:
            code = code +"\n" + temp +"\n" + temp2 + "\n" + temp3
            train_code.append(code)
    elif (os.path.exists(f"D:/code_preprocessing/executable/{name}.txt")):
        temp = open(f"D:/code_preprocessing/executable/{name}.txt", "r").read()
        temp = temp.replace('GOODJOBANDSUCCESS', ' ')
        temp = temp.replace("**START**", " ")
        temp = temp[:100]
        code2 = aug_train[aug_train["problem_script"]==name]["code"].values
        if code2:
            code2 = code2 +"\n" + temp 
            train_code.append(code2)
        else:
            code = code +"\n" + temp 
            train_code.append(code)
    else:
        train_code.append(code)
        
                    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40590/40590 [05:51<00:00, 115.42it/s]


In [16]:
train_df["code_"] = train_code
# train_df["code_no"] = no_print

In [17]:
train_df1 = train_df[["code", "problem_num"]]
train_df2 = train_df[["code_", "problem_num"]]
# train_df3 = train_df[["code_no", "problem_num"]]
train_df2.columns = ["code", "problem_num"]
# train_df3.columns = ["code", "problem_num"]
# train_df = pd.concat([train_df1, train_df2, train_df3], ignore_index=True)
train_df = pd.concat([train_df1, train_df2], ignore_index=True)

In [18]:
train_df = train_df.sample(frac=1, random_state=103).reset_index(drop=True)

In [115]:
train_df = train_df.sample(frac=1, random_state=20220609).reset_index(drop=True)

In [118]:
train_df.code = train_df.code.map(str)

In [119]:
train_df = train_df.drop_duplicates("code")
train_df = train_df.sample(frac=1, random_state=2022).reset_index(drop=True)

In [130]:

def make_fest_dataset(train_df, tokenizer):
    from rank_bm25 import BM25Okapi
    from itertools import combinations
    import random
    codes = train_df['code'].to_list()
    problems = train_df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    bm25 = BM25Okapi(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = train_df[train_df['problem_num'] == problem]['code']
        ppl = solution_codes.to_list()
        random.seed(1)
        random.shuffle(ppl)
        lp = len(ppl) // 10
        ppl = ppl[:lp]
        positive_pairs = list(combinations(ppl,2))        
        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(train_df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)
        
        temp_len1 = len(total_positive_pairs)
        temp_len2 = len(total_negative_pairs)
        if temp_len1 > temp_len2:
            total_positive_pairs = total_positive_pairs[:temp_len2]
        elif temp_len1 < temp_len2:
            total_negative_pairs = total_negative_pairs[:temp_len1]
        assert len(total_positive_pairs) == len(total_negative_pairs), 'length is different'

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label})
    pair_data = pair_data.sample(frac=1).reset_index(drop=True)
    return pair_data

In [132]:
train_data = make_fest_dataset(train_df, tokenizer)
valid_data = make_dataset(valid_df, tokenizer)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [2:04:23<00:00, 24.88s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:21<00:00,  3.68it/s]


In [144]:
len(train_data["code2"].iloc[0])

2599

In [142]:
code1 =[]
code2 = []
similar = []
for i, j, s in zip(train_data["code1"],train_data["code2"], train_data["similar"]):
    if not (i.startswith("[")) and not (j.startswith("[")):
        code1.append(i)
        code2.append(j)
        similar.append(s)

In [143]:
len(code1)

86666

In [146]:
train_pair_data = pd.DataFrame({"code1":code1, "code2":code2, "similar":similar})

In [None]:
def reduction_1(data):
    data1 = data.drop_duplicates("code1")
    data1 = data1.reset_index(drop=True)
    return data1

In [21]:
def reduction_xdataset(data):
    data1 = data.drop_duplicates("code1")
    data2 = data.drop_duplicates("code2")
    data3 = data.drop_duplicates("code1", keep="last")
    data4 = data.drop_duplicates("code2", keep="last")
    re_data = pd.concat([data1, data2, data3, data4], ignore_index=True)
    re_data = re_data.drop_duplicates(["code1", "code2"])
    re_data = re_data.reset_index(drop=True)
    return re_data

In [59]:
ce = glob("D:/meta2/DACON/Code_classification/list_comprehension/ex*py")
ced = {}
for c in ce:
    re_lines = []
    with open(c, "rt") as f:
        lines = f.readlines()
        name = os.path.basename(c).split('.')[0]
        for line in lines:
            line = line.replace("    ","\t")
            re_lines.append(line)
        text = "\n".join(re_lines)
        ced[name] = text
        
ced1 = [ced["ex1_1"], ced["ex2_1"], ced["ex3_1"], ced["ex4_1"], ced["ex5_1"], ced["ex6_1"], ced["ex7_1"]]
ced2 = [ced["ex1_2"], ced["ex2_2"], ced["ex3_2"], ced["ex4_2"], ced["ex5_2"], ced["ex6_2"], ced["ex7_2"]]

add = pd.DataFrame({"code1":ced1, "code2":ced2, "similar":[1, 1, 1, 1, 0, 1, 1]})
add

Unnamed: 0,code1,code2,similar
0,e1 = [i for i in range(10)]\n\nprint(e1),x = []\n\nfor i in range(10):\n\n\t x.append(...,1
1,e2 = [i*i for i in range(10)]\n\nprint(e2),x2 = []\n\nfor i in range(10):\n\n\t x2.appen...,1
2,"e3 = [i for i in range(10, -1, -1)]\n\nprint(e3)","x3 = []\n\nfor i in range(10, -1, -1):\n\n\t ...",1
3,f1 = [i for i in [i for i in range(20)] if i %...,f2 = []\n\nfor i in range(20):\n\n\t if i % 2...,1
4,"k = [[i for i in range(10)],[i for i in range(...",t = []\n\nfor r in range(3):\n\n\t if r==0:\n...,0
5,"#same\n\nk = [[i for i in range(10)],[i for i ...",t = []\n\nfor r in range(3):\n\n\t if r==0:\n...,1
6,"m = {""col"":1, ""ind"":2}\n\nq = {k:v for k,v in ...","t = {}\n\nt[""col""] =1\n\nt[""ind""] =2\n\nz = {}...",1


In [61]:
temp = pd.concat([add, train_data], ignore_index=True)
temp

Unnamed: 0,code1,code2,similar
0,e1 = [i for i in range(10)]\n\nprint(e1),x = []\n\nfor i in range(10):\n\n\t x.append(...,1
1,e2 = [i*i for i in range(10)]\n\nprint(e2),x2 = []\n\nfor i in range(10):\n\n\t x2.appen...,1
2,"e3 = [i for i in range(10, -1, -1)]\n\nprint(e3)","x3 = []\n\nfor i in range(10, -1, -1):\n\n\t ...",1
3,f1 = [i for i in [i for i in range(20)] if i %...,f2 = []\n\nfor i in range(20):\n\n\t if i % 2...,1
4,"k = [[i for i in range(10)],[i for i in range(...",t = []\n\nfor r in range(3):\n\n\t if r==0:\n...,0
...,...,...,...
21845896,from operator import mul\nfrom functools impor...,import math\nfrom math import gcd\nINF = float...,1
21845897,"n, d = list(map(int, input().split(' ')))\nres...","n, m, q = list(map(int, input().split()))\na=[...",0
21845898,"H,N = list(map(int,input().split()))\nINF = 10...","import sys\nN,M = list(map(int,input().split()...",0
21845899,from collections import defaultdict\nfrom math...,import math\ndef main():\n\t\tmod = 1000000007...,1


In [62]:
train_data = temp

In [19]:
train_data = make_dataset(train_df, tokenizer)
valid_data = make_dataset(valid_df, tokenizer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [46:27<00:00,  9.29s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:17<00:00,  3.87it/s]


In [23]:
train_pair_data = reduction_xdataset(train_data)
valid_pair_data = reduction_dataset(valid_data)
# train_pair_data["similar"] = train_pair_data["similar"].map(float)
# valid_pair_data["similar"] = valid_pair_data["similar"].map(float)

#그냥데이터
#3_txt파일 몇개 붙인거
#4augmentation
#5_1024데이터셋
#6_1024_last
#7clean_augmentation
#8clean_new_augmentation
#9clean_new_augmentation_1024
#10clean_new_augmentation_2048
#11clean_new_augmentation_2048
#12 75%
#13 n0_print, aug
#14 noPorit
#15 1026
train_pair_data.to_csv("D:/code_classification/python3_train15.csv", index=False)
valid_pair_data.to_csv("D:/code_classification/python3_valid15.csv", index=False)

In [None]:
train_pair_data.shape

In [134]:

valid_pair_data = reduction_dataset(valid_data)
train_data["similar"] = train_data["similar"].map(float)
valid_pair_data["similar"] = valid_pair_data["similar"].map(float)
#그냥데이터
#3_txt파일 몇개 붙인거
#4augmentation
#5_1024데이터셋
#6_1024_last
#7clean_augmentation
#8clean_new_augmentation
#9clean_new_augmentation_1024
#10clean_new_augmentation_2048
#11clean_new_augmentation_2048
#12 75%
#13 no_print, print_all
train_data.to_csv("D:/code_classification/python3_train13.csv", index=False)
valid_pair_data.to_csv("D:/code_classification/python3_valid13.csv", index=False)

In [147]:
train_pair_data.to_csv("D:/code_classification/python3_train14.csv", index=False)

In [27]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
df['encode2'] = df['code'].apply(tokenizer.encode)
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")
df['encode'] = df['code'].apply(tokenizer.encode)

Token indices sequence length is longer than the specified maximum sequence length for this model (924 > 512). Running this sequence through the model will result in indexing errors


In [28]:
if df["encode"].all() == df["encode2"].all():
    print(True)

True
