In [115]:
import torch
import codenet

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from scipy import sparse

pd.set_option('max_columns', None)

codenet.P = 4

input_path = "../input/generated/"
data_path = input_path + "data/"

generate_labels_path = input_path + "generate_labels.csv"

In [11]:
generate_labels_df = pd.read_csv(generate_labels_path)
generate_labels_df = generate_labels_df[generate_labels_df['language'] == 'Python']
generate_labels_df.head()

Unnamed: 0,tag,i1,i2,j1,j2,problem_id,original_id,changed_id,language,extension,original_language,original_status,output,error,returncode,error_class,error_class_extra
4,insert,9,9,9,13,p02628,s000778835,s833669381,Python,py,Python (3.8.2),Runtime Error,,"Traceback (most recent call last):\n File ""/h...",1,ValueError,ValueError: invalid literal for int() with bas...
5,replace,136,137,136,137,p02406,s000873533,s357405506,Python,py,Python,Time Limit Exceeded,,"File ""/home/alex/Documents/research/bug-dete...",1,SyntaxError,SyntaxError: Missing parentheses in call to 'p...
7,replace,35,37,35,36,p02681,s001183728,s212071531,Python,py,Python (3.8.2),Runtime Error,,"File ""/home/alex/Documents/research/bug-dete...",1,SyntaxError,SyntaxError: invalid syntax
9,replace,86,87,86,87,p02314,s001192878,s967652203,Python,py,Python,Runtime Error,,"File ""/home/alex/Documents/research/bug-dete...",1,SyntaxError,SyntaxError: Missing parentheses in call to 'p...
11,replace,38,39,38,39,p03694,s001344748,s267165124,Python,py,Python (3.8.2),Runtime Error,,"Traceback (most recent call last):\n File ""/h...",1,NameError,NameError: name 'b' is not defined


In [4]:
X, y = codenet.detection_X_y(generate_labels_df)

Processing p02677 s999352609: 100%|█| 3851/38


In [54]:
stratify = [a[0][0] if a else "Accepted" for a in y]
label_enc = LabelEncoder(random_state=42)
stratify = label_enc.fit_transform(stratify)
y = [[(s, *a[0][1:])] if a else [] for s, a in zip(stratify, y)]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=stratify, random_state=42)

feat_extraction = Pipeline([
    ('vect', CountVectorizer(analyzer=lambda x: x)),
    ('tfidf', TfidfTransformer()),
])

X_train = feat_extraction.fit_transform(X_train)
X_test = feat_extraction.transform(X_test)

in_feat = X_train.shape[1]
num_classes = len(np.unique(stratify))

in_feat, num_classes

(2769, 17)

In [89]:
class Model(nn.Module):
    def __init__(self, in_feat, num_classes):
        super(Model, self).__init__()

        self.in_feat = in_feat
        self.num_classes = num_classes
        self.out_feat = 1 + 2 + num_classes
        
        self.linear = nn.Linear(in_feat, self.out_feat)
        
    def forward(self, x):
        x = self.linear(x)
        return x

class Loss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        self.mce = nn.CrossEntropyLoss()
        
    def forward(self, output, target):      
        if len(target) == 0:
            return self.mse(output[0, 0], torch.tensor(0.0))
        
        obj_loss = self.mse(output[0, 0], torch.tensor(1.0))
        pos_loss = self.mse(output[0, 1:3], target[0, 1:3])
        mce_loss = self.mce(output[0:1, 2:], target[0:1, 0].long())

        return obj_loss + pos_loss + mce_loss

In [92]:
model = Model(in_feat, num_classes)
loss_fn = Loss()
solver = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

In [93]:
dataloader = list(zip(X_train, y_train))

for epoch in range(50):
    with tqdm(dataloader, position=0, total=len(dataloader)) as loop:
        for i, (x, target) in enumerate(loop):
            x = torch.tensor(x.toarray()).float()
            target = torch.tensor(target).float()

            predictions = model(x)
            loss = loss_fn(predictions, target)

            model.zero_grad()
            loss.backward()
            solver.step()

            loop.set_postfix(loss=loss.item(), epoch=epoch)

100%|█| 6161/6161 [00:17<00:00, 343.63it/s, e
100%|█| 6161/6161 [00:17<00:00, 345.39it/s, e
100%|█| 6161/6161 [00:18<00:00, 338.02it/s, e
100%|█| 6161/6161 [00:19<00:00, 322.27it/s, e
100%|█| 6161/6161 [00:18<00:00, 326.80it/s, e
100%|█| 6161/6161 [00:18<00:00, 330.05it/s, e
100%|█| 6161/6161 [00:18<00:00, 324.80it/s, e
100%|█| 6161/6161 [00:18<00:00, 325.77it/s, e
100%|█| 6161/6161 [00:19<00:00, 323.40it/s, e
100%|█| 6161/6161 [00:18<00:00, 329.94it/s, e
100%|█| 6161/6161 [00:18<00:00, 339.21it/s, e
100%|█| 6161/6161 [00:19<00:00, 320.40it/s, e
100%|█| 6161/6161 [00:19<00:00, 316.79it/s, e
100%|█| 6161/6161 [00:19<00:00, 322.50it/s, e
100%|█| 6161/6161 [00:18<00:00, 328.49it/s, e
100%|█| 6161/6161 [00:18<00:00, 328.52it/s, e
100%|█| 6161/6161 [00:18<00:00, 326.78it/s, e
100%|█| 6161/6161 [00:18<00:00, 328.36it/s, e
100%|█| 6161/6161 [00:18<00:00, 327.67it/s, e
100%|█| 6161/6161 [00:18<00:00, 326.32it/s, e
100%|█| 6161/6161 [00:18<00:00, 328.52it/s, e
100%|█| 6161/6161 [00:18<00:00, 32

In [120]:
dataloader = list(zip(X_test, y_test))

def get_prediction(predictions):
    if predictions[0, 0] < 0.5:
        return []

    line, column = predictions[0, 1:3].long().tolist()
    label = torch.argmax(predictions[0, 3:]).item()
    
    return [(label, line, column)]
    
result = []

with tqdm(dataloader, position=0, total=len(dataloader)) as loop:
    for i, (x, target) in enumerate(loop):
        x = torch.tensor(x.toarray()).float()
        target = torch.tensor(target).float()

        predictions = model(x)
        predictions = get_prediction(predictions)
        
        result.append(predictions)

100%|█| 1541/1541 [00:00<00:00, 10997.41it/s]


In [121]:
a_pred = [a[0][0] if a else label_enc.transform(["Accepted"])[0] for a in result]
a_test = [a[0][0] if a else label_enc.transform(["Accepted"])[0] for a in y_test]

In [122]:
print(f"Accuracy: {accuracy_score(a_test, a_pred)}")
print(f"Precision: {precision_score(a_test, a_pred, average='weighted', zero_division=0)}")
print(f"Recall: {recall_score(a_test, a_pred, average='weighted', zero_division=0)}")
print(f"F1: {f1_score(a_test, a_pred, average='weighted')}")

Accuracy: 0.25243348475016225
Precision: 0.21452804626751823
Recall: 0.2524334847501622
F1: 0.23150146491345164
