# Cleaning data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("/kaggle/input/uos-ai-week-2023where-theres-code-theres-bug/train.csv")

In [2]:
import re
def remove_empty(data,column):
    idxs = data[(data[column].str.contains(r'(?m)^.*\n.*\n\s*\n.*\n.*')) & (data["label"]==1)].index
    return data.drop(idxs)
def clean_code(code):
    # Remove comments
    code = re.sub(r'(?m)^\s*#.*\n?', '', code) # python
    code = re.sub(r'(?m)^\s*//.*\n?', '', code) # //
    code = re.sub(r'(?m)^\s*(\/\*)?[\S\s]*\*\/', '', code) # comment blocks
    
    # Remove extra whitespace
    code = re.sub(r'\s+', ' ', code)
    # split
    code = ' '.join(re.split(r'([\(\)\{\}\;\:\[\]\&\=\<\>\,\.\+\-\/\*])',code))
    
    # remove numericals
    def numrepl(matchobj):
        return matchobj.group(1) + '<NUM>'
    code = re.sub(r'([^a-zA-Z0-9_]){1}\d+(?:\.\d+)?f?',numrepl,code)
    # remove strings
    code = re.sub('".*"','<STR>',code)
    # Remove leading and trailing whitespace
    code = code.strip()
    return code

In [3]:
data["code"].apply(lambda x: len(x.split(" "))).describe()

count    209786.000000
mean         39.616347
std          30.473218
min           1.000000
25%          17.000000
50%          35.000000
75%          54.000000
max        1467.000000
Name: code, dtype: float64

In [4]:
data.file_extension.value_counts()

java    98763
py      59337
cpp     32821
rs      12761
js       4673
ts        845
kt        481
c         105
Name: file_extension, dtype: int64

In [5]:
data.label.value_counts()

0    144123
1     65663
Name: label, dtype: int64

In [6]:
data = remove_empty(data,'code')
data['clean_code'] = data['code'].apply(lambda x : clean_code(x))
data.head()

Unnamed: 0,id,code,file_extension,label,clean_code
0,8e7e71745d,a0 = alpha_winder(0)\n ...,py,0,a0 = alpha_winder ( <NUM> ) rebased_winder ...
1,f67fb60d2f,"[&](const ExprHandle& m, const ExprHandl...",cpp,0,"[ & ] ( const ExprHandle & m , const Expr..."
3,ac09060d54,def test_is_fast(self):\n for token...,py,1,"def test_is_fast ( self ) : for tokenizer , ..."
4,c746c1badf,public Exception getException() {\n ...,java,0,public Exception getException ( ) { return...
5,08ae206799,"groups=groups,\n bias=False,\n ...",py,0,"groups = groups , bias = False , dilation = ..."


In [7]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(data['clean_code'], data['label'], test_size=0.0)

In [8]:
X_train, y_train = (data['clean_code'], data['label'])

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
#X_test_seq = tokenizer.texts_to_sequences(X_test)

In [11]:
# Pad the sequences so each sequence is the same length
max_len = 50
X_train_seq_padded = pad_sequences(X_train_seq, max_len,padding='post',truncating='post')
#X_test_seq_padded = pad_sequences(X_test_seq, max_len,padding='post',truncating='post')

In [12]:
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

# RNN

In [13]:
from tensorflow import keras
from keras import layers
from tensorflow import keras
from keras.layers import Dropout, BatchNormalization, Bidirectional,Dense, Embedding, LSTM
from keras.models import Sequential

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=128)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy','AUC'])
model.summary()


2023-01-22 17:34:34.955537: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-22 17:34:35.049973: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-22 17:34:35.050724: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-22 17:34:35.051829: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         13380992  
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                41216     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 13,422,273
Trainable params: 13,422,273
Non-trainable params: 0
_________________________________________________________________


In [14]:
y_train

0         0
1         0
3         1
4         0
5         0
         ..
209781    0
209782    0
209783    0
209784    0
209785    0
Name: label, Length: 207409, dtype: int64

In [15]:
# Plot basic evaluation metrics across epochs
#import matplotlib.pyplot as plt
#%matplotlib inline

#for i in ['accuracy', 'precision_m', 'recall_m']:
 #   acc = history.history[i]
 #   val_acc = history.history['val_{}'.format(i)]
 #   epochs = range(1, len(acc) + 1)

 #   plt.figure()
 #   plt.plot(epochs, acc, label='Training Accuracy')
 #   plt.plot(epochs, val_acc, label='Validation Accuracy')
 #   plt.title('Results for {}'.format(i))
 #   plt.legend()
 #   plt.show()

## Transformer

In [16]:
from tensorflow.keras import layers
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [17]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [18]:
import tensorflow as tf
from tensorflow import keras
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, len(tokenizer.index_word)+1, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.25)(x)
x = layers.Dense(40, activation="relu")(x)
x = layers.LayerNormalization()(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [19]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
token_and_position_embedding (None, 50, 32)            3346848   
_________________________________________________________________
transformer_block (Transform (None, 50, 32)            10656     
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 40)                1320      
_________________________________________________________________
layer_normalization_2 (Layer (None, 40)                80  

## Train

In [20]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy','AUC'])

In [21]:
callbacks = [
    keras.callbacks.ModelCheckpoint("bi_rnn.keras",
                                    save_best_only=True)
]
history = model.fit(X_train_seq_padded, y_train, 
                    batch_size=32, epochs=5,
                    callbacks=callbacks)
                    #validation_data=(X_test_seq_padded, y_test))

Epoch 1/5


2023-01-22 17:34:39.060287: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


   1/6482 [..............................] - ETA: 6:26:23 - loss: 0.7154 - accuracy: 0.5938 - auc: 0.5841

2023-01-22 17:34:42.509407: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Submission

In [22]:
# load test.csv,
submission = pd.read_csv(r'/kaggle/input/uos-ai-week-2023where-theres-code-theres-bug/test.csv')
submission

Unnamed: 0,id,code,file_extension
0,9125d50679,self.play(\n pixels_to_dete...,py
1,dc3b7fbe81,*[\n Changi...,py
2,3276506966,...,cpp
3,556239b7c2,"""""""\n A = torch.randn(*(batch_dims + (m...",py
4,15c4f39064,abstract fun getAllOutdated(outdatedThresh...,kt
...,...,...,...
22994,13696067b8,"help=""The root of the backing fi...",py
22995,59c99cf33d,\n public void doLeftDeletes(LeftTupleSink ...,java
22996,4755e18b90,"gpu_last_map_locations,\n ...",py
22997,6de73b77c1,break;\r\n ...,java


In [23]:
import tensorflow as tf
# create a function, that given a string comment, it predicts topic as a string,
def predict_bug(code):
    'input: "x=y\nz=w\n for b in range(5,4):\nz+=b\nprint(z)"'
    'returns: 1 (buggy)'
    code = clean_code(code)
    tokens = tokenizer.texts_to_sequences(code)
    if tokens == None or tokens == [] or len(tokens) == 0:
        return 0
    #print(code)
    seq = pad_sequences(tokens,maxlen=max_len,padding='post',truncating='post')
    #print(seq)
    return model.predict(seq)

#from keras.models import load_model
#model = load_model(r"/kaggle/working/bi_rnn.keras")

# store all predicted topics in one new column
submission['clean_code'] = submission['code'].apply(lambda code: clean_code(code))
sequences = tokenizer.texts_to_sequences(submission['clean_code'])
sequences = pad_sequences(sequences,maxlen=max_len,padding='post',truncating='post')
submission['label'] = tf.cast(tf.round(model.predict(sequences)),dtype=tf.int64)

# drop the Comments column
submission = submission.drop(labels=['clean_code','code','file_extension'], axis=1)

# save the dataframe as a .csv file for submission
submission.to_csv('submission.csv',index=False)

In [24]:
submission

Unnamed: 0,id,label
0,9125d50679,0
1,dc3b7fbe81,0
2,3276506966,0
3,556239b7c2,1
4,15c4f39064,0
...,...,...
22994,13696067b8,1
22995,59c99cf33d,0
22996,4755e18b90,1
22997,6de73b77c1,0
