# Cleaning data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("/kaggle/input/uos-ai-week-2023where-theres-code-theres-bug/train.csv")

In [2]:
import re
def remove_empty(data,column):
    idxs = data[(data[column].str.contains(r'(?m)^.*\n.*\n\s*\n.*\n.*')) & (data["label"]==1)].index
    return data.drop(idxs)
def clean_code(code):
    # Remove comments
    code = re.sub(r'(?m)^\s*#.*\n?', '', code) # python
    code = re.sub(r'(?m)^\s*//.*\n?', '', code) # //
    code = re.sub(r'(?m)^\s*(\/\*)?[\S\s]*\*\/', '', code) # comment blocks
    
    # Remove extra whitespace
    code = re.sub(r'\s+', ' ', code)
    # split
    code = ' '.join(re.split(r'([\(\)\{\}\;\:\[\]\&\=\<\>\,\.\+\-\/\*])',code))
    
    # remove numericals
    def numrepl(matchobj):
        return matchobj.group(1) + '<NUM>'
    code = re.sub(r'([^a-zA-Z0-9_]){1}\d+(?:\.\d+)?f?',numrepl,code)
    # remove strings
    code = re.sub('".*"','<STR>',code)
    # Remove leading and trailing whitespace
    code = code.strip()
    return code

In [3]:
data["code"].apply(lambda x: len(x.split(" "))).describe()

count    209786.000000
mean         39.616347
std          30.473218
min           1.000000
25%          17.000000
50%          35.000000
75%          54.000000
max        1467.000000
Name: code, dtype: float64

In [4]:
data.file_extension.value_counts()

java    98763
py      59337
cpp     32821
rs      12761
js       4673
ts        845
kt        481
c         105
Name: file_extension, dtype: int64

In [5]:
data.label.value_counts()

0    144123
1     65663
Name: label, dtype: int64

In [6]:
data = remove_empty(data,'code')
data['clean_code'] = data['code'].apply(lambda x : clean_code(x))
data.head()

Unnamed: 0,id,code,file_extension,label,clean_code
0,8e7e71745d,a0 = alpha_winder(0)\n ...,py,0,a0 = alpha_winder ( <NUM> ) rebased_winder ...
1,f67fb60d2f,"[&](const ExprHandle& m, const ExprHandl...",cpp,0,"[ & ] ( const ExprHandle & m , const Expr..."
3,ac09060d54,def test_is_fast(self):\n for token...,py,1,"def test_is_fast ( self ) : for tokenizer , ..."
4,c746c1badf,public Exception getException() {\n ...,java,0,public Exception getException ( ) { return...
5,08ae206799,"groups=groups,\n bias=False,\n ...",py,0,"groups = groups , bias = False , dilation = ..."


## Segregate Data based on file extension

In [7]:
def segregate_extensions(data):
    df_java = data[data['file_extension'] == 'java'].copy()
    df_py = data[data['file_extension'] == 'py'].copy()
    df_cpp = data[data['file_extension'] == 'cpp'].copy()
    df_rs = data[data['file_extension'] == 'rs'].copy()
    df_js = data[data['file_extension'] == 'js'].copy()
    df_ts = data[data['file_extension'] == 'ts'].copy()
    df_kt = data[data['file_extension'] == 'kt'].copy()
    df_c = data[data['file_extension'] == 'c'].copy()    
    return (df_java, df_py, df_cpp, df_rs, df_js, df_ts, df_kt, df_c)

In [8]:
(df_java, df_py, df_cpp, df_rs, df_js, df_ts, df_kt, df_c) = segregate_extensions(data)

In [9]:
df_java.value_counts()

id          code                                                                                                                                                                                                                                                                                                                                                        file_extension  label  clean_code                                                                                                                                                                                                                              
0000264569      builder.requestStaticInjection(StaticInjectionTest.Static.class);\n\n    Container c = builder.createContainer();\n\n    assertEquals("test", StaticInjectionTest.Static.s);                                                                                                                                                                            java            0      b

In [10]:
df_java

Unnamed: 0,id,code,file_extension,label,clean_code
4,c746c1badf,public Exception getException() {\n ...,java,0,public Exception getException ( ) { return...
6,e852600d11,* @return hubVirtualNetworkConnection Res...,java,0,@ServiceMethod ( returns = ReturnType . SING...
7,2d310cad6e,\r\nimport io.onedev.server.web.behavior.UserM...,java,1,import io . onedev . server . web . behavior ....
8,2cee71c5c5,\n /**\n * Get the verifierType propert...,java,0,/ * * * Get the verifierType property : ...
12,d7657ec588,* @return a common class for general reso...,java,0,@ServiceMethod ( returns = ReturnType . SING...
...,...,...,...,...,...
209776,ddca4556ee,\n initEntityResolver();\n\n ...,java,0,initEntityResolver ( ) ; }
209777,e9337332d7,"\n // $ANTLR start ""andRestriction""\n //...",java,1,public final BaseDescr andRestriction ( ) th...
209778,e50517a763,.setKeyOps(createKeyOptions.getKey...,java,1,. setKeyOps ( createKeyOptions . getKeyOperati...
209779,7cda935786,import com.pmease.gitplex.core.gatekeeper.AndG...,java,0,import com . pmease . gitplex . core . gatekee...


In [11]:
class Extension():
    def __init__(self,df):
        self.df = df

In [12]:
extensions = {ext:{} for ext in  data['file_extension'].unique()}

In [13]:
extensions

{'py': {},
 'cpp': {},
 'java': {},
 'rs': {},
 'ts': {},
 'js': {},
 'kt': {},
 'c': {}}

In [14]:
idx=0
test_size=0
from sklearn.model_selection import train_test_split
for ext in data['file_extension'].unique():
    extensions[ext] = {'data':data[data['file_extension'] == ext].copy()}
    if test_size > 0:
        X_train, X_test, y_train, y_test = \
        train_test_split(extensions[ext]['data']['clean_code'],extensions[ext]['data']['label'], test_size=0.2)
        extensions[ext]['X_train'] = X_train
        extensions[ext]['X_test'] = X_test
        extensions[ext]['y_train'] = y_train
        extensions[ext]['y_test'] = y_test
    else:
        extensions[ext]['X_train'] = extensions[ext]['data']['clean_code']
        extensions[ext]['y_train'] = extensions[ext]['data']['label']
    idx+=1

In [15]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(data['clean_code'], data['label'], test_size=0.2)

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [17]:
for et in extensions:
    extensions[et]['tokenizer'] = Tokenizer()
    extensions[et]['tokenizer'].fit_on_texts(extensions[et]['X_train'])
    extensions[et]['X_train_seq'] = extensions[et]['tokenizer'].texts_to_sequences(extensions[et]['X_train'])
    if test_size > 0:
        et['X_test_seq'] = extensions[et]['tokenizer'].texts_to_sequences(extensions[et]['X_test'])

In [18]:
# Pad the sequences so each sequence is the same length
max_len = 50
#X_train_seq_padded = pad_sequences(X_train_seq, max_len,padding='post',truncating='post')
#X_test_seq_padded = pad_sequences(X_test_seq, max_len,padding='post',truncating='post')
for et in extensions:
    extensions[et]['X_train_seq_padded'] = pad_sequences(extensions[et]['X_train_seq'], max_len,padding='post',truncating='post')
    if test_size > 0:
        extensions[et]['X_test_seq_padded'] = pad_sequences(extensions[et]['X_test_seq'], max_len,padding='post',truncating='post')

In [19]:
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
import tensorflow as tf
from tensorflow import keras

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

# RNN

In [20]:
from tensorflow import keras
from keras import layers
from tensorflow import keras
from keras.layers import Dropout, BatchNormalization, Bidirectional,Dense, Embedding, LSTM
from keras.models import Sequential
for et in extensions:
    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = layers.Embedding(input_dim=len(extensions[et]['tokenizer'].index_word)+1, output_dim=128)(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    extensions[et]['model'] = keras.Model(inputs, outputs)
    extensions[et]['model'].compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy','AUC'])
    extensions[et]['model'].summary()

2023-01-24 14:40:08.089406: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 14:40:08.090332: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 14:40:08.279983: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 14:40:08.280798: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 14:40:08.281550: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         3016192   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                41216     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 3,057,473
Trainable params: 3,057,473
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Outpu

In [21]:
from tensorflow.keras import layers
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

embed_dim = 64  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

for et in extensions:
    inputs = layers.Input(shape=(max_len,))
    embedding_layer = TokenAndPositionEmbedding(max_len, len(extensions[et]['tokenizer'].index_word)+1, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.LayerNormalization()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    extensions[et]['model'] = keras.Model(inputs=inputs, outputs=outputs)
    extensions[et]['model'].compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy','AUC'])
    extensions[et]['model'].summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
token_and_position_embedding (None, 50, 64)            1511296   
_________________________________________________________________
transformer_block (Transform (None, 50, 64)            70816     
_________________________________________________________________
global_average_pooling1d (Gl (None, 64)                0         
_________________________________________________________________
layer_normalization_2 (Layer (None, 64)                128       
_________________________________________________________________
dropout_10 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160

In [22]:

for et in extensions:
    callbacks = [
        keras.callbacks.ModelCheckpoint("model_{}.keras".format(et),save_best_only=True)
    ]
    extensions[et]['model_path'] = "model_{}.keras".format(et)
    if test_size > 0:
        extensions[et]['history'] = extensions[et]['model'] .fit(extensions[et]['X_train_seq_padded'], extensions[et]['y_train'], 
                                            batch_size=32, epochs=10,
                                            callbacks=callbacks,
                                            validation_data=(extensions[et]['X_test_seq_padded'], extensions[et]['y_test']))
    else:
        extensions[et]['history'] = extensions[et]['model'] .fit(extensions[et]['X_train_seq_padded'], extensions[et]['y_train'], 
                                            batch_size=32, epochs=10,
                                            callbacks=callbacks)

2023-01-24 14:40:18.253335: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10


2023-01-24 14:40:23.450948: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### 

labels, axis=1 is## Submission

In [23]:
# load test.csv,
submission = pd.read_csv(r'/kaggle/input/uos-ai-week-2023where-theres-code-theres-bug/test.csv')
submission

Unnamed: 0,id,code,file_extension
0,9125d50679,self.play(\n pixels_to_dete...,py
1,dc3b7fbe81,*[\n Changi...,py
2,3276506966,...,cpp
3,556239b7c2,"""""""\n A = torch.randn(*(batch_dims + (m...",py
4,15c4f39064,abstract fun getAllOutdated(outdatedThresh...,kt
...,...,...,...
22994,13696067b8,"help=""The root of the backing fi...",py
22995,59c99cf33d,\n public void doLeftDeletes(LeftTupleSink ...,java
22996,4755e18b90,"gpu_last_map_locations,\n ...",py
22997,6de73b77c1,break;\r\n ...,java


In [24]:
import tensorflow as tf

# create a function, that given a string comment, it predicts topic as a string,
def predict_bug(code):
    'input: "x=y\nz=w\n for b in range(5,4):\nz+=b\nprint(z)"'
    'returns: 1 (buggy)'
    code = clean_code(code)
    tokens = tokenizer.texts_to_sequences(code)
    if tokens == None or tokens == [] or len(tokens) == 0:
        return 0
    #print(code)
    seq = pad_sequences(tokens,maxlen=max_len,padding='post',truncating='post')
    #print(seq)
    return model.predict(seq)

from keras.models import load_model
#model = load_model(r"/kaggle/working/bi_rnn.keras")

# store all predicted topics in one new column
submission['clean_code'] = submission['code'].apply(lambda code: clean_code(code))
submission['label'] = 0
submission_df = pd.DataFrame(columns=['id','label','code','file_extension'])
for et in submission['file_extension'].unique():
    ext = extensions[et]
    submissions_et = submission[submission['file_extension'] == et].copy()
    sequences = extensions[et]['tokenizer'].texts_to_sequences(submissions_et['clean_code'])
    sequences = pad_sequences(sequences,maxlen=max_len,padding='post',truncating='post')
    model = extensions[et]['model'] #load_model(extensions[et]['model_path'])
    submissions_et['label'] = tf.cast(tf.round(model.predict(sequences)),dtype=tf.int64)
    # now assign correct labels to submission based on ids
    submission_df = pd.concat([submission_df,submissions_et],ignore_index=True)

# drop the Comments column
submission_df = submission_df.drop(columns=['code','clean_code','file_extension'])

# save the dataframe as a .csv file for submission
submission_df.to_csv('submission.csv',index=False)

In [25]:
submission_df

Unnamed: 0,id,label
0,9125d50679,0
1,dc3b7fbe81,0
2,556239b7c2,1
3,d968be6e2b,0
4,8001cd915d,1
...,...,...
22994,727bc99b9b,0
22995,a477322861,0
22996,53c603d3c0,0
22997,57f13c9cae,0
