In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Bidirectional, TimeDistributed,Conv1D, MaxPooling1D, Input, concatenate
from keras.layers.recurrent import SimpleRNN
from keras.layers import GRU, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
import keras
import os
import tarfile
import re
import pandas as pd
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import KFold
from keras_self_attention import SeqSelfAttention
import statistics

max_length = 50
vocab_size = 2000

In [None]:
def RNN(maxlen = 50, max_features = 4590, embed_size =32):
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
#     model.add(Dropout(0.5))
    model.add(GRU(16,dropout=0.2,return_sequences=True))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))
    model.add(Flatten())
#     model.add(Dense(8, activation='relu'))
#     model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())

    return model

def CNN(maxlen=50, max_features=4590, embed_size=32):
    # Inputs
    comment_seq = Input(shape=[maxlen], name='x_seq')

    # Embeddings layers
    emb_comment = Embedding(max_features, embed_size)(comment_seq)

    # conv layers
    convs = []
#     filter_sizes = [2, 3, 4, 5]
    filter_sizes = [2, 3]

    for fsz in filter_sizes:
        l_conv = Conv1D(filters=10, kernel_size=fsz, activation='relu')(emb_comment)
        l_pool = MaxPooling1D(maxlen - fsz + 1)(l_conv)
        l_pool = Flatten()(l_pool)
        convs.append(l_pool)
    merge = concatenate(convs, axis=1)

    out = Dropout(0.2)(merge)
    output = Dense(16, activation='relu')(out)

    output = Dense(units=1, activation='sigmoid')(output)

    model = Model([comment_seq], output)
    print(model.summary())
    return model


def embedding(maxlen=50, max_features=4590, embed_size=16):
    # define the model
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    # summarize the model
    print(model.summary())
    return model

    
def model_fit(model,padded_docs,labels,train_index,test_index, epochs = 5):
    len_train = len(train_index)
    len_test = len(test_index)
    x_test =  np.zeros(shape = (len_test,len(padded_docs[0])))
    y_test = np.zeros(shape = (len_test))
    x_train = np.zeros(shape = (len_train,len(padded_docs[0])))
    y_train = np.zeros(shape = (len_train))
    i = 0
    for tri in train_index:
        x_train[i] = padded_docs[tri]
        y_train[i] = labels[tri]
        i+=1
    j = 0
    for ti in test_index:
        x_test[j] = padded_docs[ti]
        y_test[j] =  labels[ti]
        j +=1
        
    print("Test set from ",test_index[0], " to ",test_index[-1])

    model.fit(x_train, y_train,epochs=epochs)
    eva = model.evaluate(x_test,y_test)
    print('loss: ',eva[0])
    print('accuracy: ',eva[1])
    print('precision: ',eva[2])
    print('recall: ',eva[3])
    print('f1-score: ',(2*eva[3]*eva[2])/(eva[3]+eva[2]))
    return model ,eva

def model_fit_no_test(model,x_train,y_train, epochs = 5):
    
    model.fit(x_train, y_train,epochs=epochs)
    return model 

import numpy
def predict_test(model, testData):
    test = numpy.reshape(testData,(1,testData.shape[0]))
    pr = model.predict(test)
    re = []
    if (pr >= 0.5):
        return ("yes")
    else:
        return ("no")
    
def predict_train(model, testData, truth):
    test = numpy.reshape(testData,(1,testData.shape[0]))
    pr = model.predict(test)
    print("==================")
    print("truth = ",truth )
    print("predict = ",pr )
    if (pr >= 0.5 and truth == 1) or (pr< 0.5 and truth ==0):
        print("true")
        return 1
    else:
        print("false")
        return 0
    
def write_result(predict, outPath,inpath ="assets/test_set_0520.csv"):
    dataframe_in = pd.read_csv(inpath, na_filter = False)
    dataframe_in["Expected"] = predict
    dataframe_in.to_csv(outPath)

def getMedian(textList):
     return statistics.median(textList)

In [None]:
def demostrateData():
    SPL = "<SPL>"
    path = "csv/trimedValues2.csv"
    train_file = path
    dataframe = pd.read_csv(path, na_filter = False)
    docs = []
    labels = []
    for i,data in dataframe.iterrows():
        d = data["comment"]
        c = data["code"]
        s = []
        s =  d+" "+SPL+" "+c # comment and code together
        docs.append(s)
        l = 1 if data["non-information"] == "yes" else 0
        labels.append(l)
    # integer encode the documents
    vocab_size = 2000
    encoded_docs = [one_hot(d, vocab_size) for d in docs]
    # pad documents to a max length of 41 words
    # The langest sentence contains 200 words but the Average is 21. If  padding all sentence to length of langest one, most of sentence
    # will be 0. That is not good for RNN, so we set padding length to 50
    summ = 0 
    li = []
    for t in encoded_docs:
        li.append(len(t))
        summ += len(t)
    avg = summ/len(encoded_docs)
    print("avg =", avg )
    m = getMedian(li)
    print("median = ",m)
    maxLen = len(max(docs, key=len).split())
    print("max = ",maxLen)
    li.sort()
    ## plot graph
    import matplotlib.pyplot as plt
    %matplotlib inline
    plt.style.use('ggplot')

    x = range(len(encoded_docs))

    plt.bar(x, li, color='green')

    plt.show()

In [None]:
demostrateData()

The most length of given training sentence is less than 50. So we set 50 as padding length. 

In [None]:
# define documents
## read train Data
def read_train_Data():
    SPL = "<SPL>"
    path = "csv/trimedValues2.csv"
    train_file = path
    dataframe = pd.read_csv(path, na_filter = False)
    docs = []
    labels = []
    for i,data in dataframe.iterrows():
        d = data["comment"]
        c = data["code"]
        s = []
        s =  d+" "+SPL+" "+c # comment and code together
        docs.append(s)
        l = 1 if data["non-information"] == "yes" else 0
        labels.append(l)
    # integer encode the documents
    vocab_size = 2000
    encoded_docs = [one_hot(d, vocab_size) for d in docs]
    # pad documents to a max length of 41 words
    # The langest sentence contains 200 words but the Average is 21. If  padding all sentence to length of langest one, most of sentence
    # will be 0. That is not good for RNN, so we set padding length to 50
    li = []
    for t in encoded_docs:
        li.append(len(t))
    m = getMedian(li)
    # max_length = len(max(docs, key=len).split())
    max_length = 50
    ##
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    return padded_docs, labels

In [None]:
# define documents
## read test Data
def read_test_Data():
    SPL = "<SPL>"
    tes_path = "csv/trimedValues3.csv"
    test_file = tes_path
    dataframe = pd.read_csv(test_file, na_filter = False)
    test_docs = []
    test_labels = []
    for i,data in dataframe.iterrows():
        d = data["comment"]
        c = data["code"]
        s = []
        s =  d+" "+SPL+" "+c # comment and code together
        test_docs.append(s)
    # integer encode the documents
    vocab_size = 2000
    test_encoded_docs = [one_hot(d, vocab_size) for d in test_docs]
    # pad documents to a max length of 41 words
    # The langest sentence contains 200 words but the Average is 21. If  padding all sentence to length of langest one, most of sentence
    # will be 0. That is not good for RNN, so we set padding length to 50
    summ = 0
    for i in test_docs:
        summ += len(i.split())
#     print(summ/len(test_docs))
    # max_length = len(max(docs, key=len).split())
    max_length = 50
    ##
    test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')
    return test_padded_docs

In [None]:
# train CNN with training data

padded_docs,labels = read_train_Data()
model_CNN = CNN(max_length,vocab_size,32)
model_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])
model_CNN = model_fit_no_test(model_CNN,padded_docs,labels)


In [None]:
## CNN test
test_padded_docs = read_test_Data()
outPath = "result/CNN-Result.txt"
i = []
for tr in (test_padded_docs):
    i.append(predict_test(model_CNN,tr))
print("CNN Predicted non-informative ",i)
write_result(i, outPath)

In [None]:
# train RNN with training data
padded_docs,labels = read_train_Data()
model = RNN(max_length,vocab_size,32)
learning_rate = 0.0002
adam = Adam(
    learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False
    )
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])
model = model_fit_no_test(model,padded_docs,labels, 20)

In [None]:
## RNN test
test_padded_docs = read_test_Data()

i = []
for tr in (test_padded_docs):
    i.append(predict_test(model,tr))
print("RNN Predicted non-informative ",i)
outPath = "result/RNN-Result.txt"
write_result(i, outPath,)

In [None]:
# train embedding with training data
padded_docs,labels = read_train_Data()
model_em = embedding(max_length,vocab_size,32)
model_em.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])
model_em = model_fit_no_test(model_em,padded_docs,labels)


In [None]:
## embedding test
test_padded_docs = read_test_Data()

i = []
for tr in (test_padded_docs):
    i.append(predict_test(model_em,tr))
print("Embedding Predicted non-informative ",i)
outPath = "result/Embedding-Result.txt"
write_result(i, outPath)

In [None]:
# CNN train with split 
max_length = 50
vocab_size = 2000
padded_docs, labels = read_train_Data()
i = 0
n_split=5
loss = 0
accuracy = 0
precision = 0
recall = 0
f1_score = 0
for train_index,test_index in KFold(n_split).split(padded_docs):
    # define the model
    model = CNN(max_length,vocab_size,32)
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])

    model,eva = model_fit(model,padded_docs,labels,train_index,test_index)
    loss += eva[0]
    accuracy += eva[1]
    precision += eva[2]
    recall += eva[3]
    if precision + recall != 0:
        f1_score += (2*eva[3]*eva[2])/(eva[3]+eva[2])

print('total loss: ',loss/n_split)
print('total accuracy: ',accuracy/n_split)
print('total precision: ',precision/n_split)
print('total recall: ',recall/n_split)
print('total f1-score: ',f1_score/n_split)


In [None]:
# embedding train with split 
max_length = 50
vocab_size = 2000
padded_docs, labels = read_train_Data()
i = 0
n_split=5
loss = 0
accuracy = 0
precision = 0
recall = 0
f1_score = 0
for train_index,test_index in KFold(n_split).split(padded_docs):
    # define the model
    model = embedding(max_length,vocab_size,32)
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])

    model,eva = model_fit(model,padded_docs,labels,train_index,test_index)
    loss += eva[0]
    accuracy += eva[1]
    precision += eva[2]
    recall += eva[3]
    f1_score += (2*eva[3]*eva[2])/(eva[3]+eva[2])

print('total loss: ',loss/n_split)
print('total accuracy: ',accuracy/n_split)
print('total precision: ',precision/n_split)
print('total recall: ',recall/n_split)
print('total f1-score: ',f1_score/n_split)


In [None]:
# RNN train with split 
max_length = 50
vocab_size = 2000

padded_docs, labels = read_train_Data()
# attention will focus on specific words for example comment: auto generated method stub
i = 0
n_split=5
loss = 0
accuracy = 0
precision = 0
recall = 0
f1_score = 0
for train_index,test_index in KFold(n_split).split(padded_docs):
    # define the model
    model = RNN(max_length,vocab_size,32)
    learning_rate = 0.0002
    adam = Adam(
        learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False
        )
    early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    min_delta=0,
                                                    patience=5,
                                                    verbose=0, mode='auto')
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])

    model,eva = model_fit(model,padded_docs,labels,train_index,test_index, epochs=20)
    loss += eva[0]
    accuracy += eva[1]
    precision += eva[2]
    recall += eva[3]
    f1_score += (2*eva[3]*eva[2])/(eva[3]+eva[2])

print('total loss: ',loss/n_split)
print('total accuracy: ',accuracy/n_split)
print('total precision: ',precision/n_split)
print('total recall: ',recall/n_split)
print('total f1-score: ',f1_score/n_split)