In [None]:
!pip install datasets
from datasets import load_dataset
import re

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
datasets = load_dataset('humicroedit', 'subtask-1')

In [None]:
def preprocessing(key):
  data = datasets[key]
  x = []
  y_reg = data['meanGrade']
  y = [int(s>0.94) for s in y_reg]
  for i in range(len(data)):
    new_sentence = re.sub("<.+/>", data["edit"][i], data["original"][i])
    x.append(new_sentence)
  return x, y, y_reg

In [None]:
train_x_ori, train_y, train_y_reg = preprocessing('train')
val_x_ori, val_y, val_y_reg = preprocessing('validation')
test_x_ori, test_y, test_y_reg = preprocessing('test')

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_x_ori)
train_sequence = tokenizer.texts_to_sequences(train_x_ori)
train_padded_sequence = pad_sequences(train_sequence, maxlen = 256)
train_x = np.array(train_padded_sequence)
train_y = np.array(train_y)

test_sequence = tokenizer.texts_to_sequences(test_x_ori)
test_padded_sequence = pad_sequences(test_sequence, maxlen = 256)
test_x = np.array(test_padded_sequence)
test_y = np.array(test_y)

val_sequence = tokenizer.texts_to_sequences(val_x_ori)
val_padded_sequence = pad_sequences(val_sequence, maxlen = 256)
val_x = np.array(val_padded_sequence)
val_y = np.array(val_y)

In [None]:
model = Sequential()
model.add(Embedding(10000, 128, input_length=256))
model.add(GRU(128, return_sequences = False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [None]:
from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.compile(optimizer = keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), f1_m])

In [None]:
history = model.fit(train_x, train_y, validation_data = (val_x, val_y), epochs=20)

In [None]:
# plot graph
import matplotlib.pyplot as plt
EPOCH_NUM = 20
train_loss = history.history['root_mean_squared_error']
val_loss = history.history['val_root_mean_squared_error']
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, EPOCH_NUM+1, 1)
max_loss = max(max(train_loss), max(val_loss))

# plot loss
fig, ax1 = plt.subplots()
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.set_ylim([0, max_loss+0.5])
plt1 = ax1.plot(epochs, train_loss, 'yo-', label='Training Loss')
plt2 = ax1.plot(epochs, val_loss, 'ro-', label='Validation Loss')

# plot accuracy
ax2 = ax1.twinx()
ax2.set_ylabel("Accuracy")
ax2.set_ylim([0,1])
plt3 = ax2.plot(epochs, train_acc, 'bo-', label='Training Accuracy')
plt4 = ax2.plot(epochs, val_acc, 'co-', label='Validation Accuracy')

plts = plt1 + plt2 + plt3 + plt4
labs = [p.get_label() for p in plts]
ax2.legend(plts, labs, loc=0)
# fig.tight_layout()
fig_name = "Training and Validation Metrics"
plt.title(fig_name)

plt.show()


In [None]:
model.evaluate(test_x, test_y)