In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold


from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, GRU
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras import callbacks

In [None]:
file_path_1 = 'data/domain1_train.json'
df1 = pd.read_json(file_path_1, lines=True)
file_path_2 = 'data/domain2_train.json'
df2 = pd.read_json(file_path_2, lines=True).drop('model', axis=1)
df_comb = pd.concat([df1, df2],axis=0,ignore_index=True)

X = np.array(df_comb['text'])
y = np.array(df_comb['label'])

In [None]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Data parameters
num_classes = 2

# Hyperparameters
maxlen = 120  #Start with the median, max at 75th percentile
batch_size = 32 # base on experiment 32 is reasonable starting point (the best)
n_epochs = 20
# recommened learning rate is 0.001 to 0.005. usually 0.002 and 0.003 is the best
learning_rate = 0.002
# start small and increase gradually 
hidden_layers = 32
early_stop_patience = 2

In [None]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[3]) 

In [None]:
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

print(y_train.shape)
print(y_train[0])

In [None]:
#(samples, timesteps, features)
#(batch_size, timesteps, input_dim)
X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
print(X_train.shape)  #(750, 100, 1)

X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1))
print(X_test.shape)

In [None]:
def vanilla_rnn():
    model = Sequential()
    model.add(GRU(hidden_layers, input_shape = (maxlen,1), return_sequences = False))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.summary()
    
    adam = optimizers.Adam(learning_rate = learning_rate)
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model

In [None]:
earlystopping = callbacks.EarlyStopping(monitor="accuracy",
                                        mode="max", patience=early_stop_patience,
                                        restore_best_weights=True)
model = KerasClassifier(build_fn = vanilla_rnn, epochs = n_epochs, batch_size = batch_size)
model.fit(X_train, y_train, callbacks=[earlystopping])

In [None]:
y_pred = model.predict(X_test)
y_test_ = np.argmax(y_test, axis = 1)

print(accuracy_score(y_pred, y_test_))

# Kaggle Dataset

In [None]:
file_path_test = 'data/test_set.json'
df_test = pd.read_json(file_path_test, lines=True)
X_Kaggle = np.array(df_test['text'])
X_Kaggle = pad_sequences(X_Kaggle, padding='post', maxlen=maxlen)
X_Kaggle = np.array(X_Kaggle).reshape((X_Kaggle.shape[0], X_Kaggle.shape[1], 1))
y_Kaggle = model.predict(X_Kaggle)

In [None]:
# # Add predictions to the test DataFrame
df_test['class'] = y_Kaggle

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('prediction/GRU_RNN.csv', index=False)