In [86]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, GRU
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras import callbacks

In [87]:
file_path_1 = 'data/domain1_train.json'
df1 = pd.read_json(file_path_1, lines=True)


file_path_2 = 'data/domain2_train.json'
df2 = pd.read_json(file_path_2, lines=True).drop('model', axis=1)


df_comb = pd.concat([df1, df2],axis=0,ignore_index=True)

df_comb

Unnamed: 0,text,label
0,"[70, 746, 825, 109, 2083, 0, 2, 0, 0, 0, 9, 0,...",1
1,"[1209, 179, 1952, 4, 4959, 7, 0, 2, 978, 1522,...",1
2,"[287, 3, 3330, 0, 23, 12, 13, 465, 74, 8, 0, 8...",1
3,"[0, 0, 3, 592, 19, 2, 706, 1439, 2575, 7, 2, 0...",1
4,"[9, 2, 110, 12, 42, 32, 44, 361, 9, 3860, 2358...",1
...,...,...
34395,"[175, 1317, 38, 754, 9, 5, 0, 228, 1, 45, 6, 2...",0
34396,"[466, 5, 70, 1242, 6, 3888, 1, 34, 43, 5, 70, ...",0
34397,"[10, 0, 21, 1650, 18, 5, 1335, 1, 208, 5, 997,...",0
34398,"[18, 39, 316, 133, 365, 2019, 1, 27, 10, 5, 61...",0


In [88]:
X = np.array(df_comb['text'])
y = np.array(df_comb['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train[27519]) 
print(y_train[27519])

[2, 48, 124, 23, 63, 637, 1, 682, 62]
0


In [89]:
len_lis = [len(x) for x in X]
print(f"The average length of each sentence is {np.mean(len_lis)}")
print(f"The max length of the sentences is {np.max(len_lis)}")
print(f"The min length of the sentence is {np.min(len_lis)}")
print(f"The median is {np.median(len_lis)}")
print(f"The 75th percentile is {np.percentile(len_lis, 75)}")
print(f"The 25th percentile is {np.percentile(len_lis, 25)}")


The average length of each sentence is 90.3398546511628
The max length of the sentences is 1075
The min length of the sentence is 0
The median is 44.0
The 75th percentile is 114.0
The 25th percentile is 26.0


In [90]:
# Data parameters
num_classes = 2

# Hyperparameters
maxlen = 80  #Start with the median, max at 75th percentile
batch_size = 32 # base on experiment 32 is reasonable starting point (the best)
n_epochs = 20
# recommened learning rate is 0.001 to 0.005. usually 0.002 and 0.003 is the best
learning_rate = 0.002
# start small and increase gradually 
hidden_layers = 32
early_stop_patience = 2

In [91]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[3]) 

[   0 1225    7    0    0 1813 3042   38   24 3887   90    5    0    6
 3059  953    3    0    2    0    9 1080    7 3708    9   73   34  405
    0 2129    1    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


In [92]:
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

print(y_train.shape)
print(y_train[0])

(27520, 2)
[1. 0.]


In [93]:
#(samples, timesteps, features)
#(batch_size, timesteps, input_dim)
X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
print(X_train.shape)  #(750, 100, 1)

X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1))
print(X_test.shape)

(27520, 80, 1)
(6880, 80, 1)


In [94]:
def vanilla_rnn():
    model = Sequential()
    model.add(GRU(hidden_layers, input_shape = (maxlen,1), return_sequences = False))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.summary()
    
    adam = optimizers.Adam(learning_rate = learning_rate)
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model

In [95]:
earlystopping = callbacks.EarlyStopping(monitor="accuracy",
                                        mode="max", patience=early_stop_patience,
                                        restore_best_weights=True)
model = KerasClassifier(build_fn = vanilla_rnn, epochs = n_epochs, batch_size = batch_size)
model.fit(X_train, y_train, callbacks=[earlystopping])

  model = KerasClassifier(build_fn = vanilla_rnn, epochs = n_epochs, batch_size = batch_size)


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_8 (GRU)                 (None, 32)                3360      
                                                                 
 dense_8 (Dense)             (None, 2)                 66        
                                                                 
 activation_8 (Activation)   (None, 2)                 0         
                                                                 
Total params: 3,426
Trainable params: 3,426
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


<keras.callbacks.History at 0x17f92554040>

In [100]:
y_pred = model.predict(X_test)
y_test_ = np.argmax(y_test, axis = 1)

print(accuracy_score(y_pred, y_test_))

0.8738372093023256


# Kaggle Dataset

In [97]:
file_path_test = 'data/test_set.json'
df_test = pd.read_json(file_path_test, lines=True)
X_Kaggle = np.array(df_test['text'])
X_Kaggle = pad_sequences(X_Kaggle, padding='post', maxlen=maxlen)
X_Kaggle = np.array(X_Kaggle).reshape((X_Kaggle.shape[0], X_Kaggle.shape[1], 1))
y_Kaggle = model.predict(X_Kaggle)



In [98]:
# # Add predictions to the test DataFrame
df_test['class'] = y_Kaggle

# # Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# # Save the selected columns to a CSV file
df_test[selected_columns].to_csv('prediction/GRU_RNN.csv', index=False)

In [99]:
file_path_test = 'data/test_set.json'
df_test = pd.read_json(file_path_test, lines=True)
X_Kaggle = np.array(df_test['text'])
len_lis = [len(x) for x in X_Kaggle]
print(f"The average length of each sentence is {np.mean(len_lis)}")
print(f"The max length of the sentences is {np.max(len_lis)}")
print(f"The min length of the sentence is {np.min(len_lis)}")
print(f"The median is {np.median(len_lis)}")
print(f"The 75th percentile is {np.percentile(len_lis, 75)}")
print(f"The 25th percentile is {np.percentile(len_lis, 25)}")

The average length of each sentence is 110.615
The max length of the sentences is 933
The min length of the sentence is 4
The median is 48.0
The 75th percentile is 148.0
The 25th percentile is 28.0
