In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from keras import layers
from keras import regularizers


class TextCNN():
    """
                    -> convolutional, max-pooling (stride 3, 128 filters) ->
    Embedding layer -> convolutional, max-pooling (stride 4, 128 filters) -> 1 dense layer -> softmax layer.
                    -> convolutional, max-pooling (stride 5, 128 filters) ->
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, dropout_prob, num_filters):
       
      inputs = keras.Input(shape=(sequence_length,), dtype="int64")
      x = layers.Embedding(vocab_size, embedding_size, activity_regularizer=regularizers.L1(1e-4))(inputs)
      conv1 = layers.Conv1D(num_filters, 3, activation='relu')(x)
      conv1pooled = layers.GlobalMaxPooling1D()(conv1)
      conv2 = layers.Conv1D(num_filters, 4, activation='relu')(x)
      conv2pooled = layers.GlobalMaxPooling1D()(conv2)
      conv3 = layers.Conv1D(num_filters, 5, activation='relu')(x)
      conv3pooled = layers.GlobalMaxPooling1D()(conv3)
      combined = layers.concatenate([conv1pooled, conv2pooled, conv3pooled])
      x = layers.Dense(
         units=64, 
         activation='relu', 
         bias_regularizer=regularizers.L2(1e-4),
         activity_regularizer=regularizers.L2(1e-5)
        )(combined)
      x = layers.Dropout(dropout_prob)(x)
      outputs = layers.Dense(num_classes, activation='softmax')(x)    
      self.model = keras.Model(inputs, outputs, name="CNN")
      self.model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=[keras.metrics.CategoricalAccuracy(), 
                           keras.metrics.F1Score(average=None, threshold=None, name="f1_score", dtype=None),
                           keras.metrics.AUC(multi_label=True, num_labels=num_classes,)
                          ]
                  )

    def getModel(self):
        return self.model


In [2]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import utils
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
# ==================================================

# Model Hyperparameters
embedding_dim = 128
num_filters = 128
dropout_prob = 0.2 # need to implement dropout
l2_reg_lambda = 0.0
max_seq = 500
num_classes = 4

# Training parameters
epochs = 2

# Misc Parameters
allow_soft_placement = True
log_device_placement = False

tokenizer = Tokenizer(oov_token="<OOV>")

def preprocess(file, training=False):
    # Data Preparation
    # ==================================================
    df = pd.read_csv(file, index_col = False)
    
    if(training):
        df_processed = pd.read_csv("../strip_punct_lower/strip_punct_lower.csv", index_col = False)
        tokenizer.fit_on_texts(df_processed["1"])
        word_index = tokenizer.word_index
    else:
        word_index = None
    
    if(training):
        sequences = tokenizer.texts_to_sequences(df_processed["1"])
    else:
        sequences = tokenizer.texts_to_sequences(df["Sentence"].str.lower())
    
    #Truncating to 500 theres some that are longer but 500+ words is q insane and will just make the other vectors very empty
    padded = pad_sequences(sequences, padding='post', maxlen=max_seq)
    y_train_one_hot = utils.to_categorical(df["Label"]-1, num_classes=4)

    if(training):
        print("Vocabulary Size: {:d}".format(len(word_index)))
    return padded, y_train_one_hot, word_index 


def train(x_train, y_train, word_index, epochs):
    # Training
    # ==================================================

    model = TextCNN(max_seq, num_classes, len(word_index), embedding_dim, dropout_prob, num_filters).getModel()
    print(model.summary())
    # utils.plot_model(model, "my_first_model_with_shape_info.png", show_shapes=True)
    # Fit the model using the train and test datasets.
    # Since using fulltrain shuffle first else validation will be very low
    x_train, y_train = shuffle(x_train, y_train)
    model.fit(x_train, y_train, validation_split=0.2, epochs=epochs)
    model.save("CNN_model.keras")
    return model
    
def test(file, model):
    x_test, y_test, _ = preprocess(file, training=False)
    test_scores = model.evaluate(x_test, y_test, verbose=1)
    print(test_scores)



In [3]:
x_train, y_train, word_index = preprocess(file="../train_and_balancedtest/fulltrain.csv", training=True)

Vocabulary Size: 229659


In [4]:
model = train(x_train, y_train, word_index, epochs=2)
print("===============================Model Performance on Train Set======================================")
model.evaluate(x_train, y_train, verbose=1)

None
Epoch 1/2
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 194ms/step - auc: 0.8645 - categorical_accuracy: 0.6733 - f1_score: 0.6082 - loss: 2.2212 - val_auc: 0.9906 - val_categorical_accuracy: 0.9219 - val_f1_score: 0.9163 - val_loss: 1.4228
Epoch 2/2
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 193ms/step - auc: 0.9970 - categorical_accuracy: 0.9653 - f1_score: 0.9625 - loss: 1.0907 - val_auc: 0.9924 - val_categorical_accuracy: 0.9307 - val_f1_score: 0.9264 - val_loss: 1.3533
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 15ms/step - auc: 0.5504 - categorical_accuracy: 0.9766 - f1_score: 0.6076 - loss: 1.3426


[1.2280268669128418,
 0.9802677631378174,
 <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.97408617, 0.9803186 , 0.9910357 , 0.9696182 ], dtype=float32)>,
 0.9989985227584839]

In [7]:
# model = keras.saving.load_model("CNN_model_dropout_05.keras")
model = keras.models.load_model("CNN_model.keras")
print("===============================Model Performance on Test Set======================================")
test(file="../train_and_balancedtest/balancedtest.csv", model = model)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - auc: 0.4451 - categorical_accuracy: 0.5760 - f1_score: 0.3600 - loss: 2.5726
[2.556088447570801, 0.559333324432373, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.57160264, 0.44910637, 0.565367  , 0.64307225], dtype=float32)>, 0.800629734992981]
