In [120]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import regularizers

tf.random.set_seed(10)

class TextCNN():
    """
                    -> convolutional, max-pooling (stride 3, 128 filters) ->
    Embedding layer -> convolutional, max-pooling (stride 4, 128 filters) -> 1 dense layer -> softmax layer.
                    -> convolutional, max-pooling (stride 5, 128 filters) ->
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, dropout_prob, num_filters):
       
      inputs = keras.Input(shape=(sequence_length,), dtype="int64")
      x = layers.Embedding(vocab_size, embedding_size)(inputs)
      conv1 = layers.Conv1D(num_filters, 2, activation='relu')(x)
      conv1pooled = layers.GlobalMaxPooling1D()(conv1)
      conv2 = layers.Conv1D(num_filters, 3, activation='relu')(x)
      conv2pooled = layers.GlobalMaxPooling1D()(conv2)
      # conv3 = layers.Conv1D(num_filters, 4, activation='relu')(x)
      # conv3pooled = layers.GlobalMaxPooling1D()(conv3)
      combined = layers.concatenate([conv1pooled, conv2pooled])
      x = layers.Dense(
         units=64, 
         activation='relu', 
        #  bias_regularizer=regularizers.L2(1e-4),
        #  activity_regularizer=regularizers.L2(1e-6)
        )(combined)
      x = layers.Dropout(dropout_prob, seed=10)(x)
      outputs = layers.Dense(num_classes, activation='softmax')(x)    
      self.model = keras.Model(inputs, outputs, name="CNN")
      self.model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=[keras.metrics.CategoricalAccuracy(), 
                           keras.metrics.F1Score(average=None, threshold=None, name="f1_score", dtype=None),
                          #  keras.metrics.AUC(multi_label=True, num_labels=num_classes,)
                          ]
                  )

    def getModel(self):
        return self.model


In [121]:
import numpy as np
from tensorflow.keras import utils
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import io
import json
import os

# Parameters
# ==================================================

# Model Hyperparameters
embedding_dim = 128
num_filters = 128
dropout_prob = 0.4 
l2_reg_lambda = 0.0
max_seq = 1000
num_classes = 4

# Training parameters
epochs = 2

# Misc Parameters
allow_soft_placement = True
log_device_placement = False

tokenizer = Tokenizer(oov_token="<OOV>")

def generateTokenizer(file):
    df_processed = pd.read_csv(file, index_col = False)
    if "Sentence" in df_processed:
        tokenizer.fit_on_texts(df_processed["Sentence"].str.lower())
    elif "sentence" in df_processed:
        tokenizer.fit_on_texts(df_processed["sentence"].str.lower())
    else:
        print("Check columns of data")
    tokenizer_json = tokenizer.to_json()
    with io.open(f'{os.path.basename(file)}_tokenizer.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    return f'{os.path.basename(file)}_tokenizer.json'


def preprocess(file, tokenizer):
    # Data Preparation
    # ==================================================
    df = pd.read_csv(file, index_col = False)

    with open(tokenizer) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    
    word_index = tokenizer.word_index

    
    if "Sentence" in df:
        sequences = tokenizer.texts_to_sequences(df["Sentence"].str.lower())
    elif "sentence" in df:
        sequences = tokenizer.texts_to_sequences(df["sentence"].str.lower())
    else:
        print("Check columns of data")
   
    
    padded = pad_sequences(sequences, padding='post', maxlen=max_seq)
    if "Label" in df:
        y_train_one_hot = utils.to_categorical(df["Label"]-1, num_classes=4)
    elif "label" in df:
        y_train_one_hot = utils.to_categorical(df["label"]-1, num_classes=4)

    print("Vocabulary Size: {:d}".format(len(word_index)))
    return padded, y_train_one_hot, word_index 


def train(x_train, y_train, word_index, epochs, filename="CNN_model.keras"):
    # Training
    # ==================================================
    model = TextCNN(max_seq, num_classes, len(word_index), embedding_dim, dropout_prob, num_filters).getModel()
    print(model.summary())
    # utils.plot_model(model, "my_first_model_with_shape_info.png", show_shapes=True)
    # Fit the model using the train and test datasets.
    # Since using fulltrain shuffle first else validation will be very low
    x_train, y_train = shuffle(x_train, y_train)
    model.fit(x_train, y_train, validation_split=0.4, epochs=epochs)
    model.save(filename)
    print("===============================Model Performance on Train Set======================================")
    print(model.evaluate(x_train, y_train, verbose=1))
    Y_train = np.argmax(y_train, axis=1) # Convert one-hot to index
    y_pred = np.argmax(model.predict(x_train), axis=1)
    print(classification_report(Y_train, y_pred))
    
    return model
    
def test(file, model, tokenizer):
    x_test, y_test, _ = preprocess(file, tokenizer=tokenizer)
    test_scores = model.evaluate(x_test, y_test, verbose=1)
    print(test_scores)
    Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
    y_pred = np.argmax(model.predict(x_test), axis=1)
    print(classification_report(Y_test, y_pred))



In [122]:
if not os.path.isfile("strip_punct_stop.csv_tokenizer.json"):
    generateTokenizer("../strip_punct_stop/strip_punct_stop.csv")

In [123]:
#Add label column from fulltrain to processed csv 
df = pd.read_csv("../strip_punct_stop/strip_punct_stop.csv", index_col=False)
df_train = pd.read_csv("../train_and_balancedtest/fulltrain.csv")
df = pd.concat([df_train["Label"], df["Sentence"]], axis=1, ignore_index=True)
df.columns = ["Label", "Sentence"]

df.to_csv('../strip_punct_stop/strip_punct_stop.csv')

In [124]:
x_train, y_train, word_index = preprocess(file="../strip_punct_stop/strip_punct_stop.csv", tokenizer="strip_punct_stop.csv_tokenizer.json")

Vocabulary Size: 229614


In [125]:
model = train(x_train, y_train, word_index, epochs)

None
Epoch 1/2
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 208ms/step - categorical_accuracy: 0.7456 - f1_score: 0.7103 - loss: 0.6062 - val_categorical_accuracy: 0.9637 - val_f1_score: 0.9607 - val_loss: 0.1021
Epoch 2/2
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 208ms/step - categorical_accuracy: 0.9754 - f1_score: 0.9732 - loss: 0.0749 - val_categorical_accuracy: 0.9691 - val_f1_score: 0.9670 - val_loss: 0.0960
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 15ms/step - categorical_accuracy: 0.9969 - f1_score: 0.9967 - loss: 0.0109
[0.03966376557946205, 0.9874523878097534, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.98436  , 0.9874675, 0.9934358, 0.9811301], dtype=float32)>]
[1m1527/1527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 15ms/step
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     14047
           1       0.99      0.99      0.99  

In [133]:
model = keras.models.load_model("CNN_model_f1_061.keras")
# model = keras.models.load_model("CNN_model.keras")
print("===============================Model Performance on Test Set======================================")
test(file="../train_and_balancedtest/balancedtest.csv", model = model, tokenizer="strip_punct_stop.csv_tokenizer.json")

Vocabulary Size: 229614
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - categorical_accuracy: 0.4848 - f1_score: 0.3297 - loss: 2.0986
[1.5907927751541138, 0.6176666617393494, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.64055693, 0.3121951 , 0.6380133 , 0.75866044], dtype=float32)>]
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
              precision    recall  f1-score   support

           0       0.92      0.49      0.64       750
           1       0.58      0.21      0.31       750
           2       0.50      0.89      0.64       750
           3       0.67      0.88      0.76       750

    accuracy                           0.62      3000
   macro avg       0.67      0.62      0.59      3000
weighted avg       0.67      0.62      0.59      3000



In [134]:
print("===============================Model Performance on External Test Set======================================")
test(file="../opensources_fakenewscorpus_balancedtest/opensources_fakenewscorpus_modified_undersampled.csv", model = model, tokenizer="strip_punct_stop.csv_tokenizer.json")

Vocabulary Size: 229614
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - categorical_accuracy: 0.2864 - f1_score: 0.2076 - loss: 3.4151
[2.5740530490875244, 0.4050000011920929, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.3246326 , 0.20883214, 0.4156037 , 0.5819502 ], dtype=float32)>]
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step
              precision    recall  f1-score   support

           0       0.41      0.27      0.32      3500
           1       0.33      0.15      0.21      3500
           2       0.33      0.56      0.42      3500
           3       0.53      0.64      0.58      3500

    accuracy                           0.41     14000
   macro avg       0.40      0.41      0.38     14000
weighted avg       0.40      0.41      0.38     14000



In [128]:
if not os.path.isfile("synonym_augmented_train.csv_tokenizer.json"):
    generateTokenizer("../synonym_augmented_train.csv")

In [129]:
x_train, y_train, word_index = preprocess(file="../synonym_augmented_train.csv", tokenizer="synonym_augmented_train.csv_tokenizer.json")

Vocabulary Size: 334900


In [130]:
model = train(x_train, y_train, word_index, epochs, filename="CNN_model_augmented_train.keras")

None
Epoch 1/2
[1m1475/1475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 271ms/step - categorical_accuracy: 0.7961 - f1_score: 0.7936 - loss: 0.5027 - val_categorical_accuracy: 0.9794 - val_f1_score: 0.9794 - val_loss: 0.0633
Epoch 2/2
[1m1475/1475[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 271ms/step - categorical_accuracy: 0.9856 - f1_score: 0.9857 - loss: 0.0459 - val_categorical_accuracy: 0.9835 - val_f1_score: 0.9835 - val_loss: 0.0538
[1m2458/2458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 17ms/step - categorical_accuracy: 0.9979 - f1_score: 0.9979 - loss: 0.0075
[0.02288159355521202, 0.9930686354637146, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.99195915, 0.9959241 , 0.99267185, 0.9917233 ], dtype=float32)>]
[1m2458/2458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 17ms/step
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     19657
           1       1.00      0.99    

In [137]:
model = keras.models.load_model("CNN_model_augmented_train_f1_064.keras")
# model = keras.models.load_model("CNN_model_augmented_train.keras")
print("===============================Model Performance on Test Set======================================")
test(file="../train_and_balancedtest/balancedtest.csv", model = model, tokenizer="synonym_augmented_train.csv_tokenizer.json")

Vocabulary Size: 334900
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - categorical_accuracy: 0.5845 - f1_score: 0.3759 - loss: 2.0581
[1.8109004497528076, 0.6453333497047424, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.6726643 , 0.43396226, 0.6549423 , 0.7367281 ], dtype=float32)>]
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
              precision    recall  f1-score   support

           0       0.70      0.65      0.67       750
           1       0.74      0.31      0.43       750
           2       0.60      0.72      0.65       750
           3       0.62      0.91      0.74       750

    accuracy                           0.65      3000
   macro avg       0.67      0.65      0.62      3000
weighted avg       0.67      0.65      0.62      3000



In [138]:
print("===============================Model Performance on External Test Set======================================")
test(file="../opensources_fakenewscorpus_balancedtest/opensources_fakenewscorpus_modified_undersampled.csv", model = model, tokenizer="synonym_augmented_train.csv_tokenizer.json")

Vocabulary Size: 334900
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - categorical_accuracy: 0.3795 - f1_score: 0.2372 - loss: 3.4229
[2.9159815311431885, 0.4480714201927185, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.44670042, 0.11510183, 0.4740605 , 0.6170238 ], dtype=float32)>]
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step
              precision    recall  f1-score   support

           0       0.43      0.47      0.45      3500
           1       0.22      0.08      0.12      3500
           2       0.43      0.52      0.47      3500
           3       0.54      0.73      0.62      3500

    accuracy                           0.45     14000
   macro avg       0.40      0.45      0.41     14000
weighted avg       0.40      0.45      0.41     14000

