In [125]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import regularizers

SEED = 42

tf.random.set_seed(SEED)

class TextCNN():
    """
                    -> convolutional, max-pooling (stride 2, 128 filters) ->
    Embedding layer -> convolutional, max-pooling (stride 3, 128 filters) -> 1 dense layer -> softmax layer.
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, dropout_prob, num_filters):
       
      inputs = keras.Input(shape=(sequence_length,), dtype="int64")
      x = layers.Embedding(vocab_size, embedding_size)(inputs)
      conv1 = layers.Conv1D(num_filters, 2, activation='relu')(x)
      conv1pooled = layers.GlobalMaxPooling1D()(conv1)
      conv2 = layers.Conv1D(num_filters, 3, activation='relu')(x)
      conv2pooled = layers.GlobalMaxPooling1D()(conv2)
      combined = layers.concatenate([conv1pooled, conv2pooled])
      x = layers.Dense(
         units=64, 
         activation='relu', 
        #  bias_regularizer=regularizers.L2(1e-4),
         activity_regularizer=regularizers.L2(1e-7)
        )(combined)
      x = layers.Dropout(dropout_prob, seed=SEED)(x)
      outputs = layers.Dense(num_classes, activation='softmax')(x)    
      self.model = keras.Model(inputs, outputs, name="CNN")
      self.model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=[keras.metrics.CategoricalAccuracy(), 
                           keras.metrics.F1Score(average=None, threshold=None, name="f1_score", dtype=None),
                          #  keras.metrics.AUC(multi_label=True, num_labels=num_classes,)
                          ]
                  )

    def getModel(self):
        return self.model


In [135]:
import numpy as np
from tensorflow.keras import utils
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
import io
import json
import os

# Parameters
# ==================================================

# Model Hyperparameters
embedding_dim = 128
num_filters = 128
dropout_prob = 0.4 
l2_reg_lambda = 0.0
max_seq = 1000
num_classes = 4

# Training parameters
epochs = 2

# Misc Parameters
allow_soft_placement = True
log_device_placement = False

tokenizer = Tokenizer(oov_token="<OOV>")

def generateTokenizer(file):
    df_processed = pd.read_csv(file, index_col = False)
    if "Sentence" in df_processed:
        tokenizer.fit_on_texts(df_processed["Sentence"].str.lower())
    elif "sentence" in df_processed:
        tokenizer.fit_on_texts(df_processed["sentence"].str.lower())
    else:
        df_processed.loc[-1] = df_processed.columns  
        df_processed.index = df_processed.index + 1  # shifting index
        df_processed.sort_index(inplace=True) 
        df_processed.columns = [*df_processed.columns[:-1], 'Sentence']
        tokenizer.fit_on_texts(df_processed["Sentence"].str.lower())
        
    tokenizer_json = tokenizer.to_json()
    with io.open(f'{os.path.basename(file)}_tokenizer.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    return f'{os.path.basename(file)}_tokenizer.json'


def preprocess(file, tokenizer, df=None):
    # Data Preparation
    # ==================================================
    if df is None:
        df = pd.read_csv(file, index_col = False)

    with open(tokenizer) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    
    word_index = tokenizer.word_index
    # print(word_index)
    
    if "Sentence" in df:
        sequences = tokenizer.texts_to_sequences(df["Sentence"].str.lower())
    elif "sentence" in df:
        sequences = tokenizer.texts_to_sequences(df["sentence"].str.lower())
    else:
        print("Check columns of data")
   
    padded = pad_sequences(sequences, padding='post', maxlen=max_seq)
    if "Label" in df:
        y_train_one_hot = utils.to_categorical(df["Label"]-1, num_classes=4)
    elif "label" in df:
        y_train_one_hot = utils.to_categorical(df["label"]-1, num_classes=4)

    print("Vocabulary Size: {:d}".format(len(word_index)))
    return padded, y_train_one_hot, word_index 


def train(x, y, word_index, epochs, filename="CNN_model.keras", validation=False):
    # Training
    # ==================================================
    model = TextCNN(max_seq, num_classes, len(word_index), embedding_dim, dropout_prob, num_filters).getModel()
    print(model.summary())

    #Used when training on the base LUN dataset, everything else is just the full data 
    if validation:
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=SEED)
        model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs)
    else:
        model.fit(x, y, epochs=epochs)
    model.save(filename)

    if validation: 
        print("===============================Model Performance on Validation Set======================================")
        y_val = np.argmax(y_val, axis=1) # Convert probabilities to index
        y_pred = np.argmax(model.predict(x_val), axis=1)
        print(classification_report(y_val, y_pred, digits=8))

    print("===============================Model Performance on Full Training Set======================================")
    y = np.argmax(y, axis=1) # Convert probabilities to index
    y_pred = np.argmax(model.predict(x), axis=1)
    print("Prediction distribution:")
    print(pd.Series(y_pred).value_counts())
    print(classification_report(y, y_pred, digits=8))
    
    return model
    
def test(file, model, tokenizer, df=None):
    x_test, y_test, _ = preprocess(file, tokenizer=tokenizer, df=df)
    test_scores = model.evaluate(x_test, y_test, verbose=1)
    print(test_scores)

    y_test = np.argmax(y_test, axis=1) # Convert probabilities to index
    y_pred = np.argmax(model.predict(x_test), axis=1)
    print("Prediction distribution:")
    print(pd.Series(y_pred).value_counts())
    print(classification_report(y_test, y_pred, digits=8))



### Base LUN dataset

In [89]:
#Add label column from fulltrain to processed csv 

#if fulltrain doesnt have the label, sentence columns yet
df_train = pd.read_csv("../raw_data/fulltrain.csv")
if "Sentence" not in df_train.columns: 
    df_train.loc[-1] = df_train.columns  
    df_train.index = df_train.index + 1  # shifting index
    df_train.sort_index(inplace=True) 
    df_train.columns = ["Label", "Sentence"]
    df_train.to_csv('../raw_data/fulltrain.csv',index=False)

df_test = pd.read_csv("../raw_data/balancedtest.csv")
if "Sentence" not in df_test.columns: 
    df_test.loc[-1] = df_test.columns  
    df_test.index = df_test.index + 1  # shifting index
    df_test.sort_index(inplace=True) 
    df_test.columns = ["Label", "Sentence"]
    df_test.to_csv('../raw_data/balancedtest.csv',index=False)

df = pd.read_csv("../preprocessed_data/strip_punct_stop.csv", index_col=False)
df_modified = pd.concat([df_train["Label"], df["Sentence"]], axis=1, ignore_index=True)
df_modified.columns = ["Label", "Sentence"]
df_modified["Label"] = pd.to_numeric(df_modified["Label"])


In [90]:
if not os.path.isfile("strip_punct_stop.csv_tokenizer.json"):
    generateTokenizer("../preprocessed_data/strip_punct_stop.csv")

In [91]:
x, y, word_index = preprocess(file="", tokenizer="strip_punct_stop.csv_tokenizer.json", df=df_modified)

Vocabulary Size: 229614


In [92]:
#This model has validation
model = train(x, y, word_index, epochs, validation=True)

None
Epoch 1/2
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 195ms/step - categorical_accuracy: 0.7792 - f1_score: 0.7472 - loss: 0.5439 - val_categorical_accuracy: 0.9582 - val_f1_score: 0.9549 - val_loss: 0.1144
Epoch 2/2
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 191ms/step - categorical_accuracy: 0.9809 - f1_score: 0.9795 - loss: 0.0618 - val_categorical_accuracy: 0.9720 - val_f1_score: 0.9696 - val_loss: 0.0829
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step
              precision    recall  f1-score   support

           0  0.96113074 0.97386323 0.96745510      2793
           1  0.96800000 0.97082422 0.96941005      1371
           2  0.98658093 0.98383050 0.98520380      3587
           3  0.96380090 0.94900990 0.95634822      2020

    accuracy                      0.97195783      9771
   macro avg  0.96987815 0.96938196 0.96960429      9771
weighted avg  0.97198954 0.97195783 0.97194890      9771


In [113]:
model = keras.models.load_model("CNN_model.keras")
print("===============================Model Performance on Test Set======================================")
test(file="../raw_data/balancedtest.csv", model = model, tokenizer="strip_punct_stop.csv_tokenizer.json")

Vocabulary Size: 229614
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - categorical_accuracy: 0.5094 - f1_score: 0.3400 - loss: 2.3223
[1.883949875831604, 0.6196666955947876, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.6621391 , 0.31163704, 0.6228495 , 0.78520435], dtype=float32)>]
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
Prediction distribution:
2    1517
3     791
0     428
1     264
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.91121495 0.52000000 0.66213922       750
           1  0.59848485 0.21066667 0.31163708       750
           2  0.46539222 0.94133333 0.62284958       750
           3  0.76485461 0.80666667 0.78520441       750

    accuracy                      0.61966667      3000
   macro avg  0.68498666 0.61966667 0.59545757      3000
weighted avg  0.68498666 0.61966667 0.59545757      3000



In [114]:
print("===============================Model Performance on External Test Set======================================")
test_df = pd.read_csv("../external-dataset/opensources_fakenewscorpus_modified_undersampled.csv")
test_df.loc[-1] = test_df.columns  
test_df.index = test_df.index + 1  # shifting index
test_df.sort_index(inplace=True) 
test_df.columns = ["Label", "Sentence"]
test_df["Label"] = pd.to_numeric(test_df["Label"])
test(file="", model = model, tokenizer="strip_punct_stop.csv_tokenizer.json", df=test_df)

Vocabulary Size: 229614
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - categorical_accuracy: 0.2710 - f1_score: 0.1927 - loss: 4.1349
[3.174232244491577, 0.38428571820259094, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.33161572, 0.14755479, 0.41609633, 0.53883135], dtype=float32)>]
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step
Prediction distribution:
2    7459
3    3363
0    1934
1    1244
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.46587384 0.25742857 0.33161575      3500
           1  0.28135048 0.10000000 0.14755481      3500
           2  0.30567100 0.65142857 0.41609636      3500
           3  0.54980672 0.52828571 0.53883141      3500

    accuracy                      0.38428571     14000
   macro avg  0.40067551 0.38428571 0.35852458     14000
weighted avg  0.40067551 0.38428571 0.35852458     14000



### Synonym augmented training set

In [95]:
if not os.path.isfile("synonym_augmented_train.csv_tokenizer.json"):
    generateTokenizer("../synonym-creation/synonym_augmented_train.csv")

In [96]:
df_synonym = pd.read_csv("../synonym-creation/synonym_augmented_train.csv")
df_synonym.loc[-1] = df_synonym.columns  
df_synonym.index = df_synonym.index + 1  # shifting index
df_synonym.sort_index(inplace=True) 
df_synonym.columns = ["Label", "Sentence"]
df_synonym["Label"] = pd.to_numeric(df_synonym["Label"])
x, y, word_index = preprocess(file="", tokenizer="synonym_augmented_train.csv_tokenizer.json", df=df_synonym)

Vocabulary Size: 334900


In [97]:
#trained on full set
model = train(x, y, word_index, epochs, filename="CNN_model_augmented_train.keras")

None
Epoch 1/2
[1m2458/2458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 250ms/step - categorical_accuracy: 0.8504 - f1_score: 0.8493 - loss: 0.3877
Epoch 2/2
[1m2458/2458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m614s[0m 250ms/step - categorical_accuracy: 0.9927 - f1_score: 0.9927 - loss: 0.0243
[1m2458/2458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 13ms/step
Prediction distribution:
3    19684
2    19656
0    19650
1    19638
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.99984733 0.99949128 0.99966927     19657
           1  0.99994908 0.99898255 0.99946558     19657
           2  0.99954212 0.99949128 0.99951670     19657
           3  0.99857752 0.99994913 0.99926286     19657

    accuracy                      0.99947856     78628
   macro avg  0.99947901 0.99947856 0.99947860     78628
weighted avg  0.99947901 0.99947856 0.99947860     78628



In [115]:
model = keras.models.load_model("CNN_model_augmented_train.keras")
print("===============================Model Performance on Test Set======================================")
test(file="../raw_data/balancedtest.csv", model = model, tokenizer="synonym_augmented_train.csv_tokenizer.json")

Vocabulary Size: 334900
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - categorical_accuracy: 0.5050 - f1_score: 0.3404 - loss: 3.1866
[2.4706013202667236, 0.6203333139419556, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.6561461 , 0.35892317, 0.6253968 , 0.7304256 ], dtype=float32)>]
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step
Prediction distribution:
3    1153
2    1140
0     454
1     253
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.87004405 0.52666667 0.65614618       750
           1  0.71146245 0.24000000 0.35892323       750
           2  0.51842105 0.78800000 0.62539683       750
           3  0.60277537 0.92666667 0.73042564       750

    accuracy                      0.62033333      3000
   macro avg  0.67567573 0.62033333 0.59272297      3000
weighted avg  0.67567573 0.62033333 0.59272297      3000



In [116]:
print("===============================Model Performance on External Test Set======================================")
test_df = pd.read_csv("../external-dataset/opensources_fakenewscorpus_modified_undersampled.csv")
test_df.loc[-1] = test_df.columns  
test_df.index = test_df.index + 1  # shifting index
test_df.sort_index(inplace=True) 
test_df.columns = ["Label", "Sentence"]
test_df["Label"] = pd.to_numeric(test_df["Label"])
test(file="", model = model, tokenizer="synonym_augmented_train.csv_tokenizer.json", df=test_df)

Vocabulary Size: 334900
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - categorical_accuracy: 0.3004 - f1_score: 0.2075 - loss: 5.5427
[4.077095031738281, 0.4424999952316284, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.38884833, 0.10258618, 0.4837428 , 0.61639184], dtype=float32)>]
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step
Prediction distribution:
2    5819
3    5053
0    1988
1    1140
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.53672032 0.30485714 0.38884840      3500
           1  0.20877193 0.06800000 0.10258621      3500
           2  0.38735178 0.64400000 0.48374289      3500
           3  0.52167029 0.75314286 0.61639191      3500

    accuracy                      0.44250000     14000
   macro avg  0.41362858 0.44250000 0.39789235     14000
weighted avg  0.41362858 0.44250000 0.39789235     14000



### NER

In [100]:
#Add label column from fulltrain to processed csv 
df = pd.read_csv("../NER-masking/NER_masked_train.csv", index_col=False)
if "Sentence" not in df.columns: 
    df_train = pd.read_csv("../raw_data/fulltrain.csv")
    df.loc[-1] = df.columns  
    df.index = df.index + 1  # shifting index
    df.sort_index() 
    df_NER_train = pd.concat([df_train["Label"], df], axis=1, ignore_index=True)
    df_NER_train.columns = ["Label", "Sentence"]
else:
    print("Skipping")

df = pd.read_csv("../NER-masking/NER_masked_test.csv", index_col=False)
if "Sentence" not in df.columns: 
    df_test = pd.read_csv("../external-dataset/opensources_fakenewscorpus_modified_undersampled.csv")
    df_test.loc[-1] = df_test.columns  
    df_test.index = df_test.index + 1  # shifting index
    df_test.sort_index(inplace=True) 
    df_test.columns = ["Label", "Sentence"]
    df_test["Label"] = pd.to_numeric(df_test["Label"])
    df_NER_test = pd.concat([df_test["Label"], df["1"]], axis=1, ignore_index=True)
    df_NER_test.columns = ["Label", "Sentence"]


else:
    print("Skipping")

In [101]:
if not os.path.isfile("NER_masked_train.csv_tokenizer.json"):
    generateTokenizer("../NER-masking/NER_masked_train.csv")

In [130]:
x, y, word_index = preprocess(file="", tokenizer="NER_masked_train.csv_tokenizer.json", df=df_NER_train)

Vocabulary Size: 367143


In [136]:
#trained on full set
model = train(x, y, word_index, epochs, filename="CNN_model_NER.keras")

None
Epoch 1/2
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 272ms/step - categorical_accuracy: 0.8027 - f1_score: 0.7778 - loss: 0.4845
Epoch 2/2
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 271ms/step - categorical_accuracy: 0.9758 - f1_score: 0.9738 - loss: 0.0714
[1m1222/1222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step
Prediction distribution:
2    14345
0    11261
3     7938
1     5539
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.99751354 0.99813400 0.99782367     11254
           1  0.99873623 0.99299946 0.99585959      5571
           2  0.99553852 0.99985997 0.99769456     14283
           3  0.99949609 0.99485893 0.99717212      7975

    accuracy                      0.99736458     39083
   macro avg  0.99782110 0.99646309 0.99713749     39083
weighted avg  0.99737059 0.99736458 0.99736357     39083



In [139]:
model = keras.models.load_model("CNN_model_NER.keras")
print("===============================Model Performance on Test Set======================================")
test(file="../raw_data/balancedtest.csv", model = model, tokenizer="NER_masked_train.csv_tokenizer.json", df=test_df)

Vocabulary Size: 367143
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - categorical_accuracy: 0.2998 - f1_score: 0.1961 - loss: 3.9014
[3.3834574222564697, 0.3578571379184723, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.37073162, 0.10228506, 0.4210625 , 0.42175218], dtype=float32)>]
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step
Prediction distribution:
2    7135
3    3120
0    2650
1    1095
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.43018868 0.32571429 0.37073171      3500
           1  0.21461187 0.06714286 0.10228509      3500
           2  0.31380519 0.63971429 0.42106253      3500
           3  0.44743590 0.39885714 0.42175227      3500

    accuracy                      0.35785714     14000
   macro avg  0.35151041 0.35785714 0.32895790     14000
weighted avg  0.35151041 0.35785714 0.32895790     14000



In [140]:
print("===============================Model Performance on External Test Set======================================")
test(file="", model = model, tokenizer="NER_masked_train.csv_tokenizer.json", df=df_NER_test)

Vocabulary Size: 367143
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - categorical_accuracy: 0.3003 - f1_score: 0.2018 - loss: 3.9042
[3.191516160964966, 0.40071427822113037, <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.37580135, 0.09808557, 0.44449615, 0.53691113], dtype=float32)>]
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step
Prediction distribution:
3    5183
2    5085
0    2583
1    1149
Name: count, dtype: int64
              precision    recall  f1-score   support

           0  0.44250871 0.32657143 0.37580141      3500
           1  0.19843342 0.06514286 0.09808561      3500
           2  0.37522124 0.54514286 0.44449621      3500
           3  0.44973953 0.66600000 0.53691121      3500

    accuracy                      0.40071429     14000
   macro avg  0.36647573 0.40071429 0.36382361     14000
weighted avg  0.36647573 0.40071429 0.36382361     14000

