In [None]:
#################################### Libraries #######################################
#%% libraries
import os # setting working directory
import numpy as np # for generating random embeddings
import pandas as pd # importing a csv
import re
import random
import spacy # basic text processing
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow_addons.metrics import F1Score
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
plt.style.use('ggplot')
from statistics import harmonic_mean
import pickle
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, Bidirectional, Dense, Concatenate, GlobalMaxPooling1D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support 

In [None]:
# sets working directory
os.chdir('/kaggle/input/largedatasets2')

# sets pseudorandom seeds
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)    

### TPU usage
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


In [None]:
train = pd.read_csv('202011_202110_train_dataset_3.csv')

max_len = 0
for i in train['text'].values:
    if max_len < len(i):
        max_len = len(i)

#train_testsplit
x_train, x_valid, y_train, y_valid = train_test_split(train['text'].values, train['popular'].values, test_size = 0.2, random_state = 0)


In [None]:
#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_train))
max_l = 0
for i in x_train:
    if max_l < len(i):
        max_l = len(i)
        
#converts text into integer sequences
x_train_seq  = tokenizer.texts_to_sequences(x_train) 
x_valid_seq = tokenizer.texts_to_sequences(x_valid)

#padding to prepare sequences of same length
x_train_seq  = pad_sequences(x_train_seq)
x_valid_seq = pad_sequences(x_valid_seq)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
test = pd.read_csv('202011_202110_test_dataset_3.csv')
t = Tokenizer()
x_test = test['text'].values
t.fit_on_texts(list(x_test))

x_test_seq = t.texts_to_sequences(x_test)
x_test_seq  = pad_sequences(x_test_seq)
y_test = test['popular'].values
#x_test = t.texts_to_sequences(test['selftext'].values) 
#y_test = t.texts_to_sequences(test['class'].values)

In [None]:
os.chdir('/kaggle/input/glove6b')
embeddings_index = dict()
f = open('glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
########################## custom CNN with multiple filter window sizes ##############################
class Conv1D_multiple_filters(keras.layers.Layer):
    # intializes class attributes
    def __init__(self, filter_size_list, filter_num_list, activation, pooling_fun):
        super().__init__()
         
        self.num_window_sizes = len(filter_size_list)
        self.convolutions_list = [Conv1D(filter_num_list[i],
                                   filter_size_list[i],
                                   activation = activation)
                             for i in range(self.num_window_sizes)
                             ]
        self.pooling_fun = pooling_fun()
        self.concat = Concatenate()
        
    def call(self, x):
        # runs n-grams through convolutions with activation functions
        x = [self.convolutions_list[i](x) for i in range(self.num_window_sizes)]
        
        # pooling
        x = [self.pooling_fun(x[i]) for i in range(self.num_window_sizes)]
                
        # concatenates results from different filter sizes.  If only bigrams are used, then there is not concatentation to be done.
        if len(x) == 1:
            x = x[0] # list of 1 tensor -> tensor
        elif len(x) > 1:
            x = self.concat(x) # concatentates list of >1 tensors to one tensor.

        return x

In [None]:
#%% F-score metric
class F1(tf.keras.metrics.Metric):
    # initializing class object
    def __init__(self, name='F1', **kwargs):
        super(F1, self).__init__(name=name, **kwargs)
        
        # intializes # TP's, FP's, and FN's to 0
        self.TP = self.add_weight(name='TP', initializer='zeros')
        self.FP = self.add_weight(name='FP', initializer='zeros')
        self.FN = self.add_weight(name='FN', initializer='zeros')

    # accumulates TP's, FP's, and FN's over batches in epoch
    def update_state(self, y_true, y_pred, sample_weight = None):
        
        # converts probability score to boolean
        y_pred = tf.where(y_pred > 0.5, True, False)

        # ensures quantities are boolean
        y_true = tf.cast(y_true, tf.bool)
        y_pred = tf.cast(y_pred, tf.bool)
        
        # calculates # TP's, FP's, FN's in batch
        TP_tensor = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
        TP_tensor = tf.cast(TP_tensor, self.dtype)
        TP = tf.reduce_sum(TP_tensor)
        
        FP_tensor = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, True))
        FP_tensor = tf.cast(FP_tensor, self.dtype)
        FP = tf.reduce_sum(FP_tensor)
        
        FN_tensor = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, False))
        FN_tensor = tf.cast(FN_tensor, self.dtype)
        FN = tf.reduce_sum(FN_tensor)
        
        # adds TP's, FP's and FN's to those of previous batches in epoch
        self.TP.assign_add(TP)
        self.FP.assign_add(FP)
        self.FN.assign_add(FN)
    
    # calculates F-score
    def result(self):
        
        precision = tf.math.divide(self.TP, tf.math.add(self.TP, self.FP))
        recall = tf.math.divide(self.TP, tf.math.add(self.TP, self.FN))
        
        numerator = 2 * tf.math.multiply(precision, recall)
        denominator = tf.math.add(precision, recall)
        
        F1 = tf.math.divide(numerator, denominator)
        
        return F1
    
    # resets TP's, FP's, and FN's to 0 at the end of epoch
    def reset_state(self):
        self.TP.assign(0)
        self.FP.assign(0)
        self.FN.assign(0)

In [None]:
############################## tracking model fitting ####################################
# plots validation loss
def plot_loss(history):
    val_loss = history.history['val_loss']
    epochs = range(1, len(val_loss) + 1)

    plt.plot(epochs, val_loss, 'b')
    plt.xlabel("Epoch")
    plt.ylabel("Validation Loss")
    plt.legend()

def plot_F1(history):
    val_f1 = history.history['val_F1']
    epochs = range(1, len(val_f1) + 1)

    plt.plot(epochs, val_f1, 'b')
    plt.xlabel("Epoch")
    plt.ylabel("Validation F1")
    plt.legend()

def plot_acc(history):
    plt.plot(history.history['precision'])
    plt.plot(history.history['val_precision'])
    plt.title('Model Precision')
    plt.ylabel('precision')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc ='upper left')
    
def plot_loss(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Training/Validation loss')
    plt.ylabel('loss')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc ='upper left')
    
def plot_f1(history):
    plt.plot(history.history['F1'])
    plt.plot(history.history['val_F1'])
    plt.title('Model F scores')
    plt.ylabel('F scores')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc ='upper left')

In [None]:
########################### Modeling choices ##############################
embedding_matrix = np.zeros((vocab_size, 300))
num_embed = embedding_matrix.shape[0]
dim_embed = embedding_matrix.shape[1]

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    
# CNN hyperparameter choices -
num_filters = 1600 
max_ngram = 5 

# mini-batch size (
batch_size = 512
num_epochs = 300

# dropout 
prob_dropout_input = 0.6
prob_dropout_after_NN = 0.6
prob_dropout_rnn = 0.55


In [None]:
##################################### model definition #############################################
# splits number of filters evenly into filter sizes, and if it doesn't divide evenly, adds one filter to smaller filter sizes until none left
filter_size_list = list(range(2, max_ngram + 1))
filter_num_list = [num_filters//(max_ngram - 1)] * (max_ngram - 1)
remainder = num_filters % (max_ngram - 1)
for i in range(remainder):
    filter_num_list[i] += 1
from tensorflow.keras import regularizers
# sets pseudorandom seeds
random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)

with tpu_strategy.scope():

    model = Sequential()
    
    # using the Sequential module
    model.add(Embedding(num_embed, dim_embed, 
                               weights=[embedding_matrix], 
                               input_length = max_l, 
                               trainable = False))

    # dropout
    model.add(Dropout(prob_dropout_input))

    ### adds CNN 
    model.add(Conv1D_multiple_filters(filter_size_list, filter_num_list, activation = 'relu', pooling_fun = GlobalMaxPooling1D))
    
    # dropout
    model.add(Dropout(prob_dropout_after_NN))
   

    

    #adds a MLP(1) layer
    model.add(Dense(1, activation = 'sigmoid',kernel_regularizer=regularizers.l2(0.01)))
    

    # chooses loss function, optimizer, and evaluation metrics
    fscore = F1()
    model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['Precision', 'Recall', fscore])
    # prints model structure summary (just for user)
    model.summary()


In [None]:
################################ model fitting ###############################
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_F1',
                                                  mode = 'max',
                                                  patience = 15,
                                                  restore_best_weights = True)
pop_weight = {0:1.0, 1:2.47}

history = model.fit(x_train_seq, y_train,
                    epochs = num_epochs,
                    verbose = 1, # change to 0 if you don't want to print intermediate results during training
                    validation_data = (x_valid_seq,y_valid),
                    batch_size = batch_size,
                    validation_batch_size = len(y_valid),
                    callbacks = [early_stopping], class_weight = pop_weight)


In [None]:
###################### Performance #############################
# prediction 
y_pred = np.round(model.predict(x_test_seq)).flatten()

# performance measures
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average = 'binary')

# printing
print('precision : ', precision)
print('recall : ', recall)
print('F1 : ', f1)

In [None]:
#################### Plotting #################
plot_loss(history)
plt.show()
plot_F1(history)
plt.show()
plot_acc(history)
plt.show()
print(history.history.keys())
plot_f1(history)
plt.show()

In [None]:
##### accessing examples for error analysis

# obtains list of false positive and negative indices
FPs = [i for i in range(len(y_test)) if y_pred[i] == 1 and y_test[i] == 0]
FNs = [i for i in range(len(y_test)) if y_pred[i] == 0 and y_test[i] == 1]

# prints out the first several indices
print('FP: ', FPs[0:5])
print('FN: ', FNs[0:5])

# example index
i = 2701

# example label
print('label: ', y_test[i])

# example predictions
print('prediction: ', y_pred[i])

# example text
print(test['text'].loc[i])