# Import packages and Libraries

In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
import time

from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, precision_score, matthews_corrcoef, roc_auc_score, balanced_accuracy_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow import keras
from tensorflow.python.client import device_lib

from tqdm import tqdm

# ---- Call tqdm to see progress bar with pandas
tqdm().pandas()

import warnings
warnings.filterwarnings('ignore')

In [None]:
tf.__version__

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

In [None]:
device_lib.list_local_devices()

In [None]:
# List of paramters for the notebook, choose option like and model to run
save_results           = True
lang                   = False
sample                 = False
multinomial_naive_bayes= True
logistic_regression    = True
svm_model              = True
k_nn_model             = True
sgd                    = True
random_forest          = True
gradient_boosting      = True
xgboost_classifier     = True
shallow_network        = True
deep_nn                = True
rnn                    = True
lstm                   = True
cnn                    = True
gru                    = True
cnn_lstm               = True
cnn_gru                = True
bidirectional_rnn      = True
bidirectional_lstm     = True
bidirectional_gru      = True
rcnn                   = True
pre_trained            = True

# Import Data

In [None]:
tweet_x_y = {}
news_x_y = {}

with open('/kaggle/input/sarcasm-preprocessed/tweet_x_values.pkl', 'rb') as f:
    tweet_x_y.update({"xtrain_count" : pkl.load(f)})
    tweet_x_y.update({"xtrain_tfidf" : pkl.load(f)})
    tweet_x_y.update({"xtrain_tfidf_ngram" : pkl.load(f)})
    tweet_x_y.update({"xtrain_tfidf_ngram_chars" : pkl.load(f)})
    tweet_x_y.update({"train_seq_x" : pkl.load(f)})
    tweet_x_y.update({"xvalid_count" : pkl.load(f)})
    tweet_x_y.update({"xvalid_tfidf" : pkl.load(f)})
    tweet_x_y.update({"xvalid_tfidf_ngram" : pkl.load(f)})
    tweet_x_y.update({"xvalid_tfidf_ngram_chars" : pkl.load(f)})
    tweet_x_y.update({"valid_seq_x" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/tweet_y_values.pkl', 'rb') as f:
    tweet_x_y.update({"train_y_sw" : pkl.load(f)})
    tweet_x_y.update({"train_y" : pkl.load(f)})
    tweet_x_y.update({"valid_y_sw" : pkl.load(f)})
    tweet_x_y.update({"valid_y" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/tweet_word_index_labels_weights.pkl', 'rb') as f:
    tweet_x_y.update({"word_index" : pkl.load(f)})
    tweet_x_y.update({"labels" : pkl.load(f)})
    tweet_x_y.update({"class_weights" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/tweet_embedding_matrix.pkl', 'rb') as f:
    tweet_x_y.update({"embedding_matrix" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/news_x_values.pkl', 'rb') as f:
    news_x_y.update({"xtrain_count" : pkl.load(f)})
    news_x_y.update({"xtrain_tfidf" : pkl.load(f)})
    news_x_y.update({"xtrain_tfidf_ngram" : pkl.load(f)})
    news_x_y.update({"xtrain_tfidf_ngram_chars" : pkl.load(f)})
    news_x_y.update({"train_seq_x" : pkl.load(f)})
    news_x_y.update({"xvalid_count" : pkl.load(f)})
    news_x_y.update({"xvalid_tfidf" : pkl.load(f)})
    news_x_y.update({"xvalid_tfidf_ngram" : pkl.load(f)})
    news_x_y.update({"xvalid_tfidf_ngram_chars" : pkl.load(f)})
    news_x_y.update({"valid_seq_x" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/news_y_values.pkl', 'rb') as f:
    news_x_y.update({"train_y_sw" : pkl.load(f)})
    news_x_y.update({"train_y" : pkl.load(f)})
    news_x_y.update({"valid_y_sw" : pkl.load(f)})
    news_x_y.update({"valid_y" : pkl.load(f)})

with open('/kaggle/input/sarcasm-preprocessed/news_word_index_labels_weights.pkl', 'rb') as f:
    news_x_y.update({"word_index" : pkl.load(f)})
    news_x_y.update({"labels" : pkl.load(f)})
    news_x_y.update({"class_weights" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/news_embedding_matrix.pkl', 'rb') as f:
    news_x_y.update({"embedding_matrix" : pkl.load(f)})

# Functions and Data Structures 

In [None]:
tweet_results = pd.DataFrame()
news_results = pd.DataFrame()

In [None]:
def report(clf, x, y, name='classifier', cv=5, dict_scoring=None, fit_params=None):
    #print(dict_scoring)
    if dict_scoring!=None:
        score = dict_scoring.copy()
        for i in score.keys():
            score[i] = make_scorer(score[i])
    
    scores = cross_validate(clf, x, y, scoring=score,
                         cv=cv, return_train_score=False, n_jobs=-1,  fit_params=fit_params)
    
    index = []
    value = []
    index.append("Model")
    value.append(name)
    for i in scores:
        if i == "estimator":
            continue
        for j in enumerate(scores[i]):
            index.append(i+"_cv"+str(j[0]+1))
            value.append(j[1])
        #if any(x in i for x in scoring.keys()):
        
        index.append(i+"_mean")
        value.append(np.mean(scores[i]))
        index.append(i+"_std")
        value.append(np.std(scores[i]))
        
    return pd.DataFrame(data=value, index=index).T

In [None]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

score_metrics = {'acc': accuracy_score,
               'balanced_accuracy': balanced_accuracy_score,
               'prec': precision_score,
               'recall': recall_score,
               'f1-score': f1_score,
               'tp': tp, 'tn': tn,
               'fp': fp, 'fn': fn,
               'cohens_kappa':cohen_kappa_score,
               'matthews_corrcoef':matthews_corrcoef,
               "roc_auc":roc_auc_score}

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='auto', patience=3)
check_p = tf.keras.callbacks.ModelCheckpoint("save_models/model.h5", save_best_only=True)

In [None]:
def cross_validate_NN(model, X, y, X_test, y_test, callbacks,name="NN", fit_params=None, scoring=None, n_splits=5):
    #print(model.__class__.__name__)
    # ---- Parameters initialisation
    seed = 42
    k = 1
    np.random.seed(seed)
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    # Creation of list for each metric
    if scoring==None:
        dic_scoring = {}
    if scoring!=None:
        dic_score = scoring.copy()
        
    dic_score["fit_time"] = None
    dic_score["score_time"] = None
    scorer = {}
    for i in dic_score.keys(): 
        scorer[i] = []
        
    
    index = ["Model"]
    results = [name]
    # ---- Loop on k-fold for cross-valisation
    for train, test in kfold.split(X, y):
        # create model
        print(f"k-fold : {k}")
        fit_start = time.time()
        _model = model
        with tf.device('/GPU:0'):
            _model.fit(X[train], y[train],
                            epochs=1000, callbacks=[callbacks],
                            validation_split=0.2, verbose=False)
        
        fit_end = time.time() - fit_start

        _acc = _model.evaluate(X_test, y_test, verbose=0)

        score_start = time.time()
        y_pred = (model.predict(X_test)>0.5).astype(int)
        score_end = time.time() - score_start

        # ---- save each metric
        for i in dic_score.keys():
            if i == "fit_time":
                scorer[i].append(fit_end)
                index.append(i+'_cv'+str(k))
                results.append(fit_end)
                continue
            if i == "score_time":
                scorer[i].append(score_end)
                index.append(i+'_cv'+str(k))
                results.append(score_end)
                continue
                
            scorer[i].append(dic_score[i](y_test, y_pred))
            index.append("test_"+i+'_cv'+str(k))
            results.append(scorer[i][-1])
                
        
        k+=1
        
    # Compute mean and std for each metric
    for i in scorer:
        
        results.append(np.mean(scorer[i]))
        results.append(np.std(scorer[i]))
        if i == "fit_time":
            index.append(i+"_mean")
            index.append(i+"_std")
            continue
        if i == "score_time":
            index.append(i+"_mean")
            index.append(i+"_std")
            continue
        
        index.append("test_"+i+"_mean")
        index.append("test_"+i+"_std")
        
    
    
    return pd.DataFrame(results, index=index).T

# Deep Learning Models

### Shallow Neural Networks 

In [None]:
def shallow_neural_networks(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 16)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      
      keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if shallow_network:
    tweet_results = tweet_results.append(cross_validate_NN(shallow_neural_networks(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="Shallow_NN_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if shallow_network:
    news_results = news_results.append(cross_validate_NN(shallow_neural_networks(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="Shallow_NN_WE", scoring=score_metrics, n_splits=5))

### Deep Neural Networks

In [None]:
def deep_neural_networks(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 50)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if deep_nn:
    tweet_results = tweet_results.append(cross_validate_NN(deep_neural_networks(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="Deep_NN_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if deep_nn:
    news_results = news_results.append(cross_validate_NN(deep_neural_networks(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="Deep_NN_WE", scoring=score_metrics, n_splits=5))

In [None]:
def deep_neural_networks_var1(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1  if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if deep_nn:
    tweet_results = tweet_results.append(cross_validate_NN(deep_neural_networks_var1(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="Deep_NN_var1_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if deep_nn:
    news_results = news_results.append(cross_validate_NN(deep_neural_networks_var1(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="Deep_NN_var1_WE", scoring=score_metrics, n_splits=5))

In [None]:
def deep_neural_networks_var2(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(32, activation='relu'),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1  if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model


In [None]:
%%time
if deep_nn:
    tweet_results = tweet_results.append(cross_validate_NN(deep_neural_networks_var2(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="Deep_NN_var2_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if deep_nn:
    news_results = news_results.append(cross_validate_NN(deep_neural_networks_var2(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="Deep_NN_var2_WE", scoring=score_metrics, n_splits=5))

### Convolutional Neural Network

In [None]:
def create_conv_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) +1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(100, 5, activation='relu'), # padding='same'
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.Conv1D(64, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.Conv1D(32, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.GlobalMaxPooling1D(),

    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if cnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_rnn_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="CNN_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if cnn:
    news_results = news_results.append(cross_validate_NN(create_rnn_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="CNN_WE", scoring=score_metrics, n_splits=5))

### Recurrent Neural Network

In [None]:
def create_rnn_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SimpleRNN(40, return_sequences=True),
    keras.layers.SimpleRNN(40, return_sequences=True),
    keras.layers.SimpleRNN(40, return_sequences=True),
    keras.layers.SimpleRNN(40),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if rnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_rnn_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="RNN_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if rnn:
    news_results = news_results.append(cross_validate_NN(create_rnn_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="RNN_WE", scoring=score_metrics, n_splits=5))

#### LSTM

In [None]:
def create_lstm_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) +1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.LSTM(32),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if lstm:
    tweet_results = tweet_results.append(cross_validate_NN(create_lstm_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="LSTM_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if lstm:
    news_results = news_results.append(cross_validate_NN(create_lstm_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="LSTM_WE", scoring=score_metrics, n_splits=5))

#### GRU

In [None]:
def create_gru_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.GRU(32),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if gru:
    tweet_results = tweet_results.append(cross_validate_NN(create_gru_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="GRU_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if gru:
    news_results = news_results.append(cross_validate_NN(create_gru_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="GRU_WE", scoring=score_metrics, n_splits=5))

#### CNN - LSTM

In [None]:
def create_cnn_lstm_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.LSTM(32),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if cnn_lstm:
    tweet_results = tweet_results.append(cross_validate_NN(create_cnn_lstm_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="CNN_LSTM_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if cnn_lstm:
    news_results = news_results.append(cross_validate_NN(create_cnn_lstm_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="CNN_LSTM_WE", scoring=score_metrics, n_splits=5))

#### CNN - GRU

In [None]:
def create_cnn_gru_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.GRU(32),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if cnn_gru:
    tweet_results = tweet_results.append(cross_validate_NN(create_cnn_gru_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="CNN_GRU_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if cnn_gru:
    news_results = news_results.append(cross_validate_NN(create_cnn_gru_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="CNN_GRU_WE", scoring=score_metrics, n_splits=5))

### Bidirectional RNN

In [None]:
def create_bidirec_rnn_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if bidirectional_rnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_bidirec_rnn_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="BiRNN_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if bidirectional_rnn:
    news_results = news_results.append(cross_validate_NN(create_bidirec_rnn_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="BiRNN_WE", scoring=score_metrics, n_splits=5))

#### Bidirectional LSTM

In [None]:
def create_bidirec_lstm_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if bidirectional_lstm:
    tweet_results = tweet_results.append(cross_validate_NN(create_bidirec_lstm_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="BiLSTM_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if bidirectional_lstm:
    news_results = news_results.append(cross_validate_NN(create_bidirec_lstm_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="BiLSTM_WE", scoring=score_metrics, n_splits=5))

#### Bidirectional LSTM

In [None]:
def create_bidirec_gru_model(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.GRU(32)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if bidirectional_gru:
    tweet_results = tweet_results.append(cross_validate_NN(create_bidirec_gru_model(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="BiGRU_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if bidirectional_gru:
    news_results = news_results.append(cross_validate_NN(create_bidirec_gru_model(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="BiGRU_WE", scoring=score_metrics, n_splits=5))

#### RCNN

In [None]:
def create_rcnn(X, word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300,input_length=X.shape[1], weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if rcnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_rcnn(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="RCNN_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if rcnn:
    news_results = news_results.append(cross_validate_NN(create_rcnn(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="RCNN_WE", scoring=score_metrics, n_splits=5))

In [None]:
def create_rcnn_var1(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.LSTM(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if rcnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_rcnn_var1(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="RCNN_var1_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if rcnn:
    news_results = news_results.append(cross_validate_NN(create_rcnn_var1(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="RCNN_var1_WE", scoring=score_metrics, n_splits=5))

In [None]:
def create_rcnn_var2(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if rcnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_rcnn_var2(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="RCNN_var2_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if rcnn:
    news_results = news_results.append(cross_validate_NN(create_rcnn_var2(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="RCNN_var2_WE", scoring=score_metrics, n_splits=5))

In [None]:
def create_rcnn_var3(word_index, label, embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [None]:
%%time
if rcnn:
    tweet_results = tweet_results.append(cross_validate_NN(create_rcnn_var3(tweet_x_y['word_index'], label=tweet_x_y['labels'], embedding_matrix=tweet_x_y['embedding_matrix'], pre_trained=pre_trained), tweet_x_y['train_seq_x'],tweet_x_y['train_y'], tweet_x_y['valid_seq_x'],tweet_x_y['valid_y'], es, name="RCNN_var3_WE", scoring=score_metrics, n_splits=5))

In [None]:
%%time
if rcnn:
    news_results = news_results.append(cross_validate_NN(create_rcnn_var3(news_x_y['word_index'], label=news_x_y['labels'], embedding_matrix=news_x_y['embedding_matrix'], pre_trained=pre_trained), news_x_y['train_seq_x'],news_x_y['train_y'], news_x_y['valid_seq_x'],news_x_y['valid_y'], es, name="RCNN_var3_WE", scoring=score_metrics, n_splits=5))

## Results

In [None]:
tweet_results = tweet_results.reset_index()
news_results = news_results.reset_index()

In [None]:
tweet_results[[ "Model","test_acc_mean","test_acc_std", 
                        "test_balanced_accuracy_mean","test_balanced_accuracy_std", 
                       "test_prec_mean", "test_prec_std", 
                        "test_recall_mean","test_recall_std", 
                       "test_f1-score_mean", "test_f1-score_std", 
                       "test_cohens_kappa_mean", "test_cohens_kappa_std", "test_matthews_corrcoef_mean","test_matthews_corrcoef_std", 
                       "test_roc_auc_mean", "test_roc_auc_std"]][tweet_results["test_prec_mean"]<1].sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False)

In [None]:
news_results[[ "Model","test_acc_mean","test_acc_std", 
                        "test_balanced_accuracy_mean","test_balanced_accuracy_std", 
                       "test_prec_mean", "test_prec_std", 
                        "test_recall_mean","test_recall_std", 
                       "test_f1-score_mean", "test_f1-score_std", 
                       "test_cohens_kappa_mean", "test_cohens_kappa_std", "test_matthews_corrcoef_mean","test_matthews_corrcoef_std", 
                       "test_roc_auc_mean", "test_roc_auc_std"]][news_results["test_prec_mean"]<1].sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False)

In [None]:
if save_results:
    tweet_results.sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False).to_csv("tweet_results_DL.csv", index=False)
    news_results.sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False).to_csv("news_results_DL.csv", index=False)