In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
# import tensorflow_hub as hub
import nltk
import matplotlib.pyplot as plt
import tqdm
import transformers

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

2023-10-13 15:01:31.681186: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[nltk_data] Downloading package punkt to /home/artem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/artem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/artem/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# load data
df = pd.read_csv("sarcasm_data_clean_v1.csv")

In [3]:
df.head()

Unnamed: 0,is_sarcastic,headline,headline_clean
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes,eat your veggies 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close to using word stream...


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["headline_clean"], df["is_sarcastic"], test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((19952,), (8551,))

In [5]:
# train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
X_train.shape, X_val.shape

((14964,), (4988,))

In [6]:
print(X_train.shape)
X_train

(14964,)


5788        nesting sea turtle escorted from private beach
12502    new ted nugent cologne tested on every goddamn...
25826    clinton already working on follow up book cast...
15280    nipsey russell estate releases volume of previ...
381      kanye west i would have ridden away from a sla...
                               ...                        
20678        yacht name conveys owners easygoing lifestyle
8949     new nba starter jackets to come with unwanted ...
16993    trump calls the health care bill he is been pr...
15547    deep down woman knows she is watching entire t...
10576    this is what happens when the pavement is too ...
Name: headline_clean, Length: 14964, dtype: object

In [7]:

cv = CountVectorizer(binary=False, min_df=2, max_df=1.0, ngram_range=(1, 3))

cv_train_features = cv.fit_transform(X_train)
cv_val_features = cv.transform(X_val)
cv_test_features = cv.transform(X_test)
print('BOW model:> Train features shape:', cv_train_features.shape,  ' Val features shape:', cv_val_features.shape, ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (14964, 26888)  Val features shape: (4988, 26888)  Test features shape: (8551, 26888)


In [8]:
# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42, class_weight="balanced")

# train model
lr.fit(cv_train_features, y_train)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

print(classification_report(y_test, lr_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, lr_bow_predictions))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      4551
           1       0.82      0.82      0.82      4000

    accuracy                           0.83      8551
   macro avg       0.83      0.83      0.83      8551
weighted avg       0.83      0.83      0.83      8551



Unnamed: 0,0,1
0,3845,706
1,718,3282


## BERT model


In [9]:
# limit gpu memory (only relevant for very large models or if computing on local gpu) - taken from the tensorflow website
# will insure that tensorflow only uses as much memory as is absolutely necessary

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


2023-10-13 15:01:35.263269: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-10-13 15:01:35.264380: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-10-13 15:01:35.282333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-10-13 15:01:35.282598: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce MX150 computeCapability: 6.1
coreClock: 1.5315GHz coreCount: 3 deviceMemorySize: 3.95GiB deviceMemoryBandwidth: 44.76GiB/s
2023-10-13 15:01:35.282680: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-10-13 15:01:35.285125: I tensorflow/stream_executor/platform/defaul

In [10]:
# load tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# helper function to tokenize
def create_bert_input_features(tokenizer, docs, max_seq_length):

    all_ids, all_masks, all_segments= [], [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):

        tokens = tokenizer.tokenize(doc)

        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        # with newer versions of transformers you don't need to explicitely add CLS and SEP
        # they are automatically added by the tokenizer
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids) # [1,1,1.....] # < 500 ones

        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)

        segments = [0] * max_seq_length # [0,0,0...] # 500 zeros
        all_ids.append(ids)
        all_masks.append(masks)
        all_segments.append(segments)

    encoded = np.array([all_ids, all_masks, all_segments])

    return encoded

In [11]:
# encode train and test data
train_features_ids, train_features_masks, train_features_segments = create_bert_input_features(tokenizer, X_train, max_seq_length=35)
val_features_ids, val_features_masks, val_features_segments = create_bert_input_features(tokenizer, X_val, max_seq_length=35)
test_features_ids, test_features_masks, test_features_segments = create_bert_input_features(tokenizer, X_test, max_seq_length=35)

Converting docs to features: 100%|██████████| 14964/14964 [00:04<00:00, 3105.97it/s]
Converting docs to features: 100%|██████████| 4988/4988 [00:01<00:00, 3416.27it/s]
Converting docs to features: 100%|██████████| 8551/8551 [00:02<00:00, 3326.96it/s]


In [12]:
MAX_SEQ_LENGTH = 35

inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inp_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_segment_ids")
inputs = [inp_id, inp_mask, inp_segment]

hidden_state = transformers.TFBertModel.from_pretrained('bert-base-uncased')(inputs)
pooled_output = hidden_state[1]

dense1 = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
drop1 = tf.keras.layers.Dropout(0.25)(dense1)
dense2 = tf.keras.layers.Dense(128, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.25)(dense2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(drop2)

model = tf.keras.Model(inputs=inputs, outputs=output)

model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5,
                                           epsilon=1e-08),
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
2023-10-13 15:01:55.571984: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.sa_layer_norm.bias', 'distilbert.transformer.layer.1.ffn.lin2.bias', 'distilbert.transformer.layer.5.sa_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.weight', 'distilbert.transformer.layer.4.output_layer_norm.bias', 'distilbert.transformer.layer.2.attention.q_lin.weigh

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'g

In [13]:
# set bert model to untrainable
model.layers[3].trainable = False
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
bert_input_ids (InputLayer)     [(None, 35)]         0                                            
__________________________________________________________________________________________________
bert_input_masks (InputLayer)   [(None, 35)]         0                                            
__________________________________________________________________________________________________
bert_segment_ids (InputLayer)   [(None, 35)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   bert_input_ids[0][0]             
                                                                 bert_input_masks[0][0]       

In [14]:
# train model
EPOCHS=5
BATCH_SIZE=32

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="./checkpoints/checkpoint_bert_v1", monitor="val_accuracy", save_best_only=True)

model.fit(x=[train_features_ids, train_features_masks, train_features_segments], y=y_train, validation_data=([val_features_ids, val_features_masks, val_features_segments], y_val), 
          batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True, callbacks=[early_stopping, checkpoint])

2023-10-13 15:03:27.544477: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-10-13 15:03:27.545008: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 1999965000 Hz


Epoch 1/5


2023-10-13 15:14:19.632206: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./checkpoints/checkpoint_distilbert_v1/assets


INFO:tensorflow:Assets written to: ./checkpoints/checkpoint_distilbert_v1/assets


Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

In [42]:
# test
predictions = model.predict([test_features_ids, test_features_masks, test_features_segments])
predictions = [1 if prob > 0.5 else 0 for prob in predictions]

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92      4551
           1       0.88      0.93      0.91      4000

    accuracy                           0.91      8551
   macro avg       0.91      0.91      0.91      8551
weighted avg       0.91      0.91      0.91      8551



Unnamed: 0,0,1
0,4065,486
1,264,3736
