In [3]:
!pip install transformers



In [4]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
import tensorflow.keras.backend as K
import transformers
from transformers import BertConfig, TFBertForSequenceClassification, BertTokenizer, XLNetConfig, TFXLNetForSequenceClassification, XLNetTokenizer, XLMConfig, TFXLMForSequenceClassification, XLMTokenizer, RobertaConfig, TFRobertaForSequenceClassification, RobertaTokenizer, DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer, AlbertConfig, TFAlbertForSequenceClassification, AlbertTokenizer
from sklearn import metrics
from sklearn.model_selection import KFold


print('Transformers version: ', transformers.__version__)
print('Tensorflow version: ', tf.__version__)

Transformers version:  4.12.5
Tensorflow version:  2.6.2


# Import Data

In [5]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    in_colab = True
except:
    in_colab = False
    
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', ''):
    env = 'Kaggle'

if in_colab:
    data_dir = '/content/drive/MyDrive/NLP Project/codes/input'
elif env == 'Kaggle':
    data_dir = '../input/nlp-getting-started/'
else:
    data_dir = ''
train_df = pd.read_csv(data_dir + '/train.csv')
test_df = pd.read_csv(data_dir + '/test.csv')

In [6]:
train_df = train_df.sample(n=len(train_df), random_state=42)
# sample_submission = pd.read_csv(data_dir+'sample_submission.csv')
print(train_df['target'].value_counts())
train_df.head(2)

0    4342
1    3271
Name: target, dtype: int64


Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0


# Data Prep Functions

In [7]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tree_tokenizer = TreebankWordTokenizer()
def get_tree_tokens(x):
    x = tree_tokenizer.tokenize(x)
    x = ' '.join(x)
    return x
train_df.text = train_df.text.apply(get_tree_tokens)
test_df.text = test_df.text.apply(get_tree_tokens)

In [8]:
# from: https://www.kaggle.com/utsavnandi/roberta-using-huggingface-tf-implementation
def to_tokens(input_text, tokenizer):
    output = tokenizer.encode_plus(input_text, max_length=90, pad_to_max_length=True)
    return output

def select_field(features, field):
    return [feature[field] for feature in features]

import re
def clean_tweet(tweet):
    # Removing the @
    #tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    #tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    #tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

def preprocess_data(tokenizer, train_df, test_df):
    train_text = train_df['text'].apply(clean_tweet)
    test_text = test_df['text'].apply(clean_tweet)
    train_encoded = train_text.apply(lambda x: to_tokens(x, tokenizer))
    test_encoded = test_text.apply(lambda x: to_tokens(x, tokenizer))

    #create attention masks
    input_ids_train = np.array(select_field(train_encoded, 'input_ids'))
    attention_masks_train = np.array(select_field(train_encoded, 'attention_mask'))

    input_ids_test = np.array(select_field(test_encoded, 'input_ids'))
    attention_masks_test = np.array(select_field(test_encoded, 'attention_mask'))

    # concatonate masks
    train_X = [input_ids_train, attention_masks_train]
    test_X = [input_ids_test, attention_masks_test]
    #OHE target
    train_y = tf.keras.utils.to_categorical(train_df['target'].values.reshape(-1, 1))

    return train_X, train_y, test_X

# Function to load models

In [9]:
# code from https://github.com/huggingface/transformers
# Transformers has a unified API
# for 10 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
def load_pretrained_model(model_class='bert', model_name='bert-base-cased', task='binary', learning_rate=3e-5, epsilon=1e-8, lower_case=False):
    MODEL_CLASSES = {
      "bert": (BertConfig, TFBertForSequenceClassification, BertTokenizer),
      "xlnet": (XLNetConfig, TFXLNetForSequenceClassification, XLNetTokenizer),
      "xlm": (XLMConfig, TFXLMForSequenceClassification, XLMTokenizer),
      "roberta": (RobertaConfig, TFRobertaForSequenceClassification, RobertaTokenizer),
      "distilbert": (DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer),
      "albert": (AlbertConfig, TFAlbertForSequenceClassification, AlbertTokenizer),
      #"xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), No tensorflow version yet
    }
    model_metrics = [
          tf.keras.metrics.TruePositives(name='tp'),
          tf.keras.metrics.FalsePositives(name='fp'),
          tf.keras.metrics.TrueNegatives(name='tn'),
          tf.keras.metrics.FalseNegatives(name='fn'), 
          tf.keras.metrics.BinaryAccuracy(name='accuracy'),
          tf.keras.metrics.Precision(name='precision'),
          tf.keras.metrics.Recall(name='recall'),
          tf.keras.metrics.AUC(name='auc'),
    ]
  
    
    config_class, model_class, tokenizer_class = MODEL_CLASSES[model_class]
  
    config = config_class.from_pretrained(model_name, num_labels=2, finetuning_task=task)
  
  
    model = model_class.from_pretrained(model_name)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=1.0)
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metric = tf.keras.metrics.BinaryAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    #model.summary()
  
    tokenizer = tokenizer_class.from_pretrained(model_name, lower_case = lower_case)
  
    return config, model, tokenizer

# Train Model

In [17]:
# load model, process data for model
_, _, tokenizer = load_pretrained_model(model_class='roberta', model_name='roberta-base', learning_rate=2e-5, lower_case=False)
train_X, train_y, test_X = preprocess_data(tokenizer=tokenizer, train_df=train_df, test_df=test_df)


kf = KFold(n_splits=6)
test_preds = []
i = 0
for train_idx, test_idx in kf.split(train_X[0]):
    i+=1
    if i not in [1, 5]: #only do 2 folds to save time
        continue
    train_split_X = [train_X[i][train_idx] for i in range(len(train_X))]
    test_split_X = [train_X[i][test_idx] for i in range(len(train_X))]

    train_split_y = train_y[train_idx]
    test_split_y = train_y[test_idx]
    #create class weights to account for inbalance
    positive = train_df.iloc[train_idx, :].target.value_counts()[0]
    negative = train_df.iloc[train_idx, :].target.value_counts()[1]
    pos_weight = positive / (positive + negative)
    neg_weight = negative / (positive + negative)

    # class_weight = [{0:pos_weight, 1:neg_weight}, {0:neg_weight, 1:pos_weight}]

    K.clear_session()
    config, model, tokenizer = load_pretrained_model(model_class='roberta', model_name='roberta-base', learning_rate=2e-5, lower_case=False)

    # fit, test model
    model.fit(train_split_X, train_split_y, batch_size=64, epochs=3, validation_data=(test_split_X, test_split_y))

    val_preds = model.predict(test_split_X, batch_size=32, verbose=1)
    val_preds = np.argmax(val_preds.logits, axis=1).flatten()
    print('accuracy: ', metrics.accuracy_score(train_df.iloc[test_idx, :].target.values, val_preds))
    print('f1 score: ', metrics.f1_score(train_df.iloc[test_idx, :].target.values, val_preds))

    preds1 = model.predict(test_X, batch_size=32, verbose=1)
    test_preds.append(preds1)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably 

Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy:  0.851063829787234
f1 score:  0.82186616399623


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy:  0.8226950354609929
f1 score:  0.7867298578199051


In [18]:
# load model, process data for model
_, _, tokenizer = load_pretrained_model(model_class='albert', model_name='albert-base-v2', learning_rate=2e-5, lower_case=False)
train_X, train_y, test_X = preprocess_data(tokenizer=tokenizer, train_df=train_df, test_df=test_df)


kf = KFold(n_splits=6)
test_preds = []
i = 0
for train_idx, test_idx in kf.split(train_X[0]):
    i+=1
    if i not in [1, 5]: #only do 2 folds to save time
        continue
    train_split_X = [train_X[i][train_idx] for i in range(len(train_X))]
    test_split_X = [train_X[i][test_idx] for i in range(len(train_X))]

    train_split_y = train_y[train_idx]
    test_split_y = train_y[test_idx]
    #create class weights to account for inbalance
    positive = train_df.iloc[train_idx, :].target.value_counts()[0]
    negative = train_df.iloc[train_idx, :].target.value_counts()[1]
    pos_weight = positive / (positive + negative)
    neg_weight = negative / (positive + negative)

    # class_weight = [{0:pos_weight, 1:neg_weight}, {0:neg_weight, 1:pos_weight}]

    K.clear_session()
    config, model, tokenizer = load_pretrained_model(model_class='albert', model_name='albert-base-v2', learning_rate=2e-5, lower_case=False)

    # fit, test model
    model.fit(train_split_X, train_split_y, batch_size=64, epochs=3, validation_data=(test_split_X, test_split_y))

    val_preds = model.predict(test_split_X, batch_size=32, verbose=1)
    val_preds = np.argmax(val_preds.logits, axis=1).flatten()
    print('accuracy: ', metrics.accuracy_score(train_df.iloc[test_idx, :].target.values, val_preds))
    print('f1 score: ', metrics.f1_score(train_df.iloc[test_idx, :].target.values, val_preds))

    preds1 = model.predict(test_X, batch_size=32, verbose=1)
    test_preds.append(preds1)

All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier']
You should probably 

Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy:  0.814026792750197
f1 score:  0.7521008403361344


All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy:  0.8171788810086682
f1 score:  0.7531914893617021


In [19]:
# load model, process data for model
_, _, tokenizer = load_pretrained_model(model_class='distilbert', model_name='distilbert-base-uncased', learning_rate=2e-5, lower_case=False)
train_X, train_y, test_X = preprocess_data(tokenizer=tokenizer, train_df=train_df, test_df=test_df)


kf = KFold(n_splits=6)
test_preds = []
i = 0
for train_idx, test_idx in kf.split(train_X[0]):
    i+=1
    if i not in [1, 5]: #only do 2 folds to save time
        continue
    train_split_X = [train_X[i][train_idx] for i in range(len(train_X))]
    test_split_X = [train_X[i][test_idx] for i in range(len(train_X))]

    train_split_y = train_y[train_idx]
    test_split_y = train_y[test_idx]
    #create class weights to account for inbalance
    positive = train_df.iloc[train_idx, :].target.value_counts()[0]
    negative = train_df.iloc[train_idx, :].target.value_counts()[1]
    pos_weight = positive / (positive + negative)
    neg_weight = negative / (positive + negative)

    # class_weight = [{0:pos_weight, 1:neg_weight}, {0:neg_weight, 1:pos_weight}]

    K.clear_session()
    config, model, tokenizer = load_pretrained_model(model_class='distilbert', model_name='distilbert-base-uncased', learning_rate=2e-5, lower_case=False)

    # fit, test model
    model.fit(train_split_X, train_split_y, batch_size=64, epochs=3, validation_data=(test_split_X, test_split_y))

    val_preds = model.predict(test_split_X, batch_size=32, verbose=1)
    val_preds = np.argmax(val_preds.logits, axis=1).flatten()
    print('accuracy: ', metrics.accuracy_score(train_df.iloc[test_idx, :].target.values, val_preds))
    print('f1 score: ', metrics.f1_score(train_df.iloc[test_idx, :].target.values, val_preds))

    preds1 = model.predict(test_X, batch_size=32, verbose=1)
    test_preds.append(preds1)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_24', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy:  0.830575256107171
f1 score:  0.7930702598652551


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3
accuracy:  0.8258471237194641
f1 score:  0.7959372114496768
