Useful resource for this notebook:
- https://www.kaggle.com/code/burhanuddinlatsaheb/transformer-model-comparision-for-disaster-tweet/notebook

In [1]:
import pandas as pd
import numpy as np

In [60]:
train_data = pd.read_csv('../input/sentiment-analysis-ssa/train.csv')
train_data.head()

Unnamed: 0,id,text,label
0,1154,a warm but realistic meditation on friendship ...,1
1,1134,"beautifully observed , miraculously unsentimen...",1
2,2373,some may choose to interpret the film's end as...,1
3,5691,"it's harmless , diverting fluff . but it's har...",0
4,2072,the touch is generally light enough and the pe...,1


In [15]:
test_data = pd.read_csv('../input/sentiment-analysis-ssa/test.csv')
test_data.head()

Unnamed: 0,id,text
0,0,entertainment more disposable than hanna-barbe...
1,1,shame on writer/director vicente aranda for ma...
2,2,although estela bravo's documentary is cloying...
3,3,"despite its visual virtuosity , 'naqoyqatsi' i..."
4,4,not once in the rush to save the day did i bec...


In [16]:
# Extra sentiment datasets
imdb_df = pd.read_csv('/kaggle/input/sentiment-labelled-sentences-data-set/sentiment labelled sentences/imdb_labelled.csv')
amazon_df = pd.read_csv('/kaggle/input/sentiment-labelled-sentences-data-set/sentiment labelled sentences/amazon_cells_labelled.csv')
yelp_df = pd.read_csv('/kaggle/input/sentiment-labelled-sentences-data-set/sentiment labelled sentences/yelp_labelled.csv')

In [17]:
# del train_data['id']
# del test_data['id']

# del imdb_df[' very slow-moving']
# del imdb_df[' aimless movie about a distressed']
# del imdb_df[' drifting young man.  ']
# del imdb_df['0']
# del imdb_df['Unnamed: 6']   
# del imdb_df['Unnamed: 7']

# del amazon_df['Unnamed: 2']
# del amazon_df['Unnamed: 3']
# del amazon_df['Unnamed: 4']
# del amazon_df['Unnamed: 5']

# del yelp_df['Unnamed: 2']
# del yelp_df['Unnamed: 3']
# del yelp_df['Unnamed: 4']
# del yelp_df['Unnamed: 5']

# imdb_df.head()
# test.head()


In [18]:
imdb_df.columns = ['text', 'label']
amazon_df.columns = ['text', 'label']
yelp_df.columns = ['text', 'label']

In [19]:
# removing labels not zero(0) or one(1)
imdb_df = imdb_df.copy()[(imdb_df['label'] == '0') | (imdb_df['label'] == '1')] 
amazon_df = amazon_df.copy()[(amazon_df['label'] == '0') | (amazon_df['label'] == '1')]
yelp_df = yelp_df.copy()[(yelp_df['label'] == '0') | (yelp_df['label'] == '1')]

imdb_df.label = imdb_df.label.astype('int64')
amazon_df.label = amazon_df.label.astype('int64')
yelp_df.label = yelp_df.label.astype('int64')

In [21]:
# concatenating train set and imbd

train_dataset = pd.concat([train_data.copy(), imdb_df, amazon_df, yelp_df], axis=0)
print('train_dataset length is', len(train_dataset))

train_dataset length is 10019


In [55]:
class config:
    MAX_LEN = 37
    LOWER_CASE = True
    RANDOM_STATE = 18
    TEST_SIZE = 0.21
    NUM_LABELS = 1
    BATCH_SIZE = 128
    LEARNING_RATE = 5e-5
    EPOCHS = 11
    WEIGHT_DECAY = 0.01
    DEVICE = "cuda"

In [56]:
# tokenize the data
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, TFAutoModel
import tensorflow as tf

tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
x_train = tokenizer(
    text = train_dataset["text"].tolist(),
    add_special_tokens=True,
    max_length=config.MAX_LEN,
    truncation=True,
    padding=True,
    return_tensors="tf",
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

x_test = tokenizer(
    text = test_data["text"].tolist(),
    add_special_tokens=True,
    max_length=config.MAX_LEN,
    truncation=True,
    padding=True,
    return_tensors="tf",
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

In [25]:
# setup pretrained roberta model
roberta_base = TFAutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [61]:
# add custom layers at the end
input_ids = tf.keras.Input(shape=(config.MAX_LEN), dtype=tf.int32, name="input_ids")
input_mask = tf.keras.Input(shape=(config.MAX_LEN), dtype=tf.int32, name="attention_mask")

embeddings = roberta_base(input_ids, attention_mask=input_mask)[1]
x = tf.keras.layers.Dropout(0.3)(embeddings)
x = tf.keras.layers.Dense(512, activation="relu")(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)
output = tf.keras.layers.Dense(config.NUM_LABELS, activation="sigmoid")(x)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)

In [62]:
model.layers[2].trainable = True
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 37)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 37)]         0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode multiple             124645632   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dropout_47 (Dropout)            (None, 768)          0           tf_roberta_model[4][1]     

In [63]:
import os

# callback to store the best model during training
if os.path.isdir("./weights/roberta_base_weights") is None:
    os.makedirs("./weights/roberta_base_weights")

checkpoint_filepath_roberta_base = "./weights/roberta_base_weights"
checkpoint_callback_roberta_base = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_filepath_roberta_base,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True
)

In [64]:
optimizer = tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE, epsilon=1e-8, decay=config.WEIGHT_DECAY, clipnorm=1.0)

In [65]:
# train the model
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=["accuracy"])
history = model.fit(x={"input_ids": x_train["input_ids"],
                       "attention_mask": x_train["attention_mask"]},
                    y=train_dataset["label"],
                    epochs=config.EPOCHS,
                    validation_split=0.01,
                    batch_size=128,
                    callbacks=[checkpoint_callback_roberta_base])

Epoch 1/11


  '"`binary_crossentropy` received `from_logits=True`, but the `output`'


Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [66]:
history.history.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

### Apply model to test data set

In [67]:
preds = model.predict({"input_ids": x_test["input_ids"], "attention_mask": x_test["attention_mask"]})
class_preds = np.where(preds > 0.5, 1, 0)
print(preds.shape, class_preds.shape)

(2132, 1) (2132, 1)


In [68]:
sample = pd.read_csv('../input/sentiment-analysis-ssa/sample.csv')
sample['label'] = class_preds

In [69]:
sample.head()

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0


In [70]:
sample.to_csv('submit.csv', index=False)