In [81]:
#Imports

import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
!pip install evaluate
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [82]:
MODEL = "bert-base-uncased"
BATCH_SIZE = 4
EPOCHS = 10

In [83]:
#Loading in train data

train = pd.read_csv("/kaggle/input/english-sarcasm-detection/train.En.csv")
train_C = pd.read_csv("/kaggle/input/english-sarcasm-detection/english_task_c.csv")
train_a = pd.read_csv("/kaggle/input/english-sarcasm-detection/english_task_a.csv")
train_B = pd.read_csv("/kaggle/input/english-sarcasm-detection/task_B_En_test.csv")

test = pd.read_csv("/kaggle/input/english-sarcasm-detection/task_A_En_test.csv")

In [84]:
#Extracting parts of training data relevant for Task A

train_A = train.loc[:, ["tweet", "sarcastic", "rephrase"]]
train_C = train_C.loc[:, ["text_1", "text_0", "human_aggregated"]]
train_a = train_a.loc[:, ["text", "human_aggregated"]]
train_B = train_B.loc[:, ["text", "sarcasm", "irony", "satire", "understatement", "overstatement", "rhetorical_question"]]

test_A = test.loc[:, ["text", "sarcastic"]]

In [85]:
#Conveting train data to a format suitable for training

transformed_train_A = []
transformed_train_B = []
transformed_train_C = []
transformed_train_a = []

for i in range(len(train_A)):
    transformed_train_A.append({
        "label": train_A.iloc[i]["sarcastic"],
        "text": train_A.iloc[i]["tweet"]
    })
    
    transformed_train_A.append({
        "label": 0,
        "text": train_A.iloc[i]["rephrase"]
    })

for i in range(len(train_B)):
    if train_B.iloc[i]["sarcasm"] == 1 or train_B.iloc[i]["irony"] == 1 or train_B.iloc[i]["satire"] == 1 or train_B.iloc[i]["understatement"] == 1 or train_B.iloc[i]["overstatement"] == 1 or train_B.iloc[i]["rhetorical_question"] == 1:
        transformed_train_B.append({
        "label": 1,
        "text": train_B.iloc[i]["text"]
    })
    else:
        transformed_train_B.append({
        "label": 0,
        "text": train_B.iloc[i]["text"]
    })
    
for i in range(len(train_C)):
    if train_C.iloc[i]["human_aggregated"] == 0:
        transformed_train_C.append({
            "label": 1,
            "text": train_C.iloc[i]["text_0"]
        })
        
        transformed_train_C.append({
            "label": 0,
            "text": train_C.iloc[i]["text_1"]
        })
    else:
        transformed_train_C.append({
            "label": 0,
            "text": train_C.iloc[i]["text_0"]
        })
        
        transformed_train_C.append({
            "label": 1,
            "text": train_C.iloc[i]["text_1"]
        })
    
for i in range(len(train_a)):
    transformed_train_a.append({
        "label": train_a.iloc[i]["human_aggregated"],
        "text": train_a.iloc[i]["text"]
    })
    
transformed_test_A = []

for i in range(len(test_A)):
    transformed_test_A.append({
        "label": test_A.iloc[i]["sarcastic"],
        "text": test_A.iloc[i]["text"]
    })

In [86]:
#Tokenization

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def preprocess_function(text):
    return tokenizer(text, truncation=True)

tokenized_transformed_train_A = []

for dictionary in transformed_train_A:
    if type(dictionary["text"]) is type("string"):
        temp_dict = preprocess_function(dictionary["text"])
        temp_dict["label"] = dictionary["label"]
        tokenized_transformed_train_A.append(temp_dict)
        
tokenized_transformed_train_C = []

for dictionary in transformed_train_C:
    if type(dictionary["text"]) is type("string"):
        temp_dict = preprocess_function(dictionary["text"])
        temp_dict["label"] = dictionary["label"]
        tokenized_transformed_train_C.append(temp_dict)
        
tokenized_transformed_train_a = []

for dictionary in transformed_train_a:
    if type(dictionary["text"]) is type("string"):
        temp_dict = preprocess_function(dictionary["text"])
        temp_dict["label"] = dictionary["label"]
        tokenized_transformed_train_a.append(temp_dict)
        
tokenized_transformed_train_B = []

for dictionary in transformed_train_B:
    if type(dictionary["text"]) is type("string"):
        temp_dict = preprocess_function(dictionary["text"])
        temp_dict["label"] = dictionary["label"]
        tokenized_transformed_train_B.append(temp_dict)
        
tokenized_transformed_test_A = []

for dictionary in transformed_test_A:
    if type(dictionary["text"]) is type("string"):
        temp_dict = preprocess_function(dictionary["text"])
        temp_dict["label"] = dictionary["label"]
        tokenized_transformed_test_A.append(temp_dict)

In [87]:
#Extending A dataset with community labelled C dataset

tokenized_transformed_train_A.extend(tokenized_transformed_train_a)
tokenized_transformed_train_A.extend(tokenized_transformed_train_C)

In [88]:
#Dynamic batchwise padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [89]:
#Evaluation metrics

accuracy = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels, pos_label=1, average = "binary")

In [90]:
#Id2label map and vice versa

id2label = {0: "non-sarcastic", 1: "sarcastic"}
label2id = {"non-sarcastic": 0, "sarcastic": 1}

In [91]:
#Defining the model

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [92]:
#Training and evaluation

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_transformed_train_A,
    eval_dataset=tokenized_transformed_test_A,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.5382,0.28903,0.59
2,0.5203,0.486802,0.621723
3,0.3473,0.587522,0.649903
4,0.2091,0.792615,0.645283
5,0.0925,0.924092,0.652908
6,0.0718,0.868732,0.635161
7,0.0714,1.098429,0.629981
8,0.0389,1.232798,0.635161
9,0.0241,1.264945,0.640301
10,0.0142,1.327932,0.637736


TrainOutput(global_step=15340, training_loss=0.19346668064361133, metrics={'train_runtime': 929.1088, 'train_samples_per_second': 66.02, 'train_steps_per_second': 16.51, 'total_flos': 1310629138954920.0, 'train_loss': 0.19346668064361133, 'epoch': 10.0})

In [93]:
#f461aa864bc375133e0abc309bda5598d66d00a2

In [94]:
!rm -rf /kaggle/working/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
rm: cannot remove '/kaggle/working/': Device or resource busy
