In [8]:
#!pip install kaggle
!kaggle competitions download -c nlp-getting-started

Downloading nlp-getting-started.zip to /home/jovyan/QMML/Lessons
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 1.52MB/s]
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 1.52MB/s]


## Kaggle - Real or Not? NLP with Disaster Tweets

Today we will be working with some data from Kaggle. The goal is to predict whether a tweet is about a real disaster or not. 

The data has been split to a train test set, but there are some important steps we must take to make use of it. Firstly, we will need to tokenize the data and then train a model on it. I leave the model choise up to you.

In [2]:
!pip install transformers datasets evaluate accelerate

Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting 

In [91]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import pandas as pd

In [57]:
dataset = load_dataset("csv", data_files={"train": "../nlp-getting-started/train.csv"})
train_val_data = dataset['train'].train_test_split(test_size=0.2)
# Rename the target column in the training set
train_data = train_test_data['train'].rename_column('target', 'labels')

# Rename the target column in the test set
val_data = train_test_data['test'].rename_column('target', 'labels')

In [58]:
train_data[0]

{'id': 6779,
 'keyword': 'lightning',
 'location': 'Leesburg, FL',
 'text': '.@dantwitty52 shuts the door on the Boom in the bottom half. #Lightning coming up in the top of the eighth.',
 'labels': 0}

In [None]:
val_data[0]

In [60]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [61]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [62]:
tokenized_tweets_train = train_data.map(preprocess_function, batched=True)
tokenized_tweets_val = val_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [63]:
tokenized_tweets_train[0]

{'id': 6779,
 'keyword': 'lightning',
 'location': 'Leesburg, FL',
 'text': '.@dantwitty52 shuts the door on the Boom in the bottom half. #Lightning coming up in the top of the eighth.',
 'labels': 0,
 'input_ids': [101,
  1012,
  1030,
  4907,
  2102,
  9148,
  15353,
  25746,
  3844,
  2015,
  1996,
  2341,
  2006,
  1996,
  8797,
  1999,
  1996,
  3953,
  2431,
  1012,
  1001,
  7407,
  2746,
  2039,
  1999,
  1996,
  2327,
  1997,
  1996,
  5964,
  1012,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [64]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [65]:
accuracy = evaluate.load("accuracy")

In [66]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [67]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [113]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [115]:
training_args = TrainingArguments(
    output_dir="./lesson_15_model/pretrained_distilbert",
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweets_train,
    eval_dataset=tokenized_tweets_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.419594,0.823375
2,0.452600,0.426288,0.819435
3,0.349000,0.432609,0.822718
4,0.313400,0.445114,0.824688
5,0.313400,0.529793,0.80302
6,0.278400,0.521098,0.80696
7,0.255700,0.532397,0.812869
8,0.240300,0.539087,0.818122
9,0.240300,0.559023,0.812869
10,0.222500,0.56338,0.813526


TrainOutput(global_step=3810, training_loss=0.29459012249323324, metrics={'train_runtime': 120.938, 'train_samples_per_second': 503.564, 'train_steps_per_second': 31.504, 'total_flos': 851370558415992.0, 'train_loss': 0.29459012249323324, 'epoch': 10.0})

In [116]:
test_file_path = '../nlp-getting-started/test.csv'

# Load the test data
test_data = pd.read_csv(test_file_path)
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True, return_tensors="pt")
test_encodings.to("cuda")

{'input_ids': tensor([[  101,  2074,  3047,  ...,     0,     0,     0],
        [  101,  2657,  2055,  ...,     0,     0,     0],
        [  101,  2045,  2003,  ...,     0,     0,     0],
        ...,
        [  101,  2665,  2240,  ...,     0,     0,     0],
        [  101, 12669,  3314,  ...,     0,     0,     0],
        [  101,  1001,  2103,  ...,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [117]:
import torch

model.eval()  # Set the model to evaluation mode

with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**test_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1)

In [118]:
predictions

tensor([1, 1, 1,  ..., 1, 1, 1], device='cuda:0')

In [119]:
numpy_predictions = predictions.cpu().numpy()
submission_df = pd.DataFrame({'id': test_data['id'], 'label': numpy_predictions})
submission_df.index = submission_df["id"]
#submission_df = submission_df.drop(columns=["id"])
submission_df.to_csv('submission.csv', index=False)