<a href="https://www.kaggle.com/code/ambrustorok/natural-language-processing-with-disaster-tweets?scriptVersionId=161441456" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Getting data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


# Explore

In [2]:
df[df["location"].str.contains("0|1|2|3|4|5|6|7|8|9")==True].head(20)

Unnamed: 0,id,keyword,location,text,target
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
59,83,ablaze,"Edmonton, Alberta - Treaty 6",How the West was burned: Thousands of wildfire...,1
103,149,aftershock,304,'The man who can drive himself further once th...,0
105,153,aftershock,304,'There is no victory at bargain basement price...,0
107,157,aftershock,304,'Nobody remembers who came in second.' Charles...,0
109,159,aftershock,304,'The harder the conflict the more glorious the...,0
125,180,aftershock,304,Sometimes you face difficulties not because yo...,0
126,182,aftershock,304,'The only thing that stands between you and yo...,0
128,184,aftershock,304,'Remembering that you are going to die is the ...,0
133,193,aftershock,304,People who say it cannot be done should not in...,0


# Transform data

The original idea was to concatenate the BERT embedded texts with the categories, then train a classifier.

But the locations and the keywoards are kindof cluttered, so we contatenate before embedding

In [3]:
def combine_row_conditionally(row):
    keyword = "keyword: " + row["keyword"] + "; " if not pd.isna(row["keyword"]) else ""
    location = "location: " + row["location"] + "; " if not pd.isna(row["location"]) else ""
    return str(keyword) + str(location) + row["text"]
    
df["inputs"] = df.apply(lambda row: combine_row_conditionally(row), axis = 1)
df

Unnamed: 0,id,keyword,location,text,target,inputs
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,@aria_ahrary @TheTawniest The out of control w...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,10872,,,Police investigating after an e-bike collided ...,1,Police investigating after an e-bike collided ...


# Train the actual model

In [4]:
y = df["target"].to_numpy()
X = df["inputs"].to_numpy()

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Test data shape:", X_test.shape)
print("Test labels shape:", y_test.shape)



Training data shape: (6090,)
Training labels shape: (6090,)
Test data shape: (1523,)
Test labels shape: (1523,)


In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import matplotlib.pyplot as plt

model_name = 'bert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_inputs = self.tokenizer.encode_plus(
            text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return encoded_inputs, label

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
dataset = CustomDataset(X_train, y_train, tokenizer)
batch_size = 8
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
num_epochs = 4 # originally it was 5, but it was heavily overfitting...
print("Training started...")
for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    for i, batch in enumerate(train_loader, 1):
        inputs, labels = batch
        inputs = {k: v.squeeze(1).to(device) for k, v in inputs.items()}  # Move inputs to GPU
        labels = labels.to(device)  # Move labels to GPU
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute running loss
        running_loss += loss.item()

        # Compute accuracy
        predictions = outputs.logits.argmax(dim=1)
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

        # Print progress
        if i % 100 == 0:  # Update every 10 batches
            avg_loss = running_loss / 100
            accuracy = correct / total
            print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{i}/{len(train_loader)}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
            running_loss = 0.0
            correct = 0
            total = 0

print("Training completed.")

Training started...
Epoch [1/4], Batch [100/762], Loss: 0.5685, Accuracy: 0.7113
Epoch [1/4], Batch [200/762], Loss: 0.4661, Accuracy: 0.8113
Epoch [1/4], Batch [300/762], Loss: 0.4512, Accuracy: 0.8000
Epoch [1/4], Batch [400/762], Loss: 0.4100, Accuracy: 0.8300
Epoch [1/4], Batch [500/762], Loss: 0.3907, Accuracy: 0.8363
Epoch [1/4], Batch [600/762], Loss: 0.3954, Accuracy: 0.8337
Epoch [1/4], Batch [700/762], Loss: 0.3908, Accuracy: 0.8325
Epoch [2/4], Batch [100/762], Loss: 0.2532, Accuracy: 0.9050
Epoch [2/4], Batch [200/762], Loss: 0.2767, Accuracy: 0.9050
Epoch [2/4], Batch [300/762], Loss: 0.2479, Accuracy: 0.9087
Epoch [2/4], Batch [400/762], Loss: 0.2846, Accuracy: 0.8938
Epoch [2/4], Batch [500/762], Loss: 0.2345, Accuracy: 0.9200
Epoch [2/4], Batch [600/762], Loss: 0.2997, Accuracy: 0.8975
Epoch [2/4], Batch [700/762], Loss: 0.3123, Accuracy: 0.8750
Epoch [3/4], Batch [100/762], Loss: 0.1208, Accuracy: 0.9525
Epoch [3/4], Batch [200/762], Loss: 0.1155, Accuracy: 0.9600
Epoc

# Evaluation

In [8]:
# Evaluation on the test set
test_dataset = CustomDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        inputs, labels = batch
        inputs = {k: v.squeeze(1).to(device) for k, v in inputs.items()}  # Move inputs to GPU
        labels = labels.to(device)  # Move labels to GPU
        outputs = model(**inputs, labels=labels)
        predictions = outputs.logits.argmax(dim=1)
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy}")

Accuracy: 0.8207485226526592


# Submission

In [9]:
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [10]:
df_test["inputs"] = df_test.apply(lambda row: combine_row_conditionally(row), axis = 1)
df_test

Unnamed: 0,id,keyword,location,text,inputs
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,MEG issues Hazardous Weather Outlook (HWO) htt...


In [11]:
# Evaluation on the test set
submission_dataset = CustomDataset(df_test["inputs"].to_numpy(), np.array([0]*len(df_test)), tokenizer)
submission_loader = DataLoader(submission_dataset, batch_size=batch_size)
model.eval()

submission_outputs = []

with torch.no_grad():
    for batch in submission_loader:
        inputs, labels = batch
        inputs = {k: v.squeeze(1).to(device) for k, v in inputs.items()}  # Move inputs to GPU
        labels = labels.to(device)  # Move labels to GPU
        outputs = model(**inputs, labels=labels)
        predictions = outputs.logits.argmax(dim=1)
        submission_outputs += predictions

# Create a dataframe to store the predictions
submission = pd.DataFrame({'id': df_test.id, 'target': [int(o) for o in submission_outputs]})
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [12]:
submission.to_csv("/kaggle/working/submission.csv", index=False)