In [2]:
import pandas as pd

train_df = pd.read_csv('/Users/akshitaarora/train.csv')

In [3]:
test_df = pd.read_csv('/Users/akshitaarora/test.csv')

In [4]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming train_df and test_df are already loaded

class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128, is_test=False):
        self.texts = df['text'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        if not is_test:
            self.targets = df['target'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        item_dict = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if not self.is_test:
            item_dict['targets'] = torch.tensor(self.targets[item], dtype=torch.long)

        return item_dict

# Split the training data
train_texts, val_texts, train_targets, val_targets = train_test_split(
    train_df['text'].tolist(),
    train_df['target'].tolist(),
    test_size=0.1,
    random_state=42
)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Create datasets and dataloaders
train_dataset = TweetDataset(pd.DataFrame({'text': train_texts, 'target': train_targets}), tokenizer)
val_dataset = TweetDataset(pd.DataFrame({'text': val_texts, 'target': val_targets}), tokenizer)
test_dataset = TweetDataset(test_df, tokenizer, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_targets.extend(targets.cpu().tolist())

    val_accuracy = accuracy_score(val_targets, val_predictions)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}')

print('Training completed')

# Print classification report
print(classification_report(val_targets, val_predictions))

# Make predictions on test data
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# Add predictions to test_df
test_df['prediction'] = test_predictions

# Save predictions to a CSV file
test_df[['id', 'prediction']].to_csv('bert_predictions.csv', index=False)
print("Predictions saved to 'bert_predictions.csv'")

# Function to predict on new data
def predict_tweet(text):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

    return preds.item()

# Example usage
sample_tweet = "There's been a major earthquake in the city center. Many buildings have collapsed."
prediction = predict_tweet(sample_tweet)
print(f"Prediction for the sample tweet: {'Real Disaster' if prediction == 1 else 'Not a Real Disaster'}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Validation Accuracy: 0.8333
Epoch 2/3, Validation Accuracy: 0.8346
Epoch 3/3, Validation Accuracy: 0.8268
Training completed
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       426
           1       0.82      0.78      0.80       336

    accuracy                           0.83       762
   macro avg       0.83      0.82      0.82       762
weighted avg       0.83      0.83      0.83       762

Predictions saved to 'bert_predictions.csv'
Prediction for the sample tweet: Real Disaster
