In [10]:
import sys
print(sys.executable)

/Users/AnshulSrivastava/anaconda3/bin/python


In [48]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [36]:
train = pd.read_csv('train.csv')
validation = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [37]:
# Preprocess the text data
def preprocess_text(text):
    return text.lower()

train['cleaned_text'] = train['text'].apply(preprocess_text)


In [40]:
# Split dataset into train and test sets
train_df, test_df = train_test_split(train, test_size=0.2, random_state=42, stratify=train['target'])

In [54]:
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_test=False):
        self.texts = df['cleaned_text'].values
        self.ids = df['id'].values
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        
        if not is_test:
            self.labels = df['target'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        id_ = self.ids[idx]
        
        # Tokenize text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        item = {
            'id': id_,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if not self.is_test:
            label = self.labels[idx]
            item['labels'] = torch.tensor(label, dtype=torch.long)
        
        return item

In [44]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 128
BATCH_SIZE = 16

train_dataset = TweetDataset(train_df, tokenizer, MAX_LENGTH)
test_dataset = TweetDataset(test_df, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [45]:
class DisasterClassifier(nn.Module):
    def __init__(self, n_classes):
        super(DisasterClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(output)
    
model = DisasterClassifier(n_classes=2)

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    
    loop = tqdm(data_loader, leave=True)
    
    for d in loop:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, labels)
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loop.set_description(f'Epoch [{epoch + 1}/{EPOCHS}]')
        loop.set_postfix(loss=np.mean(losses), accuracy=correct_predictions.double() / ((loop.n + 1) * BATCH_SIZE))
    
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, labels)
            
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    
    return correct_predictions.double() / n_examples, np.mean(losses)

# Train the model
EPOCHS = 3

for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(
        model, train_loader, loss_fn, optimizer, device, len(train_df)
    )
    
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(
        model, test_loader, loss_fn, device, len(test_df)
    )
    
    print(f'Validation loss {val_loss} accuracy {val_acc}')

  0%|          | 0/381 [00:00<?, ?it/s]

Epoch [1/3]: 100%|██████████| 381/381 [1:41:17<00:00, 15.95s/it, accuracy=tensor(0.8094, dtype=torch.float64), loss=0.44] 


Epoch 1/3
Train loss 0.4395990816196744 accuracy 0.8101806239737275
Validation loss 0.3741399595358719 accuracy 0.8463558765594222


Epoch [2/3]: 100%|██████████| 381/381 [1:21:56<00:00, 12.90s/it, accuracy=tensor(0.8747, dtype=torch.float64), loss=0.328]


Epoch 2/3
Train loss 0.3277613366232926 accuracy 0.8755336617405582
Validation loss 0.4781446174213973 accuracy 0.8430728824688115


Epoch [3/3]: 100%|██████████| 381/381 [1:27:57<00:00, 13.85s/it, accuracy=tensor(0.9188, dtype=torch.float64), loss=0.239]


Epoch 3/3
Train loss 0.23923118208141464 accuracy 0.9197044334975369
Validation loss 0.4806661558880781 accuracy 0.8378200919238346


In [50]:
validation_set = pd.read_csv('test.csv')
validation_set['cleaned_text'] = validation_set['text'].apply(preprocess_text)
validation_set.head()

Unnamed: 0,id,keyword,location,text,cleaned_text
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting. #spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan


In [55]:
validation_dataset = TweetDataset(validation_set, tokenizer, MAX_LENGTH, is_test=True)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [57]:
def predict(model, data_loader, device):
    model.eval()
    predictions = []
    ids = []
    
    with torch.no_grad():
        # Wrap the data_loader with tqdm to add the progress bar
        for d in tqdm(data_loader, desc="Predicting", leave=True):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            id_batch = d["id"]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            ids.extend(id_batch)
            predictions.extend(preds.cpu().numpy())
    
    return ids, predictions

# Make predictions on the validation set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ids, predictions = predict(model, validation_loader, device)

# Create a DataFrame with the output format
output_df = pd.DataFrame({'id': ids, 'target': predictions})

# Save the results as a CSV file
output_df.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")

Predicting:   0%|          | 0/204 [00:00<?, ?it/s]

Predicting: 100%|██████████| 204/204 [11:53<00:00,  3.50s/it]

Predictions saved to predictions.csv





In [59]:
# Assuming `model` is your trained BERT model
MODEL_PATH = "bert_model.pth"

# Save the state_dict of the model
torch.save(model.state_dict(), MODEL_PATH)

print(f"Model saved to {MODEL_PATH}")

Model saved to bert_model.pth
