In [None]:
!pip install torch transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Change directory to where your CSV files are located
os.chdir('/content/drive/My Drive/colab/nlp-with-disaster-tweets')

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load your dataset (train.csv)
df = pd.read_csv('train.csv')

# Tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in df['text']]

# Pad and truncate the sequences to a fixed length
max_len = max(len(tokens) for tokens in tokenized_texts)
padded_sequences = [tokens + [0] * (max_len - len(tokens)) for tokens in tokenized_texts]

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_sequences)
labels = torch.tensor(df['target'].values)

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32
train_data = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [None]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
num_epochs = 1

batch_idx = 0
epoch_idx = 0
for epoch in range(num_epochs):
  batch_idx = 0
    epoch_idx = epoch_idx + 1
    print("Processing Epoch", epoch_idx)
    model.train()
    for batch in train_loader:
        batch_idx = batch_idx + 1
        print("Processing Batch", batch_idx)
        inputs, labels = batch
        outputs = model(inputs, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()