In [23]:
from transformers import BertTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, AdamW
import torch
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score

In [10]:
import os
for dirname, _, filenames in os.walk('/Users/aymentiouiri/Desktop/workspace/Sentiment-Analysis/data/tweet-hatred-speech'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/aymentiouiri/Desktop/workspace/Sentiment-Analysis/data/tweet-hatred-speech/test.csv
/Users/aymentiouiri/Desktop/workspace/Sentiment-Analysis/data/tweet-hatred-speech/train.csv


In [11]:
df = pd.read_csv('/Users/aymentiouiri/Desktop/workspace/Sentiment-Analysis/data/tweet-hatred-speech/train.csv')
test = pd.read_csv('/Users/aymentiouiri/Desktop/workspace/Sentiment-Analysis/data/tweet-hatred-speech/test.csv')

In [12]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
def preprocess_tweet(tweet):
    encoded_tweet = tokenizer.encode_plus(
        tweet,                      # Text to encode
        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
        max_length = 128,           # Truncate tweets longer than 128 tokens
        padding = 'max_length',     # Pad shorter tweets to the same length
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt'       # Return PyTorch tensors
    )
    return encoded_tweet

In [15]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Upsample class 1 in the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(train_df['tweet'].values.reshape(-1, 1), train_df['label'])
train_resampled_df = pd.DataFrame({'tweet': X_train_resampled.flatten(), 'label': y_train_resampled})

In [17]:
class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        tweet = self.df.iloc[idx]['tweet']
        label = self.df.iloc[idx]['label']

        encoded_tweet = preprocess_tweet(tweet)

        return {
            'input_ids': encoded_tweet['input_ids'].flatten(),
            'attention_mask': encoded_tweet['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [18]:
train_dataset = TweetsDataset(train_resampled_df, tokenizer)
val_dataset = TweetsDataset(val_df, tokenizer)

# Define data loaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [8]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:

def fine_tune_bert(train_loader, val_loader, model, epochs, tokenizer):
    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    # Train model
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1} has started')
        total_loss = 0.0
        total_predictions = []
        total_labels = []
        model.train()

        # Training loop
        for batch in train_loader:
            inputs = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            # Compute training accuracy
            logits = outputs.logits.detach().cpu().numpy()
            predictions = torch.argmax(torch.from_numpy(logits), axis=1).numpy()
            total_predictions.extend(predictions)
            total_labels.extend(labels.detach().cpu().numpy())

            total_loss += loss.item()

        # Calculate training accuracy
        train_accuracy = accuracy_score(total_labels, total_predictions)

        print(f'Epoch {epoch + 1} has ended. Average Loss: {total_loss / len(train_loader):.4f}, Training Accuracy: {train_accuracy:.4f}')

# Example usage:
# fine_tune_bert(train_loader, val_loader, model, epochs, tokenizer)


In [25]:
epochs = 3

# Fine-tune BERT
fine_tune_bert(train_loader, val_loader, model, epochs, tokenizer)

Epoch 1 has started
