In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

In [7]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x276b2aaa0d0>

In [8]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))


In [9]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user @ references
    text = re.sub(r'\@\w+','', text)
    
    return text

In [10]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", 
                 names=["target", "id", "date", "flag", "user", "text"])

In [11]:
df = df.sample(frac=0.1, random_state=42)

In [12]:
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)

In [13]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'].values, 
    df['target'].values, 
    test_size=0.2, 
    random_state=42
)

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
MAX_LEN = 128

In [17]:
input_ids = []
attention_masks = []

for text in tqdm(X_train, desc="Tokenizing training data"):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

Tokenizing training data: 100%|██████████| 128000/128000 [00:33<00:00, 3817.56it/s]


In [18]:
train_input_ids = torch.cat(input_ids, dim=0)
train_attention_masks = torch.cat(attention_masks, dim=0)
train_labels = torch.tensor(y_train)

In [19]:
input_ids = []
attention_masks = []

for text in tqdm(X_test, desc="Tokenizing test data"):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])


Tokenizing test data: 100%|██████████| 32000/32000 [00:09<00:00, 3214.72it/s]


In [20]:
test_input_ids = torch.cat(input_ids, dim=0)
test_attention_masks = torch.cat(attention_masks, dim=0)
test_labels = torch.tensor(y_test)

In [21]:
batch_size = 16

In [22]:
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [23]:
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [24]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [26]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [28]:
def train_model(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    # Training loop
    for batch in tqdm(dataloader, desc="Training"):
        # Unpack the batch and move to device
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        # Clear gradients
        model.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, 
                       attention_mask=attention_mask, 
                       labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update parameters and learning rate
        optimizer.step()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    # Evaluation loop
    for batch in tqdm(dataloader, desc="Evaluating"):
        # Unpack the batch and move to device
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        # No gradient calculation needed for evaluation
        with torch.no_grad():
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
        
        # Get predictions
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        
        # Store predictions and labels
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    
    return accuracy, report, predictions

In [30]:
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    avg_loss = train_model(model, train_dataloader, optimizer, device)
    print(f"Average training loss: {avg_loss:.4f}")
    
    # Evaluate after each epoch
    accuracy, report, _ = evaluate_model(model, test_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}")

# Final evaluation
print("\nFinal model evaluation:")
final_accuracy, final_report, predictions = evaluate_model(model, test_dataloader, device)
print(f"Test Accuracy: {final_accuracy:.4f}")
print(f"Classification Report:\n{final_report}")

Epoch 1/3


Training: 100%|██████████| 8000/8000 [42:59<00:00,  3.10it/s]


Average training loss: 0.3783


Evaluating: 100%|██████████| 2000/2000 [03:01<00:00, 11.00it/s]


Validation Accuracy: 0.8494
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85     15878
           1       0.86      0.83      0.85     16122

    accuracy                           0.85     32000
   macro avg       0.85      0.85      0.85     32000
weighted avg       0.85      0.85      0.85     32000

Epoch 2/3


Training: 100%|██████████| 8000/8000 [43:56<00:00,  3.03it/s]


Average training loss: 0.2752


Evaluating: 100%|██████████| 2000/2000 [03:03<00:00, 10.93it/s]


Validation Accuracy: 0.8511
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85     15878
           1       0.84      0.87      0.86     16122

    accuracy                           0.85     32000
   macro avg       0.85      0.85      0.85     32000
weighted avg       0.85      0.85      0.85     32000

Epoch 3/3


Training: 100%|██████████| 8000/8000 [43:34<00:00,  3.06it/s]


Average training loss: 0.1749


Evaluating: 100%|██████████| 2000/2000 [03:02<00:00, 10.94it/s]


Validation Accuracy: 0.8471
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85     15878
           1       0.87      0.81      0.84     16122

    accuracy                           0.85     32000
   macro avg       0.85      0.85      0.85     32000
weighted avg       0.85      0.85      0.85     32000


Final model evaluation:


Evaluating: 100%|██████████| 2000/2000 [03:03<00:00, 10.92it/s]

Test Accuracy: 0.8471
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85     15878
           1       0.87      0.81      0.84     16122

    accuracy                           0.85     32000
   macro avg       0.85      0.85      0.85     32000
weighted avg       0.85      0.85      0.85     32000




