In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("cleaned_data.csv")


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Date,URL,Content,content_length,cleaned_content,label_encoded,binary_label
0,0,music,2016-09-22,http://www.usmagazine.com/entertainment/news/t...,"By clicking Sign In, you agree to our Terms an...",2509,clicking sign agree term condition read privac...,5,0
1,1,politics,2016-09-24,http://www.heraldscotland.com/opinion/14759994...,\n IF you are watching the Corbyn-Smith show ...,6499,watching corbynsmith show pondering whether sc...,6,1
2,2,mass media,2016-09-24,http://www.notebookreview.com/feature/nbr-flas...,Connect with more active buying teams and shap...,3138,connect active buying team shape decision make...,4,0
3,3,sideshow,2016-09-25,https://www.loc.gov/item/fsa1997018934/PP/,Top of page \n\n\n Back to Search Results\n...,3030,top page back search result content library co...,7,0
4,4,music,2016-09-26,http://musicfeeds.com.au/news/parquet-courts-a...,\n\tBy\n\t\t\t\n\t\t\tMike Hohnen\t\t\n UPDATE...,2188,mike hohnen update parquet court announced mel...,5,0


In [5]:
data['binary_label'] = data['label_encoded'].apply(lambda x: 1 if x == 6 else 0)

print(data['binary_label'].value_counts())


0    654
1    183
Name: binary_label, dtype: int64


In [6]:
missing_cleaned_content = data['cleaned_content'].isnull().sum()
missing_label_encoded = data['binary_label'].isnull().sum()

print(f"'cleaned_content' miss value：{missing_cleaned_content}")
print(f"'binary label' miss value：{missing_label_encoded}")

'cleaned_content' miss value：0
'binary label' miss value：0


In [7]:
data = data.dropna(subset=['cleaned_content']).reset_index(drop=True)
print(len(data))

837


In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Date,URL,Content,content_length,cleaned_content,label_encoded,binary_label
0,0,music,2016-09-22,http://www.usmagazine.com/entertainment/news/t...,"By clicking Sign In, you agree to our Terms an...",2509,clicking sign agree term condition read privac...,5,0
1,1,politics,2016-09-24,http://www.heraldscotland.com/opinion/14759994...,\n IF you are watching the Corbyn-Smith show ...,6499,watching corbynsmith show pondering whether sc...,6,1
2,2,mass media,2016-09-24,http://www.notebookreview.com/feature/nbr-flas...,Connect with more active buying teams and shap...,3138,connect active buying team shape decision make...,4,0
3,3,sideshow,2016-09-25,https://www.loc.gov/item/fsa1997018934/PP/,Top of page \n\n\n Back to Search Results\n...,3030,top page back search result content library co...,7,0
4,4,music,2016-09-26,http://musicfeeds.com.au/news/parquet-courts-a...,\n\tBy\n\t\t\t\n\t\t\tMike Hohnen\t\t\n UPDATE...,2188,mike hohnen update parquet court announced mel...,5,0


In [10]:
data.to_csv("cleaned_data.csv", index = False)

In [11]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define a function to tokenize text data with truncation and padding
def tokenize_data(texts, tokenizer, max_length=512):
    """
    Tokenize input text data using BERT tokenizer.
    Parameters:
        texts (list of str): List of text data to tokenize.
        tokenizer (BertTokenizer): Pre-trained BERT tokenizer.
        max_length (int): Maximum sequence length for padding/truncation.
    Returns:
        dict: Dictionary with tokenized input_ids, attention_mask.
    """
    return tokenizer(
        texts,
        truncation=True,         # Truncate sequences to max_length
        padding="max_length",    # Pad sequences to max_length
        max_length=max_length,   # Maximum sequence length
        return_tensors="pt"      # Return PyTorch tensors
    )

# Tokenize the text data (cleaned_content column)
max_length = 512  # Define the fixed sequence length
encodings = tokenize_data(data['cleaned_content'].tolist(), tokenizer, max_length=max_length)

# Check the shape of tokenized inputs
print(f"Shape of input_ids: {encodings['input_ids'].shape}")        # Shape of token IDs
print(f"Shape of attention_mask: {encodings['attention_mask'].shape}")  # Shape of attention mask


Shape of input_ids: torch.Size([837, 512])
Shape of attention_mask: torch.Size([837, 512])


In [12]:
import torch
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):
    """
    Custom PyTorch Dataset for text classification.
    This class combines tokenized inputs and labels.
    """
    def __init__(self, encodings, labels):
        """
        Initialize the dataset with tokenized inputs and labels.
        Parameters:
            encodings (dict): Tokenized input data (input_ids and attention_mask).
            labels (list or torch.Tensor): Corresponding labels for the data.
        """
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to tensor

    def __len__(self):
        """
        Return the total number of samples in the dataset.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieve a single sample from the dataset.
        Parameters:
            idx (int): Index of the sample to retrieve.
        Returns:
            dict: A dictionary containing input_ids, attention_mask, and labels.
        """
        item = {key: val[idx] for key, val in self.encodings.items()}  # input_ids and attention_mask
        item['labels'] = self.labels[idx]  # Add the label
        return item

# Prepare labels
labels = data['binary_label'].tolist()  # Extract labels as a list

# Create dataset instance
dataset = TextClassificationDataset(encodings, labels)

# Check the dataset
print(f"Number of samples in the dataset: {len(dataset)}")
print("Example of a dataset sample:")
print(dataset[0])  # Display the first sample


Number of samples in the dataset: 837
Example of a dataset sample:
{'input_ids': tensor([  101, 22042,  3696,  5993,  2744,  4650,  3191,  9394,  3343,  3696,
         1999,  6279,  2591,  4070,  2180,  2102,  2695,  4070, 20786,  2442,
         2421,  3696,  1999,  6279,  2591,  4070,  2180,  2102,  2695,  4070,
        20786,  2442,  2421,  2064,  2102,  2767,  2208,  5045,  2067, 28997,
         4971,  2047,  2299, 20739,  2491,  2028,  2154,  4971,  3333,  4487,
         2015,  2650,  6136,  2402,  5003,  1051,  9541,  2226,  2226,  2226,
         6160,  2225,  7110,  2102,  2166, 11623,  2994, 12065,  2102,  2175,
         2078,  2707,  2514,  2066,  3190,  2562,  4172,  2072, 19808,  3501,
         3903, 14406,  2208,  9680, 20739,  2491,  2207,  9857,  2244, 18592,
         3128,  4431,  4971, 11155, 13552,  7867, 15969,  4679,  7867,  6884,
         5745, 15994,  9680,  4691,  5598, 15969,  4679,  2123,  2102, 15121,
         2051,  2175,  2852, 10993,  9096,  2123,  2102,  968

In [13]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

labels = data['binary_label']
# Split dataset into training and validation sets
train_size = 0.8  # Proportion of data used for training
train_indices, val_indices = train_test_split(
    list(range(len(dataset))), 
    test_size=1 - train_size,
    stratify = labels,
    random_state=42
)

# Subset the dataset for training and validation
train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)

# Define DataLoaders for training and validation
batch_size = 16  # Define batch size

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Check the size of each set
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")


Number of training samples: 669
Number of validation samples: 168


In [14]:
from transformers import BertForSequenceClassification

# Define the number of classes (unique labels in the dataset)
num_labels = len(data['binary_label'].unique())
print("num_labels are", num_labels)
# Load pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",    # Pre-trained BERT model
    num_labels=num_labels   # Number of output classes
)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded and moved to device: {device}")


num_labels are 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to device: cpu


In [15]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define loss function (for classification tasks)
loss_fn = CrossEntropyLoss()

# Optionally, define a learning rate scheduler
from torch.optim.lr_scheduler import StepLR
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)  # Decays learning rate every 2 epochs

print("Optimizer, loss function, and scheduler initialized.")


Optimizer, loss function, and scheduler initialized.




In [16]:
from tqdm import tqdm
import torch

# Define the training loop
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    """
    Train the model for one epoch.
    Parameters:
        model (torch.nn.Module): BERT model with classification head.
        data_loader (DataLoader): DataLoader for training data.
        loss_fn (function): Loss function (e.g., CrossEntropyLoss).
        optimizer (torch.optim.Optimizer): Optimizer (e.g., AdamW).
        device (torch.device): Device to run the training on (CPU or GPU).
    Returns:
        float: Average loss over the epoch.
    """
    model.train()  # Set model to training mode
    total_loss = 0

    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()  # Clear gradients from the previous step

        # Move batch data to the target device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Return average loss
    return total_loss / len(data_loader)


# Define the validation loop
def eval_model(model, data_loader, loss_fn, device):
    """
    Evaluate the model on the validation set.
    Parameters:
        model (torch.nn.Module): BERT model with classification head.
        data_loader (DataLoader): DataLoader for validation data.
        loss_fn (function): Loss function (e.g., CrossEntropyLoss).
        device (torch.device): Device to run the evaluation on (CPU or GPU).
    Returns:
        tuple: (average loss, accuracy)
    """
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(data_loader, desc="Evaluating"):
            # Move batch data to the target device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            # Compute accuracy
            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_samples += labels.size(0)

    # Compute average loss and accuracy
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy


In [17]:
# Define the number of training epochs
num_epochs = 7  # You can adjust based on your dataset and task

# Track training progress
history = {
    "train_loss": [],
    "val_loss": [],
    "val_accuracy": []
}

# Main training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 30)

    # Train the model for one epoch
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")

    # Evaluate the model on the validation set
    val_loss, val_accuracy = eval_model(model, val_loader, loss_fn, device)
    print(f"Validation loss: {val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")

    # Save metrics for plotting or analysis
    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["val_accuracy"].append(val_accuracy)

    # Update learning rate (if using a scheduler)
    if scheduler:
        scheduler.step()

# Print summary of training
print("\nTraining complete!")
print(f"Best validation accuracy: {max(history['val_accuracy']):.4f}")



Epoch 1/7
------------------------------


Training:  19%|█▉        | 8/42 [05:36<25:21, 44.76s/it]