In [13]:
# Install necessary libraries
%pip install pandas torch transformers scikit-learn nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 6.9 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.2 MB/s  0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.2
Note: you may need to restart the kernel to use updated packages.




# Step-by-step guide to preprocessing data

This reading will guide you through the following steps:

- **Step 1:** Data Preprocessing
- **Step 2:** Clean the text 
- **Step 3:** Tokenize
- **Step 4:** Handle missing data
- **Step 5:** Prepare the data for fine-tuning
- **Step 6:** Split the data

## Step 1: Data Preprocessing

Before diving into the cleaning and tokenization processes, it's essential to import and organize the raw data into a structured format. We begin by loading the dataset, defining necessary labels, and preparing the initial dataset.

In [14]:
import pandas as pd
import torch

# Load the dataset from the URL
url = "https://huggingface.co/datasets/stepp1/tweet_emotion_intensity/resolve/main/train.csv"
data = pd.read_csv(url)

print("Dataset loaded successfully.")
print(data.head())

# Preprocessing for this specific dataset:
# 1. Rename 'tweet' to 'text' so it works with our cleaning function later
if 'tweet' in data.columns:
    data = data.rename(columns={'tweet': 'text'})

# 2. Create numeric labels from the 'emotion' column
if 'emotion' in data.columns:
    # Create a mapping from emotion string to number (e.g., anger -> 0, fear -> 1)
    label_mapping = {label: idx for idx, label in enumerate(data['emotion'].unique())}
    data['label'] = data['emotion'].map(label_mapping)
    print(f"\nLabel mapping applied: {label_mapping}")

# Convert labels to PyTorch tensor
if 'label' in data.columns:
    labels = torch.tensor(data['label'].tolist())

print(f"\nTotal samples: {len(data)}")

Dataset loaded successfully.
      id                                              tweet    class  \
0  40815  Loved @Bethenny independence msg on @WendyWill...     fear   
1  10128  @mark_slifer actually maybe we were supposed t...  sadness   
2  40476  I thought the nausea and headaches had passed ...     fear   
3  20813  Anger, resentment, and hatred are the destroye...    anger   
4  40796  new tires &amp; an alarm system on my car. fwm...     fear   

  sentiment_intensity class_intensity  labels  
0                 low        fear_low       4  
1                high    sadness_high       9  
2              medium     fear_medium       5  
3                high      anger_high       0  
4                 low        fear_low       4  

Total samples: 3960


## Step 2: Clean the text

Text cleaning is the first step in preparing your dataset. It involves removing unwanted characters, URLs, and excess whitespace to ensure uniformity and cleanliness in the data. Text is also changed to lowercase to maintain consistency across all data points.

**Explanation:**
Cleaning the text by removing unnecessary characters and formatting it ensures that the data is consistent, making it easier for the model to understand.

In [15]:
import re

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply cleaning function to your dataset
data['cleaned_text'] = data['text'].apply(clean_text)
print(data['cleaned_text'].head())

0    loved bethenny independence msg on wendywillia...
1    mark_slifer actually maybe we were supposed to...
2    i thought the nausea and headaches had passed ...
3    anger resentment and hatred are the destroyer ...
4      new tires amp an alarm system on my car fwm now
Name: cleaned_text, dtype: object


### Step 3: Tokenize

Tokenization is the process of converting text into individual tokens that a machine-learning model can understand. We use the tokenizer corresponding to the pretrained model (e.g., BERT) for this. This ensures that the data is properly formatted and ready for fine-tuning.

**Explanation:**
Tokenization converts the cleaned text into a format suitable for fine-tuning the model, ensuring that the input is ready for training.

In [16]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the cleaned text
tokens = tokenizer(
    data['cleaned_text'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128
)

print(tokens['input_ids'][:5])  # Check the first 5 tokenized examples

tensor([[  101,  3866,  7014,  2368,  4890,  4336,  5796,  2290,  2006, 12815,
         29602,  6632,  5244,  2022,  3407, 23713, 16829,  2306,  4426, 23713,
         13433, 28032,  7730,  2097, 19311,  2000,  2017,  3407,  2981,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  2928,  1035, 22889, 23780,  2941,  2672,  2057,  2020,  4011,
          2000,  3280,  1998,  2026, 13445,  5552,  2256,  3268, 27451,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2245,  1996, 19029,  1998, 14978,  2015,  2018,  2979,
          2021,  8840,  2140,  1045,  2514,  9643,  2651,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

### Step 4: Handle missing data

Missing data is common in real-world datasets. You can handle it either by removing incomplete entries or by imputing missing values. This step is critical to preventing errors during the training process.

**Explanation:**
Handling missing data ensures that your dataset is complete, which prevents training interruptions or biases introduced by missing information.

In [18]:
# Check for missing data
print(data.isnull().sum())

# Option 1: Drop rows with missing data
# CRITICAL: We comment this out because dropping rows changes the dataframe length,
# but our 'tokens' variable (created in Step 3) still has the original length.
# This would cause a mismatch error in Step 5.
# data = data.dropna() 

# Option 2: Fill missing values with a placeholder (Safer)
data['cleaned_text'] = data['cleaned_text'].fillna('missing')

id                     0
text                   0
class                  0
sentiment_intensity    0
class_intensity        0
labels                 0
cleaned_text           0
dtype: int64


#### Explanation of code

- **`synonym_replacement`**: This function uses the `nltk` libraryâ€™s wordnet to retrieve synonyms of a given word. If synonyms are available, it randomly selects one. If not, the original word is returned.
- **`augment_text`**: This function iterates through each word in the text, replacing it with a synonym based on a random probability (here, a 20 percent chance for each word).
- **Applying augmentation**: We apply the `augment_text` function to the cleaned text in the dataset, creating a new column, `augmented_text`, which contains the augmented text samples.

In [19]:
# Import necessary modules
import random # Random module for generating random numbers and selections
import nltk
from nltk.corpus import wordnet # NLTK's WordNet corpus for finding synonyms

# Download WordNet data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define a function to find and replace a word with a synonym
def synonym_replacement(word):
    # Get all synsets (sets of synonyms) for the given word from WordNet
    synonyms = wordnet.synsets(word)
    
    # If the word has synonyms, randomly choose one synonym, otherwise return the original word
    if synonyms:
        synonym = synonyms[0].lemmas()[0].name()
        return synonym
    return word

def augment_text(text):
    words = text.split()
    new_words = [synonym_replacement(word) if random.random() < 0.2 else word for word in words]
    return ' '.join(new_words)

# Apply augmentation to create a new column
# We apply this to a small subset or the whole dataset depending on needs
# For demonstration, let's apply it to the first 5 rows
print("Original:", data['cleaned_text'].iloc[0])
print("Augmented:", augment_text(data['cleaned_text'].iloc[0]))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amansahni\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amansahni\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amansahni\AppData\Roaming\nltk_data...


Original: loved bethenny independence msg on wendywilliams be happy amp fulfilled within yourself amp positivity will flock to you happy independent
Augmented: loved bethenny independence msg on wendywilliams be happy amp carry_through within yourself amp positivity will flock to you happy independent
Augmented: loved bethenny independence msg on wendywilliams be happy amp carry_through within yourself amp positivity will flock to you happy independent


### Data Augmentation

In certain cases, especially when data is limited, data augmentation techniques can be applied to generate new training examples by modifying the original dataset.

- **Paraphrasing:** Rewriting sentences in different ways while preserving the meaning.
- **Backtranslation:** Translating text into another language and back again to create variation.
- **Synonym replacement:** Replacing certain words in the text with their synonyms.

#### Code example for synonym replacement (augmentation)

The following example demonstrates how to implement synonym replacement using the `nltk` library. It randomly replaces words in the text with their synonyms to create new variations of sentences. This method can be applied when paraphrasing or backtranslation is not feasible.

### Step 5: Prepare the data for fine-tuning

After cleaning and tokenizing your text, the next step is to prepare the data for fine-tuning. This involves structuring the tokenized data and labels into a format suitable for training, such as PyTorch DataLoader objects.

# Example code

In [20]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Define a mapping function for sentiment intensity
def map_sentiment(value):
    if value == "high":
        return 1.0
    elif value == "medium":
        return 0.5
    elif value == "low":
        return 0.0
    return None

# Apply the mapping function to the 'intensity' column (assuming it exists in this dataset)
# Note: The dataset loaded in Step 1 has an 'intensity' column.
if 'intensity' in data.columns:
    data['sentiment_intensity'] = data['intensity'].apply(map_sentiment)
    
    # Drop rows where sentiment_intensity is None (invalid values)
    data = data.dropna(subset=['sentiment_intensity'])
    
    # Convert to tensor
    labels = torch.tensor(data['sentiment_intensity'].tolist())
    print("Labels created from 'intensity' column.")
else:
    # Fallback if 'intensity' column is missing (using previous logic)
    print("'intensity' column not found. Using 'label' column if available.")
    if 'label' in data.columns:
         labels = torch.tensor(data['label'].tolist())

# Re-tokenize to ensure alignment after potential dropna
tokens = tokenizer(
    data['cleaned_text'].tolist(), 
    padding=True, 
    truncation=True, 
    return_tensors='pt', 
    max_length=128
)

input_ids = tokens['input_ids']
attention_masks = tokens['attention_mask']

# Create a DataLoader for training
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

print("DataLoader created successfully!")

'intensity' column not found. Using 'label' column if available.
DataLoader created successfully!
DataLoader created successfully!


### Step 6: Split the data

Before training, itâ€™s important to split your data into training, validation, and test sets. The training set is used to train the model, the validation set helps to tune model hyperparameters, and the test set is used for final evaluation to ensure that the model generalizes well to unseen data.

In [21]:
from sklearn.model_selection import train_test_split

# First split: 15% for test set, the rest for training/validation
train_val_inputs, test_inputs, train_val_masks, test_masks, train_val_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.15, random_state=42
)

# Second split: 20% for validation set from remaining data
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_val_inputs, train_val_masks, train_val_labels, test_size=0.20, random_state=42
)

# Create DataLoader objects for training, validation, and test sets
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

print("DataLoader objects for training, validation, and test sets created successfully!")

DataLoader objects for training, validation, and test sets created successfully!


### Explanation

The `train_test_split` method from the `sklearn.model_selection` module splits your data into training and validation (or test) sets. Here's a breakdown of how it works:

- **`input_ids` and `labels`**: These are the inputs and labels you are splitting.
- **`test_size=0.1`**: This indicates that 10 percent of the data will be set aside for the test set.
- **`random_state=42`**: This ensures the split is reproducibleâ€”using the same random state will produce the same split every time.

In this case, we first split the data into two sets:
1. **`train_val_inputs` and `test_inputs`**: A combined set of training + validation data and a test set.

Then, we further split the `train_val_inputs` into `train_inputs` and `val_inputs` to get a separate validation set.

This process allows us to train, validate, and test data.

### Conclusion

Following this walkthrough, youâ€™ve cleaned, tokenized, and structured your dataset for fine-tuning. With clean and well-prepared data, your model will have the best chance of achieving high performance during fine-tuning. You can use these preprocessing steps in your machine-learning projects to ensure optimal results.

## Step 7: Fine-tune the Model

Now that our data is prepared, we can proceed to fine-tune the BERT model. We will use the `Trainer` API from Hugging Face, which simplifies the training loop.

**Key Components:**
- **Model:** We load `bert-base-uncased` with a classification head.
- **Training Arguments:** We define hyperparameters like learning rate, batch size, and number of epochs.
- **Trainer:** Handles the training and evaluation process.

In [25]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import torch

# 1. Load the model
# We are doing regression (predicting a float value 0.0 - 1.0), so num_labels=1
# Explicitly set problem_type="regression" to ensure correct loss function (MSELoss)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1, problem_type="regression")

# 2. Define Metrics Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.flatten()
    
    # For regression, we can use Mean Squared Error (MSE)
    mse = ((preds - labels) ** 2).mean()
    return {
        'mse': mse,
    }

# 3. Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="epoch",           # Save checkpoint every epoch
    load_best_model_at_end=True,     # Load the best model when finished
)

# 4. Initialize Trainer
# Note: We need to wrap our TensorDatasets into a format Trainer expects (dictionary with keys)
# Or simpler: Use the standard Dataset object which we didn't use earlier. 
# Since we have TensorDatasets, we can write a small wrapper or just use a custom loop.
# HOWEVER, for simplicity and standard practice, let's convert our Tensors back to HF Datasets quickly.

def create_hf_dataset(inputs, masks, labels):
    # Ensure labels are float32 for regression
    # Ensure inputs are long for embedding layers
    return Dataset.from_dict({
        'input_ids': inputs,
        'attention_mask': masks,
        'labels': labels.to(torch.float32) 
    })

train_hf_ds = create_hf_dataset(train_inputs, train_masks, train_labels)
val_hf_ds = create_hf_dataset(val_inputs, val_masks, val_labels)
test_hf_ds = create_hf_dataset(test_inputs, test_masks, test_labels)

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_hf_ds,           # training dataset
    eval_dataset=val_hf_ds,              # evaluation dataset
    compute_metrics=compute_metrics      # the callback that computes metrics of interest
)

# 5. Train the model
print("Starting training...")
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...




Epoch,Training Loss,Validation Loss,Mse
1,0.0044,0.000305,0.000305
2,0.0013,0.000332,0.000332
3,0.0009,4.6e-05,4.6e-05




TrainOutput(global_step=507, training_loss=0.004114732400677672, metrics={'train_runtime': 4045.7894, 'train_samples_per_second': 1.996, 'train_steps_per_second': 0.125, 'total_flos': 178455526558488.0, 'train_loss': 0.004114732400677672, 'epoch': 3.0})

## Step 8: Evaluate the Model

After training, it is crucial to evaluate the model's performance to ensure it has learned effectively and is not overfitting. We will visualize the training loss and validation metrics.

**What to look for:**
- **Training Loss:** Should decrease over time.
- **Validation Loss (MSE):** Should also decrease. If it starts rising, the model might be overfitting.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Extract training history
history = trainer.state.log_history
df_history = pd.DataFrame(history)

# Filter for training loss and eval loss
train_loss = df_history[df_history['loss'].notna()][['epoch', 'loss']]
eval_loss = df_history[df_history['eval_loss'].notna()][['epoch', 'eval_loss']]

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(train_loss['epoch'], train_loss['loss'], label='Training Loss')
plt.plot(eval_loss['epoch'], eval_loss['eval_loss'], label='Validation Loss (MSE)', marker='o')

plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()

# Final Evaluation on Test Set
print("\nEvaluating on Test Set...")
test_results = trainer.evaluate(test_hf_ds)
print(f"Test MSE: {test_results['eval_mse']:.4f}")

## Step 9: Test with Random Examples

Finally, let's see the model in action! We will pick a few random examples from the test set, print the text, the actual intensity, and the model's predicted intensity.

In [None]:
import random

# Function to predict intensity for a single text
def predict_intensity(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    # Move to same device as model
    inputs = {k: v.to(trainer.model.device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = trainer.model(**inputs)
    
    # Get prediction (it's a regression value)
    prediction = outputs.logits.item()
    return prediction

# Pick 5 random samples from the original dataframe (to get the text back easily)
# We use the 'test_inputs' indices if we tracked them, but for simplicity, let's just pick random texts 
# and see what the model thinks, comparing to our intuition.

print("--- Random Prediction Showcase ---")
samples = data.sample(5)

for index, row in samples.iterrows():
    text = row['cleaned_text']
    actual_intensity = row['sentiment_intensity']
    predicted_intensity = predict_intensity(text)
    
    print(f"Text: {text}")
    print(f"Actual Intensity: {actual_intensity}")
    print(f"Predicted Intensity: {predicted_intensity:.4f}")
    print("-" * 30)