In [2]:
try:
    import transformers
except ImportError:
    !pip install transformers -q
    import transformers

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch import nn
import os
from google.colab import files
from torch.optim import AdamW

# Check for GPU (Essential for BERT)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FearFactor/data/fear_factor_dataset.csv')
df.tail()

Unnamed: 0,text,fear,stress,morale,trust
1885,Soldiers express low morale after extended dep...,-0.644562,-0.704294,0.87572,0.55445
1886,Public trust increases as clear communication ...,-0.675916,-0.668007,0.663791,0.669481
1887,A viral post falsely claims that water supplie...,-0.944199,-0.854009,0.991415,0.040135
1888,Citizens feel hopeful as relief convoys arrive...,-0.787442,-0.908382,0.048531,0.096918
1889,Disinformation campaigns create fear among vil...,0.246219,-0.65981,0.828001,0.850298


In [4]:
# Hyperparameters
MAX_LEN = 128      # Maximum tokens per sentence
BATCH_SIZE = 16    # Reduce to 8 if you run out of GPU memory
EPOCHS = 4         # BERT learns fast; 3-5 epochs is usually enough
LEARNING_RATE = 2e-5

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
class FSMTDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

# Prepare data
texts = df['text'].values
targets = df[['fear', 'stress', 'morale', 'trust']].values

# Split data
train_texts, val_texts, train_targets, val_targets = train_test_split(
    texts, targets, test_size=0.1, random_state=42
)

# Create DataLoaders
def create_data_loader(texts, targets, tokenizer, max_len, batch_size):
    ds = FSMTDataset(
        texts=texts,
        targets=targets,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=2)

train_data_loader = create_data_loader(train_texts, train_targets, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_texts, val_targets, tokenizer, MAX_LEN, BATCH_SIZE)

print(f"Train batches: {len(train_data_loader)}")
print(f"Val batches: {len(val_data_loader)}")

Train batches: 107
Val batches: 12


In [6]:
class FSMTRegressor(nn.Module):
    def __init__(self, n_classes):
        super(FSMTRegressor, self).__init__()
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Drop out layer to prevent overfitting
        self.drop = nn.Dropout(p=0.3)
        # Final Linear Layer: 768 (BERT output) -> 4 (FSMT values)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # Get BERT output
        output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        # Use the "pooled output" (representation of the whole sentence)
        pooled_output = output.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# Initialize Model
model = FSMTRegressor(n_classes=4)
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.MSELoss().to(device) # Mean Squared Error for Regression

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

    return np.mean(losses)


In [9]:
print("Starting Training...")

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_texts)
    )

    val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(val_texts)
    )

    print(f'Train loss (MSE): {train_loss:.4f}')
    print(f'Val   loss (MSE): {val_loss:.4f}')
    print()

Starting Training...
Epoch 1/4
----------
Train loss (MSE): 0.1434
Val   loss (MSE): 0.0745

Epoch 2/4
----------
Train loss (MSE): 0.0917
Val   loss (MSE): 0.0605

Epoch 3/4
----------
Train loss (MSE): 0.0752
Val   loss (MSE): 0.0562

Epoch 4/4
----------
Train loss (MSE): 0.0653
Val   loss (MSE): 0.0479



In [10]:
# Inference function
def predict_fsmt_bert(text):
    encoded_text = tokenizer.encode_plus(
        text,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        prediction = output.cpu().numpy()[0]

    return {
        "Fear": prediction[0],
        "Stress": prediction[1],
        "Morale": prediction[2],
        "Trust": prediction[3]
    }

# Test Prediction
sample_text = "The enemy forces have retreated, and food supplies are finally arriving."
results = predict_fsmt_bert(sample_text)

print(f"Input: {sample_text}\n")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

Input: The enemy forces have retreated, and food supplies are finally arriving.

Fear: -0.4517
Stress: -0.8871
Morale: 0.8240
Trust: 0.7467


In [11]:
# 1. Create a directory to hold the model files
output_dir = './fsmt_model_saved'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 2. Save the model weights (The learned parameters)
model_save_path = os.path.join(output_dir, 'fsmt_bert_model.bin')
torch.save(model.state_dict(), model_save_path)

# 3. Save the tokenizer (MUST match the model)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

# 4. Zip the folder
!zip -r fsmt_model.zip {output_dir}

# 5. Download the zip file to your local computer
print("Downloading fsmt_model.zip...")
files.download('fsmt_model.zip')

Model and tokenizer saved to ./fsmt_model_saved
  adding: fsmt_model_saved/ (stored 0%)
  adding: fsmt_model_saved/tokenizer_config.json (deflated 75%)
  adding: fsmt_model_saved/fsmt_bert_model.bin (deflated 7%)
  adding: fsmt_model_saved/special_tokens_map.json (deflated 42%)
  adding: fsmt_model_saved/vocab.txt (deflated 53%)
Downloading fsmt_model.zip...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>