In [1]:
import pandas as pd
from IPython.display import clear_output
import gc
import sys
import time

In [2]:
df = pd.read_csv('/kaggle/input/youtube-videos-dataset/youtube.csv')
df

Unnamed: 0,link,title,description,category
0,JLZlCZ0,Ep 1| Travelling through North East India | Of...,Tanya Khanijow\n671K subscribers\nSUBSCRIBE\nT...,travel
1,i9E_Blai8vk,Welcome to Bali | Travel Vlog | Priscilla Lee,Priscilla Lee\n45.6K subscribers\nSUBSCRIBE\n*...,travel
2,r284c-q8oY,My Solo Trip to ALASKA | Cruising From Vancouv...,Allison Anderson\n588K subscribers\nSUBSCRIBE\...,travel
3,Qmi-Xwq-ME,Traveling to the Happiest Country in the World!!,Yes Theory\n6.65M subscribers\nSUBSCRIBE\n*BLA...,travel
4,_lcOX55Ef70,Solo in Paro Bhutan | Tiger's Nest visit | Bhu...,Tanya Khanijow\n671K subscribers\nSUBSCRIBE\nH...,travel
...,...,...,...,...
3594,#NAME?,21st Century Challenges: Crash Course European...,CrashCourse\n12.4M subscribers\nSUBSCRIBE\nThe...,history
3595,d-2Trw8bCa0,EU DataViz webinar - Barnaby Skinner - How to ...,Publications Office of the European Union\n3.2...,history
3596,RCKWarkUL,Stone Age Scandinavia: First People In the Nor...,History Time\n619K subscribers\nSUBSCRIBE\n- W...,history
3597,MF6F3BxJIY,AP European History - Interwar Period: Paris P...,Mr. Raymond's Civics and Social Studies Academ...,history


In [3]:
df.value_counts('category')

category
travel       1156
art_music     947
food          903
history       593
Name: count, dtype: int64

In [4]:
labels = set(df['category'].values)
n_labels = len(labels)

label_index_map = {l: i for i, l in enumerate(labels)}
index_label_map = {i: l for l, i in label_index_map.items()}

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AutoModel, AutoConfig, AutoTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import numpy as np

In [7]:
max_seq_length = 256
_pretrained_model = 'google-bert/bert-base-uncased'

config = AutoConfig.from_pretrained(_pretrained_model)
config.update({'output_hidden_states':True})
model = AutoModel.from_pretrained(_pretrained_model, config=config)
tokenizer = AutoTokenizer.from_pretrained(_pretrained_model)

clear_output()

model = model.to('cuda')

In [8]:
hidden_states_cls_dict = {}
for i in range(1, 13):
    hidden_states_cls_dict[f'layer_{i}'] = []

In [9]:
for i in range((len(df)//256)+1):
    texts = df['title'][256*i:256*(i+1)].tolist()
    features = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        padding='max_length',
        max_length=max_seq_length,
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True
    )
    features.to('cuda')
    with torch.no_grad():
        outputs = model(features['input_ids'], features['attention_mask'])
        all_hidden_states = torch.stack(outputs[2])
        for i in range(1, 13):
            layer_cls_list = []
            for ii in all_hidden_states[i]:
                layer_cls_list.append(ii[0].tolist())
            hidden_states_cls_dict[f'layer_{i}'].extend(layer_cls_list)  
        
        
        del outputs, all_hidden_states, features, layer_cls_list, texts
        gc.collect()
        torch.cuda.empty_cache()


In [10]:
for i in range(1, 13):
    df[f'layer_{i}'] = hidden_states_cls_dict[f'layer_{i}']

In [11]:
df.head(1)

Unnamed: 0,link,title,description,category,layer_1,layer_2,layer_3,layer_4,layer_5,layer_6,layer_7,layer_8,layer_9,layer_10,layer_11,layer_12
0,JLZlCZ0,Ep 1| Travelling through North East India | Of...,Tanya Khanijow\n671K subscribers\nSUBSCRIBE\nT...,travel,"[0.20696742832660675, 0.009053017012774944, -0...","[0.08044908940792084, -0.200675368309021, -0.1...","[0.06040492653846741, -0.22809070348739624, -0...","[0.32965075969696045, -0.6098545789718628, -0....","[0.022259468212723732, -0.7881401181221008, -0...","[0.2084280103445053, -1.0606553554534912, -0.5...","[0.17031367123126984, -1.2389631271362305, -0....","[-0.1554407924413681, -1.0762848854064941, -0....","[-0.24458223581314087, -0.6536688208580017, -0...","[-0.671563446521759, -0.9311846494674683, 0.03...","[-0.6237320899963379, -0.3326026499271393, -0....","[-0.5532158017158508, 0.03186248615384102, 0.0..."


In [12]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

In [17]:
def train(model, criterion, optimizer, train_loader):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    return running_loss / len(train_loader.dataset)

def validate(model, criterion, val_loader):
    model.eval()
    running_loss = 0.0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    return running_loss / len(val_loader.dataset), f1_score(y_true, y_pred, average='macro')

# Only 12th layer

In [66]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        batch = self.data.iloc[idx]
        features = torch.tensor(batch['layer_12'], dtype=torch.float)
        label = torch.tensor(label_index_map[batch['category']], dtype=torch.long)
        return features, label
    
batch_size = 128
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=0, stride=3)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x).squeeze()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x
    
    
input_size = 256
hidden_size = 512
output_size = n_labels
lr = 0.001
num_epochs = 100

model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99) 


for epoch in range(num_epochs):
    train_loss = train(model, criterion, optimizer, train_dataloader)
    val_loss, val_f1 = validate(model, criterion, test_dataloader)
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
    scheduler.step()

Epoch 10/100, Train Loss: 0.8048, Val Loss: 0.8088, Val F1 (macro): 0.9480
Epoch 20/100, Train Loss: 0.7807, Val Loss: 0.7971, Val F1 (macro): 0.9496
Epoch 30/100, Train Loss: 0.7708, Val Loss: 0.7928, Val F1 (macro): 0.9531
Epoch 40/100, Train Loss: 0.7661, Val Loss: 0.7957, Val F1 (macro): 0.9487
Epoch 50/100, Train Loss: 0.7612, Val Loss: 0.7899, Val F1 (macro): 0.9565
Epoch 60/100, Train Loss: 0.7588, Val Loss: 0.7890, Val F1 (macro): 0.9565
Epoch 70/100, Train Loss: 0.7573, Val Loss: 0.7894, Val F1 (macro): 0.9565
Epoch 80/100, Train Loss: 0.7567, Val Loss: 0.7887, Val F1 (macro): 0.9565
Epoch 90/100, Train Loss: 0.7564, Val Loss: 0.7886, Val F1 (macro): 0.9578
Epoch 100/100, Train Loss: 0.7563, Val Loss: 0.7885, Val F1 (macro): 0.9565


# Only 11th layer

In [67]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        batch = self.data.iloc[idx]
        features = torch.tensor(batch['layer_11'], dtype=torch.float)
        label = torch.tensor(label_index_map[batch['category']], dtype=torch.long)
        return features, label
    
batch_size = 128
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=0, stride=3)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x).squeeze()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x
    
    
input_size = 256
hidden_size = 512
output_size = n_labels
lr = 0.001
num_epochs = 100

model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99) 


for epoch in range(num_epochs):
    train_loss = train(model, criterion, optimizer, train_dataloader)
    val_loss, val_f1 = validate(model, criterion, test_dataloader)
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
    scheduler.step()

Epoch 10/100, Train Loss: 0.8709, Val Loss: 0.8785, Val F1 (macro): 0.8683
Epoch 20/100, Train Loss: 0.8253, Val Loss: 0.8530, Val F1 (macro): 0.8918
Epoch 30/100, Train Loss: 0.8092, Val Loss: 0.8427, Val F1 (macro): 0.9079
Epoch 40/100, Train Loss: 0.7962, Val Loss: 0.8410, Val F1 (macro): 0.9014
Epoch 50/100, Train Loss: 0.7900, Val Loss: 0.8434, Val F1 (macro): 0.8998
Epoch 60/100, Train Loss: 0.7835, Val Loss: 0.8411, Val F1 (macro): 0.9038
Epoch 70/100, Train Loss: 0.7803, Val Loss: 0.8424, Val F1 (macro): 0.9017
Epoch 80/100, Train Loss: 0.7782, Val Loss: 0.8402, Val F1 (macro): 0.9066
Epoch 90/100, Train Loss: 0.7770, Val Loss: 0.8401, Val F1 (macro): 0.8994
Epoch 100/100, Train Loss: 0.7755, Val Loss: 0.8371, Val F1 (macro): 0.9032


# Only 10th layer

In [68]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        batch = self.data.iloc[idx]
        features = torch.tensor(batch['layer_10'], dtype=torch.float)
        label = torch.tensor(label_index_map[batch['category']], dtype=torch.long)
        return features, label
    
batch_size = 128
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=0, stride=3)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x).squeeze()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x
    
    
input_size = 256
hidden_size = 512
output_size = n_labels
lr = 0.001
num_epochs = 100

model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99) 


for epoch in range(num_epochs):
    train_loss = train(model, criterion, optimizer, train_dataloader)
    val_loss, val_f1 = validate(model, criterion, test_dataloader)
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
    scheduler.step()

Epoch 10/100, Train Loss: 0.8494, Val Loss: 0.8756, Val F1 (macro): 0.8724
Epoch 20/100, Train Loss: 0.8133, Val Loss: 0.8478, Val F1 (macro): 0.9043
Epoch 30/100, Train Loss: 0.8019, Val Loss: 0.8435, Val F1 (macro): 0.9026
Epoch 40/100, Train Loss: 0.7877, Val Loss: 0.8350, Val F1 (macro): 0.9050
Epoch 50/100, Train Loss: 0.7809, Val Loss: 0.8318, Val F1 (macro): 0.9136
Epoch 60/100, Train Loss: 0.7774, Val Loss: 0.8332, Val F1 (macro): 0.9055
Epoch 70/100, Train Loss: 0.7754, Val Loss: 0.8342, Val F1 (macro): 0.9093
Epoch 80/100, Train Loss: 0.7743, Val Loss: 0.8345, Val F1 (macro): 0.9102
Epoch 90/100, Train Loss: 0.7744, Val Loss: 0.8323, Val F1 (macro): 0.9081
Epoch 100/100, Train Loss: 0.7724, Val Loss: 0.8342, Val F1 (macro): 0.9034


# Only 9th layer

In [69]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        batch = self.data.iloc[idx]
        features = torch.tensor(batch['layer_9'], dtype=torch.float)
        label = torch.tensor(label_index_map[batch['category']], dtype=torch.long)
        return features, label
    
batch_size = 128
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=0, stride=3)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x).squeeze()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x
    
    
input_size = 256
hidden_size = 512
output_size = n_labels
lr = 0.001
num_epochs = 100

model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99) 


for epoch in range(num_epochs):
    train_loss = train(model, criterion, optimizer, train_dataloader)
    val_loss, val_f1 = validate(model, criterion, test_dataloader)
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
    scheduler.step()

Epoch 10/100, Train Loss: 0.8871, Val Loss: 0.8946, Val F1 (macro): 0.8707
Epoch 20/100, Train Loss: 0.8337, Val Loss: 0.8641, Val F1 (macro): 0.8851
Epoch 30/100, Train Loss: 0.8122, Val Loss: 0.8546, Val F1 (macro): 0.8912
Epoch 40/100, Train Loss: 0.8002, Val Loss: 0.8512, Val F1 (macro): 0.8965
Epoch 50/100, Train Loss: 0.7919, Val Loss: 0.8516, Val F1 (macro): 0.8913
Epoch 60/100, Train Loss: 0.7887, Val Loss: 0.8465, Val F1 (macro): 0.9003
Epoch 70/100, Train Loss: 0.7843, Val Loss: 0.8482, Val F1 (macro): 0.8931
Epoch 80/100, Train Loss: 0.7812, Val Loss: 0.8455, Val F1 (macro): 0.9011
Epoch 90/100, Train Loss: 0.7787, Val Loss: 0.8447, Val F1 (macro): 0.8980
Epoch 100/100, Train Loss: 0.7771, Val Loss: 0.8442, Val F1 (macro): 0.9023


# 11th and 12th layers with aggregation using point-wise conv

In [70]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        batch = self.data.iloc[idx]
        features11 = torch.tensor(batch['layer_11'], dtype=torch.float)
        features12 = torch.tensor(batch['layer_12'], dtype=torch.float)
        features = torch.stack([features11, features12])
        label = torch.tensor(label_index_map[batch['category']], dtype=torch.long)
        return features, label
    
batch_size = 128
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.conv_pw = nn.Conv1d(2, 1, kernel_size=1, padding=0, stride=1)
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=0, stride=3)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.conv_pw(x)
        x = self.conv(x).squeeze()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x
    
    
input_size = 256
hidden_size = 512
output_size = n_labels
lr = 0.001
num_epochs = 100

model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99) 


for epoch in range(num_epochs):
    train_loss = train(model, criterion, optimizer, train_dataloader)
    val_loss, val_f1 = validate(model, criterion, test_dataloader)
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
    scheduler.step()

Epoch 10/100, Train Loss: 0.9686, Val Loss: 0.9651, Val F1 (macro): 0.6370
Epoch 20/100, Train Loss: 0.7949, Val Loss: 0.8070, Val F1 (macro): 0.9366
Epoch 30/100, Train Loss: 0.7801, Val Loss: 0.8001, Val F1 (macro): 0.9430
Epoch 40/100, Train Loss: 0.7714, Val Loss: 0.7961, Val F1 (macro): 0.9516
Epoch 50/100, Train Loss: 0.7663, Val Loss: 0.7938, Val F1 (macro): 0.9532
Epoch 60/100, Train Loss: 0.7620, Val Loss: 0.7939, Val F1 (macro): 0.9546
Epoch 70/100, Train Loss: 0.7573, Val Loss: 0.7896, Val F1 (macro): 0.9554
Epoch 80/100, Train Loss: 0.7565, Val Loss: 0.7888, Val F1 (macro): 0.9579
Epoch 90/100, Train Loss: 0.7558, Val Loss: 0.7882, Val F1 (macro): 0.9563
Epoch 100/100, Train Loss: 0.7551, Val Loss: 0.7886, Val F1 (macro): 0.9546


# Last 4 layers with aggregation using point-wise conv

In [72]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        batch = self.data.iloc[idx]
        features9 = torch.tensor(batch['layer_9'], dtype=torch.float)
        features10 = torch.tensor(batch['layer_10'], dtype=torch.float)
        features11 = torch.tensor(batch['layer_11'], dtype=torch.float)
        features12 = torch.tensor(batch['layer_12'], dtype=torch.float)
        features = torch.stack([features9, features10, features11, features12])
        label = torch.tensor(label_index_map[batch['category']], dtype=torch.long)
        return features, label
    
batch_size = 128
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleModel, self).__init__()
        self.conv_pw = nn.Conv1d(4, 1, kernel_size=1, padding=0, stride=1)
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=0, stride=3)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.conv_pw(x)
        x = self.conv(x).squeeze()
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x
    
    
input_size = 256
hidden_size = 512
output_size = n_labels
lr = 0.001
num_epochs = 100

model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99) 


for epoch in range(num_epochs):
    train_loss = train(model, criterion, optimizer, train_dataloader)
    val_loss, val_f1 = validate(model, criterion, test_dataloader)
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1 (macro): {val_f1:.4f}")
    scheduler.step()

Epoch 10/100, Train Loss: 0.8704, Val Loss: 0.8635, Val F1 (macro): 0.8886
Epoch 20/100, Train Loss: 0.8106, Val Loss: 0.8400, Val F1 (macro): 0.9052
Epoch 30/100, Train Loss: 0.7879, Val Loss: 0.8297, Val F1 (macro): 0.9093
Epoch 40/100, Train Loss: 0.7778, Val Loss: 0.8267, Val F1 (macro): 0.9105
Epoch 50/100, Train Loss: 0.7732, Val Loss: 0.8247, Val F1 (macro): 0.9137
Epoch 60/100, Train Loss: 0.7715, Val Loss: 0.8235, Val F1 (macro): 0.9177
Epoch 70/100, Train Loss: 0.7700, Val Loss: 0.8250, Val F1 (macro): 0.9131
Epoch 80/100, Train Loss: 0.7679, Val Loss: 0.8245, Val F1 (macro): 0.9183
Epoch 90/100, Train Loss: 0.7674, Val Loss: 0.8248, Val F1 (macro): 0.9164
Epoch 100/100, Train Loss: 0.7658, Val Loss: 0.8260, Val F1 (macro): 0.9114
