In [17]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import ViTModel, ViTFeatureExtractor
from transformers import BertTokenizer, BertModel
import re
import glob

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
df = pd.read_csv('/kaggle/input/multilingual-meme-datasets/final_datasets.csv')
print(f"Dataset loaded with shape: {df.shape}")

# Image directory
image_dir = "/kaggle/input/multilingual-meme-datasets/datasets/datasets"

# Create a mapping between dataset 'name' and actual image filenames
def create_image_mapping(dataframe, image_directory):
    """
    Create a mapping between dataset 'name' column and actual image filenames in the directory.
    This handles the case where image filenames might have different formats.
    """
    # Get all image files in the directory
    image_files = glob.glob(os.path.join(image_directory, '*.*'))
    image_mapping = {}
    
    # Create a set of all available image filenames (without path)
    available_images = {os.path.basename(f) for f in image_files}
    print(f"Found {len(available_images)} images in directory")
    
    # Print some sample images to understand the naming pattern
    print("Sample image filenames:", list(available_images)[:5])
    
    # Print some sample names from the dataset
    print("Sample names from dataset:", dataframe['name'].iloc[:5].tolist())
    
    # Method 1: Exact match
    for name in dataframe['name'].unique():
        if name in available_images:
            image_mapping[name] = name
    
    # Method 2: Check if the id is part of the filename
    unmapped_names = set(dataframe['name']) - set(image_mapping.keys())
    for name in unmapped_names:
        # Extract ID from name (assuming name has some ID pattern)
        id_match = re.search(r'\d+', name)
        if id_match:
            id_value = id_match.group()
            # Look for files containing this ID
            matching_files = [f for f in available_images if id_value in f]
            if matching_files:
                image_mapping[name] = matching_files[0]
    
    # Method 3: Try matching using 'ids' or 'id' column if available
    if 'ids' in dataframe.columns or 'id' in dataframe.columns:
        id_col = 'ids' if 'ids' in dataframe.columns else 'id'
        id_to_name = dict(zip(dataframe[id_col], dataframe['name']))
        
        for id_value, name in id_to_name.items():
            if name not in image_mapping:
                # Look for files containing this ID
                matching_files = [f for f in available_images if str(id_value) in f]
                if matching_files:
                    image_mapping[name] = matching_files[0]
    
    print(f"Successfully mapped {len(image_mapping)} out of {len(dataframe['name'].unique())} unique names")
    return image_mapping

# Custom dataset class with image mapping
class HarmfulContentDataset(Dataset):
    def __init__(self, dataframe, image_dir, feature_extractor, tokenizer, image_mapping=None, max_len=128):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.image_mapping = image_mapping or {}
        
        # Create a default blank image to use when an image is not found
        self.blank_image = Image.new('RGB', (224, 224), color='white')
        
        # Image transformation
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Text features
        text_features = f"{row['gender']} {row['age']} {row['age_bucket']} {row['dominant_emotion']} {row['dominant_race']} {row['translated_text']}"
        
        # Tokenize text
        encoding = self.tokenizer.encode_plus(
            text_features,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Image processing
        try:
            # Get the correct image filename using the mapping
            image_filename = self.image_mapping.get(row['name'], row['name'])
            image_path = os.path.join(self.image_dir, image_filename)
            
            # Check if the file exists
            if os.path.exists(image_path):
                image = Image.open(image_path).convert('RGB')
            else:
                # Try alternate approach: check if the file exists with different extensions
                base_name = os.path.splitext(image_path)[0]
                for ext in ['.jpg', '.jpeg', '.png', '.gif']:
                    alt_path = base_name + ext
                    if os.path.exists(alt_path):
                        image = Image.open(alt_path).convert('RGB')
                        break
                else:
                    # If still not found, use blank image
                    image = self.blank_image
                    if idx % 100 == 0:  # Limit logging to avoid flooding
                        print(f"Image not found for {row['name']}, using blank image")
            
            # Process image for ViT
            image_features = self.feature_extractor(images=image, return_tensors="pt")
            pixel_values = image_features.pixel_values.squeeze()
            
        except Exception as e:
            if idx % 100 == 0:  # Limit logging
                print(f"Error processing image for {row['name']}: {e}")
            # Create blank image features
            pixel_values = torch.zeros((3, 224, 224))
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'pixel_values': pixel_values,
            'labels': torch.tensor(row['label'], dtype=torch.long)
        }

# Custom model class (ViT + BERT + classification head)
class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(MultimodalClassifier, self).__init__()
        
        # Load pre-trained Vision Transformer
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224")
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        
        # Freeze ViT weights
        for param in self.vit.parameters():
            param.requires_grad = False
            
        # Unfreeze the last few layers of ViT
        for param in self.vit.encoder.layer[-2:].parameters():
            param.requires_grad = True
        
        # Freeze BERT weights
        for param in self.bert.parameters():
            param.requires_grad = False
            
        # Unfreeze the last few layers of BERT
        for param in self.bert.encoder.layer[-2:].parameters():
            param.requires_grad = True
        
        # Dimensionality of ViT and BERT embeddings
        vit_hidden_size = self.vit.config.hidden_size
        bert_hidden_size = self.bert.config.hidden_size
        
        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(vit_hidden_size + bert_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, pixel_values):
        # Process image with ViT
        vit_outputs = self.vit(pixel_values=pixel_values)
        vit_embeddings = vit_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Process text with BERT
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate image and text features
        combined_embeddings = torch.cat((vit_embeddings, bert_embeddings), dim=1)
        
        # Classification
        logits = self.classifier(combined_embeddings)
        
        return logits

# Training function
def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=10):
    train_losses = []
    valid_losses = []
    train_accuracies = []
    valid_accuracies = []
    
    best_valid_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0
        
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()
            
            # Print batch progress
            if (batch_idx + 1) % 10 == 0:
                print(f'Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
        
        train_loss = running_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        
        # Validation
        model.eval()
        running_valid_loss = 0.0
        correct_valid = 0
        total_valid = 0
        
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                pixel_values = batch['pixel_values'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
                loss = criterion(outputs, labels)
                
                running_valid_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total_valid += labels.size(0)
                correct_valid += (predicted == labels).sum().item()
        
        valid_loss = running_valid_loss / len(valid_loader)
        valid_accuracy = correct_valid / total_valid
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_accuracy)
        
        print(f'Epoch {epoch+1}/{num_epochs}, '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, '
              f'Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_accuracy:.4f}')
        
        # Save best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best_multimodal_model.pth')
            print(f'Saved model with validation loss: {valid_loss:.4f}')
    
    return train_losses, valid_losses, train_accuracies, valid_accuracies

# Function for evaluation and metrics
def evaluate_model(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
            _, predicted = torch.max(outputs.data, 1)
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    class_report = classification_report(y_true, y_pred, target_names=['Non-Harmful', 'Harmful'])
    
    print(f'Test Accuracy: {accuracy:.4f}')
    print('\nConfusion Matrix:')
    print(conf_matrix)
    print('\nClassification Report:')
    print(class_report)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-Harmful', 'Harmful'], 
                yticklabels=['Non-Harmful', 'Harmful'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    return accuracy, conf_matrix, class_report

# Function to plot training curves
def plot_training_curves(train_losses, valid_losses, train_accuracies, valid_accuracies):
    # Plot loss
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(valid_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Training Accuracy')
    plt.plot(valid_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training vs Validation Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_curves.png')
    plt.close()

# Function to inspect dataset
def inspect_dataset(df, image_dir):
    """
    Inspects the dataset and the image directory to understand the structure.
    """
    print("\n=== Dataset Inspection ===")
    print(f"Dataset columns: {df.columns.tolist()}")
    print(f"Number of rows: {len(df)}")
    print(f"Label distribution: {df['label'].value_counts().to_dict()}")
    
    # Check if image directory exists
    if not os.path.exists(image_dir):
        print(f"WARNING: Image directory does not exist: {image_dir}")
        return
    
    # Count image files
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f)) and 
                   f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
    
    print(f"Number of image files in directory: {len(image_files)}")
    if len(image_files) > 0:
        print(f"Sample image filenames: {image_files[:5]}")
    
    # Check name column format
    print("\nSample names from dataset:")
    print(df['name'].head(5).tolist())
    
    # Check if any dataset names exactly match image filenames
    matching_names = [name for name in df['name'].unique() if name in image_files]
    print(f"Number of exact matches between 'name' column and image filenames: {len(matching_names)}")
    
    # Check if IDs are in the name column
    if 'id' in df.columns or 'ids' in df.columns:
        id_col = 'ids' if 'ids' in df.columns else 'id'
        print(f"\nSample values from '{id_col}' column:")
        print(df[id_col].head(5).tolist())
        
        # Check if IDs are in image filenames
        sample_ids = df[id_col].astype(str).head(5).tolist()
        for id_val in sample_ids:
            matches = [f for f in image_files if id_val in f]
            if matches:
                print(f"Found matches for ID {id_val}: {matches[:2]}")
                
    return image_files

# Main execution
def main():
    # Initialize feature extractor and tokenizer
    feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    
    # Inspect dataset and image directory
    image_files = inspect_dataset(df, image_dir)
    
    # Create mapping between dataset names and image filenames
    image_mapping = create_image_mapping(df, image_dir)
    
    # Split dataset
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
    valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])
    
    print(f"Train set: {len(train_df)}, Validation set: {len(valid_df)}, Test set: {len(test_df)}")
    
    # Create datasets with image mapping
    train_dataset = HarmfulContentDataset(train_df, image_dir, feature_extractor, tokenizer, image_mapping)
    valid_dataset = HarmfulContentDataset(valid_df, image_dir, feature_extractor, tokenizer, image_mapping)
    test_dataset = HarmfulContentDataset(test_df, image_dir, feature_extractor, tokenizer, image_mapping)
    
    # Create dataloaders with appropriate batch size based on available GPU memory
    batch_size = 8  # Reduced batch size to handle larger model
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)
    
    # Initialize model
    model = MultimodalClassifier(num_classes=2).to(device)
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    
    # Train model
    print("Starting training...")
    train_losses, valid_losses, train_accuracies, valid_accuracies = train_model(
        model, train_loader, valid_loader, criterion, optimizer, num_epochs=10
    )
    
    # Plot training curves
    plot_training_curves(train_losses, valid_losses, train_accuracies, valid_accuracies)
    
    # Load best model for evaluation
    model.load_state_dict(torch.load('best_multimodal_model.pth'))
    
    # Evaluate model
    print("\nEvaluating model on test set...")
    accuracy, conf_matrix, class_report = evaluate_model(model, test_loader)
    
    # Save final model with metadata
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'accuracy': accuracy,
        'conf_matrix': conf_matrix,
        'class_report': class_report,
    }, 'multimodal_harmful_content_classifier.pth')
    
    print("Model training and evaluation complete!")

if __name__ == "__main__":
    main()

Using device: cuda:0
Dataset loaded with shape: (25600, 11)





=== Dataset Inspection ===
Dataset columns: ['ids', 'name', 'text', 'label', 'id', 'gender', 'age', 'age_bucket', 'dominant_emotion', 'dominant_race', 'translated_text']
Number of rows: 25600
Label distribution: {1: 17388, 0: 8212}
Number of image files in directory: 25716
Sample image filenames: ['eng476.png', 'meme_184.png', 'tangaila (166).jpg', 'Image- (178).jpg', 'Image- (2026).jpg']

Sample names from dataset:
['tangaila (1).jpg', 'tangaila (2).jpg', 'tangaila (3).jpg', 'tangaila (4).jpg', 'tangaila (5).jpg']
Number of exact matches between 'name' column and image filenames: 18670

Sample values from 'ids' column:
[1, 2, 3, 4, 5]
Found matches for ID 1: ['meme_184.png', 'tangaila (166).jpg']
Found matches for ID 2: ['Image- (2026).jpg', 'image_ (2682).jpg']
Found matches for ID 3: ['Bangla Thug Life (374).jpg', '37825.png']
Found matches for ID 4: ['eng476.png', 'meme_184.png']
Found matches for ID 5: ['52691.png', '37825.png']
Found 25716 images in directory
Sample image filena

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 1, Batch 10/2240, Loss: 0.6757
Epoch 1, Batch 20/2240, Loss: 0.5741
Epoch 1, Batch 30/2240, Loss: 0.6345
Epoch 1, Batch 40/2240, Loss: 0.5359
Epoch 1, Batch 50/2240, Loss: 0.5310
Epoch 1, Batch 60/2240, Loss: 0.6471
Epoch 1, Batch 70/2240, Loss: 0.7478
Epoch 1, Batch 80/2240, Loss: 0.4174
Epoch 1, Batch 90/2240, Loss: 0.4602
Epoch 1, Batch 100/2240, Loss: 0.5193
Epoch 1, Batch 110/2240, Loss: 0.5933
Epoch 1, Batch 120/2240, Loss: 0.5653
Epoch 1, Batch 130/2240, Loss: 0.3392
Epoch 1, Batch 140/2240, Loss: 0.6926
Epoch 1, Batch 150/2240, Loss: 0.7081
Epoch 1, Batch 160/2240, Loss: 0.3805
Epoch 1, Batch 170/2240, Loss: 0.5094
Epoch 1, Batch 180/2240, Loss: 0.4571
Epoch 1, Batch 190/2240, Loss: 0.5129
Epoch 1, Batch 200/2240, Loss: 0.6089
Epoch 1, Batch 210/2240, Loss: 0.5802
Epoch 1, Batch 220/2240, Loss: 0.6602
Epoch 1, Batch 230/2240, Loss: 0.5168
Epoch 1, Batch 240/2240, Loss: 0.6039
Epoch 1, Batch 250/2240, Loss: 0.4914
Epoch 1, Batch 260/2240, Loss: 0.4597
Epoch 1, Batch 270/22



Epoch 1, Batch 940/2240, Loss: 0.3253
Epoch 1, Batch 950/2240, Loss: 0.3089
Epoch 1, Batch 960/2240, Loss: 0.5901
Epoch 1, Batch 970/2240, Loss: 0.6687
Epoch 1, Batch 980/2240, Loss: 0.7747
Epoch 1, Batch 990/2240, Loss: 0.4309
Epoch 1, Batch 1000/2240, Loss: 0.4488
Epoch 1, Batch 1010/2240, Loss: 0.6312
Epoch 1, Batch 1020/2240, Loss: 0.4252
Epoch 1, Batch 1030/2240, Loss: 0.4047
Epoch 1, Batch 1040/2240, Loss: 0.6677
Epoch 1, Batch 1050/2240, Loss: 0.3503
Epoch 1, Batch 1060/2240, Loss: 0.3032
Epoch 1, Batch 1070/2240, Loss: 0.5312
Epoch 1, Batch 1080/2240, Loss: 0.3762
Epoch 1, Batch 1090/2240, Loss: 0.4282
Epoch 1, Batch 1100/2240, Loss: 0.4268
Epoch 1, Batch 1110/2240, Loss: 0.3803
Epoch 1, Batch 1120/2240, Loss: 0.5010
Epoch 1, Batch 1130/2240, Loss: 1.0056
Epoch 1, Batch 1140/2240, Loss: 0.3192
Epoch 1, Batch 1150/2240, Loss: 0.8067
Epoch 1, Batch 1160/2240, Loss: 0.3430
Epoch 1, Batch 1170/2240, Loss: 0.3663
Epoch 1, Batch 1180/2240, Loss: 0.5317
Epoch 1, Batch 1190/2240, Loss:



Epoch 1, Batch 2160/2240, Loss: 0.4712
Epoch 1, Batch 2170/2240, Loss: 0.5092
Epoch 1, Batch 2180/2240, Loss: 0.1724
Epoch 1, Batch 2190/2240, Loss: 0.7835
Epoch 1, Batch 2200/2240, Loss: 0.4754
Epoch 1, Batch 2210/2240, Loss: 0.3262
Epoch 1, Batch 2220/2240, Loss: 0.3588
Epoch 1, Batch 2230/2240, Loss: 0.7274
Epoch 1, Batch 2240/2240, Loss: 0.3962


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 1/10, Train Loss: 0.4610, Train Acc: 0.7584, Valid Loss: 0.4097, Valid Acc: 0.7969
Saved model with validation loss: 0.4097


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 2, Batch 10/2240, Loss: 0.3980
Epoch 2, Batch 20/2240, Loss: 0.5079
Epoch 2, Batch 30/2240, Loss: 0.4378
Epoch 2, Batch 40/2240, Loss: 0.3560
Epoch 2, Batch 50/2240, Loss: 0.2851
Epoch 2, Batch 60/2240, Loss: 0.4006
Epoch 2, Batch 70/2240, Loss: 0.3237
Epoch 2, Batch 80/2240, Loss: 0.1248
Epoch 2, Batch 90/2240, Loss: 0.1026
Epoch 2, Batch 100/2240, Loss: 0.2092
Epoch 2, Batch 110/2240, Loss: 0.2588
Epoch 2, Batch 120/2240, Loss: 0.6170
Epoch 2, Batch 130/2240, Loss: 0.1668
Epoch 2, Batch 140/2240, Loss: 0.1932
Epoch 2, Batch 150/2240, Loss: 0.2652
Epoch 2, Batch 160/2240, Loss: 0.6042
Epoch 2, Batch 170/2240, Loss: 0.3773
Epoch 2, Batch 180/2240, Loss: 0.3564
Epoch 2, Batch 190/2240, Loss: 0.5586
Epoch 2, Batch 200/2240, Loss: 0.2675
Epoch 2, Batch 210/2240, Loss: 0.2829
Epoch 2, Batch 220/2240, Loss: 0.4062
Epoch 2, Batch 230/2240, Loss: 0.2422
Epoch 2, Batch 240/2240, Loss: 0.3784
Epoch 2, Batch 250/2240, Loss: 0.4484
Epoch 2, Batch 260/2240, Loss: 0.1809
Epoch 2, Batch 270/22



Epoch 2, Batch 1040/2240, Loss: 0.4165
Epoch 2, Batch 1050/2240, Loss: 0.2411
Epoch 2, Batch 1060/2240, Loss: 0.2515
Epoch 2, Batch 1070/2240, Loss: 0.3710
Epoch 2, Batch 1080/2240, Loss: 0.0758
Epoch 2, Batch 1090/2240, Loss: 0.6703
Epoch 2, Batch 1100/2240, Loss: 0.8295
Epoch 2, Batch 1110/2240, Loss: 0.1020
Epoch 2, Batch 1120/2240, Loss: 0.3714
Epoch 2, Batch 1130/2240, Loss: 0.3089
Epoch 2, Batch 1140/2240, Loss: 0.4208
Epoch 2, Batch 1150/2240, Loss: 0.4137
Epoch 2, Batch 1160/2240, Loss: 0.1873
Epoch 2, Batch 1170/2240, Loss: 0.0870
Epoch 2, Batch 1180/2240, Loss: 0.7496
Epoch 2, Batch 1190/2240, Loss: 0.4533
Epoch 2, Batch 1200/2240, Loss: 0.2540
Epoch 2, Batch 1210/2240, Loss: 0.3407
Epoch 2, Batch 1220/2240, Loss: 0.4232
Epoch 2, Batch 1230/2240, Loss: 0.2343
Epoch 2, Batch 1240/2240, Loss: 0.1623
Epoch 2, Batch 1250/2240, Loss: 0.6485
Epoch 2, Batch 1260/2240, Loss: 0.3532
Epoch 2, Batch 1270/2240, Loss: 0.4192
Epoch 2, Batch 1280/2240, Loss: 0.4605
Epoch 2, Batch 1290/2240,



Epoch 2, Batch 1990/2240, Loss: 0.3985
Epoch 2, Batch 2000/2240, Loss: 0.4704
Epoch 2, Batch 2010/2240, Loss: 0.4063
Epoch 2, Batch 2020/2240, Loss: 0.1730
Epoch 2, Batch 2030/2240, Loss: 0.2775
Epoch 2, Batch 2040/2240, Loss: 0.0733
Epoch 2, Batch 2050/2240, Loss: 0.2578
Epoch 2, Batch 2060/2240, Loss: 0.1455
Epoch 2, Batch 2070/2240, Loss: 0.4953
Epoch 2, Batch 2080/2240, Loss: 0.5461
Epoch 2, Batch 2090/2240, Loss: 0.2637
Epoch 2, Batch 2100/2240, Loss: 0.2598
Epoch 2, Batch 2110/2240, Loss: 0.4031
Epoch 2, Batch 2120/2240, Loss: 0.4043
Epoch 2, Batch 2130/2240, Loss: 0.0198
Epoch 2, Batch 2140/2240, Loss: 0.2489
Epoch 2, Batch 2150/2240, Loss: 0.0840
Epoch 2, Batch 2160/2240, Loss: 0.3625
Epoch 2, Batch 2170/2240, Loss: 0.2889
Epoch 2, Batch 2180/2240, Loss: 0.3972
Epoch 2, Batch 2190/2240, Loss: 0.2921
Epoch 2, Batch 2200/2240, Loss: 0.4073
Epoch 2, Batch 2210/2240, Loss: 0.2883
Epoch 2, Batch 2220/2240, Loss: 0.4712
Epoch 2, Batch 2230/2240, Loss: 0.1632
Epoch 2, Batch 2240/2240,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 2/10, Train Loss: 0.3857, Train Acc: 0.8141, Valid Loss: 0.4126, Valid Acc: 0.8042


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 3, Batch 10/2240, Loss: 0.2204
Epoch 3, Batch 20/2240, Loss: 0.1694
Epoch 3, Batch 30/2240, Loss: 0.3819
Epoch 3, Batch 40/2240, Loss: 0.4826
Epoch 3, Batch 50/2240, Loss: 0.2031
Epoch 3, Batch 60/2240, Loss: 0.5790
Epoch 3, Batch 70/2240, Loss: 0.1353
Epoch 3, Batch 80/2240, Loss: 0.2745
Epoch 3, Batch 90/2240, Loss: 0.4042
Epoch 3, Batch 100/2240, Loss: 0.2451
Epoch 3, Batch 110/2240, Loss: 0.5527
Epoch 3, Batch 120/2240, Loss: 0.5649
Epoch 3, Batch 130/2240, Loss: 0.5798
Epoch 3, Batch 140/2240, Loss: 0.3348
Epoch 3, Batch 150/2240, Loss: 0.4051
Epoch 3, Batch 160/2240, Loss: 0.3781
Epoch 3, Batch 170/2240, Loss: 0.5915
Epoch 3, Batch 180/2240, Loss: 0.2566
Epoch 3, Batch 190/2240, Loss: 0.2187
Epoch 3, Batch 200/2240, Loss: 0.2900
Epoch 3, Batch 210/2240, Loss: 0.4754
Epoch 3, Batch 220/2240, Loss: 0.2149
Epoch 3, Batch 230/2240, Loss: 0.2012
Epoch 3, Batch 240/2240, Loss: 0.5474
Epoch 3, Batch 250/2240, Loss: 0.3327
Epoch 3, Batch 260/2240, Loss: 0.3914
Epoch 3, Batch 270/22



Epoch 3, Batch 1310/2240, Loss: 0.4825
Epoch 3, Batch 1320/2240, Loss: 0.1855
Epoch 3, Batch 1330/2240, Loss: 0.4026
Epoch 3, Batch 1340/2240, Loss: 0.1569
Epoch 3, Batch 1350/2240, Loss: 0.6221
Epoch 3, Batch 1360/2240, Loss: 0.0910
Epoch 3, Batch 1370/2240, Loss: 0.1890
Epoch 3, Batch 1380/2240, Loss: 0.2166
Epoch 3, Batch 1390/2240, Loss: 0.4681
Epoch 3, Batch 1400/2240, Loss: 0.7048
Epoch 3, Batch 1410/2240, Loss: 0.1573
Epoch 3, Batch 1420/2240, Loss: 0.3288
Epoch 3, Batch 1430/2240, Loss: 0.2936
Epoch 3, Batch 1440/2240, Loss: 0.7950
Epoch 3, Batch 1450/2240, Loss: 0.3450
Epoch 3, Batch 1460/2240, Loss: 0.4962
Epoch 3, Batch 1470/2240, Loss: 0.1857
Epoch 3, Batch 1480/2240, Loss: 0.6146




Epoch 3, Batch 1490/2240, Loss: 0.3278
Epoch 3, Batch 1500/2240, Loss: 0.1301
Epoch 3, Batch 1510/2240, Loss: 0.1668
Epoch 3, Batch 1520/2240, Loss: 0.2295
Epoch 3, Batch 1530/2240, Loss: 0.2108
Epoch 3, Batch 1540/2240, Loss: 0.5655
Epoch 3, Batch 1550/2240, Loss: 0.1845
Epoch 3, Batch 1560/2240, Loss: 0.3634
Epoch 3, Batch 1570/2240, Loss: 0.0767
Epoch 3, Batch 1580/2240, Loss: 0.3707
Epoch 3, Batch 1590/2240, Loss: 0.2155
Epoch 3, Batch 1600/2240, Loss: 0.2571
Epoch 3, Batch 1610/2240, Loss: 0.2083
Epoch 3, Batch 1620/2240, Loss: 0.7472
Epoch 3, Batch 1630/2240, Loss: 0.3211
Epoch 3, Batch 1640/2240, Loss: 0.2727
Epoch 3, Batch 1650/2240, Loss: 0.2387
Epoch 3, Batch 1660/2240, Loss: 0.2172
Epoch 3, Batch 1670/2240, Loss: 0.2899
Epoch 3, Batch 1680/2240, Loss: 0.2167
Epoch 3, Batch 1690/2240, Loss: 0.3408
Epoch 3, Batch 1700/2240, Loss: 0.3131
Epoch 3, Batch 1710/2240, Loss: 0.2044
Epoch 3, Batch 1720/2240, Loss: 0.3474
Epoch 3, Batch 1730/2240, Loss: 0.4523
Epoch 3, Batch 1740/2240,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 3/10, Train Loss: 0.3301, Train Acc: 0.8458, Valid Loss: 0.3997, Valid Acc: 0.8102
Saved model with validation loss: 0.3997


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 4, Batch 10/2240, Loss: 0.1260
Epoch 4, Batch 20/2240, Loss: 0.1555
Epoch 4, Batch 30/2240, Loss: 0.1302
Epoch 4, Batch 40/2240, Loss: 0.0749
Epoch 4, Batch 50/2240, Loss: 0.1068
Epoch 4, Batch 60/2240, Loss: 0.2029
Epoch 4, Batch 70/2240, Loss: 0.5744
Epoch 4, Batch 80/2240, Loss: 0.2515
Epoch 4, Batch 90/2240, Loss: 0.5695
Epoch 4, Batch 100/2240, Loss: 0.2385
Epoch 4, Batch 110/2240, Loss: 0.0345
Epoch 4, Batch 120/2240, Loss: 0.1532
Epoch 4, Batch 130/2240, Loss: 0.1842
Epoch 4, Batch 140/2240, Loss: 0.3055
Epoch 4, Batch 150/2240, Loss: 0.3773
Epoch 4, Batch 160/2240, Loss: 0.1084
Epoch 4, Batch 170/2240, Loss: 0.2448
Epoch 4, Batch 180/2240, Loss: 0.1776
Epoch 4, Batch 190/2240, Loss: 0.2084
Epoch 4, Batch 200/2240, Loss: 0.3336
Epoch 4, Batch 210/2240, Loss: 0.3606
Epoch 4, Batch 220/2240, Loss: 0.3105
Epoch 4, Batch 230/2240, Loss: 0.0968
Epoch 4, Batch 240/2240, Loss: 0.1640
Epoch 4, Batch 250/2240, Loss: 0.0692
Epoch 4, Batch 260/2240, Loss: 0.4698
Epoch 4, Batch 270/22



Epoch 4, Batch 950/2240, Loss: 0.2484
Epoch 4, Batch 960/2240, Loss: 0.1275
Epoch 4, Batch 970/2240, Loss: 0.1681




Epoch 4, Batch 980/2240, Loss: 0.1096
Epoch 4, Batch 990/2240, Loss: 0.5306
Epoch 4, Batch 1000/2240, Loss: 0.2748
Epoch 4, Batch 1010/2240, Loss: 0.0955
Epoch 4, Batch 1020/2240, Loss: 0.2490
Epoch 4, Batch 1030/2240, Loss: 0.1764
Epoch 4, Batch 1040/2240, Loss: 0.9082
Epoch 4, Batch 1050/2240, Loss: 0.3553
Epoch 4, Batch 1060/2240, Loss: 0.4079
Epoch 4, Batch 1070/2240, Loss: 0.0590
Epoch 4, Batch 1080/2240, Loss: 0.1621
Epoch 4, Batch 1090/2240, Loss: 0.0554
Epoch 4, Batch 1100/2240, Loss: 0.1891
Epoch 4, Batch 1110/2240, Loss: 0.3588
Epoch 4, Batch 1120/2240, Loss: 0.3173
Epoch 4, Batch 1130/2240, Loss: 0.4483
Epoch 4, Batch 1140/2240, Loss: 0.1100
Epoch 4, Batch 1150/2240, Loss: 0.5388
Epoch 4, Batch 1160/2240, Loss: 0.0197
Epoch 4, Batch 1170/2240, Loss: 0.2633
Epoch 4, Batch 1180/2240, Loss: 0.0570
Epoch 4, Batch 1190/2240, Loss: 0.3175
Epoch 4, Batch 1200/2240, Loss: 0.1701
Epoch 4, Batch 1210/2240, Loss: 0.0752
Epoch 4, Batch 1220/2240, Loss: 0.3800
Epoch 4, Batch 1230/2240, L

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 4/10, Train Loss: 0.2534, Train Acc: 0.8876, Valid Loss: 0.4368, Valid Acc: 0.8049


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 5, Batch 10/2240, Loss: 0.1487
Epoch 5, Batch 20/2240, Loss: 0.0905
Epoch 5, Batch 30/2240, Loss: 0.0607
Epoch 5, Batch 40/2240, Loss: 0.1146
Epoch 5, Batch 50/2240, Loss: 0.0632
Epoch 5, Batch 60/2240, Loss: 0.0353
Epoch 5, Batch 70/2240, Loss: 0.0603
Epoch 5, Batch 80/2240, Loss: 0.0445
Epoch 5, Batch 90/2240, Loss: 0.0948
Epoch 5, Batch 100/2240, Loss: 0.2211
Epoch 5, Batch 110/2240, Loss: 0.1079
Epoch 5, Batch 120/2240, Loss: 0.0552
Epoch 5, Batch 130/2240, Loss: 0.1098
Epoch 5, Batch 140/2240, Loss: 0.0665
Epoch 5, Batch 150/2240, Loss: 0.1835
Epoch 5, Batch 160/2240, Loss: 0.1063
Epoch 5, Batch 170/2240, Loss: 0.0673
Epoch 5, Batch 180/2240, Loss: 0.2492
Epoch 5, Batch 190/2240, Loss: 0.1161
Epoch 5, Batch 200/2240, Loss: 0.1069
Epoch 5, Batch 210/2240, Loss: 0.2844
Epoch 5, Batch 220/2240, Loss: 0.1939
Epoch 5, Batch 230/2240, Loss: 0.0820
Epoch 5, Batch 240/2240, Loss: 0.0748
Epoch 5, Batch 250/2240, Loss: 0.1096
Epoch 5, Batch 260/2240, Loss: 0.5847
Epoch 5, Batch 270/22



Epoch 5, Batch 850/2240, Loss: 0.2565
Epoch 5, Batch 860/2240, Loss: 0.2470
Epoch 5, Batch 870/2240, Loss: 0.2692
Epoch 5, Batch 880/2240, Loss: 1.0967
Epoch 5, Batch 890/2240, Loss: 0.1085
Epoch 5, Batch 900/2240, Loss: 0.2103
Epoch 5, Batch 910/2240, Loss: 0.3245
Epoch 5, Batch 920/2240, Loss: 0.1876
Epoch 5, Batch 930/2240, Loss: 0.1292
Epoch 5, Batch 940/2240, Loss: 0.0019
Epoch 5, Batch 950/2240, Loss: 0.2493
Epoch 5, Batch 960/2240, Loss: 0.1449
Epoch 5, Batch 970/2240, Loss: 0.2734
Epoch 5, Batch 980/2240, Loss: 0.0516
Epoch 5, Batch 990/2240, Loss: 0.2169
Epoch 5, Batch 1000/2240, Loss: 0.0373
Epoch 5, Batch 1010/2240, Loss: 0.1416
Epoch 5, Batch 1020/2240, Loss: 0.1994
Epoch 5, Batch 1030/2240, Loss: 0.1919
Epoch 5, Batch 1040/2240, Loss: 0.2883
Epoch 5, Batch 1050/2240, Loss: 0.0650
Epoch 5, Batch 1060/2240, Loss: 0.0159
Epoch 5, Batch 1070/2240, Loss: 0.0426
Epoch 5, Batch 1080/2240, Loss: 0.1338
Epoch 5, Batch 1090/2240, Loss: 0.4278
Epoch 5, Batch 1100/2240, Loss: 0.4631
E



Epoch 5, Batch 1130/2240, Loss: 0.0593
Epoch 5, Batch 1140/2240, Loss: 0.0191
Epoch 5, Batch 1150/2240, Loss: 0.0889
Epoch 5, Batch 1160/2240, Loss: 0.4541
Epoch 5, Batch 1170/2240, Loss: 0.0218
Epoch 5, Batch 1180/2240, Loss: 0.0888
Epoch 5, Batch 1190/2240, Loss: 0.1142
Epoch 5, Batch 1200/2240, Loss: 0.0388
Epoch 5, Batch 1210/2240, Loss: 0.0856
Epoch 5, Batch 1220/2240, Loss: 0.0719
Epoch 5, Batch 1230/2240, Loss: 0.2120
Epoch 5, Batch 1240/2240, Loss: 0.4149
Epoch 5, Batch 1250/2240, Loss: 0.0593
Epoch 5, Batch 1260/2240, Loss: 0.2407
Epoch 5, Batch 1270/2240, Loss: 0.1444
Epoch 5, Batch 1280/2240, Loss: 0.0233
Epoch 5, Batch 1290/2240, Loss: 0.4111
Epoch 5, Batch 1300/2240, Loss: 0.1369
Epoch 5, Batch 1310/2240, Loss: 0.2816
Epoch 5, Batch 1320/2240, Loss: 0.2820
Epoch 5, Batch 1330/2240, Loss: 0.3685
Epoch 5, Batch 1340/2240, Loss: 0.0426
Epoch 5, Batch 1350/2240, Loss: 0.3189
Epoch 5, Batch 1360/2240, Loss: 0.2388
Epoch 5, Batch 1370/2240, Loss: 0.1570
Epoch 5, Batch 1380/2240,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 5/10, Train Loss: 0.1632, Train Acc: 0.9353, Valid Loss: 0.5999, Valid Acc: 0.8065


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 6, Batch 10/2240, Loss: 0.0092
Epoch 6, Batch 20/2240, Loss: 0.1116
Epoch 6, Batch 30/2240, Loss: 0.1706
Epoch 6, Batch 40/2240, Loss: 0.0274
Epoch 6, Batch 50/2240, Loss: 0.0020
Epoch 6, Batch 60/2240, Loss: 0.0128
Epoch 6, Batch 70/2240, Loss: 0.0113
Epoch 6, Batch 80/2240, Loss: 0.4712
Epoch 6, Batch 90/2240, Loss: 0.0048
Epoch 6, Batch 100/2240, Loss: 0.0368
Epoch 6, Batch 110/2240, Loss: 0.0952
Epoch 6, Batch 120/2240, Loss: 0.0397
Epoch 6, Batch 130/2240, Loss: 0.0311
Epoch 6, Batch 140/2240, Loss: 0.0104
Epoch 6, Batch 150/2240, Loss: 0.0470
Epoch 6, Batch 160/2240, Loss: 0.1784
Epoch 6, Batch 170/2240, Loss: 0.7954
Epoch 6, Batch 180/2240, Loss: 0.0323
Epoch 6, Batch 190/2240, Loss: 0.0356
Epoch 6, Batch 200/2240, Loss: 0.0729
Epoch 6, Batch 210/2240, Loss: 0.0360
Epoch 6, Batch 220/2240, Loss: 0.0470
Epoch 6, Batch 230/2240, Loss: 0.0325
Epoch 6, Batch 240/2240, Loss: 0.1311
Epoch 6, Batch 250/2240, Loss: 0.0431
Epoch 6, Batch 260/2240, Loss: 0.1451
Epoch 6, Batch 270/22



Epoch 6, Batch 330/2240, Loss: 0.2068
Epoch 6, Batch 340/2240, Loss: 0.1209
Epoch 6, Batch 350/2240, Loss: 0.3345
Epoch 6, Batch 360/2240, Loss: 0.1960
Epoch 6, Batch 370/2240, Loss: 0.0453
Epoch 6, Batch 380/2240, Loss: 0.0865
Epoch 6, Batch 390/2240, Loss: 0.0624
Epoch 6, Batch 400/2240, Loss: 0.0641
Epoch 6, Batch 410/2240, Loss: 0.0344
Epoch 6, Batch 420/2240, Loss: 0.0675
Epoch 6, Batch 430/2240, Loss: 0.3616
Epoch 6, Batch 440/2240, Loss: 0.2081
Epoch 6, Batch 450/2240, Loss: 0.0111
Epoch 6, Batch 460/2240, Loss: 0.1190
Epoch 6, Batch 470/2240, Loss: 0.0115
Epoch 6, Batch 480/2240, Loss: 0.1257
Epoch 6, Batch 490/2240, Loss: 0.0371
Epoch 6, Batch 500/2240, Loss: 0.1091
Epoch 6, Batch 510/2240, Loss: 0.0111
Epoch 6, Batch 520/2240, Loss: 0.0501
Epoch 6, Batch 530/2240, Loss: 0.0035
Epoch 6, Batch 540/2240, Loss: 0.0426
Epoch 6, Batch 550/2240, Loss: 0.5322
Epoch 6, Batch 560/2240, Loss: 0.0268
Epoch 6, Batch 570/2240, Loss: 0.0448
Epoch 6, Batch 580/2240, Loss: 0.0066
Epoch 6, Bat



Epoch 6, Batch 2060/2240, Loss: 0.3062
Epoch 6, Batch 2070/2240, Loss: 0.2353
Epoch 6, Batch 2080/2240, Loss: 0.0354
Epoch 6, Batch 2090/2240, Loss: 0.0450
Epoch 6, Batch 2100/2240, Loss: 0.1702
Epoch 6, Batch 2110/2240, Loss: 0.0031
Epoch 6, Batch 2120/2240, Loss: 0.0080
Epoch 6, Batch 2130/2240, Loss: 0.0279
Epoch 6, Batch 2140/2240, Loss: 0.0241
Epoch 6, Batch 2150/2240, Loss: 0.0372
Epoch 6, Batch 2160/2240, Loss: 0.0704
Epoch 6, Batch 2170/2240, Loss: 0.2473
Epoch 6, Batch 2180/2240, Loss: 0.1398
Epoch 6, Batch 2190/2240, Loss: 0.4925
Epoch 6, Batch 2200/2240, Loss: 0.0874
Epoch 6, Batch 2210/2240, Loss: 0.0309
Epoch 6, Batch 2220/2240, Loss: 0.1788
Epoch 6, Batch 2230/2240, Loss: 0.0952
Epoch 6, Batch 2240/2240, Loss: 0.0492


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 6/10, Train Loss: 0.1047, Train Acc: 0.9611, Valid Loss: 0.6935, Valid Acc: 0.8021


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 7, Batch 10/2240, Loss: 0.0343
Epoch 7, Batch 20/2240, Loss: 0.0272
Epoch 7, Batch 30/2240, Loss: 0.0256
Epoch 7, Batch 40/2240, Loss: 0.0451
Epoch 7, Batch 50/2240, Loss: 0.4740
Epoch 7, Batch 60/2240, Loss: 0.0749
Epoch 7, Batch 70/2240, Loss: 0.0072
Epoch 7, Batch 80/2240, Loss: 0.0353
Epoch 7, Batch 90/2240, Loss: 0.0978
Epoch 7, Batch 100/2240, Loss: 0.0186
Epoch 7, Batch 110/2240, Loss: 0.0234
Epoch 7, Batch 120/2240, Loss: 0.0126
Epoch 7, Batch 130/2240, Loss: 0.0133
Epoch 7, Batch 140/2240, Loss: 0.0053
Epoch 7, Batch 150/2240, Loss: 0.0441
Epoch 7, Batch 160/2240, Loss: 0.4069
Epoch 7, Batch 170/2240, Loss: 0.0245
Epoch 7, Batch 180/2240, Loss: 0.0168
Epoch 7, Batch 190/2240, Loss: 0.0460
Epoch 7, Batch 200/2240, Loss: 0.0063
Epoch 7, Batch 210/2240, Loss: 0.0261
Epoch 7, Batch 220/2240, Loss: 0.0210
Epoch 7, Batch 230/2240, Loss: 0.0215
Epoch 7, Batch 240/2240, Loss: 0.0116
Epoch 7, Batch 250/2240, Loss: 0.0185
Epoch 7, Batch 260/2240, Loss: 0.0220
Epoch 7, Batch 270/22



Epoch 7, Batch 1260/2240, Loss: 0.0351
Epoch 7, Batch 1270/2240, Loss: 0.0452
Epoch 7, Batch 1280/2240, Loss: 0.0156
Epoch 7, Batch 1290/2240, Loss: 0.0329
Epoch 7, Batch 1300/2240, Loss: 0.0185
Epoch 7, Batch 1310/2240, Loss: 0.0159
Epoch 7, Batch 1320/2240, Loss: 0.0081
Epoch 7, Batch 1330/2240, Loss: 0.0251
Epoch 7, Batch 1340/2240, Loss: 0.0586
Epoch 7, Batch 1350/2240, Loss: 0.3678
Epoch 7, Batch 1360/2240, Loss: 0.0523
Epoch 7, Batch 1370/2240, Loss: 0.0369
Epoch 7, Batch 1380/2240, Loss: 0.1494
Epoch 7, Batch 1390/2240, Loss: 0.1039
Epoch 7, Batch 1400/2240, Loss: 0.1032




Epoch 7, Batch 1410/2240, Loss: 0.0931
Epoch 7, Batch 1420/2240, Loss: 0.0075
Epoch 7, Batch 1430/2240, Loss: 0.0154
Epoch 7, Batch 1440/2240, Loss: 0.4171
Epoch 7, Batch 1450/2240, Loss: 0.0358
Epoch 7, Batch 1460/2240, Loss: 0.2594
Epoch 7, Batch 1470/2240, Loss: 0.0247
Epoch 7, Batch 1480/2240, Loss: 0.0641
Epoch 7, Batch 1490/2240, Loss: 0.0166
Epoch 7, Batch 1500/2240, Loss: 0.0031
Epoch 7, Batch 1510/2240, Loss: 0.4376
Epoch 7, Batch 1520/2240, Loss: 0.0543
Epoch 7, Batch 1530/2240, Loss: 0.0290
Epoch 7, Batch 1540/2240, Loss: 0.0300
Epoch 7, Batch 1550/2240, Loss: 0.0289
Epoch 7, Batch 1560/2240, Loss: 0.0422
Epoch 7, Batch 1570/2240, Loss: 0.2055
Epoch 7, Batch 1580/2240, Loss: 0.0138
Epoch 7, Batch 1590/2240, Loss: 0.1567
Epoch 7, Batch 1600/2240, Loss: 0.0393
Epoch 7, Batch 1610/2240, Loss: 0.0436
Epoch 7, Batch 1620/2240, Loss: 0.0063
Epoch 7, Batch 1630/2240, Loss: 0.0507
Epoch 7, Batch 1640/2240, Loss: 0.0156
Epoch 7, Batch 1650/2240, Loss: 0.1724
Epoch 7, Batch 1660/2240,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 7/10, Train Loss: 0.0693, Train Acc: 0.9761, Valid Loss: 0.8241, Valid Acc: 0.7904


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 8, Batch 10/2240, Loss: 0.8730
Epoch 8, Batch 20/2240, Loss: 0.0576
Epoch 8, Batch 30/2240, Loss: 0.0249
Epoch 8, Batch 40/2240, Loss: 0.0188
Epoch 8, Batch 50/2240, Loss: 0.0036
Epoch 8, Batch 60/2240, Loss: 0.0023
Epoch 8, Batch 70/2240, Loss: 0.0226
Epoch 8, Batch 80/2240, Loss: 0.0273
Epoch 8, Batch 90/2240, Loss: 0.0830
Epoch 8, Batch 100/2240, Loss: 0.0152
Epoch 8, Batch 110/2240, Loss: 0.0149
Epoch 8, Batch 120/2240, Loss: 0.3243
Epoch 8, Batch 130/2240, Loss: 0.0219
Epoch 8, Batch 140/2240, Loss: 0.0309
Epoch 8, Batch 150/2240, Loss: 0.0023
Epoch 8, Batch 160/2240, Loss: 0.2131
Epoch 8, Batch 170/2240, Loss: 0.0251
Epoch 8, Batch 180/2240, Loss: 0.0643
Epoch 8, Batch 190/2240, Loss: 0.0029
Epoch 8, Batch 200/2240, Loss: 0.0020
Epoch 8, Batch 210/2240, Loss: 0.0160
Epoch 8, Batch 220/2240, Loss: 0.0055
Epoch 8, Batch 230/2240, Loss: 0.0026
Epoch 8, Batch 240/2240, Loss: 0.0512
Epoch 8, Batch 250/2240, Loss: 0.2625
Epoch 8, Batch 260/2240, Loss: 0.0305
Epoch 8, Batch 270/22



Epoch 8, Batch 800/2240, Loss: 0.0032
Epoch 8, Batch 810/2240, Loss: 0.0005
Epoch 8, Batch 820/2240, Loss: 0.0653
Epoch 8, Batch 830/2240, Loss: 0.0083
Epoch 8, Batch 840/2240, Loss: 0.0173
Epoch 8, Batch 850/2240, Loss: 0.1988
Epoch 8, Batch 860/2240, Loss: 0.0293
Epoch 8, Batch 870/2240, Loss: 0.0475
Epoch 8, Batch 880/2240, Loss: 0.0019
Epoch 8, Batch 890/2240, Loss: 0.0066
Epoch 8, Batch 900/2240, Loss: 0.0037
Epoch 8, Batch 910/2240, Loss: 0.0296
Epoch 8, Batch 920/2240, Loss: 0.0025
Epoch 8, Batch 930/2240, Loss: 0.1727
Epoch 8, Batch 940/2240, Loss: 0.0148
Epoch 8, Batch 950/2240, Loss: 0.4436
Epoch 8, Batch 960/2240, Loss: 0.5001
Epoch 8, Batch 970/2240, Loss: 0.0318
Epoch 8, Batch 980/2240, Loss: 0.0210
Epoch 8, Batch 990/2240, Loss: 0.0056
Epoch 8, Batch 1020/2240, Loss: 0.0023
Epoch 8, Batch 1030/2240, Loss: 0.2467
Epoch 8, Batch 1040/2240, Loss: 0.0007
Epoch 8, Batch 1050/2240, Loss: 0.0157
Epoch 8, Batch 1060/2240, Loss: 0.0009
Epoch 8, Batch 1070/2240, Loss: 0.0258
Epoch 



Epoch 8, Batch 1750/2240, Loss: 0.6541
Epoch 8, Batch 1760/2240, Loss: 0.0246
Epoch 8, Batch 1770/2240, Loss: 0.0011
Epoch 8, Batch 1780/2240, Loss: 0.3261
Epoch 8, Batch 1790/2240, Loss: 0.0021
Epoch 8, Batch 1800/2240, Loss: 0.0852
Epoch 8, Batch 1810/2240, Loss: 0.0003
Epoch 8, Batch 1820/2240, Loss: 0.0689
Epoch 8, Batch 1830/2240, Loss: 0.1873
Epoch 8, Batch 1840/2240, Loss: 0.1440
Epoch 8, Batch 1850/2240, Loss: 0.0044
Epoch 8, Batch 1860/2240, Loss: 0.0017
Epoch 8, Batch 1870/2240, Loss: 0.0166
Epoch 8, Batch 1880/2240, Loss: 0.6302
Epoch 8, Batch 1890/2240, Loss: 0.0007
Epoch 8, Batch 1900/2240, Loss: 0.0137
Epoch 8, Batch 1910/2240, Loss: 0.0116
Epoch 8, Batch 1920/2240, Loss: 0.1893
Epoch 8, Batch 1930/2240, Loss: 0.0045
Epoch 8, Batch 1940/2240, Loss: 0.3176
Epoch 8, Batch 1950/2240, Loss: 0.0300
Epoch 8, Batch 1960/2240, Loss: 0.0424
Epoch 8, Batch 1970/2240, Loss: 0.0197
Epoch 8, Batch 1980/2240, Loss: 0.0015
Epoch 8, Batch 1990/2240, Loss: 0.0030
Epoch 8, Batch 2000/2240,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 8/10, Train Loss: 0.0525, Train Acc: 0.9821, Valid Loss: 0.9034, Valid Acc: 0.7935


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 9, Batch 10/2240, Loss: 0.0538
Epoch 9, Batch 20/2240, Loss: 0.0004
Epoch 9, Batch 30/2240, Loss: 0.3922
Epoch 9, Batch 40/2240, Loss: 0.0083
Epoch 9, Batch 50/2240, Loss: 0.2319
Epoch 9, Batch 60/2240, Loss: 0.0656
Epoch 9, Batch 70/2240, Loss: 0.0193
Epoch 9, Batch 80/2240, Loss: 0.0223
Epoch 9, Batch 90/2240, Loss: 0.0390
Epoch 9, Batch 100/2240, Loss: 0.0163
Epoch 9, Batch 110/2240, Loss: 0.0006
Epoch 9, Batch 120/2240, Loss: 0.0062
Epoch 9, Batch 130/2240, Loss: 0.0550
Epoch 9, Batch 140/2240, Loss: 0.0006
Epoch 9, Batch 150/2240, Loss: 0.1829
Epoch 9, Batch 160/2240, Loss: 0.0014
Epoch 9, Batch 170/2240, Loss: 0.1671
Epoch 9, Batch 180/2240, Loss: 0.0039
Epoch 9, Batch 190/2240, Loss: 0.0017
Epoch 9, Batch 200/2240, Loss: 0.0004
Epoch 9, Batch 210/2240, Loss: 0.1727
Epoch 9, Batch 220/2240, Loss: 0.0023
Epoch 9, Batch 230/2240, Loss: 0.0002
Epoch 9, Batch 240/2240, Loss: 0.0017
Epoch 9, Batch 250/2240, Loss: 0.0117
Epoch 9, Batch 260/2240, Loss: 0.0549
Epoch 9, Batch 270/22



Epoch 9, Batch 280/2240, Loss: 0.1837
Epoch 9, Batch 290/2240, Loss: 0.0933
Epoch 9, Batch 300/2240, Loss: 0.0024
Epoch 9, Batch 310/2240, Loss: 0.0001
Epoch 9, Batch 320/2240, Loss: 0.0080
Epoch 9, Batch 330/2240, Loss: 0.1723
Epoch 9, Batch 340/2240, Loss: 0.0125
Epoch 9, Batch 350/2240, Loss: 0.0011
Epoch 9, Batch 360/2240, Loss: 0.0005
Epoch 9, Batch 370/2240, Loss: 0.0015
Epoch 9, Batch 380/2240, Loss: 0.0102
Epoch 9, Batch 390/2240, Loss: 0.0050
Epoch 9, Batch 400/2240, Loss: 0.0196
Epoch 9, Batch 410/2240, Loss: 0.0502
Epoch 9, Batch 420/2240, Loss: 0.0173
Epoch 9, Batch 430/2240, Loss: 0.0358
Epoch 9, Batch 440/2240, Loss: 0.0005
Epoch 9, Batch 450/2240, Loss: 0.0094
Epoch 9, Batch 460/2240, Loss: 0.0011
Epoch 9, Batch 470/2240, Loss: 0.0098
Epoch 9, Batch 480/2240, Loss: 0.0050
Epoch 9, Batch 490/2240, Loss: 0.0133
Epoch 9, Batch 500/2240, Loss: 0.0025
Epoch 9, Batch 510/2240, Loss: 0.0065
Epoch 9, Batch 520/2240, Loss: 0.0066
Epoch 9, Batch 530/2240, Loss: 0.0247
Epoch 9, Bat



Epoch 9, Batch 1330/2240, Loss: 0.0022
Epoch 9, Batch 1340/2240, Loss: 0.0072
Epoch 9, Batch 1350/2240, Loss: 0.1178
Epoch 9, Batch 1360/2240, Loss: 0.0011
Epoch 9, Batch 1370/2240, Loss: 0.1464
Epoch 9, Batch 1380/2240, Loss: 0.0008
Epoch 9, Batch 1390/2240, Loss: 0.1452
Epoch 9, Batch 1400/2240, Loss: 0.0145
Epoch 9, Batch 1410/2240, Loss: 0.0994
Epoch 9, Batch 1420/2240, Loss: 0.0072
Epoch 9, Batch 1430/2240, Loss: 0.0066
Epoch 9, Batch 1440/2240, Loss: 0.2380
Epoch 9, Batch 1450/2240, Loss: 0.0297
Epoch 9, Batch 1460/2240, Loss: 0.0023
Epoch 9, Batch 1470/2240, Loss: 0.0121
Epoch 9, Batch 1480/2240, Loss: 0.0520
Epoch 9, Batch 1490/2240, Loss: 0.0028
Epoch 9, Batch 1500/2240, Loss: 0.0062
Epoch 9, Batch 1510/2240, Loss: 0.0009
Epoch 9, Batch 1520/2240, Loss: 0.0530
Epoch 9, Batch 1530/2240, Loss: 0.0048
Epoch 9, Batch 1540/2240, Loss: 0.0064
Epoch 9, Batch 1550/2240, Loss: 0.0044
Epoch 9, Batch 1560/2240, Loss: 0.0329
Epoch 9, Batch 1570/2240, Loss: 0.0360
Epoch 9, Batch 1580/2240,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 9/10, Train Loss: 0.0446, Train Acc: 0.9850, Valid Loss: 0.8906, Valid Acc: 0.7932


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 10, Batch 10/2240, Loss: 0.0059
Epoch 10, Batch 20/2240, Loss: 0.0017
Epoch 10, Batch 30/2240, Loss: 0.0027
Epoch 10, Batch 40/2240, Loss: 0.0105
Epoch 10, Batch 50/2240, Loss: 0.0034
Epoch 10, Batch 60/2240, Loss: 0.0005
Epoch 10, Batch 70/2240, Loss: 0.0004
Epoch 10, Batch 80/2240, Loss: 0.0031
Epoch 10, Batch 90/2240, Loss: 0.0133
Epoch 10, Batch 100/2240, Loss: 0.0018
Epoch 10, Batch 110/2240, Loss: 0.0070




Epoch 10, Batch 120/2240, Loss: 0.0005
Epoch 10, Batch 130/2240, Loss: 0.0690
Epoch 10, Batch 140/2240, Loss: 0.0060
Epoch 10, Batch 150/2240, Loss: 0.0017
Epoch 10, Batch 160/2240, Loss: 0.0036
Epoch 10, Batch 170/2240, Loss: 0.0448
Epoch 10, Batch 180/2240, Loss: 0.0153
Epoch 10, Batch 190/2240, Loss: 0.0214
Epoch 10, Batch 200/2240, Loss: 0.0064
Epoch 10, Batch 210/2240, Loss: 0.0232
Epoch 10, Batch 220/2240, Loss: 0.0088
Epoch 10, Batch 230/2240, Loss: 0.0015
Epoch 10, Batch 240/2240, Loss: 0.0088
Epoch 10, Batch 250/2240, Loss: 0.0067
Epoch 10, Batch 260/2240, Loss: 0.0081
Epoch 10, Batch 270/2240, Loss: 0.0018
Epoch 10, Batch 280/2240, Loss: 0.2103
Epoch 10, Batch 290/2240, Loss: 0.0057
Epoch 10, Batch 300/2240, Loss: 0.0141
Epoch 10, Batch 310/2240, Loss: 0.0021
Epoch 10, Batch 320/2240, Loss: 0.0687
Epoch 10, Batch 330/2240, Loss: 0.0009
Epoch 10, Batch 340/2240, Loss: 0.0067
Epoch 10, Batch 350/2240, Loss: 0.0018
Epoch 10, Batch 360/2240, Loss: 0.0134
Epoch 10, Batch 370/2240,



Epoch 10, Batch 1760/2240, Loss: 0.0035
Epoch 10, Batch 1770/2240, Loss: 0.0003
Epoch 10, Batch 1780/2240, Loss: 0.0021
Epoch 10, Batch 1790/2240, Loss: 0.0137
Epoch 10, Batch 1800/2240, Loss: 0.0004
Epoch 10, Batch 1810/2240, Loss: 0.0014
Epoch 10, Batch 1820/2240, Loss: 0.0298
Epoch 10, Batch 1830/2240, Loss: 0.0019
Epoch 10, Batch 1840/2240, Loss: 0.0014
Epoch 10, Batch 1850/2240, Loss: 0.0263
Epoch 10, Batch 1860/2240, Loss: 0.0023
Epoch 10, Batch 1870/2240, Loss: 0.0028
Epoch 10, Batch 1880/2240, Loss: 0.0073
Epoch 10, Batch 1890/2240, Loss: 0.4313
Epoch 10, Batch 1900/2240, Loss: 0.0180
Epoch 10, Batch 1910/2240, Loss: 0.0132
Epoch 10, Batch 1920/2240, Loss: 0.0033
Epoch 10, Batch 1930/2240, Loss: 0.0046
Epoch 10, Batch 1940/2240, Loss: 0.0001
Epoch 10, Batch 1950/2240, Loss: 0.0002
Epoch 10, Batch 1960/2240, Loss: 0.0000
Epoch 10, Batch 1970/2240, Loss: 0.0240
Epoch 10, Batch 1980/2240, Loss: 0.1665
Epoch 10, Batch 1990/2240, Loss: 0.0038
Epoch 10, Batch 2000/2240, Loss: 0.0429


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 10/10, Train Loss: 0.0319, Train Acc: 0.9890, Valid Loss: 1.0329, Valid Acc: 0.7802


  model.load_state_dict(torch.load('best_multimodal_model.pth'))



Evaluating model on test set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Test Accuracy: 0.8120

Confusion Matrix:
[[ 874  358]
 [ 364 2244]]

Classification Report:
              precision    recall  f1-score   support

 Non-Harmful       0.71      0.71      0.71      1232
     Harmful       0.86      0.86      0.86      2608

    accuracy                           0.81      3840
   macro avg       0.78      0.78      0.78      3840
weighted avg       0.81      0.81      0.81      3840

Model training and evaluation complete!
