The first part would be load the MIDI file and convert it to sequence that coule be directly used in the model. Here we only trained with only 12 piano notes (Beethoven's piano solo) for shorter training process

In [61]:
import os
from mido import MidiFile
import csv

def midi_to_note_sequence_to_file(filepath, output_csv):
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['piece_id','event_type', 'control', 'value', 'note', 'velocity', 'time_since_last_event', 'channel']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        piece_id = 0

        for filename in os.listdir(filepath):
            if filename.endswith('.mid') or filename.endswith('.midi'):
                try:
                    mid = MidiFile(os.path.join(filepath, filename))  
                    for track in mid.tracks:
                        time_since_last_event = 0
                        for msg in track:
                            time_since_last_event += msg.time
                            if not msg.is_meta:
                                if msg.type in ['note_on', 'note_off']:
                                    event_type = 'note_start' if msg.type == 'note_on' and msg.velocity > 0 else 'note_stop'
                                    writer.writerow({
                                        'piece_id': piece_id,
                                        'event_type': event_type,
                                        'control': 0,
                                        'value': 0,
                                        'note': msg.note,
                                        'velocity': msg.velocity,
                                        'time_since_last_event': time_since_last_event,
                                        'channel': msg.channel
                                    })
                                elif msg.type == 'control_change':
                                    writer.writerow({
                                        'piece_id': piece_id,
                                        'event_type': 'control_change',
                                        'control': msg.control,
                                        'value': msg.value,
                                        'note': 0,
                                        'velocity': 0,
                                        'time_since_last_event': time_since_last_event,
                                        'channel': msg.channel
                                    })
                                time_since_last_event = 0
                    piece_id += 1
                except OSError as e:
                    print(f"Error processing {filename}: {e}")

path = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven'
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'
midi_to_note_sequence_to_file(path, output_csv)


Define the main training and evaluating below, as we can see from the result, the accuracy stuck at around 0.64

Input Layer (Piano note sequence)
     |
     v
Embedding Layer (Converts input tokens to dense vectors)
     |
     v
Positional Encoding Layer (Adds positional information to embeddings)
     |
     v
Transformer Encoder Layer 1 (Self-attention mechanism, layer normalization, residual connection, and feed-forward network)
     |
     v
Transformer Encoder Layer 2 (Self-attention mechanism, layer normalization, residual connection, and feed-forward network)
     |
     v
Transformer Encoder Layer N (Self-attention mechanism, layer normalization, residual connection, and feed-forward network)
     |
     v
Flatten Layer (Converts sequence output to 1D)
     |
     v
Fully Connected Layer 1 (Learns complex patterns)
     |
     v
ReLU (Applies non-linearity)
     |
     v
Dropout Layer (Prevents overfitting)
     |
     v
Fully Connected Layer 2 (Learns more complex patterns)
     |
     v
ReLU (Applies non-linearity)
     |
     v
Output Layer (Softmax) (Produces probability distribution over classes)


In [63]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim

class PositionalEncoding(nn.Module):
    def __init__(self, model_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, model_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-np.log(10000.0) / model_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_dim, dropout=0.1, activation='relu'):
        super(CustomTransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(model_dim, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(model_dim, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_dim, model_dim)
        
        # Choose activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        elif activation == 'softmax':
            self.activation = nn.Softmax(dim=-1)
        else:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        src2 = self.self_attn(src, src, src)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, ff_dim, n_classes, dropout=0.1, activation='relu'):
        super(CustomTransformerModel, self).__init__()
        self.model_dim = model_dim
        self.input_projection = nn.Linear(input_dim, model_dim)  
        self.pos_encoder = PositionalEncoding(model_dim)
        
        # Create custom transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            CustomTransformerEncoderLayer(model_dim, num_heads, ff_dim, dropout, activation)
            for _ in range(num_layers)
        ])
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=model_dim * sequence_length, out_features=n_classes)
        )
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.bias.data.zero_()
                layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.input_projection(src) * np.sqrt(self.model_dim)  # Project input and scale
        src = self.pos_encoder(src)
        for layer in self.encoder_layers:
            src = layer(src)
        output = self.classifier(src)
        return output

class PianoDataset(Dataset):
    def __init__(self, csv_file, sequence_length=100):
        self.df = pd.read_csv(csv_file)
        self.df = pd.concat([self.df.drop('event_type', axis=1), pd.get_dummies(self.df['event_type'], prefix='event_type')], axis=1)
        self.df = self.df.astype(np.float32)  
        self.sequence_length = sequence_length
        self.data = self.generate_sequences()

    def generate_sequences(self):
        input_sequences = []
        target_events = []
        for i in range(len(self.df) - self.sequence_length):
            full_sequence = self.df.iloc[i:i+self.sequence_length+1]
            input_sequence = full_sequence.iloc[:-1]
            target_event = full_sequence.iloc[-1]
            input_sequences.append(input_sequence.values)
            target_events.append(target_event.values.argmax())  # Use class index as target
        return list(zip(input_sequences, target_events))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return sequence_tensor, target_tensor

def train_valid_test_split(dataset, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum to 1"
    
    train_size = int(train_ratio * len(dataset))
    valid_size = int(valid_ratio * len(dataset))
    test_size = len(dataset) - train_size - valid_size
    
    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Define model parameters
input_dim = 10  
model_dim = 128  
num_heads = 4  
num_layers = 8  
ff_dim = 512  
n_classes = 8  
sequence_length = 100

# Set device and initialize model and optimizer
device = torch.device("cuda")
activation_function = 'gelu'
model = CustomTransformerModel(input_dim, model_dim, num_heads, num_layers, ff_dim, n_classes, activation=activation_function).to(device)

# Initialize weights
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(initialize_weights)

# Set optimizer with a lower learning rate
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Create dataset and dataloaders
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'
full_dataset = PianoDataset(output_csv, sequence_length)

# Split the dataset
train_dataset, valid_dataset, test_dataset = train_valid_test_split(full_dataset)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=512, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training function with gradient clipping and early stopping
def train_model(model, train_loader, valid_loader, optimizer, num_epochs=100, patience=5):
    model.train()
    best_accuracy = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f'Starting epoch {epoch+1}/{num_epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            
            # Ensure outputs and targets are correctly shaped
            if outputs.shape[0] != targets.shape[0]:
                print(f"Shape mismatch: outputs shape {outputs.shape}, targets shape {targets.shape}")
                continue
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
        # Validate the model
        val_loss, val_accuracy = evaluate_model(model, valid_loader)
        
        # Early stopping
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, dataloader):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

    total_loss = running_loss / len(dataloader.dataset)
    accuracy = correct_predictions / total_predictions
    print(f'Evaluation Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}')
    return total_loss, accuracy

# Train the model with high patience early stopping and save the best model
train_model(model, train_loader, valid_loader, optimizer, num_epochs=100, patience=15)

# Load the best model and evaluate on the test set
model.load_state_dict(torch.load('best_model.pth'))
evaluate_model(model, test_loader)


Starting epoch 1/100
Epoch 1/100, Loss: 1.1528, Accuracy: 0.5343
Evaluation Loss: 1.0568, Accuracy: 0.5845
Starting epoch 2/100
Epoch 2/100, Loss: 0.9910, Accuracy: 0.6004
Evaluation Loss: 0.9564, Accuracy: 0.6062
Starting epoch 3/100
Epoch 3/100, Loss: 0.9346, Accuracy: 0.6166
Evaluation Loss: 0.9250, Accuracy: 0.6175
Starting epoch 4/100
Epoch 4/100, Loss: 0.9124, Accuracy: 0.6240
Evaluation Loss: 0.9245, Accuracy: 0.6198
Starting epoch 5/100
Epoch 5/100, Loss: 0.8970, Accuracy: 0.6296
Evaluation Loss: 0.8977, Accuracy: 0.6306
Starting epoch 6/100
Epoch 6/100, Loss: 0.8863, Accuracy: 0.6337
Evaluation Loss: 0.8944, Accuracy: 0.6297
Starting epoch 7/100
Epoch 7/100, Loss: 0.8763, Accuracy: 0.6382
Evaluation Loss: 0.8845, Accuracy: 0.6371
Starting epoch 8/100
Epoch 8/100, Loss: 0.8669, Accuracy: 0.6423
Evaluation Loss: 0.8804, Accuracy: 0.6355
Starting epoch 9/100
Epoch 9/100, Loss: 0.8588, Accuracy: 0.6457
Evaluation Loss: 0.8798, Accuracy: 0.6355
Starting epoch 10/100
Epoch 10/100, L

(0.8742001934206507, 0.6438128755934792)

We can try to modify the model with some approaches in cluding data augmenting and normalization and pretraining. But even with tuning, the accuracy are stuck. I would presume that large sample and longer-training with more complex structure transformer can achieve higher accuracy, but for the limited computational power, we might need to integrate some other models.p

Difference: 
Layer normalization is applied before the residual connection.
Dropout rate is set to 0.2.
Activation function is gelu.

Input Layer (Piano note sequence)
     |
     v
Embedding Layer (Converts input tokens to dense vectors)
     |
     v
Positional Encoding Layer (Adds positional information to embeddings)
     |
     v
Transformer Encoder Layer 1 (Self-attention mechanism, layer normalization, residual connection, and feed-forward network)
     |
     v
Transformer Encoder Layer 2 (Self-attention mechanism, layer normalization, residual connection, and feed-forward network)
     |
     v
Transformer Encoder Layer N (Self-attention mechanism, layer normalization, residual connection, and feed-forward network)
     |
     v
Flatten Layer (Converts sequence output to 1D)
     |
     v
Fully Connected Layer 1 (Learns complex patterns)
     |
     v
ReLU (Applies non-linearity)
     |
     v
Dropout Layer (Prevents overfitting)
     |
     v
Fully Connected Layer 2 (Learns more complex patterns)
     |
     v
ReLU (Applies non-linearity)
     |
     v
Output Layer (Softmax) (Produces probability distribution over classes)


In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
import random

class PositionalEncoding(nn.Module):
    def __init__(self, model_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, model_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-np.log(10000.0) / model_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_dim, dropout=0.1, activation='gelu'):
        super(CustomTransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(model_dim, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(model_dim, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_dim, model_dim)
        
        # Choose activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        # Pre-layer norm
        src2 = self.norm1(src)
        src2 = self.self_attn(src2, src2, src2)[0]
        src = src + self.dropout1(src2)
        
        # Pre-layer norm
        src2 = self.norm2(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
        src = src + self.dropout2(src2)
        return src

class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, ff_dim, n_classes, sequence_length, dropout=0.1, activation='gelu'):
        super(CustomTransformerModel, self).__init__()
        self.model_dim = model_dim
        self.input_projection = nn.Linear(input_dim, model_dim)  # Project input to model dimension
        self.pos_encoder = PositionalEncoding(model_dim)
        
        # Create custom transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            CustomTransformerEncoderLayer(model_dim, num_heads, ff_dim, dropout, activation)
            for _ in range(num_layers)
        ])
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=model_dim * sequence_length, out_features=n_classes)
        )
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.bias.data.zero_()
                layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.input_projection(src) * np.sqrt(self.model_dim)  # Project input and scale
        src = self.pos_encoder(src)
        for layer in self.encoder_layers:
            src = layer(src)
        output = self.classifier(src)
        return output

class PianoDataset(Dataset):
    def __init__(self, csv_file, sequence_length=100, augment=False):
        self.df = pd.read_csv(csv_file)
        self.df = pd.concat([self.df.drop('event_type', axis=1), pd.get_dummies(self.df['event_type'], prefix='event_type')], axis=1)
        self.df = self.df.astype(np.float32)  # Ensure all data is of type float32
        self.sequence_length = sequence_length
        self.augment = augment
        self.data = self.generate_sequences()

    def generate_sequences(self):
        input_sequences = []
        target_events = []
        for i in range(len(self.df) - self.sequence_length):
            full_sequence = self.df.iloc[i:i+self.sequence_length+1]
            input_sequence = full_sequence.iloc[:-1]
            target_event = full_sequence.iloc[-1]
            input_sequences.append(input_sequence.values)
            target_events.append(target_event.values.argmax())  # Use class index as target
        return list(zip(input_sequences, target_events))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        if self.augment:
            sequence = self.augment_sequence(sequence)
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return sequence_tensor, target_tensor

    def augment_sequence(self, sequence):
        # Transposition
        if random.random() < 0.5:
            transpose_amount = random.randint(-5, 5)
            sequence[:, 0] += transpose_amount  
        
        # Time-stretching
        if random.random() < 0.5:
            stretch_factor = random.uniform(0.8, 1.2)
            sequence[:, 1] *= stretch_factor  
        
        # Adding Noise
        if random.random() < 0.5:
            noise = np.random.normal(0, 0.01, sequence.shape)
            sequence += noise
        
        return sequence

def train_valid_test_split(dataset, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum to 1"
    
    train_size = int(train_ratio * len(dataset))
    valid_size = int(valid_ratio * len(dataset))
    test_size = len(dataset) - train_size - valid_size
    
    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Define model parameters
input_dim = 10  
model_dim = 128  
num_heads = 8  
num_layers = 12  
ff_dim = 512  
n_classes = 8  
sequence_length = 100

# Set device and initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
activation_function = 'gelu'  # Change activation function here
model = CustomTransformerModel(input_dim, model_dim, num_heads, num_layers, ff_dim, n_classes, sequence_length, activation=activation_function).to(device)

# Initialize weights
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(initialize_weights)

# Set optimizer with a lower learning rate
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)  # Reduced weight decay

# Define loss function
criterion = nn.CrossEntropyLoss()

# Create dataset and dataloaders
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'
full_dataset = PianoDataset(output_csv, sequence_length, augment=True)

# Split the dataset
train_dataset, valid_dataset, test_dataset = train_valid_test_split(full_dataset)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=512, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training function with gradient clipping and early stopping
def train_model(model, train_loader, valid_loader, optimizer, num_epochs=100, patience=15):
    model.train()
    best_accuracy = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f'Starting epoch {epoch+1}/{num_epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            
            # Ensure outputs and targets are correctly shaped
            if outputs.shape[0] != targets.shape[0]:
                print(f"Shape mismatch: outputs shape {outputs.shape}, targets shape {targets.shape}")
                continue
            
            loss = criterion(outputs, targets)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)  # Use inputs.size(0) for batch size
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
        # Validate the model
        val_loss, val_accuracy = evaluate_model(model, valid_loader)
        
        # Early stopping
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, dataloader):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

    total_loss = running_loss / len(dataloader.dataset)
    accuracy = correct_predictions / total_predictions
    print(f'Evaluation Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}')
    return total_loss, accuracy

# Train the model with high patience early stopping and save the best model
train_model(model, train_loader, valid_loader, optimizer, num_epochs=100, patience=15)

# Load the best model and evaluate on the test set
model.load_state_dict(torch.load('best_model.pth'))
evaluate_model(model, test_loader)


Starting epoch 1/100
Epoch 1/100, Loss: 19.5316, Accuracy: 0.4236
Evaluation Loss: 13.8370, Accuracy: 0.5156
Starting epoch 2/100
Epoch 2/100, Loss: 10.7625, Accuracy: 0.4356
Evaluation Loss: 18.0568, Accuracy: 0.2750
Starting epoch 3/100
Epoch 3/100, Loss: 10.0768, Accuracy: 0.4419
Evaluation Loss: 10.0024, Accuracy: 0.3313
Starting epoch 4/100
Epoch 4/100, Loss: 8.8451, Accuracy: 0.4564
Evaluation Loss: 8.7254, Accuracy: 0.5268
Starting epoch 5/100
Epoch 5/100, Loss: 7.7756, Accuracy: 0.4656
Evaluation Loss: 7.4704, Accuracy: 0.4753
Starting epoch 6/100
Epoch 6/100, Loss: 6.8677, Accuracy: 0.4781
Evaluation Loss: 6.0451, Accuracy: 0.4756
Starting epoch 7/100
Epoch 7/100, Loss: 5.8033, Accuracy: 0.4880
Evaluation Loss: 5.5495, Accuracy: 0.4361
Starting epoch 8/100
Epoch 8/100, Loss: 4.6672, Accuracy: 0.4979
Evaluation Loss: 4.6682, Accuracy: 0.4864
Starting epoch 9/100
Epoch 9/100, Loss: 3.9401, Accuracy: 0.5066
Evaluation Loss: 3.8136, Accuracy: 0.5049
Starting epoch 10/100
Epoch 10/

(1.0177395746501232, 0.6411484445034695)

The result remains similar to the original model.