The hybrid model performes better than the stand-alone transformer model, the training start with 12 piano scripts and received the result as 0.63 as accuracy. But with the increase of the samples to 30 scripts, the accuracy increase to 0.67

Input Layer:

Input dimension: 10 (number of features in the input)
Positional Encoding:

Model dimension: 64
Max length: 5000
Transformer Encoder Layer (6 layers):

Model dimension: 64
Number of heads: 4
Feedforward dimension: 256
Dropout: 0.1
Activation: ReLU
RNN Layer:

RNN type: LSTM
Hidden dimension: 128
Number of layers: 1
Dropout: 0.1
Classifier:

Flatten
Linear layer: 128 * sequence_length to 256
Activation: ReLU
Dropout: 0.1
Linear layer: 256 to n_classes (8)

In [10]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
import random

class PositionalEncoding(nn.Module):
    def __init__(self, model_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, model_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-np.log(10000.0) / model_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_dim, dropout=0.1, activation='relu'):
        super(CustomTransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(model_dim, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(model_dim, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_dim, model_dim)
        
        # Choose activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        elif activation == 'softmax':
            self.activation = nn.Softmax(dim=-1)
        else:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        src2 = self.self_attn(src, src, src)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class TransformerRNNHybridModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, ff_dim, rnn_hidden_dim, n_classes, dropout=0.1, activation='relu'):
        super(TransformerRNNHybridModel, self).__init__()
        self.model_dim = model_dim
        self.input_projection = nn.Linear(input_dim, model_dim)  
        self.pos_encoder = PositionalEncoding(model_dim)
        
        # Create custom transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            CustomTransformerEncoderLayer(model_dim, num_heads, ff_dim, dropout, activation)
            for _ in range(num_layers)
        ])
        
        self.rnn = nn.LSTM(model_dim, rnn_hidden_dim, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(rnn_hidden_dim * sequence_length, n_classes)
        )
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.bias.data.zero_()
                layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.input_projection(src) * np.sqrt(self.model_dim)  
        src = self.pos_encoder(src)
        for layer in self.encoder_layers:
            src = layer(src)
        rnn_output, _ = self.rnn(src)
        output = self.classifier(rnn_output)
        return output

class PianoDataset(Dataset):
    def __init__(self, csv_file, sequence_length=100, augment=False):
        self.df = pd.read_csv(csv_file)
        self.df = pd.concat([self.df.drop('event_type', axis=1), pd.get_dummies(self.df['event_type'], prefix='event_type')], axis=1)
        self.df = self.df.astype(np.float32)  
        self.sequence_length = sequence_length
        self.augment = augment
        self.data = self.generate_sequences()

    def generate_sequences(self):
        input_sequences = []
        target_events = []
        for i in range(len(self.df) - self.sequence_length):
            full_sequence = self.df.iloc[i:i+self.sequence_length+1]
            input_sequence = full_sequence.iloc[:-1]
            target_event = full_sequence.iloc[-1]
            input_sequences.append(input_sequence.values)
            target_events.append(target_event.values.argmax())  
        return list(zip(input_sequences, target_events))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        if self.augment:
            sequence = self.augment_sequence(sequence)
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return sequence_tensor, target_tensor

    def augment_sequence(self, sequence):
        # Transposition
        if random.random() < 0.5:
            transpose_amount = random.randint(-5, 5)
            sequence[:, 0] += transpose_amount  
        
        # Time-stretching
        if random.random() < 0.5:
            stretch_factor = random.uniform(0.8, 1.2)
            sequence[:, 1] *= stretch_factor  
        
        # Adding Noise
        if random.random() < 0.5:
            noise = np.random.normal(0, 0.01, sequence.shape)
            sequence += noise
        
        return sequence

def train_valid_test_split(dataset, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum to 1"
    
    train_size = int(train_ratio * len(dataset))
    valid_size = int(valid_ratio * len(dataset))
    test_size = len(dataset) - train_size - valid_size
    
    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Define model parameters
input_dim = 10 
model_dim = 64  
num_heads = 4 
num_layers = 4  
ff_dim = 128  
rnn_hidden_dim = 32  
n_classes = 8 
sequence_length = 100

# Set device and initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
activation_function = 'relu'  # Change activation function here
model = TransformerRNNHybridModel(input_dim, model_dim, num_heads, num_layers, ff_dim, rnn_hidden_dim, n_classes, activation=activation_function).to(device)

# Initialize weights
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(initialize_weights)

# Set optimizer with a lower learning rate
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Create dataset and dataloaders
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'
full_dataset = PianoDataset(output_csv, sequence_length, augment=True)

# Split the dataset
train_dataset, valid_dataset, test_dataset = train_valid_test_split(full_dataset)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)  # Reduced batch size
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training function with gradient clipping and early stopping
def train_model(model, train_loader, valid_loader, optimizer, num_epochs=50, patience=5): 
    model.train()
    best_accuracy = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f'Starting epoch {epoch+1}/{num_epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            
            # Ensure outputs and targets are correctly shaped
            if outputs.shape[0] != targets.shape[0]:
                print(f"Shape mismatch: outputs shape {outputs.shape}, targets shape {targets.shape}")
                continue
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
        # Validate the model
        val_loss, val_accuracy = evaluate_model(model, valid_loader)
        
        # Early stopping
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model_hybrid.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, dataloader):
    model.eval()  
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

    total_loss = running_loss / len(dataloader.dataset)
    accuracy = correct_predictions / total_predictions
    print(f'Evaluation Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}')
    model.train() 
    return total_loss, accuracy

# Train the model with high patience early stopping and save the best model
train_model(model, train_loader, valid_loader, optimizer, num_epochs=50, patience=5)

# Load the best model and evaluate on the test set
model.load_state_dict(torch.load('best_model_hybrid.pth'))
evaluate_model(model, test_loader)


Starting epoch 1/50
Epoch 1/50, Loss: 1.0982, Accuracy: 0.5564
Evaluation Loss: 1.0187, Accuracy: 0.5912
Starting epoch 2/50
Epoch 2/50, Loss: 0.9749, Accuracy: 0.5998
Evaluation Loss: 0.9257, Accuracy: 0.6164
Starting epoch 3/50
Epoch 3/50, Loss: 0.9152, Accuracy: 0.6167
Evaluation Loss: 0.8893, Accuracy: 0.6281
Starting epoch 4/50
Epoch 4/50, Loss: 0.8897, Accuracy: 0.6267
Evaluation Loss: 0.8749, Accuracy: 0.6334
Starting epoch 5/50
Epoch 5/50, Loss: 0.8732, Accuracy: 0.6339
Evaluation Loss: 0.8627, Accuracy: 0.6400
Starting epoch 6/50
Epoch 6/50, Loss: 0.8617, Accuracy: 0.6390
Evaluation Loss: 0.8558, Accuracy: 0.6438
Starting epoch 7/50
Epoch 7/50, Loss: 0.8524, Accuracy: 0.6433
Evaluation Loss: 0.8465, Accuracy: 0.6461
Starting epoch 8/50
Epoch 8/50, Loss: 0.8448, Accuracy: 0.6468
Evaluation Loss: 0.8442, Accuracy: 0.6485
Starting epoch 9/50
Epoch 9/50, Loss: 0.8389, Accuracy: 0.6491
Evaluation Loss: 0.8363, Accuracy: 0.6522
Starting epoch 10/50
Epoch 10/50, Loss: 0.8335, Accurac

(0.781249212879889, 0.6759686576579568)

We can slightly change the model structure and see the performance without changing the input data to see the potential of the hybrid model.

RNN Configuration:
Original Model: Uses a unidirectional LSTM with a single layer (bidirectional not specified and num_layers not set to 2).
Tuned Model: Uses a bidirectional LSTM with 2 layers (bidirectional=True and num_layers=2).


Attention Mechanism:
Original Model: Does not include an attention mechanism.
Tuned Model: Includes a multihead attention mechanism applied to the output of the LSTM (self.attention = nn.MultiheadAttention(rnn_hidden_dim * 2, num_heads, dropout=dropout, batch_first=True)).

Convolutional Layers:
Original Model: Does not include convolutional layers.
Tuned Model: Includes convolutional layers after the attention mechanism (self.conv1, self.conv2, and self.pool).

Classifier Input:
Original Model: The classifier input is based directly on the output of the LSTM (nn.Linear(rnn_hidden_dim * sequence_length, n_classes)).
Tuned Model: The classifier input is adjusted based on the output of the convolutional layers (nn.Linear(128 * 25, 512)).

Model Dimensionality:
Original Model: Uses a smaller model dimension (model_dim = 64), fewer transformer layers (num_layers = 4), and a smaller feedforward dimension (ff_dim = 128).
Tuned Model: Uses a larger model dimension (model_dim = 128), more transformer layers (num_layers = 8), and a larger feedforward dimension (ff_dim = 512).

Batch Size and Training Configuration:
Original Model: Uses a batch size of 128 for training and validation.
Tuned Model: Uses a batch size of 64 for training and validation.

Optimization and Training Parameters:
Original Model: Does not use a learning rate scheduler and has a reduced patience of 5 for early stopping.
Tuned Model: Uses a learning rate scheduler (scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)) and a patience of 10 for early stopping.

See from the result we can see that this is the only model with the same input data achieved 0.70 accuracy. It shows the transformer_RNN hybrid model have the best potencial in achieving the best result with more computing power and more input training material. 


In [7]:
import torch
import torch.nn as nn
import numpy as np 
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
import random

class PositionalEncoding(nn.Module):
    def __init__(self, model_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, model_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-np.log(10000.0) / model_dim)) 
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ff_dim, dropout=0.1, activation='relu'):
        super(CustomTransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(model_dim, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(model_dim, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_dim, model_dim)
        
        # Choose activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        elif activation == 'softmax':
            self.activation = nn.Softmax(dim=-1)
        else:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        src2 = self.self_attn(src, src, src)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class TransformerRNNHybridModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, ff_dim, rnn_hidden_dim, n_classes, dropout=0.1, activation='relu'):
        super(TransformerRNNHybridModel, self).__init__()
        self.model_dim = model_dim
        self.input_projection = nn.Linear(input_dim, model_dim)  
        self.pos_encoder = PositionalEncoding(model_dim)
        
        # Create custom transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            CustomTransformerEncoderLayer(model_dim, num_heads, ff_dim, dropout, activation)
            for _ in range(num_layers)
        ])
        
        self.rnn = nn.LSTM(model_dim, rnn_hidden_dim, num_layers=2, batch_first=True, dropout=dropout, bidirectional=True)  
        self.attention = nn.MultiheadAttention(rnn_hidden_dim * 2, num_heads, dropout=dropout, batch_first=True)
        
        # Convolutional layers
        self.conv1 = nn.Conv1d(in_channels=rnn_hidden_dim * 2, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 25, 512),  
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, n_classes)
        )
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.bias.data.zero_()
                layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.input_projection(src) * np.sqrt(self.model_dim)  
        src = self.pos_encoder(src)
        for layer in self.encoder_layers:
            src = layer(src)
        rnn_output, _ = self.rnn(src)
        attn_output, _ = self.attention(rnn_output, rnn_output, rnn_output)
        
        # Change the dimensions to fit the convolutional layers
        conv_input = attn_output.permute(0, 2, 1)  
        conv_output = self.pool(self.conv2(self.pool(self.conv1(conv_input))))
        output = self.classifier(conv_output)
        return output



class PianoDataset(Dataset):
    def __init__(self, csv_file, sequence_length=100, augment=False):
        self.df = pd.read_csv(csv_file)
        self.df = pd.concat([self.df.drop('event_type', axis=1), pd.get_dummies(self.df['event_type'], prefix='event_type')], axis=1)
        self.df = self.df.astype(np.float32)  
        self.sequence_length = sequence_length
        self.augment = augment
        self.data = self.generate_sequences()

    def generate_sequences(self):
        input_sequences = []
        target_events = []
        for i in range(len(self.df) - self.sequence_length):
            full_sequence = self.df.iloc[i:i+self.sequence_length+1]
            input_sequence = full_sequence.iloc[:-1]
            target_event = full_sequence.iloc[-1]
            input_sequences.append(input_sequence.values)
            target_events.append(target_event.values.argmax())  
        return list(zip(input_sequences, target_events))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        if self.augment:
            sequence = self.augment_sequence(sequence)
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return sequence_tensor, target_tensor

    def augment_sequence(self, sequence):
        # Transposition
        if random.random() < 0.5:
            transpose_amount = random.randint(-5, 5)
            sequence[:, 0] += transpose_amount  
        
        # Time-stretching
        if random.random() < 0.5:
            stretch_factor = random.uniform(0.8, 1.2)
            sequence[:, 1] *= stretch_factor  
        
        # Adding Noise
        if random.random() < 0.5:
            noise = np.random.normal(0, 0.01, sequence.shape)
            sequence += noise
        
        return sequence

def train_valid_test_split(dataset, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum to 1"
    
    train_size = int(train_ratio * len(dataset))
    valid_size = int(valid_ratio * len(dataset))
    test_size = len(dataset) - train_size - valid_size
    
    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Define model parameters
input_dim = 10  
model_dim = 128  
num_heads = 8  
num_layers = 8  
ff_dim = 512  
rnn_hidden_dim = 256  
n_classes = 8  
sequence_length = 100

# Set device and initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
activation_function = 'relu'  
model = TransformerRNNHybridModel(input_dim, model_dim, num_heads, num_layers, ff_dim, rnn_hidden_dim, n_classes, activation=activation_function).to(device)

# Initialize weights
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(initialize_weights)

# Set optimizer with a lower learning rate and L2 regularization
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

# Define learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Create dataset and dataloaders
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'
full_dataset = PianoDataset(output_csv, sequence_length, augment=True)

# Split the dataset
train_dataset, valid_dataset, test_dataset = train_valid_test_split(full_dataset)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # Increased batch size
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training function with gradient clipping and early stopping
def train_model(model, train_loader, valid_loader, optimizer, scheduler, num_epochs=50, patience=10):
    model.train()
    best_accuracy = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f'Starting epoch {epoch+1}/{num_epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            
            # Ensure outputs and targets are correctly shaped
            if outputs.shape[0] != targets.shape[0]:
                print(f"Shape mismatch: outputs shape {outputs.shape}, targets shape {targets.shape}")
                continue
            
            loss = criterion(outputs, targets)
            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
        # Validate the model
        val_loss, val_accuracy = evaluate_model(model, valid_loader)
        
        # Adjust learning rate
        scheduler.step()
        
        # Early stopping
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model_hybrid.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)  
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

    total_loss = running_loss / len(dataloader.dataset)
    accuracy = correct_predictions / total_predictions
    print(f'Evaluation Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}')
    model.train() 
    return total_loss, accuracy

# Train the model with high patience early stopping and save the best model
train_model(model, train_loader, valid_loader, optimizer, scheduler, num_epochs=100, patience=10)

# Load the best model and evaluate on the test set
model.load_state_dict(torch.load('best_model_hybrid.pth'))
evaluate_model(model, test_loader)


Starting epoch 1/100
Epoch 1/100, Loss: 1.1488, Accuracy: 0.5349
Evaluation Loss: 1.1366, Accuracy: 0.5373
Starting epoch 2/100
Epoch 2/100, Loss: 1.1288, Accuracy: 0.5391
Evaluation Loss: 1.0786, Accuracy: 0.5695
Starting epoch 3/100
Epoch 3/100, Loss: 0.9407, Accuracy: 0.6161
Evaluation Loss: 0.8669, Accuracy: 0.6403
Starting epoch 4/100
Epoch 4/100, Loss: 0.8454, Accuracy: 0.6483
Evaluation Loss: 0.8212, Accuracy: 0.6595
Starting epoch 5/100
Epoch 5/100, Loss: 0.8142, Accuracy: 0.6617
Evaluation Loss: 0.8018, Accuracy: 0.6709
Starting epoch 6/100
Epoch 6/100, Loss: 0.7923, Accuracy: 0.6720
Evaluation Loss: 0.7822, Accuracy: 0.6775
Starting epoch 7/100
Epoch 7/100, Loss: 0.7752, Accuracy: 0.6798
Evaluation Loss: 0.7730, Accuracy: 0.6814
Starting epoch 8/100
Epoch 8/100, Loss: 0.7608, Accuracy: 0.6862
Evaluation Loss: 0.7733, Accuracy: 0.6832
Starting epoch 9/100
Epoch 9/100, Loss: 0.7478, Accuracy: 0.6923
Evaluation Loss: 0.7650, Accuracy: 0.6889
Starting epoch 10/100
Epoch 10/100, L

(0.7410412680628196, 0.7004714631959893)