After hybrid and transformer model, we can take a look at the traditional CNN model which performes really well with sequenced data such as literature and for this case, piano scripts. 
Fist we can try some simple layers for performance estimate. 
Structure:
Convolutional Layers: 3 convolutional layers with 32, 64, and 128 filters, respectively.
Pooling: Max pooling after each convolutional layer.
Fully Connected Layers: 2 fully connected layers with 256 and num_classes units.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import pandas as pd
import random

class PianoDataset(Dataset):
    def __init__(self, csv_file, sequence_length=100, augment=False):
        self.df = pd.read_csv(csv_file)
        self.df = pd.concat([self.df.drop('event_type', axis=1), pd.get_dummies(self.df['event_type'], prefix='event_type')], axis=1)
        self.df = self.df.astype(np.float32)  # Ensure all data is of type float32
        self.sequence_length = sequence_length
        self.augment = augment
        self.data = self.generate_sequences()

    def generate_sequences(self):
        input_sequences = []
        target_events = []
        for i in range(len(self.df) - self.sequence_length):
            full_sequence = self.df.iloc[i:i+self.sequence_length+1]
            input_sequence = full_sequence.iloc[:-1]
            target_event = full_sequence.iloc[-1]
            input_sequences.append(input_sequence.values)
            target_events.append(target_event.values.argmax())  # Use class index as target
        return list(zip(input_sequences, target_events))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        if self.augment:
            sequence = self.augment_sequence(sequence)
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return sequence_tensor, target_tensor

    def augment_sequence(self, sequence):
        # Transposition
        if random.random() < 0.5:
            transpose_amount = random.randint(-5, 5)
            sequence[:, 0] += transpose_amount  
        
        # Time-stretching
        if random.random() < 0.5:
            stretch_factor = random.uniform(0.8, 1.2)
            sequence[:, 1] *= stretch_factor  
        
        # Adding Noise
        if random.random() < 0.5:
            noise = np.random.normal(0, 0.01, sequence.shape)
            sequence += noise
        
        return sequence

class CNNModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.fc1 = nn.Linear(128 * (input_dim // 8) * (sequence_length // 8), 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = x.unsqueeze(1)  
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1) 
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

sequence_length = 100 
input_dim = 10  
num_classes = 8  
model = CNNModel(input_dim, num_classes)


In [2]:
# Define the path to the CSV file
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'

# Create dataset and dataloaders
full_dataset = PianoDataset(csv_file=output_csv, sequence_length=100, augment=True)

def train_valid_test_split(dataset, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum to 1"
    
    train_size = int(train_ratio * len(dataset))
    valid_size = int(valid_ratio * len(dataset))
    test_size = len(dataset) - train_size - valid_size
    
    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Split the dataset
train_dataset, valid_dataset, test_dataset = train_valid_test_split(full_dataset)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) 
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [4]:
import torch.nn.functional as F

# Set device and initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNModel(input_dim, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, valid_loader, optimizer, num_epochs=50, patience=5):
    model.train()
    best_accuracy = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f'Starting epoch {epoch+1}/{num_epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if outputs.shape[0] != targets.shape[0]:
                print(f"Shape mismatch: outputs shape {outputs.shape}, targets shape {targets.shape}")
                continue
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
        val_loss, val_accuracy = evaluate_model(model, valid_loader)
        
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            patience_counter = 0
            torch.save(model.state_dict(), 'best_cnn_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, data_loader):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)
    
    loss = running_loss / len(data_loader.dataset)
    accuracy = correct_predictions / total_predictions
    print(f'Validation Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')
    return loss, accuracy

# Train the model
train_model(model, train_loader, valid_loader, optimizer, num_epochs=50, patience=5)

# Evaluate the model on the test set
model.load_state_dict(torch.load('best_cnn_model.pth'))
test_loss, test_accuracy = evaluate_model(model, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')


Starting epoch 1/50


  return F.conv2d(input, weight, bias, self.stride,


Epoch 1/50, Loss: 1.1535, Accuracy: 0.5384
Validation Loss: 1.0426, Accuracy: 0.5759
Starting epoch 2/50
Epoch 2/50, Loss: 0.9583, Accuracy: 0.6077
Validation Loss: 0.9327, Accuracy: 0.6137
Starting epoch 3/50
Epoch 3/50, Loss: 0.8868, Accuracy: 0.6330
Validation Loss: 0.8777, Accuracy: 0.6375
Starting epoch 4/50
Epoch 4/50, Loss: 0.8566, Accuracy: 0.6450
Validation Loss: 0.8624, Accuracy: 0.6430
Starting epoch 5/50
Epoch 5/50, Loss: 0.8368, Accuracy: 0.6525
Validation Loss: 0.8491, Accuracy: 0.6482
Starting epoch 6/50
Epoch 6/50, Loss: 0.8209, Accuracy: 0.6597
Validation Loss: 0.8411, Accuracy: 0.6513
Starting epoch 7/50
Epoch 7/50, Loss: 0.8056, Accuracy: 0.6659
Validation Loss: 0.8403, Accuracy: 0.6528
Starting epoch 8/50
Epoch 8/50, Loss: 0.7916, Accuracy: 0.6720
Validation Loss: 0.8383, Accuracy: 0.6532
Starting epoch 9/50
Epoch 9/50, Loss: 0.7761, Accuracy: 0.6790
Validation Loss: 0.8447, Accuracy: 0.6518
Starting epoch 10/50
Epoch 10/50, Loss: 0.7599, Accuracy: 0.6859
Validation

And we can tuning some parameter and change the layer to more complex structure for better performance.
Structuer:
Initial Convolutional Layer: 1 convolutional layer with 64 filters, batch normalization, and max pooling.
Residual Blocks: 3 residual blocks with 64 to 128, 128 to 256, and 256 to 512 filters, respectively.
Global Average Pooling: Reduces each feature map to a single value.
Fully Connected Layers: 2 fully connected layers with 256 and num_classes units.
Dropout: Included dropout before the final fully connected layer.
And we can see from the result that the performance have been improved decent amount indicating that CNN model handles the sequenced data well.

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import pandas as pd
import random

class PianoDataset(Dataset):
    def __init__(self, data=None, labels=None, csv_file=None, sequence_length=100, augment=False):
        if csv_file:
            self.df = pd.read_csv(csv_file)
            self.df = pd.concat([self.df.drop('event_type', axis=1), pd.get_dummies(self.df['event_type'], prefix='event_type')], axis=1)
            self.df = self.df.astype(np.float32)  # Ensure all data is of type float32
            self.sequence_length = sequence_length
            self.augment = augment
            self.data = self.generate_sequences()
        else:
            self.data = list(zip(data, labels))
            self.augment = augment

    def generate_sequences(self):
        input_sequences = []
        target_events = []
        for i in range(len(self.df) - self.sequence_length):
            full_sequence = self.df.iloc[i:i+self.sequence_length+1]
            input_sequence = full_sequence.iloc[:-1]
            target_event = full_sequence.iloc[-1]
            input_sequences.append(input_sequence.values)
            target_events.append(target_event.values.argmax()) 
        return list(zip(input_sequences, target_events))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        if self.augment:
            sequence = self.augment_sequence(sequence)
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return sequence_tensor, target_tensor

    def augment_sequence(self, sequence):
        # Transposition
        if random.random() < 0.5:
            transpose_amount = random.randint(-5, 5)
            sequence[:, 0] += transpose_amount  
        
        # Time-stretching
        if random.random() < 0.5:
            stretch_factor = random.uniform(0.8, 1.2)
            sequence[:, 1] *= stretch_factor  
        
        # Adding Noise
        if random.random() < 0.5:
            noise = np.random.normal(0, 0.01, sequence.shape)
            sequence += noise
        
        return sequence

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=(3, 3), padding=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=kernel_size, padding=padding)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class CNNModel(nn.Module):
    def __init__(self, input_dim, sequence_length, num_classes):
        super(CNNModel, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        )
        self.layer2 = ResidualBlock(64, 128)
        self.layer3 = ResidualBlock(128, 256)
        self.layer4 = ResidualBlock(256, 512)
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = x.unsqueeze(1)  
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.global_avg_pool(x)
        x = x.view(x.size(0), -1) 
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

sequence_length = 100  
input_dim = 10  
num_classes = 8  
model = CNNModel(input_dim, sequence_length, num_classes)

# Define the path to the CSV file
output_csv = 'D:\\Projects\\Resources\\midis_v1.2\\Beethoven\\Beethoven_note_sequence.csv'

# Create dataset and dataloaders
full_dataset = PianoDataset(csv_file=output_csv, sequence_length=100, augment=True)

def train_valid_test_split(dataset, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15):
    assert train_ratio + valid_ratio + test_ratio == 1, "Ratios must sum to 1"
    
    train_size = int(train_ratio * len(dataset))
    valid_size = int(valid_ratio * len(dataset))
    test_size = len(dataset) - train_size - valid_size
    
    train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])
    return train_dataset, valid_dataset, test_dataset

# Split the dataset
train_dataset, valid_dataset, test_dataset = train_valid_test_split(full_dataset)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Set device and initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNModel(input_dim, sequence_length, num_classes).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001)  
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, valid_loader, optimizer, num_epochs=50, patience=5):
    model.train()
    best_accuracy = 0.0
    patience_counter = 0

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f'Starting epoch {epoch+1}/{num_epochs}')
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if outputs.shape[0] != targets.shape[0]:
                print(f"Shape mismatch: outputs shape {outputs.shape}, targets shape {targets.shape}")
                continue
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
        val_loss, val_accuracy = evaluate_model(model, valid_loader)
        
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            patience_counter = 0
            torch.save(model.state_dict(), 'best_cnn_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, data_loader):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)
    
    loss = running_loss / len(data_loader.dataset)
    accuracy = correct_predictions / total_predictions
    print(f'Validation Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')
    return loss, accuracy

# Train the model
train_model(model, train_loader, valid_loader, optimizer, num_epochs=50, patience=5)

# Evaluate the model on the test set
model.load_state_dict(torch.load('best_cnn_model.pth'))
test_loss, test_accuracy = evaluate_model(model, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')


Starting epoch 1/50


  return F.conv2d(input, weight, bias, self.stride,


Epoch 1/50, Loss: 1.0105, Accuracy: 0.5955
Validation Loss: 0.9105, Accuracy: 0.6274
Starting epoch 2/50
Epoch 2/50, Loss: 0.8684, Accuracy: 0.6391
Validation Loss: 0.8425, Accuracy: 0.6479
Starting epoch 3/50
Epoch 3/50, Loss: 0.8321, Accuracy: 0.6541
Validation Loss: 0.8305, Accuracy: 0.6533
Starting epoch 4/50
Epoch 4/50, Loss: 0.8104, Accuracy: 0.6632
Validation Loss: 0.8125, Accuracy: 0.6601
Starting epoch 5/50
Epoch 5/50, Loss: 0.7949, Accuracy: 0.6699
Validation Loss: 0.7993, Accuracy: 0.6708
Starting epoch 6/50
Epoch 6/50, Loss: 0.7810, Accuracy: 0.6765
Validation Loss: 0.7915, Accuracy: 0.6733
Starting epoch 7/50
Epoch 7/50, Loss: 0.7687, Accuracy: 0.6820
Validation Loss: 0.7874, Accuracy: 0.6753
Starting epoch 8/50
Epoch 8/50, Loss: 0.7566, Accuracy: 0.6873
Validation Loss: 0.7847, Accuracy: 0.6770
Starting epoch 9/50
Epoch 9/50, Loss: 0.7443, Accuracy: 0.6924
Validation Loss: 0.7890, Accuracy: 0.6743
Starting epoch 10/50
Epoch 10/50, Loss: 0.7313, Accuracy: 0.6988
Validation

Reasons for Performance Improvement
Residual Blocks:

Ease of Training: Residual blocks help mitigate the vanishing gradient problem, making it easier to train deeper networks. They allow gradients to flow through the network more effectively during backpropagation.
Learning Complex Patterns: By enabling the network to bypass certain layers, residual blocks allow the model to learn complex patterns more efficiently.
Batch Normalization:

Stabilizing Training: Batch normalization normalizes the inputs to each layer, which helps stabilize and accelerate training by reducing internal covariate shift.
Regularization Effect: It has a slight regularization effect, reducing the need for other forms of regularization like dropout.
Global Average Pooling:

Parameter Reduction: Global average pooling reduces the number of parameters before the fully connected layers, which helps prevent overfitting.
Spatial Information: It captures the spatial information by averaging each feature map, which can be beneficial for recognizing patterns in the input data.
Increased Depth:

Feature Extraction: Increasing the depth of the network allows it to extract more detailed and abstract features from the input data, which can improve performance.
Dropout:

Preventing Overfitting: Dropout helps prevent overfitting by randomly dropping units during training, which forces the network to learn more robust features.
Reasons for Marginal Performance Improvement
Data Quality and Quantity:

Limited Data: If the dataset is small or not diverse enough, the model might not benefit significantly from increased complexity. The improvements in architecture might not be fully realized without sufficient data.
Data Quality: Noisy or low-quality data can limit the model's ability to learn effectively, regardless of the architecture improvements.
Model Complexity:

Overfitting: Adding more layers and parameters increases the risk of overfitting, especially if the dataset is not large enough to support the increased complexity.
Diminishing Returns: Beyond a certain point, adding more layers or complexity does not necessarily translate to better performance. There are diminishing returns as the model might already capture most of the patterns present in the data.
Hyperparameter Tuning:

Suboptimal Hyperparameters: The performance gain from architectural changes can be limited if the hyperparameters (learning rate, batch size, etc.) are not well-tuned for the new architecture.
Training Time and Resources: Deeper and more complex models require more computational resources and time to train, which might limit the extent of hyperparameter tuning.
Task Complexity:

Nature of the Task: The specific task of processing piano note sequences might not benefit as much from deeper or more complex architectures compared to other tasks like image recognition. The inherent complexity of the task plays a role in determining the effectiveness of model modifications.
Implementation Details:

Initial Weights and Training Dynamics: The initial weights and the dynamics of training (such as the optimizer used and the learning rate schedule) can also impact the effectiveness of the modifications. Small changes in these aspects can lead to varying degrees of performance improvement.
Conclusion
While the modifications (residual blocks, batch normalization, global average pooling, increased depth, and dropout) are theoretically and practically sound for improving a CNN's performance, the actual improvement observed can be marginal due to factors like data quality, model complexity, hyperparameter tuning, task-specific characteristics, and implementation details. To achieve significant performance gains, it is essential to carefully balance these factors and consider further experimentation with different architectures, data augmentation techniques, and hyperparameter optimization methods.