In [1]:
import librosa
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torchvision.transforms import Compose
import random
import pickle
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import audiosegment
import torchaudio.transforms as T
from coatnet import CoAtNet as CoAtNetImp
import matplotlib.pyplot as plt
import time
from sklearn.base import BaseEstimator
#Save results to csv
from sklearn.metrics import accuracy_score, precision_score, recall_score
import sklearn

#for custom libraries
import sys
sys.path.insert(1,'../SimplifiedPythonFiles/')
import NoiseFilterTorch as nft #unfortunate

#for language model
from openai import Client #better choice than ollama

# About

This version of the model will attempt to use Torch to speed up some of the audio processing.

## Building the dataset

In [2]:
# device selection
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")

MBP_AUDIO_DIR='../MKA-dataset/', '../Dataset-custom-audio/base-audio-denoised-normalized/', '../Dataset-for-Binary/base-audio/'
PREFIX='','','audio_'
SUFFIX='mac','',''
DATASET_PICK='MKA','Custom_Denoised_Normalized','Binary'

keys = list('1234567890QWERTYUIOPASDFGHJKLZXCVBNM') # remove + - when using dataset for binary

Device: cuda


In [9]:
#SELECT DATASET
dataset_choice=2

In [7]:
# Usage - with NoiseFilterTorch
audio_dataset, _ = nft.create_dataset(keys, MBP_AUDIO_DIR[dataset_choice], plot=False,  preffix=PREFIX[dataset_choice], suffix=SUFFIX[dataset_choice], length=1000) 
filter_pick="nft"

print(audio_dataset)

path: ../Dataset-custom-audio/base-audio-denoised-normalized/1.wav
File 1.wav chunks: 41
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
path: ../Dataset-custom-audio/base-audio-denoised-normalized/2.wav


  peak_diffs = torch.diff(torch.tensor(peaks)).float()


File 2.wav chunks: 40
['2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
path: ../Dataset-custom-audio/base-audio-denoised-normalized/3.wav
File 3.wav chunks: 40
['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3']
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', 

In [50]:
#filter_pick="kit"
filter_pick="nft"

In [51]:
# Write audio_dataset to a file to avoid running this shit over & over again
with open(f'audio_dataset_{filter_pick}_{DATASET_PICK[dataset_choice]}.pkl', 'wb') as f:
    pickle.dump(audio_dataset, f)

In [52]:
# Read audio_dataset back from the file
with open(f'audio_dataset_{filter_pick}_{DATASET_PICK[dataset_choice]}.pkl', 'rb') as f:
    audio_dataset = pickle.load(f)

In [8]:
audio_dataset

Unnamed: 0,Key,File
0,0,"[tensor(-0.1132), tensor(-0.1132), tensor(-0.1..."
1,0,"[tensor(-0.0311), tensor(-0.0341), tensor(-0.0..."
2,0,"[tensor(0.1829), tensor(0.1940), tensor(0.1954..."
3,0,"[tensor(0.0453), tensor(0.0492), tensor(0.0548..."
4,0,"[tensor(-0.0308), tensor(-0.0333), tensor(-0.0..."
...,...,...
1462,35,"[tensor(-0.0219), tensor(-0.0194), tensor(-0.0..."
1463,35,"[tensor(-0.0394), tensor(-0.0439), tensor(-0.0..."
1464,35,"[tensor(-0.0198), tensor(-0.0202), tensor(-0.0..."
1465,35,"[tensor(-0.0631), tensor(-0.0808), tensor(-0.0..."


## Preprocessing the dataset

In [54]:
def get_audio_length(audio_path):
    audio = audiosegment.from_file(audio_path)
    return audio.duration_seconds

def convert_to_ms(t):
    return round(t*1000)

def get_audio_length_average(audio_path, keys):
    lengths = []
    for i, File in enumerate(keys):
        loc = audio_path + File
        length = get_audio_length(loc)
        print(f'File {loc} length: {length:2f}\n')
        lengths.append(length)
    average = torch.mean(torch.tensor(lengths))
    return convert_to_ms(average)

In [55]:
def random_uniform_torch(all_frames_num, num_frames_to_mask):
    low = 0.0
    high = all_frames_num - num_frames_to_mask
    t0 = torch.empty(1).uniform_(low, high).item()
    return t0

In [56]:
def time_shift(samples):
    samples = samples.flatten()
    shift = int(len(samples) * 0.4) #Max shift (0.4)
    random_shift = random.randint(0, shift) #Random number between 0 and 0.4*len(samples)
    data_roll = torch.roll(samples, random_shift)
    return data_roll

def masking(samples):
    num_mask = 2
    freq_masking_max_percentage=0.10
    time_masking_max_percentage=0.10
    spec = samples
    mean_value = spec.mean()
    for i in range(num_mask):
        all_frames_num, all_freqs_num = spec.shape[1], spec.shape[1] 
        freq_percentage = random.uniform(0.1, freq_masking_max_percentage)

        num_freqs_to_mask = int(freq_percentage * all_freqs_num)
        f0 = random_uniform_torch(all_freqs_num, num_freqs_to_mask)
        f0 = int(f0)
        spec[:, f0:f0 + num_freqs_to_mask] = mean_value

        time_percentage = random.uniform(0.1, time_masking_max_percentage)

        num_frames_to_mask = int(time_percentage * all_frames_num)
        t0 = random_uniform_torch(all_frames_num, num_frames_to_mask)
        t0 = int(t0)
        spec[t0:t0 + num_frames_to_mask, :] = mean_value
    return spec

In [57]:
class ToMelSpectrogram:
    def __init__(self, device='cpu', audio_length=14400, sample_rate=44100, n_mels=64, n_fft=1024, hop_length=225, plot=True):
        self.audio_length = audio_length
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.plot=plot
        print(f'device::{device}')
        self.device = torch.device(device)
        self.mel_spectrogram = T.MelSpectrogram(
            sample_rate=self.sample_rate,
            n_mels=self.n_mels,
            n_fft=self.n_fft,
            hop_length=self.hop_length
        ).to(self.device)

    def __call__(self, samples):
        # print(f"Original samples shape: {samples.shape}")
        
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = torch.nn.functional.pad(samples, (0, self.audio_length - len(samples)), mode='constant')

        # print(f"Samples shape after padding/trimming: {samples.shape}")

        samples = samples.unsqueeze(0).to(self.device)  # Shape: (1, 1, audio_length)
        # print(f"Samples shape after unsqueeze and to(device): {samples.shape}")

        mel_spec = self.mel_spectrogram(samples)
        # print(f"Mel spectrogram shape: {mel_spec.shape}")

        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(64, 64), mode='bilinear', align_corners=False)
        # print(f"Mel spectrogram resized shape: {mel_spec_resized.shape}")

        if self.plot:
            self.plot_melspec(mel_spec_resized)
        return mel_spec_resized.squeeze(0)
    
    def plot_melspec(self, mel_spec):
        # Assuming mel_spec is a CUDA tensor
        mel_spec_cpu = mel_spec.cpu().numpy()
        # print(f"Mel spectrogram shape on CPU: {mel_spec_cpu.shape}")

        # Convert the mel spectrogram to decibel units
        mel_spec_db = librosa.power_to_db(mel_spec_cpu.squeeze(), ref=np.max)
        # print(f"Mel spectrogram shape after squeezing: {mel_spec_db.shape}")


        # Plot the mel spectrogram
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(mel_spec_db, sr=self.sample_rate, hop_length=self.hop_length, x_axis='time', y_axis='mel', fmax=8000)
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel Spectrogram')
        plt.tight_layout()
        plt.show()

class ToMfcc:
    def __init__(self, device='cpu', audio_length=14400, sample_rate=44100, n_mfcc=13, n_fft=1024, hop_length=512):
        self.audio_length = audio_length
        self.sample_rate = sample_rate
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.device = torch.device(device)
        self.mfcc_transform = T.MFCC(
            sample_rate=self.sample_rate,
            n_mfcc=self.n_mfcc,
            melkwargs={
                'n_fft': self.n_fft,
                'hop_length': self.hop_length,
                'n_mels': 64,
                'center': True,
                'pad_mode': 'reflect',
                'power': 2.0
            }
        ).to(self.device)


    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = F.pad(samples, (0, self.audio_length - len(samples)), mode='constant')

        samples = samples.unsqueeze(0).to(self.device)  # Shape: (1, audio_length)
        mfcc_spec = self.mfcc_transform(samples)
        mfcc_spec = mfcc_spec.squeeze(0).transpose(0, 1)  # Shape: (time, n_mfcc)
        return mfcc_spec


In [58]:
transform = Compose([ToMelSpectrogram(device,12000,plot=False)])
transform_mfcc = Compose([ToMfcc(device, 12000)])

device::cuda


In [59]:
audio_samples = audio_dataset['File'].values.tolist()
labels = audio_dataset['Key'].values.tolist()

audio_samples_new = audio_samples.copy() # audio samples CNN
print(len(audio_samples))

print(type(audio_samples[0]))

for i, sample in enumerate(audio_samples):
    audio_samples_new.append(time_shift(sample))
    labels.append(labels[i])

# convert labels to a numpy array
labels = np.array(labels)
print(len(audio_samples_new))
print(len(labels))

901
<class 'torch.Tensor'>
1802
1802


In [60]:
audioDatasetFin, audioDatasetFinMasking, audioDatasetMfcc, audioDatasetMfccMasking = [], [], [], []

for i in range(len(audio_samples_new)):
    # Print the shape of the input tensor
    print(f"Shape of input tensor before transformation: {audio_samples_new[i].shape}")
    transformed_sample = transform(audio_samples_new[i]) #running with cpu?
    transformed_mfcc = transform_mfcc(audio_samples_new[i])
    
    # CoAtNet part
    audioDatasetFin.append((transformed_sample, labels[i]))
    audioDatasetFinMasking.append((masking(transformed_sample), labels[i]))
    
    # masking part
    audioDatasetMfcc.append((transformed_sample, transformed_mfcc, labels[i]))
    audioDatasetMfccMasking.append((masking(transformed_sample), transformed_mfcc, labels[i]))


Shape of input tensor before transformation: torch.Size([86159])
Shape of input tensor before transformation: torch.Size([57104])
Shape of input tensor before transformation: torch.Size([53832])
Shape of input tensor before transformation: torch.Size([57386])
Shape of input tensor before transformation: torch.Size([57494])
Shape of input tensor before transformation: torch.Size([56210])
Shape of input tensor before transformation: torch.Size([61638])
Shape of input tensor before transformation: torch.Size([57220])
Shape of input tensor before transformation: torch.Size([62462])
Shape of input tensor before transformation: torch.Size([65904])
Shape of input tensor before transformation: torch.Size([54292])
Shape of input tensor before transformation: torch.Size([57516])
Shape of input tensor before transformation: torch.Size([65562])
Shape of input tensor before transformation: torch.Size([62578])
Shape of input tensor before transformation: torch.Size([61832])
Shape of input tensor bef

In [61]:
# check for lengths of datasets
len(audioDatasetMfcc), len(audioDatasetMfcc + audioDatasetMfccMasking), len(audioDatasetFin), len(audioDatasetFin + audioDatasetFinMasking)

(1802, 3604, 1802, 3604)

## Models

### Original CoAtNet

In [62]:
num_blocks = [2, 2, 3, 5, 2]            # L
channels = [64, 96, 192, 384, 768]      # D

class CoAtNet(nn.Module, BaseEstimator):
    def __init__(self, num_epochs=200, patience=30, keys='1234567890QWERTYUIOPASDFGHJKLZXCVBNM+-'):
        super(CoAtNet, self).__init__()    
        self.keys = keys
        self.model = CoAtNetImp((64, 64), 1, num_blocks, channels, num_classes=len(self.keys))
        self.num_epochs = num_epochs
        self.patience = patience
    
    def forward(self, x):
        return self.model(x)
    
    def fit(self, X, y):
        #TODO: en fromscratch, revisar CoAtNet e implementacion de fit
        # Split the dataset into training and validation sets
        train_indices, val_indices = train_test_split(range(len(X)), test_size=0.01)

        # Convert the indices to tensors
        train_X = torch.stack([X[i] for i in train_indices])
        train_y = torch.tensor([y[i] for i in train_indices], dtype=torch.long)
        val_X = torch.stack([X[i] for i in val_indices])
        val_y = torch.tensor([y[i] for i in val_indices], dtype=torch.long)
    
        # Create tensordataset
        train_dataset = TensorDataset(train_X, train_y)
        val_dataset = TensorDataset(val_X, val_y)
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

        # Initialize optimizer and loss function
        self._optimizer = optim.Adam(self.model.parameters(), lr=1e-4)
        criterion = nn.CrossEntropyLoss()

        # Move model to the appropriate device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = self.model.to(device)

        best_val_acc, epochs_no_imp = 0, 0
        train_accuracies, val_accuracies = [], []

        for epoch in range(self.num_epochs):
            model.train()
            epoch_train_loss = 0.0
            correct_train = 0
            total_train = 0
            tic = time.perf_counter()

            for images, labels in train_loader:
                images = images.to(device)
                labels = labels.to(device)

                self._optimizer.zero_grad()

                # Forward pass
                outputs = model(images)
                loss = criterion(outputs, labels)
                epoch_train_loss += loss.item() * images.size(0)

                _, predicted_train = torch.max(outputs.data, 1)
                total_train += labels.size(0)
                correct_train += (predicted_train == labels).sum().item()

                # Backward pass
                loss.backward()
                self._optimizer.step()

            toc = time.perf_counter()
            time_taken = toc - tic

            epoch_train_loss /= len(train_loader.dataset)
            train_accuracy = correct_train / total_train
            train_accuracies.append(train_accuracy)

            # Evaluation of the model
            model.eval()
            total, correct = 0, 0

            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)

                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            val_accuracy = correct / total
            val_accuracies.append(val_accuracy)
            if (epoch + 1) % 1 == 0:
                print(f"Epoch [{epoch + 1}/{self.num_epochs}], Train Loss: {epoch_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, Iter Time: {time_taken:.2f}s")

            if val_accuracy > best_val_acc:
                best_val_acc = val_accuracy
                epochs_no_imp = 0
                best_model_state = model.state_dict()  # Save the best model
            else:
                epochs_no_imp += 1
            if epochs_no_imp >= self.patience:
                print(f'Early stopping after {epoch+1} epochs')
                model.load_state_dict(best_model_state)  # Load the best model
                break
        return self
    
    def predict(self, X):
        # Move X to the appropriate device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Ensure X is a list of tensors or a single tensor
        if isinstance(X, list):
            X = torch.stack([torch.tensor(x) for x in X]).to(device)
        else:
            X = torch.tensor(X).to(device)

        # Move model to the appropriate device
        model = self.model.to(device)
        model.eval()

        with torch.no_grad():
            outputs = model(X)
            _, predicted = torch.max(outputs.data, 1)

        return predicted.tolist()

### Updated CoAtNet

In [63]:
num_blocks = [2, 2, 3, 5, 2]            # L
channels = [64, 96, 192, 384, 768]      # D

class CoAtNetNFT(nn.Module, BaseEstimator):
    def __init__(self, num_epochs=100, patience=20, keys='1234567890QWERTYUIOPASDFGHJKLZXCVBNM+-'):
        super(CoAtNetNFT, self).__init__()    
        self.keys = keys
        self.model = CoAtNetImp((64, 64), 1, num_blocks, channels, num_classes=len(self.keys))
        self.num_epochs = num_epochs
        self.patience = patience
    
    def forward(self, x):
        return self.model(x)
    
    def fit(self, dataset, lr=1e-4):
        # concatenate so it has the same shape as before
        self.dataset = dataset
        # dataset = np.concatenate((X, y), axis=1)
        train_set, val_set = train_test_split(self.dataset, test_size=0.05, random_state=42)
        train_loader, val_loader = DataLoader(train_set, batch_size=16), DataLoader(val_set, batch_size=16)

        # Initialize model, optimizer, and loss function
        self._optimizer = optim.Adam(self.model.parameters(), lr=lr)
        # scheduler = OneCycleLR(self._optimizer, max_lr=max_lr, steps_per_epoch=225, epochs=self.num_epochs, div_factor=50)
        # same training method but now inside the class
        model = self.model.to(device)
        
        # loss criterion
        criterion = nn.CrossEntropyLoss()
        
        best_val_acc, epochs_no_imp = 0, 0
        train_accuracies, val_accuracies = [], []
        
        for epoch in range(self.num_epochs):
            model.train()
            epoch_train_loss = 0.0
            correct_train = 0
            total_train = 0
            tic = time.perf_counter()
            
            for images, labels in train_loader:
                images = images.to(device)
                labels = labels.to(device)
                
                self._optimizer.zero_grad()
        
                # converting labels to Long to avoid error "not implemented for Int"
                labels = labels.long()
                
                # Forward pass
                outputs = model(images)
                loss = criterion(outputs, labels)
                epoch_train_loss += loss.item() * images.size(0)
        
                _, predicted_train = torch.max(outputs.data, 1)
                total_train += labels.size(0)
                correct_train += (predicted_train == labels).sum().item()
                
                # Backward pass
                loss.backward()
                self._optimizer.step()
                # scheduler.step()
            
            toc = time.perf_counter()
            time_taken = toc - tic
            
            epoch_train_loss /= len(train_loader.dataset)
            train_accuracy = correct_train / total_train
            train_accuracies.append(train_accuracy)
            
            # Evaluation of the model
            model.eval()
            total, correct = 0, 0
            
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)
        
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            val_accuracy = correct / total
            val_accuracies.append(val_accuracy)
            if (epoch + 1) % 1 == 0 or epoch == 0:
                print(f"Epoch [{epoch + 1}/{self.num_epochs}], Train Loss: {epoch_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, Iter Time: {time_taken:.2f}s")
            if val_accuracy > best_val_acc:
                best_val_acc = val_accuracy
                epochs_no_imp = 0
                best_model_state = model.state_dict()  # Save the best model
            else:
                epochs_no_imp += 1
            if epochs_no_imp >= self.patience:
                print(f'Early stopping after {epoch+1} epochs')
                model.load_state_dict(best_model_state)  # Load the best model
                break
        torch.save(self.model.state_dict(), f'models/{DATASET_PICK[dataset_choice]}.pth')
        #     # Plot accuracy curves
        # plt.plot(range(1, self.num_epochs+1), train_accuracies, label='Training Accuracy')
        # plt.plot(range(1, self.num_epochs+1), val_accuracies, label='Validation Accuracy')
        # plt.xlabel('Epoch')
        # plt.ylabel('Accuracy')
        # plt.title('Accuracy vs Epoch')
        # plt.legend()
        # plt.show()
        return self
    
    def predict(self, X, load=False):
    # Load the best model
        if load:
            self.model.load_state_dict(torch.load(f'models/{DATASET_PICK[dataset_choice]}.pth'))
        
        # Ensure X is a list of tensors or a single tensor
        if isinstance(X, list):
            X = torch.stack([torch.tensor(x) for x in X]).to(device)
        else:
            X = torch.tensor(X).to(device)

        # Model specifying
        model = self.model.to(device)
        model.eval()

        with torch.no_grad():
            outputs = model(X)
            _, predicted = torch.max(outputs.data, 1)
        
        # Debugging: Print the outputs and predictions
        # Set print options to print the full tensor
        torch.set_printoptions(profile="default")
        
        return predicted.tolist()

## Training model

In [64]:
dataset = audio_samples_new
train_set, test_set, labels_train_set, labels_test_set = train_test_split(dataset, labels, test_size=0.001, random_state=42)
final_train_set = []


for i in range(len(train_set)):
    transformed_sample = transform(train_set[i])
    final_train_set.append((transformed_sample, labels_train_set[i]))
    final_train_set.append((masking(transformed_sample), labels_train_set[i]))
X_train = [t[0] for t in final_train_set]
y_train = [t[1] for t in final_train_set]
print(len(final_train_set))

dataset_training=[(X_train[i], y_train[i]) for i in range(len(X_train))]


3600


In [65]:
model = CoAtNetNFT(keys=keys, num_epochs=1000, patience=100)

In [67]:
model.fit(dataset_training, lr=5e-5) #lr=5e-5

NaN or Inf detected after sequential 0
NaN or Inf detected after sequential 1
NaN or Inf detected after sequential 2
NaN or Inf detected after sequential 3
NaN or Inf detected after sequential 4
NaN or Inf detected after pool
NaN or Inf detected after fc
NaN or Inf detected after sequential 0
NaN or Inf detected after sequential 1
NaN or Inf detected after sequential 2
NaN or Inf detected after sequential 3
NaN or Inf detected after sequential 4
NaN or Inf detected after pool
NaN or Inf detected after fc
NaN or Inf detected after sequential 0
NaN or Inf detected after sequential 1
NaN or Inf detected after sequential 2
NaN or Inf detected after sequential 3
NaN or Inf detected after sequential 4
NaN or Inf detected after pool
NaN or Inf detected after fc
NaN or Inf detected after sequential 0
NaN or Inf detected after sequential 1
NaN or Inf detected after sequential 2
NaN or Inf detected after sequential 3
NaN or Inf detected after sequential 4
NaN or Inf detected after pool
NaN or In

## Prediction

For the prediction, a custom word is tested by picking audio samples from the overall dataset.

In [68]:
# Custom prediction

word = "Otorhinolaryngologyst" # for dataset 3 avoid using space and enter keys
word = word.upper()
curr_word, curr_labels = [], []

# Space -> -
# Enter -> +

print(f'keys: {keys}')
print(f'keys length: {len(keys)}')
#find the index first
def find_index(keys, key_char):
  try:
    return keys.index(key_char)
  except ValueError:
    return -1

#find the first matching index in the dataset
def find_first_match(dataset, labels, key_char):
  index = find_index(keys, key_char)
  if index == -1:
    return -1, -1
  for i in range(len(dataset)):
    if labels[i] == index:
      return dataset[i], labels[i]
  return -1, -1

for letter in word:
  #if letter is space or enter, replace with + or -
  if letter == ' ':
    letter = '-'
  elif letter == '\n':
    letter = '+' 
  character, letter=find_first_match(dataset, labels, letter)
  curr_word.append(character)
  curr_labels.append(letter)
        
print(f'curr_labels: {curr_labels}')
print(f'curr_word: {curr_word}')

keys: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M']
keys length: 36
curr_labels: [18, 14, 18, 13, 25, 17, 34, 18, 28, 20, 13, 15, 34, 24, 18, 28, 18, 24, 15, 21, 14]
curr_word: [tensor([    6348.,   115714.,  -104208.,  ..., -1255485.,  -377869.,
        -1269938.]), tensor([ -51572.,  -57414.,  107360.,  ..., -154955.,  120178., -104383.]), tensor([    6348.,   115714.,  -104208.,  ..., -1255485.,  -377869.,
        -1269938.]), tensor([ 115371.,   38921., -134421.,  ..., 3641711., 3216495., 3644380.]), tensor([   38769.,    52627.,   325510.,  ..., -2488643.,  -454440.,
        -2476882.]), tensor([ 143622., -159601.,  572050.,  ..., 2366193., 2392581., 2402114.]), tensor([  64924., -135220., -203051.,  ...,  266216., -295007.,  183578.]), tensor([    6348.,   115714.,  -104208.,  ..., -1255485.,  -377869.,
        -1269938.]), tensor([  86788., -1615

In [69]:
test_set_word = []
for i in range(len(curr_word)):
    transformed_word = transform(curr_word[i])
    test_set_word.append((transformed_word, curr_labels[i]))
    #test_set_word.append((masking(transformed_word), curr_labels[i]))
curr_word_processed = [t[0] for t in test_set_word]
curr_labels_processed = [t[1] for t in test_set_word]
print(len(test_set_word))

21


In [70]:
def getIndCurrKeys(ind: int):
    if ind < 0 or ind >= len(keys):
        raise IndexError(f"Index {ind} is out of range for keys list with length {len(keys)}")
    return keys[ind]

curr_word_processed = tuple(curr_word_processed)
curr_word_processed=torch.stack(curr_word_processed)
curr_labels_processed = torch.tensor(curr_labels_processed, dtype=torch.long)

# Assuming model.predict returns a list of indices
prediction = model.predict(curr_word_processed,load=True)

prediction_list = list(map(getIndCurrKeys, prediction)) 
og_labels_list=list(map(getIndCurrKeys, curr_labels_processed))
print(f'prediction: {prediction_list}')

print(f'real labels: {og_labels_list}')

NaN or Inf detected after sequential 0
NaN or Inf detected after sequential 1
NaN or Inf detected after sequential 2
NaN or Inf detected after sequential 3
NaN or Inf detected after sequential 4
NaN or Inf detected after pool
NaN or Inf detected after fc
prediction: ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
real labels: ['O', 'T', 'O', 'R', 'H', 'I', 'N', 'O', 'L', 'A', 'R', 'Y', 'N', 'G', 'O', 'L', 'O', 'G', 'Y', 'S', 'T']


  X = torch.tensor(X).to(device)


In [71]:
# Metrics calculation
accuracy = accuracy_score(og_labels_list, prediction_list)
precision = precision_score(og_labels_list, prediction_list, average='macro')
recall = recall_score(og_labels_list, prediction_list, average='macro')
f1 = sklearn.metrics.f1_score(og_labels_list, prediction_list, average='macro')

# Save in csv file
nft.save_csv("CoAtNetNFT", 1000, "With patience=100, using time shift", accuracy, precision, recall, f1)

# Print results
print("Final Results!")
print(f"Model: CoAtNetNFT")
print("With patience=100, using time shift")
print(f"Epochs: {1000}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Final Results!
Model: CoAtNetNFT
With patience=100, using time shift
Epochs: 1000
Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# revert use of - and +

prediction_list = list(map(lambda x: ' ' if x == '-' else x, prediction_list))
prediction_list = list(map(lambda x: '\n' if x == '+' else x, prediction_list))
sentence=''.join(prediction_list)

# LLAMA 3.1 prediction
try:
    client = Client(base_url='http://localhost:1234/v1', api_key='llm-studio')

    response = client.chat.completions.create(model='lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF', messages=[
      {
        'role': 'user',
        'content': sentence
      },
    ])
    print("Question:", sentence)
    print("Response:", response.choices[0].message.content)
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: Connection error.
