# Create Model

## Model setup

In [1]:
import config

data_path = config.SOUNDS_DATA_DIR_PATH
splits_path=config.DATA_SPLIT_SAVE_DIR_PATH
model_dir_path=config.MODEL_DIR_PATH

n_fft = 2048 # Ile próbek bierze do okna na ktorym dokonuje transformaty
hop_lenght = 1024 # O ile próbek przesuwa okno po każdej transformacie (Od tego zalezy wielkosc dataframe'a)
sr = 22050 # Liczba próbek na sekunde (Od tego zalezy wielkosc dataframe'a)

C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data


## Model creation process

#### Create splits for models

In [2]:
from src.myscripts import func

func.get_feature_combination_dataframe(['mel'], ['sound'], n_fft, hop_lenght, sr, data_path, True, 'sound_predict_df')
func.get_feature_combination_dataframe(['mel'], ['string'], n_fft, hop_lenght, sr, data_path, True, 'string_predict_df')
func.get_feature_combination_dataframe(['chroma', 'contrast'], ['sound_type'], n_fft, hop_lenght, sr, data_path, True, 'sound_type_predict_df')

#### Training sound recognition model

In [3]:
from src.myscripts.train import ModelTrainer
from src.myscripts.model import Conv1DClassifier
from torch.utils.data import TensorDataset
import torch
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from src.myscripts import prepare_data
import pandas as pd

data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/sound_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1], encoder_file_name="sound_encoder.joblib")

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/sound_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 5632, 1)
y_train:(1015,)
x_val:(290, 5632, 1)
y_val:(290,)
x_test:(146, 5632, 1)
y_test:(146,)

Number of classes: 43
Epoch 1/10, Loss: 4.0975, Accuracy: 0.1123
Validation Loss: 2.6459, Validation Accuracy: 0.4862
Epoch 2/10, Loss: 2.5769, Accuracy: 0.3645
Validation Loss: 1.2723, Validation Accuracy: 0.7862
Epoch 3/10, Loss: 1.7485, Accuracy: 0.5626
Validation Loss: 0.7022, Validation Accuracy: 0.8621
Epoch 4/10, Loss: 1.1805, Accuracy: 0.6985
Validation Loss: 0.3652, Validation Accuracy: 0.9276
Epoch 5/10, Loss: 0.9490, Accuracy: 0.7517
Validation Loss: 0.3492, Validation Accuracy: 0.9586
Epoch 6/10, Loss: 0.7793, Accuracy: 0.8177
Validation Loss: 0.2395, Validation Accuracy: 0.9655
Epoch 7/10, Loss: 0.6030, Accuracy: 0.8483
Validation Loss: 0.1324, Validation Accuracy: 0.9793
Epoch 8/10, Loss: 0.4455, Accuracy: 0.8887
Validation Loss

#### Training string recognition model

In [4]:
data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/string_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1], encoder_file_name="string_encoder.joblib")

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset, epochs=20)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/string_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 5632, 1)
y_train:(1015,)
x_val:(290, 5632, 1)
y_val:(290,)
x_test:(146, 5632, 1)
y_test:(146,)

Number of classes: 2
True
Epoch 1/20, Loss: 1.4205, Accuracy: 0.5271
Validation Loss: 0.4511, Validation Accuracy: 0.8345
Epoch 2/20, Loss: 0.5106, Accuracy: 0.7931
Validation Loss: 0.2530, Validation Accuracy: 0.8966
Epoch 3/20, Loss: 0.2993, Accuracy: 0.8877
Validation Loss: 0.1493, Validation Accuracy: 0.9310
Epoch 4/20, Loss: 0.1656, Accuracy: 0.9409
Validation Loss: 0.0828, Validation Accuracy: 0.9586
Epoch 5/20, Loss: 0.1276, Accuracy: 0.9547
Validation Loss: 0.0556, Validation Accuracy: 0.9862
Epoch 6/20, Loss: 0.0929, Accuracy: 0.9655
Validation Loss: 0.0596, Validation Accuracy: 0.9724
Validation loss did not improve for 1 epochs.
Epoch 7/20, Loss: 0.0585, Accuracy: 0.9764
Validation Loss: 0.0375, Validation Accuracy: 0.9862
Epoch 8/

#### Training sound_type recognition model

In [5]:
data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/sound_type_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1], encoder_file_name="sound_type_encoder.joblib")

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset, epochs=20)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/sound_type_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 836, 1)
y_train:(1015,)
x_val:(290, 836, 1)
y_val:(290,)
x_test:(146, 836, 1)
y_test:(146,)

Number of classes: 3
Epoch 1/20, Loss: 1.9498, Accuracy: 0.3901
Validation Loss: 0.9899, Validation Accuracy: 0.5000
Epoch 2/20, Loss: 1.0961, Accuracy: 0.5429
Validation Loss: 0.6635, Validation Accuracy: 0.8000
Epoch 3/20, Loss: 0.6932, Accuracy: 0.7241
Validation Loss: 0.4832, Validation Accuracy: 0.8000
Epoch 4/20, Loss: 0.5615, Accuracy: 0.7695
Validation Loss: 0.4325, Validation Accuracy: 0.8138
Epoch 5/20, Loss: 0.5240, Accuracy: 0.7833
Validation Loss: 0.4081, Validation Accuracy: 0.8241
Epoch 6/20, Loss: 0.4785, Accuracy: 0.7931
Validation Loss: 0.3901, Validation Accuracy: 0.8310
Epoch 7/20, Loss: 0.4829, Accuracy: 0.8010
Validation Loss: 0.3931, Validation Accuracy: 0.8310
Validation loss did not improve for 1 epochs.
Epoch 8/20, Loss