# Create Model

## Model setup

In [1]:
import config

data_path = config.SOUNDS_DATA_DIR_PATH
splits_path=config.DATA_SPLIT_SAVE_DIR_PATH
model_dir_path=config.MODEL_DIR_PATH

n_fft = 2048 # Ile próbek bierze do okna na ktorym dokonuje transformaty
hop_lenght = 1024 # O ile próbek przesuwa okno po każdej transformacie (Od tego zalezy wielkosc dataframe'a)
sr = 22050 # Liczba próbek na sekunde (Od tego zalezy wielkosc dataframe'a)

C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data


## Model creation process

#### Create splits for models

In [2]:
from src.myscripts import func

func.get_feature_combination_dataframe(['mel'], ['sound'], n_fft, hop_lenght, sr, data_path, True, 'sound_predict_df')
func.get_feature_combination_dataframe(['mel'], ['string'], n_fft, hop_lenght, sr, data_path, True, 'string_predict_df')
func.get_feature_combination_dataframe(['chroma', 'contrast'], ['sound_type'], n_fft, hop_lenght, sr, data_path, True, 'sound_type_predict_df')

#### Training sound recognition model

In [3]:
from src.myscripts.train import ModelTrainer
from src.myscripts.model import Conv1DClassifier
from torch.utils.data import TensorDataset
import torch
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from src.myscripts import prepare_data
import pandas as pd

data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/sound_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1])

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/sound_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 5632, 1)
y_train:(1015,)
x_val:(290, 5632, 1)
y_val:(290,)
x_test:(146, 5632, 1)
y_test:(146,)

Number of classes: 43
Epoch 1/10, Loss: 4.0940, Accuracy: 0.1044
Validation Loss: 2.8340, Validation Accuracy: 0.4517
Epoch 2/10, Loss: 2.6495, Accuracy: 0.3576
Validation Loss: 1.4175, Validation Accuracy: 0.8138
Epoch 3/10, Loss: 1.8167, Accuracy: 0.5547
Validation Loss: 0.7731, Validation Accuracy: 0.8724
Epoch 4/10, Loss: 1.1693, Accuracy: 0.7025
Validation Loss: 0.4122, Validation Accuracy: 0.9414
Epoch 5/10, Loss: 0.8823, Accuracy: 0.7507
Validation Loss: 0.2547, Validation Accuracy: 0.9586
Epoch 6/10, Loss: 0.7062, Accuracy: 0.8227
Validation Loss: 0.3004, Validation Accuracy: 0.9448
Validation loss did not improve for 1 epochs.
Epoch 7/10, Loss: 0.5701, Accuracy: 0.8473
Validation Loss: 0.1357, Validation Accuracy: 0.9793
Epoch 8/10, 

#### Training string recognition model

In [4]:
data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/string_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1])

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/string_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 5632, 1)
y_train:(1015,)
x_val:(290, 5632, 1)
y_val:(290,)
x_test:(146, 5632, 1)
y_test:(146,)

Number of classes: 2
Epoch 1/10, Loss: 1.4103, Accuracy: 0.5350
Validation Loss: 0.5042, Validation Accuracy: 0.8000
Epoch 2/10, Loss: 0.5110, Accuracy: 0.7517
Validation Loss: 0.2646, Validation Accuracy: 0.8621
Epoch 3/10, Loss: 0.3017, Accuracy: 0.8788
Validation Loss: 0.2358, Validation Accuracy: 0.9241
Epoch 4/10, Loss: 0.2117, Accuracy: 0.9261
Validation Loss: 0.1288, Validation Accuracy: 0.9379
Epoch 5/10, Loss: 0.1270, Accuracy: 0.9586
Validation Loss: 0.1146, Validation Accuracy: 0.9448
Epoch 6/10, Loss: 0.1208, Accuracy: 0.9655
Validation Loss: 0.0725, Validation Accuracy: 0.9621
Epoch 7/10, Loss: 0.0760, Accuracy: 0.9675
Validation Loss: 0.1025, Validation Accuracy: 0.9483
Validation loss did not improve for 1 epochs.
Epoch 8/10, L

#### Training sound_type recognition model

In [5]:
data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/sound_type_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1])

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/sound_type_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 836, 1)
y_train:(1015,)
x_val:(290, 836, 1)
y_val:(290,)
x_test:(146, 836, 1)
y_test:(146,)

Number of classes: 3
Epoch 1/10, Loss: 1.8366, Accuracy: 0.4227
Validation Loss: 0.9560, Validation Accuracy: 0.5000
Epoch 2/10, Loss: 1.0808, Accuracy: 0.5517
Validation Loss: 0.6664, Validation Accuracy: 0.7690
Epoch 3/10, Loss: 0.7149, Accuracy: 0.7005
Validation Loss: 0.5252, Validation Accuracy: 0.8000
Epoch 4/10, Loss: 0.5791, Accuracy: 0.7714
Validation Loss: 0.5060, Validation Accuracy: 0.8034
Epoch 5/10, Loss: 0.5151, Accuracy: 0.7911
Validation Loss: 0.4399, Validation Accuracy: 0.8138
Epoch 6/10, Loss: 0.5182, Accuracy: 0.7951
Validation Loss: 0.4314, Validation Accuracy: 0.8138
Epoch 7/10, Loss: 0.4814, Accuracy: 0.8108
Validation Loss: 0.4268, Validation Accuracy: 0.8241
Epoch 8/10, Loss: 0.4597, Accuracy: 0.8197
Validation Loss: 0.

TU ZAPISUJE STATE 3 MODELI MUSZE WYKOMBINOWAC CZY DA SIE ZROBIC SKRYPT PYTHONA KTORY DLA PODANEJ PROBKI DZWIEKOWEJ BEDZIE PRZEWIDYWAL LABEL DLA KAZDEGO Z TYCH MODELI A NASTEPNIE LACZYL TE PRZEWIDZIANE LABELE I PODAWAL GO JAKO WYNIK.
CHYBA BEDE MUSIAL WZIAC POD UWAGE TO ZE PRZEWIDYWANY LABEL JEST PO LABELENCODER I MUSZE POMYSLEC JAK PRZYWROCIC ZEBY PRZEWIDZIANA ETYKIETE ZAMIENIAL NA FAKTYCZNY ZROZUMIALY LABEL