# Create Model

In [1]:
from src.myscripts.train import ModelTrainer
from src.myscripts.model import Conv1DClassifier
from torch.utils.data import TensorDataset
import torch
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from src.myscripts import prepare_data
import pandas as pd
import config

C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data


## Model setup

In [2]:

data_path = config.SOUNDS_DATA_DIR_PATH
splits_path=config.DATA_SPLIT_SAVE_DIR_PATH
model_dir_path=config.MODEL_DIR_PATH

n_fft = 2048 # Ile próbek bierze do okna na ktorym dokonuje transformaty
hop_lenght = 1024 # O ile próbek przesuwa okno po każdej transformacie (Od tego zalezy wielkosc dataframe'a)
sr = 22050 # Liczba próbek na sekunde (Od tego zalezy wielkosc dataframe'a)

## Model creation process

#### Create splits for models

In [3]:
from src.myscripts import func

func.get_feature_combination_dataframe(['mel'], ['sound'], n_fft, hop_lenght, sr, data_path, True, 'sound_predict_df')
func.get_feature_combination_dataframe(['mel'], ['string'], n_fft, hop_lenght, sr, data_path, True, 'string_predict_df')
func.get_feature_combination_dataframe(['chroma', 'contrast'], ['sound_type'], n_fft, hop_lenght, sr, data_path, True, 'sound_type_predict_df')

Po splaszczeniu mel: (1451, 5632)
Po splaszczeniu mel: (1451, 5632)
Po splaszczeniu mel: (1451, 5632)


#### Training sound recognition model

In [19]:

data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/sound_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1], encoder_file_name="sound_encoder.joblib")

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

print(f"x_train shape: {x_train.shape}")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=43, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset, epochs=20)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/sound_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 5632, 1)
y_train:(1015,)
x_val:(290, 5632, 1)
y_val:(290,)
x_test:(146, 5632, 1)
y_test:(146,)

Number of classes: 43
x_train shape: (1015, 5632, 1)
Epoch 1/20, Loss: 3.8805, Accuracy: 0.1044
Validation Loss: 2.7634, Validation Accuracy: 0.4310
Epoch 2/20, Loss: 2.7868, Accuracy: 0.3517
Validation Loss: 1.4649, Validation Accuracy: 0.7172
Epoch 3/20, Loss: 1.7426, Accuracy: 0.5596
Validation Loss: 0.7459, Validation Accuracy: 0.8724
Epoch 4/20, Loss: 1.2262, Accuracy: 0.6798
Validation Loss: 0.4514, Validation Accuracy: 0.9207
Epoch 5/20, Loss: 0.9017, Accuracy: 0.7596
Validation Loss: 0.2873, Validation Accuracy: 0.9483
Epoch 6/20, Loss: 0.6772, Accuracy: 0.8108
Validation Loss: 0.2091, Validation Accuracy: 0.9552
Epoch 7/20, Loss: 0.6232, Accuracy: 0.8552
Validation Loss: 0.1320, Validation Accuracy: 0.9759


KeyboardInterrupt: 

#### Training string recognition model

In [12]:


data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/string_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1], encoder_file_name="string_encoder.joblib")

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

print(f"x_train shape: {x_train.shape}")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=2, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset, epochs=20)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/string_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 5632, 1)
y_train:(1015,)
x_val:(290, 5632, 1)
y_val:(290,)
x_test:(146, 5632, 1)
y_test:(146,)

Number of classes: 2
Epoch 1/20, Loss: 0.8176, Accuracy: 0.6680
Validation Loss: 0.3724, Validation Accuracy: 0.8103
Epoch 2/20, Loss: 0.3234, Accuracy: 0.8956
Validation Loss: 0.1607, Validation Accuracy: 0.9448
Epoch 3/20, Loss: 0.1557, Accuracy: 0.9330
Validation Loss: 0.1072, Validation Accuracy: 0.9414
Epoch 4/20, Loss: 0.1293, Accuracy: 0.9665
Validation Loss: 0.0650, Validation Accuracy: 0.9828
Epoch 5/20, Loss: 0.1126, Accuracy: 0.9695
Validation Loss: 0.0569, Validation Accuracy: 0.9759
Epoch 6/20, Loss: 0.0689, Accuracy: 0.9773
Validation Loss: 0.0581, Validation Accuracy: 0.9828
Validation loss did not improve for 1 epochs.
Epoch 7/20, Loss: 0.0758, Accuracy: 0.9793
Validation Loss: 0.0677, Validation Accuracy: 0.9655
Validation lo

#### Training sound_type recognition model

In [13]:
data = pd.read_csv(f"{config.DATAFRAMES_DIR_PATH}/sound_type_predict_df.csv")
prepare_data.prepare_data_for_model(data,-1, LabelEncoder, [0.7, 0.2, 0.1], encoder_file_name="sound_type_encoder.joblib")

x_train= np.load(os.path.join(splits_path, "x_train.npy"), allow_pickle=True)
y_train= np.load(os.path.join(splits_path, "y_train.npy"), allow_pickle=True)
x_val= np.load(os.path.join(splits_path, "x_val.npy"), allow_pickle=True)
y_val= np.load(os.path.join(splits_path, "y_val.npy"), allow_pickle=True)
x_train = x_train.astype(np.float32)
x_val = x_val.astype(np.float32)
y_train = y_train.astype("long")
y_val = y_val.astype("long")

print(f"x_train shape: {x_train.shape}")

x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_shape = x_train.shape[1:]
model = Conv1DClassifier(num_classes=3, input_shape=input_shape)

model_trainer = ModelTrainer(model, device)
model_trainer.train_model(train_dataset, val_dataset, epochs=20)
# Ładujemy najlepszy model
model = model_trainer.get_trained_model()

torch.save(model.state_dict(),f"{model_dir_path}/sound_type_recognition_model.pth")

Prepared data saved in: C:\Users\Bcom_\Documents\Projekty\Rozpoznawanie_dzwiekow_gitarowych\data\prepared_data.
Data shape confirmation:
x_train:(1015, 836, 1)
y_train:(1015,)
x_val:(290, 836, 1)
y_val:(290,)
x_test:(146, 836, 1)
y_test:(146,)

Number of classes: 3
Epoch 1/20, Loss: 1.2488, Accuracy: 0.4818
Validation Loss: 0.9279, Validation Accuracy: 0.7862
Epoch 2/20, Loss: 0.7619, Accuracy: 0.7143
Validation Loss: 0.5076, Validation Accuracy: 0.7931
Epoch 3/20, Loss: 0.5329, Accuracy: 0.7783
Validation Loss: 0.4467, Validation Accuracy: 0.8138
Epoch 4/20, Loss: 0.4708, Accuracy: 0.8039
Validation Loss: 0.4055, Validation Accuracy: 0.8276
Epoch 5/20, Loss: 0.4601, Accuracy: 0.8197
Validation Loss: 0.3906, Validation Accuracy: 0.8345
Epoch 6/20, Loss: 0.4528, Accuracy: 0.8197
Validation Loss: 0.4073, Validation Accuracy: 0.8207
Validation loss did not improve for 1 epochs.
Epoch 7/20, Loss: 0.4351, Accuracy: 0.8187
Validation Loss: 0.3795, Validation Accuracy: 0.8345
Epoch 8/20, Loss