In [139]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#####**Env for huggingface**

In [140]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa

# Monitor the training process
# !pip install wandb

In [141]:
!pip install transformers datasets evaluate accelerate nvidia-ml-py3



In [142]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


## Prepare Data



In [143]:
pip install evaluate




In [144]:
import os
import pandas as pd
import numpy as np
import torch
import torchaudio
import librosa

from torch.utils.data import Dataset
from transformers import AutoFeatureExtractor, ASTForAudioClassification, AutoConfig, TrainingArguments, Trainer, logging
import evaluate
from transformers import AdamW

In [145]:
path_training = '/content/drive/MyDrive/#GradProject/Project Data/'
path_healthy =  path_training + 'Healthy/'
path_pathology = path_training +'pathology/'
path_xlx = '/content/drive/MyDrive/#GradProject/Data/CleanedData.xlsx'
outputDir = '/content/drive/MyDrive/Colab Notebooks/'

In [146]:
# sampling rate of the loaded files (resampled)
samplingRate = 16000
#
numTrainEpochs = 40
topNClasses = 1+3
#
#maxNumSamplesPerClass = 80
maxSampleLength = samplingRate * 1
#
typesRecordsExt = {'Letters': '-iau.wav','Phrases': '-phrase.wav'}
selectedRecordType = 'Phrases'
#
trainPercent = 0.7
validPercent = 0.1
#testPercent = 0.1

In [147]:
#"Laryngitis","Funktionelle""Hyperfunktionelle","Rekurrensparese"
#diceases_name=["Laryngitis","Hyperfunktionelle Dysphonie","Rekurrensparese"]
diceases_name=["Dysphonie","Laryngitis"]
data = pd.read_excel(path_xlx, sheet_name='MainData')
print(data.shape[0])
# using only letters data (exclude samples that doesnt have letter record)
data = data[data[selectedRecordType] == 1]
print(data.shape[0])
data = data[data['Pathologies'].isin(diceases_name)]
print(data.shape[0])
# choose only id and pathology columns
data = data[['SessionID','Pathologies']]
data.info


2043
1754
152


<bound method DataFrame.info of       SessionID Pathologies
213        1057   Dysphonie
682         107  Laryngitis
692        1269  Laryngitis
693        1614  Laryngitis
694        1930  Laryngitis
...         ...         ...
2033       2602  Laryngitis
2036       2605  Laryngitis
2038       2607   Dysphonie
2039       2608   Dysphonie
2041       2610  Laryngitis

[152 rows x 2 columns]>

In [148]:
dataIDs = data[data['Pathologies'].isin(diceases_name)]
dataIDs['Pathologies'].value_counts()

Laryngitis    82
Dysphonie     70
Name: Pathologies, dtype: int64

In [149]:
dataIDs=dataIDs.groupby('Pathologies').head(70)
dataIDs['Pathologies'].value_counts()

Dysphonie     70
Laryngitis    70
Name: Pathologies, dtype: int64

In [150]:
dataIDs.head(3)

Unnamed: 0,SessionID,Pathologies
213,1057,Dysphonie
682,107,Laryngitis
692,1269,Laryngitis


## Start spliting data into train, valid, test -> X,Y

In [151]:
# Shuffle the dataset.
dataShuffled = dataIDs.sample(frac=1, random_state=50)

In [152]:
dataShuffled.head(3)

Unnamed: 0,SessionID,Pathologies
1766,2422,Dysphonie
925,918,Laryngitis
1765,2238,Dysphonie


In [153]:
# Group the samples by their class.
groups = dataShuffled.groupby('Pathologies')
train = []
val = []
test = []
for name, group in groups:
    n = len(group)
    train_size = int(trainPercent * n)
    val_size = int(validPercent * n)
    test_size = n - train_size - val_size

    train_group = group[:train_size]
    val_group = group[train_size:train_size+val_size]
    test_group = group[train_size+val_size:]

    train.append(train_group)
    val.append(val_group)
    test.append(test_group)

In [154]:
# Concatenate the splits for each class to create the final splits.
train = pd.concat(train)
val = pd.concat(val)
test = pd.concat(test)
train = train.sample(frac=1, random_state=50)
val = val.sample(frac=1, random_state=50)
test = test.sample(frac=1, random_state=50)

In [155]:
X_train, y_train = train['SessionID'].tolist(), train['Pathologies'].tolist()
X_valid, y_valid = val['SessionID'].tolist(), val['Pathologies'].tolist()
X_test, y_test = test['SessionID'].tolist(), test['Pathologies'].tolist()

In [156]:
print(f'train size: {len(X_train)}')
print(f'valid size: {len(X_valid)}')
print(f'test size: {len(X_test)}')

train size: 98
valid size: 14
test size: 28


In [157]:
print(pd.Series(y_train).value_counts())

Laryngitis    49
Dysphonie     49
dtype: int64


In [158]:
print('valid')
print(pd.Series(y_valid).value_counts())

valid
Laryngitis    7
Dysphonie     7
dtype: int64


In [159]:
print('test')
print(pd.Series(y_test).value_counts())

test
Laryngitis    14
Dysphonie     14
dtype: int64


In [160]:
# check test data is seperate from train and val
# find rows in common
common_rows = pd.merge(train, test, on=['SessionID', 'Pathologies'], how='inner')
print(common_rows)
# find rows in common
common_rows = pd.merge(val, test, on=['SessionID', 'Pathologies'], how='inner')
print(common_rows)

Empty DataFrame
Columns: [SessionID, Pathologies]
Index: []
Empty DataFrame
Columns: [SessionID, Pathologies]
Index: []


In [161]:
audio_path = path_healthy if(y_train[3] == "None") else path_pathology
audio_path =  audio_path +str(X_train[0]) + typesRecordsExt[selectedRecordType]
audio_path

'/content/drive/MyDrive/#GradProject/Project Data/pathology/1809-phrase.wav'

In [162]:

class CustomAudioDataset(Dataset):
  def __init__(self, samples, labels):
    self.samples = samples
    self.samples_labels = labels

  def __len__(self):
      return len(self.samples_labels)

  def __getitem__(self, idx):
      input = self.samples[idx]
      label = self.samples_labels[idx]

      return {'input_values': input, "labels" : label2id[label]}

**Model prep**

In [163]:
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

In [164]:
model_name_or_path = "facebook/wav2vec2-base-100k-voxpopuli"
pooling_mode = "mean"
modelName = "MIT/ast-finetuned-audioset-10-10-0.4593"



In [165]:
feature_extractor = AutoFeatureExtractor.from_pretrained(modelName)


In [166]:
def freq_mask(spec, F=10, num_masks=1):
    test = spec.clone()
    num_mel_channels = test.shape[1]
    if(num_mel_channels>F):
      F = F
    else:
      F = 25
    for i in range(0, num_masks):
      freq = random.randrange(0, F)
      zero =  random.randrange(0,num_mel_channels- freq)
      # avoids randrange error if values are equal and range is empty
      if (zero == zero + freq): return test
      mask_end = random.randrange(zero, zero + freq)
      test[0][zero:mask_end] = test.mean()
    return test


In [167]:
def loadAudioData(ListID, ListLabel):
  x=[]
  lab=[]

  for i, ID in enumerate(ListID):
      label = ListLabel[i]

      audio_path = path_healthy if(label == "None") else path_pathology
      audio_path = audio_path + str(ID) + typesRecordsExt[selectedRecordType]

      waveform, sr = librosa.load(audio_path, sr=samplingRate)
      augwaveform=librosa.effects.pitch_shift(y=waveform,sr=sr,n_steps=2)
      #print(waveform.shape)

      waveform = librosa.util.fix_length(waveform, size=maxSampleLength, mode='constant', constant_values=(0))
      augwaveform = librosa.util.fix_length(augwaveform, size=maxSampleLength, mode='constant', constant_values=(0))



      input = feature_extractor(waveform, padding="max_length", sampling_rate=samplingRate,  return_tensors="pt")
      auginput = feature_extractor(augwaveform, padding="max_length", sampling_rate=samplingRate,  return_tensors="pt")

      input = input.input_values[0]

      auginput = auginput.input_values[0]

      ref_input= freq_mask(torch.tensor(auginput))

      x.append(input)
      x.append(auginput)
      x.append(ref_input)
      lab.extend([label] * 3)

  return x, lab


In [168]:
label2id, id2label = dict(), dict()
for i, label in enumerate(diceases_name):
    label2id[label] = i
    id2label[i] = label

print(label2id)
print(id2label)

{'Dysphonie': 0, 'Laryngitis': 1}
{0: 'Dysphonie', 1: 'Laryngitis'}


In [169]:
num_labels = len(id2label)
# Model
model = ASTForAudioClassification.from_pretrained(modelName, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)


In [170]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cuda')

In [171]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [172]:
from torch import nn

logging.set_verbosity_error()
from torch.optim import SGD

optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)





training_args = TrainingArguments(
    output_dir=outputDir+'results',
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=30,
    weight_decay=0.01,
    logging_dir=outputDir,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="accuracy",
    greater_is_better=True

)

In [173]:
import random

In [174]:
X_train, y_train = loadAudioData(X_train, y_train)



  ref_input= freq_mask(torch.tensor(auginput))


In [175]:
X_train[2]


tensor([[-0.5742, -1.0880, -0.7112,  ..., -0.3493, -0.6473, -1.0378],
        [-0.9084, -1.2776, -1.0190,  ..., -0.3288, -0.4885, -1.0706],
        [-0.6130, -0.9994, -0.6226,  ..., -0.3948, -0.5156, -0.8965],
        ...,
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]])

In [176]:
print(len(X_train))
print(len(y_train))
print(X_train[0])

294
294
tensor([[-0.6327, -1.0695, -0.6926,  ..., -0.5713, -0.6660, -0.9840],
        [-0.5982, -1.0299, -0.6531,  ..., -0.5757, -0.7745, -1.0303],
        [-0.6689, -1.1043, -0.7275,  ..., -0.5815, -0.7049, -1.0208],
        ...,
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]])


In [177]:
X_valid, y_valid = loadAudioData(X_valid, y_valid)

  ref_input= freq_mask(torch.tensor(auginput))


In [178]:
train_dataset = CustomAudioDataset(samples=X_train, labels=y_train)
valid_dataset = CustomAudioDataset(samples=X_valid, labels=y_valid)


In [179]:
x = train_dataset.__getitem__(0)
x

{'input_values': tensor([[-0.6327, -1.0695, -0.6926,  ..., -0.5713, -0.6660, -0.9840],
         [-0.5982, -1.0299, -0.6531,  ..., -0.5757, -0.7745, -1.0303],
         [-0.6689, -1.1043, -0.7275,  ..., -0.5815, -0.7049, -1.0208],
         ...,
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]]),
 'labels': 1}

**Training**

In [180]:
trainer = Trainer(
       model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=None,
    compute_metrics=compute_metrics

)
trainer.optimizer = optimizer


disconecred

In [181]:
import random


In [182]:
result = trainer.train()

{'loss': 0.7883, 'learning_rate': 0.009931972789115647, 'epoch': 0.2}
{'loss': 0.7306, 'learning_rate': 0.009863945578231293, 'epoch': 0.41}
{'loss': 0.8102, 'learning_rate': 0.009795918367346938, 'epoch': 0.61}
{'loss': 0.8464, 'learning_rate': 0.009727891156462585, 'epoch': 0.82}
{'eval_loss': 0.7989441156387329, 'eval_accuracy': 0.5952380952380952, 'eval_precision': 0.6944444444444444, 'eval_recall': 0.5952380952380952, 'eval_f1': 0.5360623781676414, 'eval_runtime': 4.7497, 'eval_samples_per_second': 8.843, 'eval_steps_per_second': 1.474, 'epoch': 1.0}
{'loss': 0.574, 'learning_rate': 0.009659863945578231, 'epoch': 1.02}
{'loss': 0.5439, 'learning_rate': 0.009591836734693878, 'epoch': 1.22}
{'loss': 0.5763, 'learning_rate': 0.009523809523809523, 'epoch': 1.43}
{'loss': 0.5398, 'learning_rate': 0.009455782312925171, 'epoch': 1.63}
{'loss': 0.2349, 'learning_rate': 0.009387755102040816, 'epoch': 1.84}
{'eval_loss': 1.0018630027770996, 'eval_accuracy': 0.5952380952380952, 'eval_precisi

In [None]:
result.metrics

In [184]:
trainer.evaluate()

{'eval_loss': 2.6192753314971924, 'eval_accuracy': 0.6666666666666666, 'eval_precision': 0.6729411764705883, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.6636155606407323, 'eval_runtime': 4.528, 'eval_samples_per_second': 9.276, 'eval_steps_per_second': 1.546, 'epoch': 30.0}


{'eval_loss': 2.6192753314971924,
 'eval_accuracy': 0.6666666666666666,
 'eval_precision': 0.6729411764705883,
 'eval_recall': 0.6666666666666666,
 'eval_f1': 0.6636155606407323,
 'eval_runtime': 4.528,
 'eval_samples_per_second': 9.276,
 'eval_steps_per_second': 1.546,
 'epoch': 30.0}

In [185]:
X_test, y_test = loadAudioData(X_test, y_test)
test_dataset = CustomAudioDataset(samples = X_test, labels = y_test)
predictions = trainer.predict(test_dataset= test_dataset)

  ref_input= freq_mask(torch.tensor(auginput))


In [None]:
predictions.metrics

In [None]:
yPred = np.array([prediction.argmax() for prediction in predictions.predictions]).reshape(-1,)
yReal = np.array([model.config.label2id[label] for label in y_test]).reshape(-1,)
yReal

In [None]:
from sklearn import metrics

# convert 1D arrays to 2D arrays with one column
yReal = np.array(yReal).reshape((-1,))
yPred = np.array(yPred).reshape((-1,))

# calculate accuracy
accuracy = metrics.accuracy_score(yReal, yPred)

# calculate precision, recall, and F1 score
precision = metrics.precision_score(yReal, yPred, average='macro')
recall = metrics.recall_score(yReal, yPred, average='macro')
f1_score = metrics.f1_score(yReal, yPred, average='macro')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)
#print('AUC:', auc)

In [None]:
import torch

# Assuming your model is named 'model' and has achieved good accuracy

path='/content/drive/MyDrive/#GradProject/Models/model  pathology  Transformrs try% '
model.save_pretrained(path)
