In [1]:
import pandas as pd
import numpy as np
import os
import soundfile as sf
SAMPLERATE = 25600
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
# data_path = '/local2/abzaliev/road_sound/road_data/road test 02-13-24 Dixie Hwy and I-75'
# mph50 = pd.read_csv(os.path.join(data_path, 'road test1-50MPH_road-name-Dixie hwy_cold 35F.csv'), header=None)
# mph60 = pd.read_csv(os.path.join(data_path, 'road test1-60MPH_road-name-I75_cold 35F.csv'), header=None)
# mph65 = pd.read_csv(os.path.join(data_path, 'road test1-65MPH_road-name-Dixie hwy_cold 35F.csv'), header=None)
# mph70 = pd.read_csv(os.path.join(data_path, 'roadtest1_70MPH_road-name-I75_cold 35F.csv'), header=None)

# Dataset

In [6]:
from datasets import load_dataset, Audio

# load from the folder we created, see prepare_dataset.py
dataset = load_dataset("audiofolder", data_dir="/local2/abzaliev/road_sound/road_data/data_processed", save_infos=True)

# split into train/test, stratify by labels
dataset = dataset['train'].train_test_split(test_size=0.1, stratify_by_column='label')

# # resample from 25600kHz to 16kHz - required by wav2vec2
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

Resolving data files:   0%|          | 0/831 [00:00<?, ?it/s]

In [7]:
dataset['train']['audio'][0]['array']

array([143.90951538, 197.5970459 , 170.75149536, ..., 107.26091003,
       108.22573853, 121.72618866])

In [25]:
pd.Series(dataset['train']['label']).value_counts(normalize=True)

3    0.412316
2    0.208835
0    0.204819
1    0.174029
Name: proportion, dtype: float64

In [24]:
pd.Series(dataset['test']['label']).value_counts(normalize=True).sum

84

In [8]:
# load_dataset() already label encoded our class names into integers, so we just recover this information
# can be done even easier with dataset['train'].features['label']._str2int
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [9]:
label2id

{'50mph': '0', '60mph': '1', '65mph': '2', '70mph': '3'}

In [10]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [11]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [12]:
ds_encoded = dataset.map(preprocess_function, remove_columns="audio", batched=True)
# ds_encoded = encoded_minds.rename_column("intent_class", "label")

Map:   0%|          | 0/747 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


# Model 

In [14]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'classifier.bias', 'projector.weight', 'classifier.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="./trained_speed_detection",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.35888,0.404762
2,1.366200,1.330927,0.404762
3,1.366200,1.250821,0.440476
4,1.200200,1.339791,0.404762
5,1.085400,1.144744,0.511905
6,1.085400,1.132916,0.511905
7,1.016200,1.130061,0.5
8,1.016200,1.152164,0.464286
9,0.942700,1.162829,0.464286
10,0.926500,1.155228,0.464286




TrainOutput(global_step=60, training_loss=1.0895299275716146, metrics={'train_runtime': 150.5519, 'train_samples_per_second': 49.617, 'train_steps_per_second': 0.399, 'total_flos': 6.781775442048e+16, 'train_loss': 1.0895299275716146, 'epoch': 10.0})