In [4]:
import pandas as pd
import dotenv
import os
from sklearn.model_selection import train_test_split
import shutil

In [11]:
dotenv.load_dotenv()

BASE_PATH = os.getenv('BASE_PATH')

TRAIN_DATA_PATH = os.path.join(BASE_PATH, 'data', 'train')
VALID_DATA_PATH = os.path.join(BASE_PATH, 'data', 'valid')
TEST_DATA_PATH = os.path.join(BASE_PATH, 'data', 'test')

LABELS = ['siren', 'gunshot', 'explosion', 'casual']
label2id = {label: i for i, label in enumerate(LABELS)}

all_files = os.listdir(TRAIN_DATA_PATH)

all_labels = [fname.split('_')[0] for fname in all_files]

train_files, valid_files = train_test_split(
    all_files,
    test_size=0.2,
    random_state=42,
    stratify=all_labels
)

os.makedirs(VALID_DATA_PATH, exist_ok=True)

for fname in valid_files:
    shutil.copy(os.path.join(TRAIN_DATA_PATH, fname),os.path.join(VALID_DATA_PATH, fname))


In [9]:
train_files = os.listdir(TRAIN_DATA_PATH)
valid_files = os.listdir(VALID_DATA_PATH)
test_files = os.listdir(TEST_DATA_PATH)

def make_df(file_list):
    targets = [label2id[f.split('_')[0]] for f in file_list]
    return pd.DataFrame({"path": file_list, "target": targets})

train_df = make_df(train_files)
valid_df = make_df(valid_files)
test_df  = make_df(test_files)

print(
    f'train_df shape: {train_df.shape} \n'
    f'valid_df shape: {valid_df.shape} \n'
    f'test_df shape: {test_df.shape}'
)
train_df

train_df shape: (480, 2) 
valid_df shape: (96, 2) 
test_df shape: (105, 2)


Unnamed: 0,path,target
0,casual_001.wav,3
1,casual_002.wav,3
2,casual_003.wav,3
3,casual_004.wav,3
4,casual_005.wav,3
...,...,...
475,siren_098.wav,0
476,siren_100.wav,0
477,siren_101.wav,0
478,siren_103.wav,0


In [10]:
TRAIN_OUTPUT_METADATA = os.path.join(BASE_PATH, 'data', 'train.csv')
VALID_OUTPUT_METADATA = os.path.join(BASE_PATH, 'data', 'valid.csv')
TEST_OUTPUT_METADATA = os.path.join(BASE_PATH, 'data', 'test.csv')

train_df.to_csv(TRAIN_OUTPUT_METADATA, index=False)
valid_df.to_csv(VALID_OUTPUT_METADATA, index=False)
test_df.to_csv(TEST_OUTPUT_METADATA, index=False)