# Setup for GTZAN - Tabular (3-sec features)

## Imports

In [11]:
# Data Handling
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Paths
from src.__00__paths import processed_tabular_dir, curated_tabular_dir, raw_data_dir

## Load Data

In [7]:
df = pd.read_csv(raw_data_dir / "features_3_sec.csv")
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


## Data Preprocessing

In [8]:
df.drop(['filename', 'length'], axis=1, inplace=True)

# Encode Labels from 0 -> 9
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

df.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,3714.560359,1080790.0,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,0
1,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,3869.682242,672244.8,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,0
2,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,3997.63916,790712.7,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,0
3,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,3568.300218,921652.4,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,0
4,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,3469.992864,610211.1,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,0


In [9]:
# Save Label Mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

mapping_df = pd.DataFrame(label_encoder.classes_, columns=['Label'])

# Optional: sort by encoded value (for neatness)
mapping_df.head(n=10)

Unnamed: 0,Label
0,blues
1,classical
2,country
3,disco
4,hiphop
5,jazz
6,metal
7,pop
8,reggae
9,rock


## Save Processed Dataframes

In [12]:
df.to_csv(processed_tabular_dir / "gtzan_tabular_3_sec.csv", index=False)
mapping_df.to_csv(processed_tabular_dir / "gtzan_label_mapping_3_sec.csv", index=True)
print(f"✔️ Saved processed data to {'/'.join(processed_tabular_dir.parts[-2:])}")

✔️ Saved processed data to processed/tabular


## Test - Train Split

In [13]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv(curated_tabular_dir / 'train.csv', index=False)
test_df.to_csv(curated_tabular_dir / 'test.csv', index=False)
print(f"✔️ Train and Test Datasets saved to {'/'.join(curated_tabular_dir.parts[-2:])}")

✔️ Train and Test Datasets saved to curated/tabular
