# Train - Test Split

In [1]:
# Author: Arnova Abdullah
# Date: 05-05-2022
# Description: Code for creating stratified train-test split of the music genre dataset

# Edits:
# Soumya Sambeet Mohapatra (12-05-2022)

## Importing modules

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Loading data 

In [4]:
df = pd.read_csv('genre_data_final.csv')
df.head()

Unnamed: 0,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre
0,0.507,0.71,-5.02,1,0.0291,0.0975,1.1e-05,0.17,0.45,134.647,4,acoustic
1,0.496,0.378,-7.878,1,0.0336,0.898,1e-06,0.109,0.335,144.092,3,acoustic
2,0.76,0.765,-9.319,1,0.0352,0.289,8e-06,0.233,0.868,107.602,4,acoustic
3,0.607,0.196,-19.705,1,0.0595,0.904,0.727,0.111,0.187,111.031,4,acoustic
4,0.497,0.0919,-15.685,1,0.0704,0.968,0.00214,0.133,0.548,206.431,4,acoustic


## Creating stratified Train-Test split of the dataset

In [7]:
X = df.drop('genre',axis=1)
y = df['genre']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=43, 
    stratify=df['genre']
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9600, 11), (2400, 11), (9600,), (2400,))

In [6]:
X_train.value_counts()

danceability  energy  key  loudness  mode  speechiness  acousticness  instrumentalness  liveness  valence  tempo    duration_ms  time_signature
0.733         0.710   5    -5.849    0     0.0292       0.14500       0.115000          0.0956    0.965    127.975  239600       4                 6
0.344         0.952   7    -4.621    1     0.0458       0.02870       0.000000          0.0368    0.767    205.651  245093       4                 5
0.574         0.414   9    -7.160    0     0.0495       0.84200       0.000000          0.1720    0.867    77.968   163040       4                 5
0.358         0.883   4    -6.669    1     0.0485       0.00217       0.025800          0.1210    0.703    155.482  246000       4                 5
0.578         0.866   1    -3.804    1     0.0619       0.00701       0.000000          0.2570    0.619    128.038  199080       4                 5
                                                                                                               

In [8]:
print(X_test.value_counts())

danceability  energy  loudness  mode  speechiness  acousticness  instrumentalness  liveness  valence  tempo    time_signature
0.585         0.561   -9.784    1     0.0582       0.112000      0.00013           0.6890    0.292    134.053  4                 3
0.445         0.376   -6.244    1     0.0298       0.706000      0.00000           0.2700    0.473    129.022  4                 3
0.486         0.488   -5.089    1     0.0377       0.832000      0.00000           0.0854    0.534    94.250   4                 3
0.799         0.579   -5.144    0     0.0487       0.003810      0.00000           0.3670    0.363    142.987  4                 3
0.520         0.237   -10.292   1     0.0302       0.870000      0.00000           0.1220    0.574    105.711  4                 3
                                                                                                                                ..
0.488         0.936   -3.431    1     0.0699       0.000435      0.00001           0.106

In [8]:
y_train.value_counts()

blues        960
jazz         960
rock         960
classical    960
country      960
acoustic     960
metal        960
techno       960
dance        960
pop          960
Name: genre, dtype: int64

In [9]:
y_test.value_counts()

blues        240
jazz         240
metal        240
pop          240
dance        240
acoustic     240
techno       240
classical    240
rock         240
country      240
Name: genre, dtype: int64

## Exporting Training set and Test set to CSV files

In [26]:
X_train.to_csv('data/X_train.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
X_test.to_csv ('data/X_test.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)