# Train - Test Split

In [5]:
# Author: Arnova Abdullah
# Date: 05-05-2022
# Description: Code for creating stratified train-test split of the music genre dataset

# Edits:
# Soumya Sambeet Mohapatra (12-05-2022)

## Importing modules

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Loading data 

In [8]:
df = pd.read_csv('data/genre_data_final.csv')
df.head()

Unnamed: 0,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre
0,0.507,0.71,-5.02,1,0.0291,0.0975,1.1e-05,0.17,0.45,134.647,4,acoustic
1,0.496,0.378,-7.878,1,0.0336,0.898,1e-06,0.109,0.335,144.092,3,acoustic
2,0.76,0.765,-9.319,1,0.0352,0.289,8e-06,0.233,0.868,107.602,4,acoustic
3,0.607,0.196,-19.705,1,0.0595,0.904,0.727,0.111,0.187,111.031,4,acoustic
4,0.497,0.0919,-15.685,1,0.0704,0.968,0.00214,0.133,0.548,206.431,4,acoustic


## Label Encoding to Categorical Genre Column

In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df = df.apply(le.fit_transform)

## Creating stratified Train-Test split of the dataset

In [10]:
X = df.drop('genre',axis=1)
y = df['genre']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=43, 
    stratify=df['genre']
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9600, 11), (2400, 11), (9600,), (2400,))

In [11]:
X_train.value_counts()

danceability  energy  loudness  mode  speechiness  acousticness  instrumentalness  liveness  valence  tempo  time_signature
720           1229    5650      0     156          1908          0                 879       1149     3904   3                 7
686           1245    5054      0     64           2244          2526              615       1334     5673   3                 5
311           1418    4492      1     257          885           2189              680       1072     7921   3                 5
371           878     3964      1     111          2831          0                 672       696      4848   3                 5
370           978     4772      1     100          2800          0                 776       839      4610   3                 5
                                                                                                                              ..
442           862     1857      1     57           2803          1580              694       651      

In [12]:
print(X_test.value_counts())

danceability  energy  loudness  mode  speechiness  acousticness  instrumentalness  liveness  valence  tempo  time_signature
538           1096    2761      1     354          2211          1089              1194      661      6399   3                 3
398           911     4785      1     70           2797          0                 829       842      5830   3                 3
439           1023    5625      1     149          2922          0                 513       903      2147   3                 3
752           1114    5583      0     259          1020          0                 926       732      7190   3                 3
473           774     2555      1     74           2960          0                 681       943      3307   3                 3
                                                                                                                              ..
441           1471    6802      1     471          530           534               665       767      

In [13]:
y_train.value_counts()

8    960
5    960
2    960
6    960
0    960
1    960
7    960
3    960
4    960
9    960
Name: genre, dtype: int64

In [14]:
y_test.value_counts()

9    240
1    240
5    240
0    240
7    240
8    240
4    240
3    240
6    240
2    240
Name: genre, dtype: int64

## Exporting Training set and Test set to CSV files

In [15]:
X_train.to_csv('data/X_train.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
X_test.to_csv ('data/X_test.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)