# Creating Train-Test Split
The contents of this notebook create a train-test split (80:20) for the data to predict the column `label`.
The splits are also encoded and scaled in this file.

In [1]:
import pandas as pd
import pathlib
from sklearn.model_selection import train_test_split
import utils.encoding
REPO_ROOT = pathlib.Path.cwd().parent

In [3]:
master_library_table = pd.read_csv(REPO_ROOT / 'data/spotify/group/group_mutually_exclusive_library_contents.csv', index_col=0)
master_library_table.head(10)

Unnamed: 0,artist,track,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,label
0,LOONA/yyxy,love4eva (feat. Grimes),4rKEmhNA19JezqVsSQS4yo,0.738,0.954,1,-2.146,1,0.0356,0.0692,0.0,0.222,0.742,102.015,220477,4,jojo
1,Danny Brown,Side B [Dope Song],26dfLcCVJUdscPvmUmkuSI,0.69,0.6,8,-7.331,1,0.249,0.187,0.0,0.694,0.761,160.08,156744,4,jojo
2,Latto,Muwop (feat. Gucci Mane),231WYcXWUxYSx79tuPtzBk,0.79,0.614,9,-6.958,0,0.0991,0.0606,0.0,0.239,0.63,145.907,200135,4,jojo
3,MIMIDEATH,abusive,11V7vRMorD73js8sfgBOS7,0.828,0.764,2,-7.361,1,0.104,0.17,0.309,0.237,0.289,110.987,72801,4,jojo
4,Rico Nasty,Pressing Me,3wtVRcrYtWJVs0rBTabJJ8,0.936,0.523,1,-6.39,1,0.269,0.00365,0.0,0.105,0.278,130.014,162636,4,jojo
6,Otoboke Beaver,"Leave me alone! No, stay with me!",0al4YLXuKajNIiJ1IXi9JL,0.321,0.973,9,-1.191,1,0.154,0.00751,0.684,0.086,0.574,132.201,101840,4,jojo
7,Flo Milli,Like That Bitch,7zoZd2MuTaQEdF1rlq6Vv1,0.782,0.844,10,-4.142,0,0.23,0.107,1e-06,0.295,0.76,77.435,203044,4,jojo
8,"Tyler, The Creator",PartyIsntOver/Campfire/Bimmer (feat. Frank Oce...,04QTBqa3IA4ZAKEuqRPzEH,0.443,0.576,1,-8.695,0,0.201,0.0709,0.0197,0.41,0.127,151.835,438493,4,jojo
9,Lorde,Buzzcut Season,51QEyJI5M7uyd8DOh9tqQY,0.733,0.62,1,-10.525,1,0.075,0.606,0.305,0.117,0.247,111.039,246755,4,jojo
10,Vince Staples,So What? - Episode 01,2JRo1DJhNhZWs1TzrPNOSZ,0.854,0.844,2,-5.908,1,0.0519,0.00651,5e-06,0.186,0.832,146.041,125149,4,jojo


## Train-Test Split

In [12]:
library_train, library_test = train_test_split(master_library_table, test_size=.2, random_state=0)
library_train.head(10)

Unnamed: 0,artist,track,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,label
835,The Weeknd,Scared To Live,4BGZF4oLbTL0pWm7C18pbv,0.504,0.498,0,-8.24,1,0.0499,0.131,0.0,0.0929,0.196,87.221,191307,4,nick
183,cupcakKe,Garfield,3WRiWRkolTkYHKqMGsYsOT,0.872,0.835,6,-6.174,0,0.11,0.232,1.4e-05,0.0869,0.624,99.978,217560,4,jojo
1143,Cabu,Crazy,0CbgB2e5ItzZOA1yZA8Xnt,0.794,0.603,8,-11.116,1,0.0724,0.182,0.0635,0.109,0.0353,107.992,249896,4,nick
1065,Rich Brian,Introvert (feat. Joji),3qbqWUDl9j09PeBaQ6CZgr,0.836,0.686,1,-4.673,1,0.0413,0.14,0.00231,0.0681,0.632,110.029,222259,4,richard
1257,BLACKPINK,Typa Girl,0L8LOav65XwLjCLS11gNPD,0.915,0.621,7,-6.519,1,0.1,0.0745,0.0,0.628,0.527,131.984,179173,4,nick
427,Death Grips,Hustle Bones,5PXyH5bb5fbVfO8LjByhBb,0.6,0.982,4,-2.996,0,0.201,0.104,0.0,0.343,0.277,110.015,192360,4,jojo
1494,Mickey Valen,Move That Body,0a0TiZm7hjmgyY9i1sWm3Z,0.669,0.668,4,-5.553,1,0.207,0.294,0.0,0.183,0.483,77.493,193742,4,richard
842,Abhi The Nomad,Sex n' Drugs,0G2wimhVoDYXbQ6csDxtSf,0.831,0.32,7,-7.916,1,0.143,0.299,0.0,0.0757,0.319,80.935,217778,4,nick
1050,Mustard,"Baguettes in the Face (feat. NAV, Playboi Cart...",2zjGJ0dChMR0KxBZS15aqo,0.883,0.525,2,-8.054,0,0.0633,0.189,0.0,0.0971,0.312,99.978,174100,4,nick
1037,khai dreams,Lost in You,2itR8VHeTVKInnMPec8mj8,0.771,0.607,5,-12.02,0,0.113,0.334,0.0,0.122,0.79,90.035,101000,4,nick


## Encoding Categorical Features and Scaling Continuous Features

### Transforming Training Data and Getting Transformers

In [13]:
encoded_train_data, transformers = utils.encoding.encode_training_data(library_train)
encoded_train_data.head(10)

Unnamed: 0_level_0,artist,track,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,key_8,key_9,key_10,key_11,mode_0,mode_1,time_signature_1,time_signature_3,time_signature_4,time_signature_5
uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4BGZF4oLbTL0pWm7C18pbv,The Weeknd,Scared To Live,0.413284,0.440223,0.615274,0.030664,0.133805,0.0,0.080971,0.174273,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3WRiWRkolTkYHKqMGsYsOT,cupcakKe,Garfield,0.865929,0.81676,0.717531,0.102934,0.236972,1.4e-05,0.074001,0.628529,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
0CbgB2e5ItzZOA1yZA8Xnt,Cabu,Crazy,0.769988,0.557542,0.472926,0.05772,0.1859,0.066146,0.099675,0.003715,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3qbqWUDl9j09PeBaQ6CZgr,Rich Brian,Introvert (feat. Joji),0.821648,0.650279,0.791823,0.020322,0.142998,0.002406,0.052161,0.63702,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0L8LOav65XwLjCLS11gNPD,BLACKPINK,Typa Girl,0.918819,0.577654,0.700455,0.090909,0.076093,0.0,0.702602,0.525578,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5PXyH5bb5fbVfO8LjByhBb,Death Grips,Hustle Bones,0.531365,0.981006,0.874827,0.212362,0.106226,0.0,0.371515,0.260242,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
0a0TiZm7hjmgyY9i1sWm3Z,Mickey Valen,Move That Body,0.616236,0.630168,0.748268,0.219577,0.300303,0.0,0.185641,0.478879,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0G2wimhVoDYXbQ6csDxtSf,Abhi The Nomad,Sex n' Drugs,0.815498,0.241341,0.631311,0.142617,0.30541,0.0,0.06099,0.304819,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2zjGJ0dChMR0KxBZS15aqo,Mustard,"Baguettes in the Face (feat. NAV, Playboi Cart...",0.879459,0.470391,0.62448,0.046777,0.19305,0.0,0.08585,0.297389,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2itR8VHeTVKInnMPec8mj8,khai dreams,Lost in You,0.741697,0.562011,0.428183,0.106542,0.341161,0.0,0.114777,0.804712,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Transforming Testing Data Given Previous Transformers

In [14]:
encoded_test_data = utils.encoding.encode_dataframe_given_transformers(library_test, transformers)

## Final Data Cleaning and Saving

In [15]:
# Move label to the end just to make it look pretty
for df in [encoded_train_data, encoded_test_data]:
	df.insert(len(df.columns) - 1, 'label', df.pop('label'))
encoded_test_data = encoded_test_data.set_index("uri")
# Saving to file
encoded_train_data.to_csv(REPO_ROOT / "data/spotify/group/mutually_exclusive_train_test/library_train.csv")
encoded_test_data.to_csv(REPO_ROOT / "data/spotify/group/mutually_exclusive_train_test/library_test.csv")