In [2]:
# Import packages
import os
import pandas as pd
import numpy as np

import random

pd.set_option('display.max_columns', None)
pd.set_option("display.float_format", lambda x: f'{x:.2f}')

import warnings
warnings.filterwarnings("ignore")

In [3]:
def std_col_names(df):
    """
    Convert feature names to lower case
    """
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df


In [4]:
os.listdir('./data/')

['sample_submission.csv', 'train.csv', 'test.csv']

In [5]:
# Load DF
train_df = std_col_names(pd.read_csv('./data/train.csv'))
df = train_df.copy()
df.head(5)

Unnamed: 0,instance_id,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,voice_gender,mode,speechiness,tempo,musician_category,valence,music_genre
0,MSC_83537.0,Estrellitas y Duendes,49.18,0.97,0.58,214625.78,0.19,0.0,Scale E,0.15,-14.14,,Major,0.05,143.78799999999998,Band,0.6,Jazz
1,MSC_22044.0,Al Norte,59.83,1.01,0.69,216232.2,0.27,0.0,Scale A,0.17,-13.72,Male,?,0.04,?,Band,0.36,Jazz
2,MSC_62017.0,Yeah! (feat. Lil Jon & Ludacris),89.02,0.02,0.97,273314.72,0.86,0.0,Scale D,0.04,-5.0,Female,Major,0.12,105.01799999999999,Band,0.64,Rap
3,MSC_76365.0,Can’t You See,55.76,0.01,0.62,189189.61,0.97,0.0,Scale D,0.12,-4.26,Both,Major,0.17,?,Duet,0.98,Rock
4,MSC_71493.0,"Sonata III (G Moll), BWV 1029: Adagio",45.09,0.86,0.28,410136.99,0.11,0.0,Scale F Sharp,0.13,-26.92,Female,Minor,0.04,112.18299999999999,Band,0.07,Classical


In [6]:
# Observe missing data and feature types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15681 entries, 0 to 15680
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   instance_id        15681 non-null  object 
 1   track_name         15681 non-null  object 
 2   popularity         15039 non-null  float64
 3   acousticness       15681 non-null  float64
 4   danceability       15125 non-null  float64
 5   duration_ms        15587 non-null  float64
 6   energy             15587 non-null  float64
 7   instrumentalness   15586 non-null  float64
 8   key                15681 non-null  object 
 9   liveness           15681 non-null  float64
 10  loudness           15645 non-null  float64
 11  voice_gender       14916 non-null  object 
 12  mode               15646 non-null  object 
 13  speechiness        15655 non-null  float64
 14  tempo              15681 non-null  object 
 15  musician_category  14321 non-null  object 
 16  valence            150

In [7]:
# Find out missing column names
missing_cols = df.columns[df.isnull().any()].to_list()
print('Features with missing values: ')
missing_cols

Features with missing values: 


['popularity',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'loudness',
 'voice_gender',
 'mode',
 'speechiness',
 'musician_category',
 'valence']

In [8]:
# Evaluate count of missing values
missing_cols_count = df[missing_cols].isnull().sum()
print('Number of missing values for each feature in descending order: ')
(missing_cols_count)#.sort_values(ascending= False)

Number of missing values for each feature in descending order: 


popularity            642
danceability          556
duration_ms            94
energy                 94
instrumentalness       95
loudness               36
voice_gender          765
mode                   35
speechiness            26
musician_category    1360
valence               677
dtype: int64

In [9]:
# View data distribution of numeric features
print('Descriptive stats of numerical features: ')
df.describe().loc[['min', 'max', 'mean', '50%']]

Descriptive stats of numerical features: 


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,valence
min,0.0,0.0,0.06,-1.0,-1.0,0.0,0.02,-50.05,0.02,0.0
max,108.51,1.09,1.07,3410383.32,1.09,1.08,1.08,1.39,1.0,1.08
mean,50.82,0.45,0.57,248000.81,0.43,0.26,0.19,-11.62,0.1,0.45
50%,54.11,0.3,0.59,236212.74,0.56,0.0,0.13,-8.61,0.05,0.43


In [10]:
# View data distribution of categorical features
print('Descriptive stats of numerical features: ')
df.describe(include= 'object')

Descriptive stats of numerical features: 


Unnamed: 0,instance_id,track_name,key,voice_gender,mode,tempo,musician_category,music_genre
count,15681,15681,15681,14916,15646,15681,14321,15681
unique,15681,14799,12,3,3,12332,3,7
top,MSC_83537.0,Smile,Scale C,Female,Major,?,Band,Classical
freq,1,7,1771,5050,9210,1476,4843,3990


In [11]:
print('Unique values in each feature: ')
df.nunique()

Unique values in each feature: 


instance_id          15681
track_name           14799
popularity           12743
acousticness         15681
danceability         15125
duration_ms          14011
energy               14371
instrumentalness     11219
key                     12
liveness             15681
loudness             10473
voice_gender             3
mode                     3
speechiness          15655
tempo                12332
musician_category        3
valence              15004
music_genre              7
dtype: int64

In [12]:
print('Unique music_genres: ')
print(df['music_genre'].value_counts())

Unique music_genres: 
Classical     3990
Rock          3879
Rap           3207
Jazz          2850
Country        683
Electronic     614
Hip-Hop        458
Name: music_genre, dtype: int64


In [13]:
print(f'Number of records with duration= -1 :')
print(df[df['duration_ms']==-1].shape[0])

print(f'Number of records with duration= np.nan :')
print(df['duration_ms'].isna().sum())

Number of records with duration= -1 :
1577
Number of records with duration= np.nan :
94


In [14]:
print(f'Number of records with energy= -1 :')
print(df[df['energy']==-1].shape[0])

print(f'Number of records with duration= np.nan :')
print(df['energy'].isna().sum())

Number of records with energy= -1 :
1217
Number of records with duration= np.nan :
94


In [15]:
print('Loudness value count greater than 0: ')
print(df[df['loudness']>=0].shape[0])

Loudness value count greater than 0: 
7


## Observing data

In [16]:
print('Unqiue key values: ')
print(df['key'].unique())

Unqiue key values: 
['Scale E' 'Scale A' 'Scale D' 'Scale F Sharp' 'Scale G Sharp'
 'Scale C Sharp' 'Scale A Sharp' 'Scale F' 'Scale G' 'Scale B' 'Scale C'
 'Scale D Sharp']


In [17]:
print('Unique mode values: ')
print(df['mode'].value_counts())


Unique mode values: 
Major    9210
Minor    5357
?        1079
Name: mode, dtype: int64


In [18]:
print('Tempo records with `?`: ')
print(df[df['tempo']=='?'].shape[0])

Tempo records with `?`: 
1476


In [19]:
df[df['tempo']=='?']['music_genre'].value_counts()

Classical     396
Rock          328
Rap           325
Jazz          276
Electronic     58
Country        56
Hip-Hop        37
Name: music_genre, dtype: int64

In [20]:
print('Value counts of musician_category')
print(df['musician_category'].value_counts())

df[['track_name', 'musician_category']]

Value counts of musician_category
Band    4843
Solo    4784
Duet    4694
Name: musician_category, dtype: int64


Unnamed: 0,track_name,musician_category
0,Estrellitas y Duendes,Band
1,Al Norte,Band
2,Yeah! (feat. Lil Jon & Ludacris),Band
3,Can’t You See,Duet
4,"Sonata III (G Moll), BWV 1029: Adagio",Band
...,...,...
15676,I. Andante,Band
15677,Only Trust Your Heart,Duet
15678,Moment's Notice,
15679,Such Small Scenes,Duet


## Impute values using Pipeline where required

In [21]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer

In [22]:
# Convert duration in ms to duration in minutes
df['duration_mins'] = df['duration_ms'] / 60000

# Fill NA values of mode feature randomly with one of ['Major', 'Minor']
df['mode'].replace('?', np.nan, inplace= True)
df['mode'].fillna(random.choice(['Major', 'Minor']), inplace= True)

# Fill NA values of musician_category feature randomly with one of ['Band', 'Duet', 'Solo']
df['musician_category'].fillna(random.choice(['Band', 'Duet', 'Solo']), inplace= True)

df['voice_gender'].fillna('No', inplace= True)

In [23]:
# Change type of tempo feture
df['tempo'].replace('?', np.nan, inplace= True)
df['tempo'] = df['tempo'].astype('float')

In [24]:
fin_cols = [
 'popularity',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'voice_gender',
 'mode',
 'speechiness',
 'tempo',
 'musician_category',
 'valence',
 'duration_mins'
]

In [25]:
list(zip(enumerate(df[fin_cols].columns.to_list())))

[((0, 'popularity'),),
 ((1, 'acousticness'),),
 ((2, 'danceability'),),
 ((3, 'energy'),),
 ((4, 'instrumentalness'),),
 ((5, 'key'),),
 ((6, 'liveness'),),
 ((7, 'loudness'),),
 ((8, 'voice_gender'),),
 ((9, 'mode'),),
 ((10, 'speechiness'),),
 ((11, 'tempo'),),
 ((12, 'musician_category'),),
 ((13, 'valence'),),
 ((14, 'duration_mins'),)]

In [26]:
# Column transformer for imputation
ct = ColumnTransformer(
[
    ('mean', SimpleImputer(strategy='mean'), [0, 4, 10, 14]),
    ('knn', KNNImputer(weights= 'distance'), [2, 3, 7, 11, 13])
],remainder='passthrough'
)

In [27]:
trans_df = ct.fit_transform(df[fin_cols])

In [28]:
trans_cols = ['popularity', 'instrumentalness', 'speechiness', 'duration_mins', 'danceability', 'energy', 'loudness', 'tempo', 'valence', 'acousticness', 'key', 'liveness', 'voice_gender', 'mode', 'musician_category']
fin_df = pd.DataFrame(trans_df, columns=trans_cols)
fin_df.sample(7)

Unnamed: 0,popularity,instrumentalness,speechiness,duration_mins,danceability,energy,loudness,tempo,valence,acousticness,key,liveness,voice_gender,mode,musician_category
291,61.39,0.0,0.04,4.1,0.58,0.68,-5.87,132.28,0.26,0.01,Scale A,0.16,Male,Major,Solo
8158,60.66,0.0,0.45,3.56,0.54,0.87,-5.55,138.15,0.48,0.07,Scale E,0.19,Both,Minor,Duet
3761,76.95,0.0,0.05,3.47,0.7,0.88,-3.81,128.07,0.49,0.03,Scale A Sharp,0.21,No,Minor,Duet
8931,35.18,0.0,0.03,8.75,0.51,0.28,-13.4,119.98,0.2,1.0,Scale D,0.15,Both,Major,Duet
14601,57.12,0.0,0.39,5.1,0.78,-1.0,-1.0,148.09,0.24,0.11,Scale G Sharp,0.16,Both,Minor,Duet
5263,76.54,0.0,0.03,5.38,0.8,-1.0,-14.22,112.75,0.53,0.02,Scale B,0.04,Female,Minor,Solo
14420,62.66,0.0,0.1,3.84,0.84,0.56,-7.2,129.97,0.29,0.27,Scale D Sharp,0.15,Male,Minor,Band


In [29]:
fin_missing_cols = fin_df.columns[fin_df.isnull().any()].to_list()
fin_missing_cols_count = fin_df[fin_missing_cols].isnull().sum()
print('Missing values: ')
fin_missing_cols_count

Missing values: 


Series([], dtype: float64)

## Modeling

In [30]:
import sklearn 
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [31]:
# dictVectorizer for vectorizing data
fin_dict = fin_df.to_dict(orient= 'records')
dv= DictVectorizer(sparse=False)
fin_dv = dv.fit_transform(fin_dict)

In [32]:
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(fin_dv, df['music_genre'], n_neighbors= 5, random_state= 42, discrete_features= 'auto')
mi_df = pd.DataFrame(mi, index= dv.get_feature_names(), columns=['mutual_info']).reset_index().sort_values(by='mutual_info', ascending= False)


In [33]:
mi_df[mi_df['mutual_info'] >=0.1]['index'].to_list()

['popularity',
 'acousticness',
 'loudness',
 'energy',
 'danceability',
 'instrumentalness',
 'speechiness',
 'valence',
 'tempo',
 'duration_mins']

In [34]:
from sklearn.preprocessing import LabelEncoder
lb= LabelEncoder()
train_df['genre_label'] = lb.fit_transform(train_df['music_genre'])

In [35]:
lb.classes_

array(['Classical', 'Country', 'Electronic', 'Hip-Hop', 'Jazz', 'Rap',
       'Rock'], dtype=object)

In [36]:
# Tuning parameter grid
xgb_cv_params = {
    'max_depth': [3,4,5,6,7], 
    'min_child_weight': [1,2,3,4],
    'subsample': [0.8,0.9,1], 
    'max_delta_step': [0,1,2,4],
    'learning_rate': [0.1, 0.2, 0.3, 0.4],
    'n_estimators' : [75, 100, 150],
#     'booster': ['gbtree', 'dart']
}   

# Parameters fixed
fix_params = {'objective': 'multi:softmax', 'eval_metric': 'aucpr', 'use_label_encoder': False, 'tree_method': 'gpu_hist'}  

# Create RandomizedSearchCV object
csv = RandomizedSearchCV(xgb.XGBClassifier(**fix_params), xgb_cv_params, scoring = 'f1_macro', cv = 4, verbose=2, n_jobs= 4, n_iter= 30, random_state=42) #, random_state=42


In [37]:
# # Perform randomized grid search
# csv.fit(fin_dv, train_df['genre_label'])

In [38]:
# model = csv.best_estimator_

In [54]:
fin_params = {'objective': 'multi:softprob',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'enable_categorical': False,
 'gamma': 0,
 'gpu_id': 0,
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.2,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 3,
 'monotone_constraints': '()',
 'n_estimators': 150,
 'n_jobs': 8,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': None,
 'subsample': 0.9,
 'tree_method': 'gpu_hist',
 'validate_parameters': 1,
 'verbosity': None,
 'eval_metric': 'aucpr',
  'num_class': 7       
             }

In [55]:
# Train final model
dtrain = xgb.DMatrix(fin_dv, train_df['genre_label'])
fin_xgb = xgb.train(fin_params, dtrain, num_boost_round = 200)

Parameters: { "enable_categorical", "n_estimators", "use_label_encoder" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




## Predictions on test set

### Imputations on test set

In [56]:
test =  std_col_names(pd.read_csv('./data/test.csv'))

In [57]:
test['duration_mins'] = test['duration_ms'] / 60000

test['mode'].replace('?', np.nan, inplace= True)
test['mode'].fillna(random.choice(['Major', 'Minor']), inplace= True)

test['musician_category'].fillna(random.choice(['Band', 'Duet', 'Solo']), inplace= True)

test['voice_gender'].fillna('No', inplace= True)

In [58]:
test['tempo'].replace('?', np.nan, inplace= True)
test['tempo'] = test['tempo'].astype('float')

In [59]:
trans_test = ct.transform(test[fin_cols])

In [60]:
trans_cols = ['popularity', 'instrumentalness', 'speechiness', 'duration_mins', 'danceability', 'energy', 'loudness', 'tempo', 'valence', 'acousticness', 'key', 'liveness', 'voice_gender', 'mode', 'musician_category']
fin_test = pd.DataFrame(trans_test, columns=trans_cols)
fin_test.sample(7)

fin_missing_cols = fin_test.columns[fin_test.isnull().any()].to_list()
fin_missing_cols_count = fin_test[fin_missing_cols].isnull().sum()
print('Missing values: ')
fin_missing_cols_count

Missing values: 


Series([], dtype: float64)

### Vectorizing test set

In [61]:
test_dict = fin_test.to_dict(orient= 'records')
test_dv = dv.transform(test_dict)
dtest = xgb.DMatrix(test_dv)

### Predicting using model

In [79]:
test_pred = fin_xgb.predict(dtest)
test_pred

In [81]:
fin_pred = np.argmax(test_pred, axis= 1)
fin_pred

In [84]:
# Obtain prediction
test_pred = lb.inverse_transform(fin_pred)

### Extracting for submission

In [90]:
def get_sub(arr, outfile= 'sub.csv'):
    """
    Convert predicted output to a dataframe and extract to local
    """
    out_df = pd.DataFrame({'instanceid': test['instance_id'], 'music_genre': test_pred})
    out_df.to_csv('./subs/'+outfile, index= False)
    return out_df

In [91]:
sub2 = get_sub(test_pred, 'sub2.csv')
sub2

Unnamed: 0,instanceid,music_genre
0,MSC_70753.0,Classical
1,MSC_24064.0,Rock
2,MSC_22731.0,Rock
3,MSC_32095.0,Rock
4,MSC_24198.0,Jazz
...,...,...
3916,MSC_80955.0,Classical
3917,MSC_72767.0,Rock
3918,MSC_40192.0,Classical
3919,MSC_56067.0,Country


## References
- [Replace invalid values](https://stackoverflow.com/questions/17097236/replace-invalid-values-with-none-in-pandas-dataframe)
- [Replace values randomly](https://stackoverflow.com/questions/36413314/filling-missing-data-by-random-choosing-from-non-missing-values-in-pandas-datafr)
- [Pandas: filling missing values by mean in each group](https://stackoverflow.com/questions/19966018/pandas-filling-missing-values-by-mean-in-each-group)