# 1. Import Packages and Libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K
from keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import xgboost

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score

import scipy
import pandas as pd
import numpy as np
import gensim

import nltk
from nltk.data import find
import matplotlib.pyplot as plt
import shap

import matplotlib
import sklearn
import pickle
import random
import multiprocessing
import os
import sys

# 2. Read in Language Data

In [2]:
sample_dataset = pd.read_csv('Train_Test_Data/train.csv')[['Lyric','language label']]
test_dataset = pd.read_csv('Train_Test_Data/test.csv')[['Lyric','language label']]
print('Label Counts in Train Set')
display(sample_dataset['language label'].value_counts())
train_set = sample_dataset
val_set = test_dataset.iloc[:1517]
test_set = test_dataset.iloc[1517:]
test_set.index = np.arange(0,len(test_set))

Label Counts in Train Set


Spanish        2405
Portuguese     2372
English        2359
Kinyarwanda    1366
Italian        1162
French          999
German          698
Other           498
Finnish         111
Swedish          90
Romanian         78
Name: language label, dtype: int64

#### Resampled Version of Train Set for Non Class Weight Method of Dealing With Class Imbalance

In [3]:
random.seed(50)
max_class_counts = train_set['language label'].value_counts().iloc[0]
resampled_train_set = pd.DataFrame()
for lang in train_set['language label'].unique():
    subset = train_set[train_set['language label'] == lang].copy()
    if len(subset) == max_class_counts:
        resampled_train_set = pd.concat([resampled_train_set,subset],ignore_index=True)
    else:
        added_subset = subset.iloc[random.choices(np.arange(0,len(subset)),k=max_class_counts - len(subset))]
        resampled_train_set = pd.concat([resampled_train_set,subset,added_subset],ignore_index=True)
        
display(resampled_train_set)

Unnamed: 0,Lyric,language label
0,Teu olhar\nFez cinema mim\nEm cada sessão\nFui...,Portuguese
1,"Tristeza, por favor vá embora\nMinha alma que ...",Portuguese
2,"Só procurou, sempre encontrou\nAmores tontos, ...",Portuguese
3,Cruzei uma doida - Charlie Brown Jr\n\nCruzei ...,Portuguese
4,Como as cores num retrato o tempo insiste em d...,Portuguese
...,...,...
26450,Hednaorden\n\nI ekot av fäders stolthet\nVår h...,Swedish
26451,Utmed brådkalla rännilars fors och fall\nYrväd...,Swedish
26452,"Inciklad av stjärnhärars krestsande fält,\nfrå...",Swedish
26453,När jag tänker på den ständiga resan genom liv...,Swedish


# 3. Create Term Density Representation of train and val/test lyrics where terms are from non-other class lyrics

#### Preprocess Text, Create Vectorizer fit on non-other languages

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('  ',' ')
    return text

vectorizer = CountVectorizer(preprocessor=preprocess_text)
vectorizer.fit(train_set['Lyric'][train_set['language label'] != 'Other'])

CountVectorizer(preprocessor=<function preprocess_text at 0x7fdd0d549700>)

#### Lyrics to Term Density, Featurization Function

In [5]:
def lyrics_to_term_density(text_df,vectorizer):
    lyrics = vectorizer.transform(text_df['Lyric'])
    lyrics = pd.DataFrame(lyrics.todense(),columns = vectorizer.get_feature_names())
    label = text_df['language label'].copy()
    label.index = np.arange(0,len(lyrics))
    lyrics.dropna(inplace=True)
    label = label.loc[lyrics.index]
    token_count = np.array(text_df['Lyric'].apply(lambda x:len(preprocess_text(x).split())))
    token_count = token_count.repeat(lyrics.shape[1])
    token_count = token_count.reshape(lyrics.shape)
    lyrics = (lyrics/token_count).astype('float32')
    lyrics = scipy.sparse.csr_matrix(lyrics)
    return lyrics,label

#### Featurize Lyrics, Train Set, Resampled Train Set, Val Set, Test Set

In [6]:
#Vectorize Train Lyrics
train_lyrics = vectorizer.transform(train_set['Lyric'])
train_lyrics = pd.DataFrame(train_lyrics.todense(),columns = vectorizer.get_feature_names())
train_lyrics_token_count = train_lyrics.sum(axis=1)
train_lyrics = train_lyrics/np.array(train_lyrics_token_count.repeat(len(train_lyrics.columns))).reshape(train_lyrics.shape)

#Oversampled Vectorize Train Lyrics
resampled_train_lyrics = vectorizer.transform(resampled_train_set['Lyric'])
resampled_train_lyrics = pd.DataFrame(resampled_train_lyrics.todense(),columns = vectorizer.get_feature_names())
resampled_train_lyrics_token_count = resampled_train_lyrics.sum(axis=1)
resampled_train_lyrics = resampled_train_lyrics/np.array(resampled_train_lyrics_token_count.repeat(len(resampled_train_lyrics.columns))).reshape(resampled_train_lyrics.shape)

#Vectorize Val Lyrics
val_lyrics = vectorizer.transform(val_set['Lyric'])
val_lyrics = pd.DataFrame(val_lyrics.todense(),columns = vectorizer.get_feature_names(),index=val_set.index)
val_lyrics_token_count = val_lyrics.sum(axis=1)
val_lyrics = val_lyrics/np.array(val_lyrics_token_count.repeat(len(val_lyrics.columns))).reshape(val_lyrics.shape)

#Vectorize Test Lyrics
test_lyrics = vectorizer.transform(test_set['Lyric'])
test_lyrics = pd.DataFrame(test_lyrics.todense(),columns = vectorizer.get_feature_names(),index=test_set.index)
test_lyrics_token_count = test_lyrics.sum(axis=1)
test_lyrics = test_lyrics/np.array(test_lyrics_token_count.repeat(len(test_lyrics.columns))).reshape(test_lyrics.shape)

resampled_train_labels = resampled_train_set['language label']
train_labels = train_set['language label']
val_labels = val_set['language label']
test_labels = test_set['language label']

#### Dropna

In [7]:
train_lyrics.astype('float32')
resampled_train_lyrics.astype('float32')
val_lyrics.astype('float32')
test_lyrics.astype('float32')

Unnamed: 0,00,000,0000,000000,00000000,00000002,0001,00011,00012,00015,...,心の鱗を剥がそう,正解も不正解もないシーツの波,沈んでいく,泳いだ,溺れていく,激しく真実だけ抱いてほしい,覗いてくれ,話せない大切な弱さを,露に青く透き通った,飾りはいいよ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train_lyrics.fillna(0,inplace=True)
resampled_train_lyrics.fillna(0,inplace=True)
val_lyrics.fillna(0,inplace=True)
test_lyrics.fillna(0,inplace=True)

# 4. ID Class Imbalance and ID Weights for Each Class

#### Class Imbalance

In [9]:
class_counts = train_set['language label'].value_counts()
class_counts

Spanish        2405
Portuguese     2372
English        2359
Kinyarwanda    1366
Italian        1162
French          999
German          698
Other           498
Finnish         111
Swedish          90
Romanian         78
Name: language label, dtype: int64

#### Class Weights

In [10]:
class_weights = class_counts.iloc[0]/class_counts
class_weights

Spanish         1.000000
Portuguese      1.013912
English         1.019500
Kinyarwanda     1.760615
Italian         2.069707
French          2.407407
German          3.445559
Other           4.829317
Finnish        21.666667
Swedish        26.722222
Romanian       30.833333
Name: language label, dtype: float64

#### Labels for Resampled Train Set

In [11]:
class_counts1 = resampled_train_set['language label'].value_counts()
class_counts1

Portuguese     2405
English        2405
Italian        2405
German         2405
Kinyarwanda    2405
Spanish        2405
Finnish        2405
French         2405
Other          2405
Romanian       2405
Swedish        2405
Name: language label, dtype: int64

#### Weights for Resampled Train Set

In [12]:
class_weights1 = class_counts1.iloc[0]/class_counts1
class_weights1

Portuguese     1.0
English        1.0
Italian        1.0
German         1.0
Kinyarwanda    1.0
Spanish        1.0
Finnish        1.0
French         1.0
Other          1.0
Romanian       1.0
Swedish        1.0
Name: language label, dtype: float64

#### Mapping Language to Numerical Label, Mapping Numerical Label to Weights

In [13]:
label_mapping = {}
weight_mapping = {}
count = 0
for index in class_counts.index:
    label_mapping[index] = count
    weight_mapping[count] = class_weights.loc[index]
    count = count + 1

In [14]:
label_mapping1 = {}
weight_mapping1 = {}
count1 = 0
for index in class_counts1.index:
    label_mapping1[index] = count
    weight_mapping1[count1] = class_weights1.loc[index]
    count1 = count1 + 1

# 5. Feed Forward Network For Language Detection

#### Custom Metric for Evaluating Performance - Average Class Recall

In [4]:
def class_recall(y_true,y_pred):
    #true labels
    true = y_true.numpy()
    #predicted prob of each class for each sample
    pred = y_pred.numpy()
    #prob to class based off max predicted prob
    pred = np.array([x.argmax() for x in pred])
    #confusion matrix
    confuse = confusion_matrix(true,pred)
    confuse_sum = confuse.sum(axis=1)
    score = 0
    for num in range(len(confuse_sum)):
        if confuse_sum[num]!=0:
            score = score + confuse[num][num]/confuse_sum[num]
    
    return score/len(confuse_sum)

#### Initialize FF Neural Architecture

In [16]:
def create_feed_forward_network(
                     shape=(1000,),
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001,
                     metrics = ['accuracy']):
    """
    Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    
    #Input Layer, sequence of max_sequence_length tokens
    input_layer = tf.keras.layers.Input(shape=shape,dtype='float32',name='input')    
    #input into hidden layers
    x = input_layer #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        #dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = hidden
        
    classification = tf.keras.layers.Dense(output_layer_size, activation='softmax', name='classification')(x)
    model = tf.keras.models.Model(inputs=input_layer, outputs=[classification])
    model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=keras.optimizers.Adam(learning_rate=0.01),
                 metrics=metrics,
                 run_eagerly=True)
    print(model.summary())

    return model

#### Initialize and Train/Evaluate FF Neural Network to Detect Primary Music Language of Song Given Term Density

In [17]:
def eval_model(
    xtrain, xval, xtest, ytrain, yval, ytest, # Train/Val/Test Data
    class_weights, batch_size, epochs, # Attributes for Fit Method of Model
    patience, mode, #attributes for early stoppage
    savepath,#attributes for model checkpoints
    #Attributes for Model Architecture
    hidden_dim=[100,100,100],
    dropout_rate=0,
    hidden_layer_activation = 'relu',
    output_layer_size = 4,
    output_activation = 'softmax',
    learning_rate=0.001,
    metrics = ['accuracy'],
    opt_metric = 'class_recall',
    opt_func = class_recall):
    
    #Sparse to Dense Matrices
    xtr_dense = xtrain.copy()
    xva_dense = xval.copy()
    xte_dense = xtest.copy()
    
    tf.config.run_functions_eagerly(True)
    #Initialize Architecture
    model = create_feed_forward_network(shape=(xtr_dense.shape[1],),hidden_dim=hidden_dim,
                                        dropout_rate=dropout_rate,hidden_layer_activation=hidden_layer_activation,
                                        output_layer_size=output_layer_size,
                                        output_activation=output_activation,
                                        learning_rate=learning_rate,metrics=metrics)
    
    #Early Stoppage and Model Checkpoints Objects
    stoppage = keras.callbacks.EarlyStopping(monitor = 'val_' + opt_metric,verbose=1,patience=patience,mode=mode)
    checkpoint = keras.callbacks.ModelCheckpoint(savepath,monitor='val_' + opt_metric,save_best_only=True,mode=mode)
    
    #Fit Model on Training Data, iteratively evaluate on val data
    model.fit(xtr_dense,ytrain,
              validation_data=(xva_dense, yval),
              batch_size=batch_size,
              epochs=epochs,
              shuffle=True,
              class_weight = class_weights,
              callbacks = [stoppage,checkpoint],
              use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)
    
    #Final Evaluation of Optimal Model on Test Data
    final_model = load_model(savepath,custom_objects={opt_metric:opt_func})
    preds = final_model.predict(xte_dense)
    return preds

In [18]:
preds = eval_model(xtrain = np.array(train_lyrics), xval = np.array(val_lyrics), xtest = np.array(test_lyrics), 
           ytrain = train_labels.map(label_mapping),
           yval = val_labels.map(label_mapping),
           ytest = test_labels.map(label_mapping),
           class_weights = weight_mapping, batch_size=8, epochs = 30,
           patience=5,mode='max',savepath='language_detection_ff_tf.h5',
           hidden_dim=[100],dropout_rate=0.3,hidden_layer_activation='relu',
           output_layer_size=11,output_activation='softmax',
           learning_rate = 0.01,metrics=['accuracy',class_recall],opt_metric='class_recall')

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 122970)]          0         
                                                                 
 hidden_1 (Dense)            (None, 100)               12297100  
                                                                 
 classification (Dense)      (None, 11)                1111      
                                                                 
Total params: 12,298,211
Trainable params: 12,298,211
Non-trainable params: 0
_________________________________________________________________


2022-07-12 01:10:03.403877: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


None
Epoch 1/30


Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 19: early stopping
 3/48 [>.............................] - ETA: 1s

Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




In [33]:
confusion_matrix(test_labels.map(label_mapping),np.array([x.argmax() for x in preds]))

array([[294,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0, 314,   0,   0,   1,   0,   0,   0,   0,   0,   0],
       [  1,   0, 322,   1,   0,   0,   0,   3,   0,   0,   0],
       [  1,   0,   1, 161,   0,   0,   0,   1,   0,   0,   0],
       [  1,   0,   2,   0, 135,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,   0,   0, 110,   0,   0,   0,   0,   0],
       [  0,   0,   2,   0,   0,   0,  67,   0,   0,   0,   0],
       [  1,   1,   6,   2,   1,   0,   0,  50,   0,   0,   1],
       [  0,   0,   1,   0,   0,   0,   0,   0,  16,   0,   0],
       [  0,   0,   1,   0,   0,   0,   0,   0,   0,  10,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   9]])