# 1. Import Packages and Libraries

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K
from keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import xgboost

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score

import scipy
import pandas as pd
import numpy as np
import gensim

import nltk
from nltk.data import find
import matplotlib.pyplot as plt
import shap

import matplotlib
import sklearn
import pickle
import random
import multiprocessing
import os
import sys

# 2. Read in Language Data

In [3]:
sample_dataset = pd.read_csv('Train_Test_Data/train.csv')[['Lyric','language label']]
test_dataset = pd.read_csv('Train_Test_Data/test.csv')[['Lyric','language label']]
print('Label Counts in Train Set')
display(sample_dataset['language label'].value_counts())
train_set = sample_dataset
val_set = test_dataset.iloc[:1517]
test_set = test_dataset.iloc[1517:]
test_set.index = np.arange(0,len(test_set))

Label Counts in Train Set


Spanish        2412
Portuguese     2405
English        2371
Kinyarwanda    1332
Italian        1156
French          968
German          700
Other           509
Finnish         114
Swedish          97
Romanian         74
Name: language label, dtype: int64

#### Resampled Version of Train Set for Non Class Weight Method of Dealing With Class Imbalance

In [4]:
random.seed(50)
max_class_counts = train_set['language label'].value_counts().iloc[0]
resampled_train_set = pd.DataFrame()
for lang in train_set['language label'].unique():
    subset = train_set[train_set['language label'] == lang].copy()
    if len(subset) == max_class_counts:
        resampled_train_set = pd.concat([resampled_train_set,subset],ignore_index=True)
    else:
        added_subset = subset.iloc[random.choices(np.arange(0,len(subset)),k=max_class_counts - len(subset))]
        resampled_train_set = pd.concat([resampled_train_set,subset,added_subset],ignore_index=True)
        
display(resampled_train_set)

Unnamed: 0,Lyric,language label
0,Varf&oumlr ska det vara så seri&oumlst f&oumlr...,Swedish
1,Intro:\n(What a group of kids we sent out into...,Swedish
2,"""Vem är Gud? (Vad är Gud?) ""\n""Det är en svår ...",Swedish
3,"vi sover på dagen,\nvi saknar tidsuppfattning,...",Swedish
4,"Honey, honey, underbara, aha, honey honey\nHon...",Swedish
...,...,...
26527,Trece timpul si inteleg ca trece\nDragostea da...,Romanian
26528,Astazi pe la 5 ma vad cu ea\nNu stiu ce m-aste...,Romanian
26529,"can you give me ,can you give me\n\nAstazi pe ...",Romanian
26530,I:\nLasa-ma sa-ti spun :\n'viata mea fara tine...,Romanian


# 3. Create Term Density Representation of train and val/test lyrics where terms are from non-other class lyrics

#### Preprocess Text, Create Vectorizer fit on non-other languages

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('  ',' ')
    return text

vectorizer = CountVectorizer(preprocessor=preprocess_text)
vectorizer.fit(train_set['Lyric'][train_set['language label'] != 'Other'])

CountVectorizer(preprocessor=<function preprocess_text at 0x7f8aa0aff0d0>)

In [5]:
pickle.dump(vectorizer,open('word_vectorizer.pkl','wb'))

#### Lyrics to Term Density, Featurization Function

In [6]:
def lyrics_to_term_density(text_df,vectorizer):
    lyrics = vectorizer.transform(text_df['Lyric'])
    lyrics = pd.DataFrame(lyrics.todense(),columns = vectorizer.get_feature_names())
    label = text_df['language label'].copy()
    label.index = np.arange(0,len(lyrics))
    lyrics.dropna(inplace=True)
    label = label.loc[lyrics.index]
    token_count = np.array(text_df['Lyric'].apply(lambda x:len(preprocess_text(x).split())))
    token_count = token_count.repeat(lyrics.shape[1])
    token_count = token_count.reshape(lyrics.shape)
    lyrics = (lyrics/token_count).astype('float32')
    lyrics = scipy.sparse.csr_matrix(lyrics)
    return lyrics,label

#### Featurize Lyrics, Train Set, Resampled Train Set, Val Set, Test Set

In [7]:
#Vectorize Train Lyrics
train_lyrics = vectorizer.transform(train_set['Lyric'])
train_lyrics = pd.DataFrame(train_lyrics.todense(),columns = vectorizer.get_feature_names())
train_lyrics_token_count = train_lyrics.sum(axis=1)
train_lyrics = train_lyrics/np.array(train_lyrics_token_count.repeat(len(train_lyrics.columns))).reshape(train_lyrics.shape)

#Oversampled Vectorize Train Lyrics
resampled_train_lyrics = vectorizer.transform(resampled_train_set['Lyric'])
resampled_train_lyrics = pd.DataFrame(resampled_train_lyrics.todense(),columns = vectorizer.get_feature_names())
resampled_train_lyrics_token_count = resampled_train_lyrics.sum(axis=1)
resampled_train_lyrics = resampled_train_lyrics/np.array(resampled_train_lyrics_token_count.repeat(len(resampled_train_lyrics.columns))).reshape(resampled_train_lyrics.shape)

#Vectorize Val Lyrics
val_lyrics = vectorizer.transform(val_set['Lyric'])
val_lyrics = pd.DataFrame(val_lyrics.todense(),columns = vectorizer.get_feature_names(),index=val_set.index)
val_lyrics_token_count = val_lyrics.sum(axis=1)
val_lyrics = val_lyrics/np.array(val_lyrics_token_count.repeat(len(val_lyrics.columns))).reshape(val_lyrics.shape)

#Vectorize Test Lyrics
test_lyrics = vectorizer.transform(test_set['Lyric'])
test_lyrics = pd.DataFrame(test_lyrics.todense(),columns = vectorizer.get_feature_names(),index=test_set.index)
test_lyrics_token_count = test_lyrics.sum(axis=1)
test_lyrics = test_lyrics/np.array(test_lyrics_token_count.repeat(len(test_lyrics.columns))).reshape(test_lyrics.shape)

resampled_train_labels = resampled_train_set['language label']
train_labels = train_set['language label']
val_labels = val_set['language label']
test_labels = test_set['language label']

#### Fillna

In [8]:
train_lyrics.astype('float32')
resampled_train_lyrics.astype('float32')
val_lyrics.astype('float32')
test_lyrics.astype('float32')

Unnamed: 0,00,000,0000,000000,00000000,00000002,0001,00011,00014,00020,...,ținem,еl,еm,еnnui,еt,еtt,йquateur,йternellement,оles,時間
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
train_lyrics.fillna(0,inplace=True)
resampled_train_lyrics.fillna(0,inplace=True)
val_lyrics.fillna(0,inplace=True)
test_lyrics.fillna(0,inplace=True)

# 4. ID Class Imbalance and ID Weights for Each Class

#### Class Imbalance

In [10]:
class_counts = train_set['language label'].value_counts()
class_counts

Spanish        2412
Portuguese     2405
English        2371
Kinyarwanda    1332
Italian        1156
French          968
German          700
Other           509
Finnish         114
Swedish          97
Romanian         74
Name: language label, dtype: int64

#### Class Weights

In [11]:
class_weights = class_counts.iloc[0]/class_counts
class_weights

Spanish         1.000000
Portuguese      1.002911
English         1.017292
Kinyarwanda     1.810811
Italian         2.086505
French          2.491736
German          3.445714
Other           4.738703
Finnish        21.157895
Swedish        24.865979
Romanian       32.594595
Name: language label, dtype: float64

#### Labels for Resampled Train Set

In [12]:
class_counts1 = resampled_train_set['language label'].value_counts()
class_counts1

Swedish        2412
French         2412
Kinyarwanda    2412
Spanish        2412
German         2412
Portuguese     2412
Italian        2412
Finnish        2412
English        2412
Other          2412
Romanian       2412
Name: language label, dtype: int64

#### Weights for Resampled Train Set

In [13]:
class_weights1 = class_counts1.iloc[0]/class_counts1
class_weights1

Swedish        1.0
French         1.0
Kinyarwanda    1.0
Spanish        1.0
German         1.0
Portuguese     1.0
Italian        1.0
Finnish        1.0
English        1.0
Other          1.0
Romanian       1.0
Name: language label, dtype: float64

#### Mapping Language to Numerical Label, Mapping Numerical Label to Weights

In [14]:
label_mapping = {}
weight_mapping = {}
count = 0
for index in class_counts.index:
    label_mapping[index] = count
    weight_mapping[count] = class_weights.loc[index]
    count = count + 1

In [15]:
label_mapping1 = {}
weight_mapping1 = {}
count1 = 0
for index in class_counts1.index:
    label_mapping1[index] = count
    weight_mapping1[count1] = class_weights1.loc[index]
    count1 = count1 + 1

# 5. Feed Forward Network For Language Detection

#### Custom Metric for Evaluating Performance - Average Class Recall

In [16]:
def class_recall(y_true,y_pred):
    #true labels
    true = y_true.numpy()
    #predicted prob of each class for each sample
    pred = y_pred.numpy()
    #prob to class based off max predicted prob
    pred = np.array([x.argmax() for x in pred])
    #confusion matrix
    confuse = confusion_matrix(true,pred)
    confuse_sum = confuse.sum(axis=1)
    score = 0
    for num in range(len(confuse_sum)):
        if confuse_sum[num]!=0:
            score = score + confuse[num][num]/confuse_sum[num]
    
    return score/len(confuse_sum)

#### Initialize FF Neural Architecture

In [17]:
def create_feed_forward_network(
                     shape=(1000,),
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001,
                     metrics = ['accuracy']):
    """
    Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    
    #Input Layer, sequence of max_sequence_length tokens
    input_layer = tf.keras.layers.Input(shape=shape,dtype='float32',name='input')    
    #input into hidden layers
    x = input_layer #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        #dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = hidden
        
    classification = tf.keras.layers.Dense(output_layer_size, activation='softmax', name='classification')(x)
    model = tf.keras.models.Model(inputs=input_layer, outputs=[classification])
    model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=keras.optimizers.Adam(learning_rate=0.01),
                 metrics=metrics,
                 run_eagerly=True)
    print(model.summary())

    return model

#### Initialize and Train/Evaluate FF Neural Network to Detect Primary Music Language of Song Given Term Density

In [18]:
def eval_model(
    xtrain, xval, xtest, ytrain, yval, ytest, # Train/Val/Test Data
    class_weights, batch_size, epochs, # Attributes for Fit Method of Model
    patience, mode, #attributes for early stoppage
    savepath,#attributes for model checkpoints
    #Attributes for Model Architecture
    hidden_dim=[100,100,100],
    dropout_rate=0,
    hidden_layer_activation = 'relu',
    output_layer_size = 4,
    output_activation = 'softmax',
    learning_rate=0.001,
    metrics = ['accuracy'],
    opt_metric = 'class_recall',
    opt_func = class_recall):
    
    #Sparse to Dense Matrices
    xtr_dense = xtrain.copy()
    xva_dense = xval.copy()
    xte_dense = xtest.copy()
    
    tf.config.run_functions_eagerly(True)
    #Initialize Architecture
    model = create_feed_forward_network(shape=(xtr_dense.shape[1],),hidden_dim=hidden_dim,
                                        dropout_rate=dropout_rate,hidden_layer_activation=hidden_layer_activation,
                                        output_layer_size=output_layer_size,
                                        output_activation=output_activation,
                                        learning_rate=learning_rate,metrics=metrics)
    
    #Early Stoppage and Model Checkpoints Objects
    stoppage = keras.callbacks.EarlyStopping(monitor = 'val_' + opt_metric,verbose=1,patience=patience,mode=mode)
    checkpoint = keras.callbacks.ModelCheckpoint(savepath,monitor='val_' + opt_metric,save_best_only=True,mode=mode)
    
    #Fit Model on Training Data, iteratively evaluate on val data
    model.fit(xtr_dense,ytrain,
              validation_data=(xva_dense, yval),
              batch_size=batch_size,
              epochs=epochs,
              shuffle=True,
              class_weight = class_weights,
              callbacks = [stoppage,checkpoint],
              use_multiprocessing=True,workers=multiprocessing.cpu_count() - 8)
    
    #Final Evaluation of Optimal Model on Test Data
    final_model = load_model(savepath,custom_objects={opt_metric:opt_func})
    preds = final_model.predict(xte_dense)
    return preds

In [34]:
preds = eval_model(xtrain = np.array(train_lyrics), xval = np.array(val_lyrics), xtest = np.array(test_lyrics), 
           ytrain = train_labels.map(label_mapping),
           yval = val_labels.map(label_mapping),
           ytest = test_labels.map(label_mapping),
           class_weights = weight_mapping, batch_size=8, epochs = 30,
           patience=5,mode='max',savepath='language_detection_ff_tf.h5',
           hidden_dim=[100],dropout_rate=0.3,hidden_layer_activation='relu',
           output_layer_size=11,output_activation='softmax',
           learning_rate = 0.005,metrics=['accuracy',class_recall],opt_metric='class_recall')

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 121319)]          0         
                                                                 
 hidden_1 (Dense)            (None, 100)               12132000  
                                                                 
 classification (Dense)      (None, 11)                1111      
                                                                 
Total params: 12,133,111
Trainable params: 12,133,111
Non-trainable params: 0
_________________________________________________________________
None


Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.


Epoch 1/30

Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 10: early stopping
 6/48 [==>...........................] - ETA: 0s

Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




In [35]:
num_label_to_map = {}
for key,value in label_mapping.items():
    num_label_to_map[value] = key

In [36]:
test_results = pd.DataFrame(confusion_matrix(test_labels.map(label_mapping),np.array([x.argmax() for x in preds])))
test_results.index = [num_label_to_map[x] for x in test_results.index]
test_results.columns = test_results.index

In [37]:
test_results

Unnamed: 0,Spanish,Portuguese,English,Kinyarwanda,Italian,French,German,Other,Finnish,Swedish,Romanian
Spanish,285,0,1,0,0,0,0,0,0,0,0
Portuguese,0,319,0,0,0,0,0,0,0,0,0
English,2,0,302,0,0,0,0,4,0,0,0
Kinyarwanda,2,0,1,162,0,0,0,0,0,0,0
Italian,0,0,0,0,147,0,0,0,0,0,0
French,0,0,1,0,0,115,0,0,0,0,0
German,0,0,0,0,0,0,80,0,0,0,0
Other,2,1,10,1,3,0,0,42,1,0,0
Finnish,0,0,0,0,0,0,0,0,15,0,0
Swedish,0,0,0,0,0,0,0,0,0,8,0


In [38]:
test_results_class_recall = round(test_results/np.array(test_results.sum(axis=1).repeat(11)).reshape(11,11),3)

In [39]:
test_results_class_recall

Unnamed: 0,Spanish,Portuguese,English,Kinyarwanda,Italian,French,German,Other,Finnish,Swedish,Romanian
Spanish,0.997,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Portuguese,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
English,0.006,0.0,0.981,0.0,0.0,0.0,0.0,0.013,0.0,0.0,0.0
Kinyarwanda,0.012,0.0,0.006,0.982,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Italian,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
French,0.0,0.0,0.009,0.0,0.0,0.991,0.0,0.0,0.0,0.0,0.0
German,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Other,0.033,0.017,0.167,0.017,0.05,0.0,0.0,0.7,0.017,0.0,0.0
Finnish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Swedish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
print('Class Recall: ' + str(np.array(test_results_class_recall).diagonal().sum()/11))
print('Accuracy: ' + str(np.array(test_results).diagonal().sum()/np.array(test_results).sum()))

Class Recall: 0.9682727272727273
Accuracy: 0.980883322346737


In [41]:
pickle.dump(test_results,open('ff_model_test_results.pkl','wb'))
pickle.dump(test_results_class_recall,open('ff_model_test_class_recall.pkl','wb'))

In [42]:
np.array(train_lyrics)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 6. DAN & WAN for Language Detection

#### Create Embedding Matrix

In [43]:
np.random.seed(50)
vocab = vectorizer.get_feature_names()
embedding_matrix = np.random.randn((len(vocab)+1)*300).reshape((len(vocab) + 1,300)) #Instantiate Embedding Matrix
embedding_matrix[-1] = 0
vocab_mapping = {}
count = 0
for word in vocab:
    vocab_mapping[word] = count
    count = count + 1

#### Map Words to Token For Each Example

In [44]:
def text_to_index(text_data,mapping,max_size):
    return_data = []
    for text in text_data:
        new_text = text.lower()
        new_text = text.replace('\n',' ')
        new_text = text.replace('  ',' ')
        new_text = new_text.split()
        mapped_text = []
        for token in new_text:
            try:
                mapped_text.append(mapping[token])
            except:
                mapped_text.append(len(mapping))
        
        if len(mapped_text) > max_size:
            mapped_text = mapped_text[:max_size]
        else:
            while len(mapped_text) < max_size:
                mapped_text.append(len(mapping))
                
        return_data.append(mapped_text)
    
    return return_data

In [45]:
train_tokens = text_to_index(train_set['Lyric'],vocab_mapping,1000)
val_tokens = text_to_index(val_set['Lyric'],vocab_mapping,1000)
test_tokens = text_to_index(test_set['Lyric'],vocab_mapping,1000)

#### DAN

In [46]:
def create_dan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix, 
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    dan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    dan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    dan_embeddings = dan_embedding_layer(dan_input_layer)
    #Embeddings are averaged, forming single vector represenation of size embedding matrix
    dan_avg_input_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), name='averaging')(dan_embeddings)
    
    #input into hidden layers
    x = dan_avg_input_embeddings #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #dan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(dan_avg_input_embeddings)
    #dan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(dan_hidden_out_1)
    dan_classification = tf.keras.layers.Dense(output_layer_size, activation=output_activation, name='dan_classification')(x)
    dan_model = tf.keras.models.Model(inputs=dan_input_layer, outputs=[dan_classification])
    dan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics=['accuracy',class_recall])
    
    print(dan_model.summary())

    return dan_model

In [47]:
def train_eval_dan_model():
    #Early Stoppage and Model Checkpoints Objects
    stoppage = keras.callbacks.EarlyStopping(monitor = 'val_class_recall',verbose=1,patience=3,mode='max')
    checkpoint = keras.callbacks.ModelCheckpoint('language_detection_dan.h5',monitor='val_class_recall',save_best_only=True,mode=max)

    model = create_dan_model(output_layer_size=11,learning_rate=0.005,hidden_dim=[100],
                            retrain_embeddings=True)
    #Fit Model on Training Data, iteratively evaluate on val data
    model.fit(np.array(train_tokens),np.array(train_labels.map(label_mapping)),
              validation_data=(np.array(val_tokens), np.array(val_labels.map(label_mapping))),
              batch_size=8,
              epochs=10,
              shuffle=True,
              class_weight = weight_mapping,
              callbacks = [stoppage,checkpoint],
              use_multiprocessing=True,workers=multiprocessing.cpu_count() - 8)

    #Final Evaluation of Optimal Model on Test Data
    final_model = load_model('language_detection_dan.h5',custom_objects={'class_recall':class_recall})
    preds = final_model.predict(np.array(test_tokens))
    
    return preds

In [48]:
preds = train_eval_dan_model()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        36396000  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dan_classification (Dense)  (None, 11)                1111      
                                                                 
Total params: 36,427,211
Trainable params: 36,427,211
Non-t

Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
11/48 [=====>........................] - ETA: 0s

Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




In [49]:
test_results = pd.DataFrame(confusion_matrix(test_labels.map(label_mapping),np.array([x.argmax() for x in preds])))
test_results.index = [num_label_to_map[x] for x in test_results.index]
test_results.columns = test_results.index

In [50]:
test_results

Unnamed: 0,Spanish,Portuguese,English,Kinyarwanda,Italian,French,German,Other,Finnish,Swedish,Romanian
Spanish,274,0,1,0,0,0,0,7,4,0,0
Portuguese,0,307,0,0,0,0,0,0,12,0,0
English,0,0,277,0,0,0,0,3,28,0,0
Kinyarwanda,2,0,1,159,0,0,0,1,2,0,0
Italian,0,0,0,0,142,1,0,0,3,0,1
French,0,0,0,0,0,114,0,0,2,0,0
German,0,0,0,0,0,0,77,0,3,0,0
Other,2,1,8,1,2,0,0,16,26,4,0
Finnish,0,0,0,0,0,0,0,0,15,0,0
Swedish,0,0,0,0,0,0,0,0,0,8,0


In [51]:
test_results_class_recall = round(test_results/np.array(test_results.sum(axis=1).repeat(11)).reshape(11,11),3)

In [52]:
test_results_class_recall

Unnamed: 0,Spanish,Portuguese,English,Kinyarwanda,Italian,French,German,Other,Finnish,Swedish,Romanian
Spanish,0.958,0.0,0.003,0.0,0.0,0.0,0.0,0.024,0.014,0.0,0.0
Portuguese,0.0,0.962,0.0,0.0,0.0,0.0,0.0,0.0,0.038,0.0,0.0
English,0.0,0.0,0.899,0.0,0.0,0.0,0.0,0.01,0.091,0.0,0.0
Kinyarwanda,0.012,0.0,0.006,0.964,0.0,0.0,0.0,0.006,0.012,0.0,0.0
Italian,0.0,0.0,0.0,0.0,0.966,0.007,0.0,0.0,0.02,0.0,0.007
French,0.0,0.0,0.0,0.0,0.0,0.983,0.0,0.0,0.017,0.0,0.0
German,0.0,0.0,0.0,0.0,0.0,0.0,0.962,0.0,0.038,0.0,0.0
Other,0.033,0.017,0.133,0.017,0.033,0.0,0.0,0.267,0.433,0.067,0.0
Finnish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Swedish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [53]:
print('Class Recall: ' + str(np.array(test_results_class_recall).diagonal().sum()/11))
print('Accuracy: ' + str(np.array(test_results).diagonal().sum()/np.array(test_results).sum()))

Class Recall: 0.9055454545454545
Accuracy: 0.9241924851680949


In [54]:
pickle.dump(test_results,open('dan_model_test_results.pkl','wb'))
pickle.dump(test_results_class_recall,open('dan_model_test_class_recall.pkl','wb'))

#### WAN

In [55]:
def create_wan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix,
                     num_attention = 1,
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the WAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    num_attention = number of parallel attention computations that learn how to balance embeddings into a single
    vector representation, final attention layer weights prior attention based representations
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    wan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    wan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    wan_embeddings = wan_embedding_layer(wan_input_layer)
    
    if num_attention > 1:
        #Create attention based single vector representations of words according to alternative query vectors
        attention_embeddings = []
        for num in range(num_attention):
            #Apply Query Vector to words in embeddings, returning a max_sequence_length x 1 tensor
            l1_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query' + str(num+1))(wan_embeddings)
            #reshape to 1 x max_sequence_length
            l1_reshape_query = tf.keras.layers.Reshape((1,max_sequence_length))(l1_query)
            #Softmax over query * key (words) to obtain weights
            l1_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                                name='attention_weights' + str(num+1))(l1_reshape_query)
            #weight embeddings according to weights
            l1_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((wan_embeddings,l1_weights)))
            attention_embeddings.append(l1_attention)

        concat_attention = tf.keras.layers.Concatenate()(attention_embeddings)
        concat_attention = tf.keras.layers.Reshape((num_attention,embedding_matrix.shape[1]))(concat_attention)
    else:
        concat_attention = wan_embeddings
        num_attention = max_sequence_length
    
    #Apply Query Vector to attention based representations, returning a num_attention x 1 tensor
    wan_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(concat_attention)
    #reshape to 1 x num_attention
    reshaped_query = tf.keras.layers.Reshape((1,num_attention))(wan_query)
    #Softmax over query * key (words) to obtain weights
    wan_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                        name='attention_weights')(reshaped_query)
    #weight attention embeddings according to weights, learning how to balance attention based vector representations 
    #from prior layer
    wan_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((concat_attention,wan_weights)))
    
    #input into hidden layers
    x = wan_attention #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    wan_classification = tf.keras.layers.Dense(output_layer_size, activation=output_activation, name='wan_classification')(x)
    wan_model = tf.keras.models.Model(inputs=wan_input_layer, outputs=[wan_classification])
    wan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics=['accuracy',class_recall],
                     run_eagerly=True)
    
    print(wan_model.summary())

    return wan_model


In [56]:
def train_eval_wan_model():
    #Early Stoppage and Model Checkpoints Objects
    stoppage = keras.callbacks.EarlyStopping(monitor = 'val_class_recall',verbose=1,patience=3,mode='max')
    checkpoint = keras.callbacks.ModelCheckpoint('language_detection_wan.h5',monitor='val_class_recall',save_best_only=True,mode=max)

    model = create_wan_model(output_layer_size=11,learning_rate=0.005,hidden_dim=[100],
                            retrain_embeddings=True,num_attention=10)
    #Fit Model on Training Data, iteratively evaluate on val data
    model.fit(np.array(train_tokens),np.array(train_labels.map(label_mapping)),
              validation_data=(np.array(val_tokens), np.array(val_labels.map(label_mapping))),
              batch_size=8,
              epochs=10,
              shuffle=True,
              class_weight = weight_mapping,
              callbacks = [stoppage,checkpoint],
              use_multiprocessing=True,workers=multiprocessing.cpu_count() - 8)

    #Final Evaluation of Optimal Model on Test Data
    final_model = load_model('language_detection_wan.h5',custom_objects={'class_recall':class_recall})
    preds = final_model.predict(np.array(test_tokens))
    
    return preds

In [57]:
preds = train_eval_wan_model()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 1000)]       0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 1000, 300)    36396000    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                                  
 attention_query2 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                            

 dot_5 (Dot)                    (None, 300, 1)       0           ['embedding_layer[0][0]',        
                                                                  'attention_weights6[0][0]']     
                                                                                                  
 dot_6 (Dot)                    (None, 300, 1)       0           ['embedding_layer[0][0]',        
                                                                  'attention_weights7[0][0]']     
                                                                                                  
 dot_7 (Dot)                    (None, 300, 1)       0           ['embedding_layer[0][0]',        
                                                                  'attention_weights8[0][0]']     
                                                                                                  
 dot_8 (Dot)                    (None, 300, 1)       0           ['embedding_layer[0][0]',        
          

Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 10: early stopping


Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.




In [58]:
test_results = pd.DataFrame(confusion_matrix(test_labels.map(label_mapping),np.array([x.argmax() for x in preds])))
test_results.index = [num_label_to_map[x] for x in test_results.index]
test_results.columns = test_results.index

In [59]:
test_results

Unnamed: 0,Spanish,Portuguese,English,Kinyarwanda,Italian,French,German,Other,Finnish,Swedish,Romanian
Spanish,261,1,2,0,0,0,0,13,9,0,0
Portuguese,6,285,0,0,0,1,0,12,15,0,0
English,0,0,241,1,24,1,0,17,21,0,3
Kinyarwanda,2,0,2,154,4,0,0,3,0,0,0
Italian,0,1,1,0,132,1,0,7,5,0,0
French,1,2,6,0,0,103,0,1,3,0,0
German,0,0,1,0,0,0,56,20,3,0,0
Other,1,1,13,1,2,0,0,14,27,0,1
Finnish,0,0,0,1,0,0,0,4,10,0,0
Swedish,0,0,0,0,0,0,0,0,0,8,0


In [60]:
test_results_class_recall = round(test_results/np.array(test_results.sum(axis=1).repeat(11)).reshape(11,11),3)

In [61]:
test_results_class_recall

Unnamed: 0,Spanish,Portuguese,English,Kinyarwanda,Italian,French,German,Other,Finnish,Swedish,Romanian
Spanish,0.913,0.003,0.007,0.0,0.0,0.0,0.0,0.045,0.031,0.0,0.0
Portuguese,0.019,0.893,0.0,0.0,0.0,0.003,0.0,0.038,0.047,0.0,0.0
English,0.0,0.0,0.782,0.003,0.078,0.003,0.0,0.055,0.068,0.0,0.01
Kinyarwanda,0.012,0.0,0.012,0.933,0.024,0.0,0.0,0.018,0.0,0.0,0.0
Italian,0.0,0.007,0.007,0.0,0.898,0.007,0.0,0.048,0.034,0.0,0.0
French,0.009,0.017,0.052,0.0,0.0,0.888,0.0,0.009,0.026,0.0,0.0
German,0.0,0.0,0.012,0.0,0.0,0.0,0.7,0.25,0.038,0.0,0.0
Other,0.017,0.017,0.217,0.017,0.033,0.0,0.0,0.233,0.45,0.0,0.017
Finnish,0.0,0.0,0.0,0.067,0.0,0.0,0.0,0.267,0.667,0.0,0.0
Swedish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [62]:
print('Class Recall: ' + str(np.array(test_results_class_recall).diagonal().sum()/11))
print('Accuracy: ' + str(np.array(test_results).diagonal().sum()/np.array(test_results).sum()))

Class Recall: 0.7887272727272727
Accuracy: 0.8398154251812788


In [63]:
pickle.dump(test_results,open('wan_model_test_results.pkl','wb'))
pickle.dump(test_results_class_recall,open('wan_model_test_class_recall.pkl','wb'))

In [18]:
ff_model = load_model('language_detection_ff_tf.h5',custom_objects={'class_recall':class_recall})

2022-07-17 20:39:46.805688: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [66]:
input_text = val_set[val_set['language label'] == 'Kinyarwanda'].iloc[2,0]
input_text = vectorizer.transform([input_text]).todense()
input_text = input_text/input_text.sum()
reverse_mapping[ff_model.predict(input_text).argmax()]



'Kinyarwanda'

In [24]:
input_text.sum()

4

In [28]:
label_mapping

{'Spanish': 0,
 'Portuguese': 1,
 'English': 2,
 'Kinyarwanda': 3,
 'Italian': 4,
 'French': 5,
 'German': 6,
 'Other': 7,
 'Finnish': 8,
 'Swedish': 9,
 'Romanian': 10}

In [63]:
val_set[val_set['language label'] == 'Kinyarwanda'].iloc[0,0]

"Look around, what a lovely day\nThere's feelin' we'll have a great day\n\nThe sky is blue as it can ever be\nOh, how the breezy wind relaxes me\n\nItsumo ijou ni agaridasu tenshon wo I can't help stop it\nOsaekirenai machikirenai special day\n\nOoh baby let's go\nDriving, driving motto\nDriving, driving (oh yeah)\nVolume agete (let's go) your favorite music\n\nDriving, driving kyou wa\nDriving, driving (oh yeah)\nTanoshimou futari no free time\n\nAtemonaku hashirou kono highway wo tobashite\nHikaru kono umi wo koe tadoritsuku no secret zone\n\nItsumo ijou ni hashaijau watashi wo I can't help stop it\nYasashiku uketomete mou hajimatteru sweet time\n\nOoh baby let's go\nDriving, driving motto\nDriving, driving (oh yeah)\nVolume agete (let's go) your favorite music\n\nDriving, driving kyou wa\nDriving, driving (oh yeah)\nDare mo shiranai (let's go) secret time\n\nOoh baby let's go\nDriving, driving motto\nDriving, driving (oh yeah)\nTanoshimou futari no let's go special day\n\nDriving, d