# 1. Import Packages and Libraries

In [4]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import xgboost

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

import scipy
import pandas as pd
import numpy as np
import gensim

import nltk
from nltk.data import find
import matplotlib.pyplot as plt
import shap

import matplotlib
import sklearn
import pickle
import random
import multiprocessing

In [5]:
print('tf ' + tf.__version__)
print('sklearn ' + sklearn.__version__)
print('xgboost ' + xgboost.__version__)
print('nltk ' + nltk.__version__)
print('pd ' + pd.__version__)
print('np ' + np.__version__)
print('shap ' + shap.__version__)
print('mpl ' + matplotlib.__version__)
print('scipy ' + scipy.__version__)
print('gensim ' + gensim.__version__)

tf 2.9.1
sklearn 0.24.2
xgboost 1.6.1
nltk 3.7
pd 1.3.4
np 1.20.3
shap 0.41.0
mpl 3.4.3
scipy 1.7.1
gensim 4.2.0


# 2. Read in Dataset + Create Train/Test Set

In [7]:
sample_dataset = pd.read_csv('Language_Detection/Train_Test_Data/train.csv')[['Lyric','language label']]
test_dataset = pd.read_csv('Language_Detection/Train_Test_Data/test.csv')[['Lyric','language label']]
print('Label Counts in Train Set')
display(sample_dataset['language label'].value_counts())
train_set = sample_dataset
val_set = test_dataset.iloc[:1418]
test_set = test_dataset.iloc[1418:]

Label Counts in Train Set


English       2145
Portuguese    2144
Spanish       2141
Other         2097
Italian       1140
French         985
German         697
Name: language label, dtype: int64

# 3. Resample (Oversample on Minority Classes) Training Set to Deal with Class Imbalance

In [8]:
random.seed(50)
max_class_counts = train_set['language label'].value_counts().iloc[0]
resampled_train_set = pd.DataFrame()
for lang in train_set['language label'].unique():
    subset = train_set[train_set['language label'] == lang].copy()
    if len(subset) == max_class_counts:
        resampled_train_set = pd.concat([resampled_train_set,subset],ignore_index=True)
    else:
        added_subset = subset.iloc[random.choices(np.arange(0,len(subset)),k=max_class_counts - len(subset))]
        resampled_train_set = pd.concat([resampled_train_set,subset,added_subset],ignore_index=True)
        
display(resampled_train_set)

Unnamed: 0,Lyric,language label
0,Nagaretsuita sono basho de\nHito wa nani omou ...,Other
1,Music non-stop\nMusic non-stop\nMusic non-stop...,Other
2,Tyttäret tulen tekevät\nTuvan taakse taaton sa...,Other
3,Tekrar geldik buraya oooo..\nHep bereber olmay...,Other
4,Itsuka kimi ga hitomi ni tomosu ai no hikari g...,Other
...,...,...
15010,Les amoureux de l'an deux mille\nCherchent Ã c...,French
15011,J'aime ta couleur café\nTes cheveux café\nTa g...,French
15012,"Je ne suis pas une dame, je ne suis pas une da...",French
15013,Viens seigneur remplir cet endroit\nAvec ta gl...,French


# 4. Term Density Transformation of Text Data

In [9]:
resampled_train_set['language label'].value_counts()

Other         2145
Spanish       2145
German        2145
Portuguese    2145
Italian       2145
English       2145
French        2145
Name: language label, dtype: int64

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('  ',' ')
    return text

vectorizer = CountVectorizer(preprocessor=preprocess_text)

#### Vectorize According to Terms in Non-Other Category

In [11]:
vectorizer.fit(resampled_train_set[resampled_train_set['language label'] != 'Other']['Lyric'])

CountVectorizer(preprocessor=<function preprocess_text at 0x7ff5263a1ca0>)

In [48]:
#Vectorize Train Lyrics
train_lyrics = vectorizer.transform(resampled_train_set['Lyric'])
train_lyrics = pd.DataFrame(train_lyrics.todense(),columns = vectorizer.get_feature_names())
train_lyrics_token_count = train_lyrics.sum(axis=1)
train_lyrics = train_lyrics/np.array(train_lyrics_token_count.repeat(len(train_lyrics.columns))).reshape(train_lyrics.shape)

#Vectorize Val Lyrics
val_lyrics = vectorizer.transform(val_set['Lyric'])
val_lyrics = pd.DataFrame(val_lyrics.todense(),columns = vectorizer.get_feature_names(),index=val_set.index)
val_lyrics_token_count = val_lyrics.sum(axis=1)
val_lyrics = val_lyrics/np.array(val_lyrics_token_count.repeat(len(val_lyrics.columns))).reshape(val_lyrics.shape)

#Vectorize Test Lyrics
test_lyrics = vectorizer.transform(test_set['Lyric'])
test_lyrics = pd.DataFrame(test_lyrics.todense(),columns = vectorizer.get_feature_names(),index=test_set.index)
test_lyrics_token_count = test_lyrics.sum(axis=1)
test_lyrics = test_lyrics/np.array(test_lyrics_token_count.repeat(len(test_lyrics.columns))).reshape(test_lyrics.shape)

train_labels = resampled_train_set['language label']
val_labels = val_set['language label']
test_labels = test_set['language label']

#### Convert to float 32 and drop observations that failed to featurize

In [49]:
train_lyrics = train_lyrics.astype('float32')
val_lyrics = val_lyrics.astype('float32')
test_lyrics = test_lyrics.astype('float32')

In [50]:
train_lyrics.dropna(inplace=True)

In [51]:
train_labels = train_labels.loc[train_lyrics.index]

In [52]:
val_lyrics.dropna(inplace=True)

In [53]:
val_labels = val_labels.loc[val_lyrics.index]

In [54]:
test_lyrics.dropna(inplace=True)

In [55]:
test_labels = test_labels.loc[test_lyrics.index]

In [56]:
print(len(train_lyrics))
print(len(val_lyrics))
print(len(test_lyrics))

15000
1416
1417


#### Mapping to map text labels to numeric labels

In [57]:
mapping = {}
count = 0
for label in train_labels.unique():
    mapping[label] = count
    count = count + 1

# 5. Quick Evaluation of Classical ML Models

In [58]:
def optimal_model_id(xtrain,xval,xtest,ytrain,yval,ytest,estimator,param_grid,metric='accuracy'):
    
    #Concatenate training and validation data
    train_val_feats = pd.concat([xtrain,xval],ignore_index=True)
    train_val_labels = pd.concat([ytrain,yval],ignore_index=True)
    #Instantiate Grid Search with model and param grid to ID which hyperparameter combo enables the model to generalize
    #best on the validation set
    grid = GridSearchCV(estimator = estimator, param_grid= param_grid,
                        scoring=metric,cv=[(np.arange(0,len(xtrain)),np.arange(len(xtrain),len(train_val_feats)))])
    
    display(train_val_feats)
    display(train_val_labels.map(mapping))
    grid.fit(train_val_feats,train_val_labels.map(mapping))
    
    #Store Best Performing Model Output
    best_estimator = grid.best_estimator_
    best_val_score = grid.best_score_
    
    #Predictions on test set with optimal model
    test_preds = best_estimator.predict(xtest)
    #performance on test set
    oos_score = accuracy_score(ytest.map(mapping),test_preds)
    label_options = list(ytest.unique())
    
    #Confustion matrix of true for predicted values on the test set
    confuse = pd.DataFrame(confusion_matrix(ytest.map(mapping),test_preds),index = label_options,columns = label_options)
    
    #return optimal model results
    return {'best_estimator':best_estimator,
           'best_val_score':best_val_score,
           'best_test_score':oos_score,
           'metric':metric,
           'test_set_confusion_matrix':confuse}

#### KNN Test

In [59]:
test = optimal_model_id(train_lyrics,val_lyrics,test_lyrics,train_labels,val_labels,test_labels,
                KNeighborsClassifier(),{'n_neighbors':[1,3,5,7,9]},'accuracy')
display(test)
display(test['test_set_confusion_matrix'])

Unnamed: 0,00,000,0000,00015,01,012,02,03,04,05,...,ʿalaykum,еl,еm,еn,еnnui,еs,еven,йquateur,йternellement,оles
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0        0
1        0
2        0
3        0
4        0
        ..
16411    4
16412    1
16413    0
16414    3
16415    3
Name: language label, Length: 16416, dtype: int64

{'best_estimator': KNeighborsClassifier(n_neighbors=1),
 'best_val_score': 0.9293785310734464,
 'best_test_score': 0.9343683839096684,
 'metric': 'accuracy',
 'test_set_confusion_matrix':             French  English  Italian  Portuguese  German  Other  Spanish
 French         248        6        5           2       6     21        3
 English          0      252        0           3       0      0        0
 Italian          0        0       62           0       0      2        2
 Portuguese       0        1        0         280       0      0        0
 German           3        2        4           1     135      1        0
 Other            1        1        9           0       8    237        2
 Spanish          1        2        0           0       1      6      110}

Unnamed: 0,French,English,Italian,Portuguese,German,Other,Spanish
French,248,6,5,2,6,21,3
English,0,252,0,3,0,0,0
Italian,0,0,62,0,0,2,2
Portuguese,0,1,0,280,0,0,0
German,3,2,4,1,135,1,0
Other,1,1,9,0,8,237,2
Spanish,1,2,0,0,1,6,110


#### XGBoost Classifier Test

In [None]:
test = optimal_model_id(train_lyrics,val_lyrics,test_lyrics,train_labels,val_labels,test_labels,
                XGBClassifier(),{'max_depth':[2,3,4],'max_features':['auto'],'n_estimators':[10]},'accuracy')
display(test)
display(test['test_set_confusion_matrix'])

# 6. Basic Feedforward NN w/ Keras Sequential API and Term Density Representation of Input

#### Input goes sequentially from one hidden layer to the next "left to right"

In [62]:
#Define Model Architecture Sequentially
model = keras.Sequential([
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(7,activation='softmax')
])

#Compile the model, specifying loss function, optimizer, and performance metric
model.compile(loss = keras.losses.SparseCategoricalCrossentropy(),
             optimizer = keras.optimizers.Adam(learning_rate=0.01),
             metrics=['accuracy'],
             )

#Fit model and validate on val set between epochs, set multiprocessing
model.fit(x = np.array(train_lyrics),y = train_labels.map(mapping),batch_size=8,epochs=2,
         validation_data=(np.array(val_lyrics),val_labels.map(mapping)),
         use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff4d1ec2040>

In [63]:
preds = model.predict(np.array(test_lyrics))



In [64]:
accuracy_score(test_labels.map(mapping),[x.argmax() for x in preds])

0.9788285109386027

In [65]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 100)               9017900   
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                                                 
 dense_5 (Dense)             (None, 7)                 707       
                                                                 
Total params: 9,028,707
Trainable params: 9,028,707
Non-trainable params: 0
_________________________________________________________________


# 7. Word Embedding Based Models That Build Vector Representation of Input, Captures General Meaning Before Pass into Feed Forward NN

#### Build Embedding Matrix

In [66]:
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

#construct embedding matrix w/ prebuilt embedding
vocab_dict = model.key_to_index.copy()
embedding_matrix = np.zeros((43982,300))
for word,index in model.key_to_index.items():
    embedding_matrix[index] = model[word]

#Construct custom embedding matrix for this task
vocab_dict_custom = {}
count = 0
for word in vectorizer.get_feature_names():
    vocab_dict_custom[word] = count
    count = count + 1
embedding_matrix_custom = np.random.random((len(vectorizer.get_feature_names()) + 1,300))
embedding_matrix_custom[-1] = 0

#### Map tokens in train, val, test set to row in embedding matrices for both word2vec and custom embedding matrix

In [69]:
def text_to_index(text_data,mapping,max_size):
    return_data = []
    for text in text_data:
        new_text = text.lower()
        new_text = text.replace('\n',' ')
        new_text = text.replace('  ',' ')
        new_text = new_text.split()
        mapped_text = []
        for token in new_text:
            try:
                mapped_text.append(mapping[token])
            except:
                mapped_text.append(len(mapping))
        
        if len(mapped_text) > max_size:
            mapped_text = mapped_text[:max_size]
        else:
            while len(mapped_text) < max_size:
                mapped_text.append(len(mapping))
                
        return_data.append(mapped_text)
    
    return return_data

In [73]:
train_tokens_prebuilt = text_to_index(resampled_train_set['Lyric'].loc[train_lyrics.index],vocab_dict,1000)
train_tokens_custom = text_to_index(resampled_train_set['Lyric'].loc[train_lyrics.index],vocab_dict_custom,1000)

val_tokens_prebuilt = text_to_index(val_set['Lyric'].loc[val_lyrics.index],vocab_dict,1000)
val_tokens_custom = text_to_index(val_set['Lyric'].loc[val_lyrics.index],vocab_dict_custom,1000)

test_tokens_prebuilt = text_to_index(test_set['Lyric'].loc[test_lyrics.index],vocab_dict,1000)
test_tokens_custom = text_to_index(test_set['Lyric'].loc[test_lyrics.index],vocab_dict_custom,1000)

### Deep Averaging Network (DAN) w/ Functional Keras API and Custom Embedding Matrix

In [75]:
def create_dan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix_custom, 
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    dan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    dan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    dan_embeddings = dan_embedding_layer(dan_input_layer)
    #Embeddings are averaged, forming single vector represenation of size embedding matrix
    dan_avg_input_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), name='averaging')(dan_embeddings)
    
    #input into hidden layers
    x = dan_avg_input_embeddings #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #dan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(dan_avg_input_embeddings)
    #dan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(dan_hidden_out_1)
    dan_classification = tf.keras.layers.Dense(output_layer_size, activation=output_activation, name='dan_classification')(x)
    dan_model = tf.keras.models.Model(inputs=dan_input_layer, outputs=[dan_classification])
    dan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy')
    
    print(dan_model.summary())

    return dan_model

In [78]:
dan_model_sorted = create_dan_model(retrain_embeddings=True,embedding_matrix=embedding_matrix_custom,
                                   output_layer_size=7)
dan_sorted_history = dan_model_sorted.fit(np.array(train_tokens_custom),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(val_tokens_custom), np.array(val_labels.map(mapping))),
                        batch_size=8,
                        epochs=2,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        27053700  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0   

In [79]:
accuracy_score(test_labels.map(mapping),[x.argmax() for x in dan_model_sorted.predict(test_tokens_custom)])



0.9618913196894848

In [80]:
dan_model_sorted.weights[0].shape

TensorShape([90179, 300])

### Weighted Attention Network (WAN) with Custom Embeddings, allows for computation of multiple attention based representations of input before a final attention layer learns how to balance attention vectors from prior layer

In [81]:
def create_wan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix_custom,
                     num_attention = 1,
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the WAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    num_attention = number of parallel attention computations that learn how to balance embeddings into a single
    vector representation, final attention layer weights prior attention based representations
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    wan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    wan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    wan_embeddings = wan_embedding_layer(wan_input_layer)
    
    #Create attention based single vector representations of words according to alternative query vectors
    attention_embeddings = []
    for num in range(num_attention):
        #Apply Query Vector to words in embeddings, returning a max_sequence_length x 1 tensor
        l1_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query' + str(num+1))(wan_embeddings)
        #reshape to 1 x max_sequence_length
        l1_reshape_query = tf.keras.layers.Reshape((1,max_sequence_length))(l1_query)
        #Softmax over query * key (words) to obtain weights
        l1_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                            name='attention_weights' + str(num+1))(l1_reshape_query)
        #weight embeddings according to weights
        l1_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((wan_embeddings,l1_weights)))
        attention_embeddings.append(l1_attention)
    
    concat_attention = tf.keras.layers.Concatenate()(attention_embeddings)
    concat_attention = tf.keras.layers.Reshape((num_attention,embedding_matrix.shape[1]))(concat_attention)
    
    #Apply Query Vector to attention based representations, returning a num_attention x 1 tensor
    wan_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(concat_attention)
    #reshape to 1 x num_attention
    reshaped_query = tf.keras.layers.Reshape((1,num_attention))(wan_query)
    #Softmax over query * key (words) to obtain weights
    wan_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                        name='attention_weights')(reshaped_query)
    #weight attention embeddings according to weights, learning how to balance attention based vector representations 
    #from prior layer
    wan_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((concat_attention,wan_weights)))
    
    #input into hidden layers
    x = wan_attention #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #wan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(wan_avg_input_embeddings)
    #wan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(wan_hidden_out_1)
    wan_classification = tf.keras.layers.Dense(output_layer_size, activation=output_activation, name='wan_classification')(x)
    wan_model = tf.keras.models.Model(inputs=wan_input_layer, outputs=[wan_classification])
    wan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy')
    
    print(wan_model.summary())

    return wan_model


In [83]:
wan_model_sorted = create_wan_model(retrain_embeddings=True,embedding_matrix=embedding_matrix_custom,
                                   num_attention=5,output_layer_size=7)
wan_sorted_history = wan_model_sorted.fit(np.array(train_tokens_custom),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(val_tokens_custom), np.array(val_labels.map(mapping))),
                        batch_size=8,
                        epochs=2,
                        shuffle=True)

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 1000)]       0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 1000, 300)    27053700    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                                  
 attention_query2 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                            

 dropout_1 (Dropout)            (None, 100)          0           ['hidden_1[0][0]']               
                                                                                                  
 hidden_2 (Dense)               (None, 100)          10100       ['dropout_1[0][0]']              
                                                                                                  
 dropout_2 (Dropout)            (None, 100)          0           ['hidden_2[0][0]']               
                                                                                                  
 hidden_3 (Dense)               (None, 100)          10100       ['dropout_2[0][0]']              
                                                                                                  
 dropout_3 (Dropout)            (None, 100)          0           ['hidden_3[0][0]']               
                                                                                                  
 wan_class

In [84]:
accuracy_score(test_labels.map(mapping),[x.argmax() for x in wan_model_sorted.predict(test_tokens_custom)])



0.9428369795342273

# 8. BERT Based Models to Develop Contextual Representations