In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

import mlflow
import mlflow.keras


import pickle

In [8]:
def create_model(embedding_dim=128, lstm_units=64, dropout_rate=0.5):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(units=lstm_units))
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [9]:
seed = 42

# Load dataset
trainingDf = pd.read_csv('../data_cleaning/spanish names db - training.csv')
validationDf = pd.read_csv('../data_cleaning/spanish names db - validation.csv')
testingDf = pd.read_csv('../data_cleaning/spanish names db - testing.csv')

In [10]:
X_train = trainingDf['name']
y_train = trainingDf['gender']

X_val = validationDf['name']
y_val = validationDf['gender']

X_test = testingDf['name']
y_test  = testingDf['gender']

In [11]:
# Tokenize names using only the training data
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(X_train)

# Convert names to sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
val_sequences = tokenizer.texts_to_sequences(X_val)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Determine the maximum sequence length from the training data
max_sequence_length = max(len(seq) for seq in train_sequences)

# Pad sequences to the same length
X_train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
X_val_padded = pad_sequences(val_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Save tokenizer
tokenizer_info = {
    'tokenizer': tokenizer,
    'max_sequence_length': max_sequence_length
}

with open('tokenizer_info.pickle', 'wb') as handle:
    pickle.dump(tokenizer_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
# Enable auto logging
mlflow.keras.autolog()

with mlflow.start_run():
    model = create_model(**params)
    model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=10, batch_size=32)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test_padded, y_test)
    mlflow.log_metric('test_loss', loss)
    mlflow.log_metric('test_accuracy', accuracy)



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.8569 - loss: 0.3249 - val_accuracy: 0.9166 - val_loss: 0.1938
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9162 - loss: 0.2014 - val_accuracy: 0.9257 - val_loss: 0.1720
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9261 - loss: 0.1783 - val_accuracy: 0.9319 - val_loss: 0.1597
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9316 - loss: 0.1657 - val_accuracy: 0.9328 - val_loss: 0.1533
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9325 - loss: 0.1612 - val_accuracy: 0.9375 - val_loss: 0.1461
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9408 - loss: 0.1468 - val_accuracy: 0.9367 - val_loss: 0.1494
Epoch 7/10



[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9406 - loss: 0.1463


In [17]:
from sklearn.model_selection import ParameterSampler

param_distributions = {
    'embedding_dim': [64, 128, 256],
    'lstm_units': [32, 64, 128],
    'dropout_rate': [0.3, 0.5, 0.7]
}

n_iter = 5
param_list = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=seed))

for params in param_list:
    with mlflow.start_run():
        model = create_model(**params)
        model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=10, batch_size=32)

        # Evaluate the model
        loss, accuracy = model.evaluate(X_test_padded, y_test)
        mlflow.log_metric('test_loss', loss)
        mlflow.log_metric('test_accuracy', accuracy)

        # Log hyperparameters
        mlflow.log_params(params)




Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - accuracy: 0.8629 - loss: 0.3033 - val_accuracy: 0.9249 - val_loss: 0.1775
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - accuracy: 0.9238 - loss: 0.1823 - val_accuracy: 0.9344 - val_loss: 0.1531
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 22ms/step - accuracy: 0.9334 - loss: 0.1593 - val_accuracy: 0.9409 - val_loss: 0.1402
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.9410 - loss: 0.1416 - val_accuracy: 0.9439 - val_loss: 0.1350
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 22ms/step - accuracy: 0.9478 - loss: 0.1311 - val_accuracy: 0.9486 - val_loss: 0.1301
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 23ms/step - accuracy: 0.9518 - loss: 0.1207 - val_accuracy: 0.9453 - val_loss: 0.1333
Epoc



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.8426 - loss: 0.3360 - val_accuracy: 0.9123 - val_loss: 0.1983
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.9155 - loss: 0.2028 - val_accuracy: 0.9235 - val_loss: 0.1753
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9228 - loss: 0.1816 - val_accuracy: 0.9258 - val_loss: 0.1651
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9304 - loss: 0.1644 - val_accuracy: 0.9388 - val_loss: 0.1510
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9373 - loss: 0.1520 - val_accuracy: 0.9396 - val_loss: 0.1455
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9398 - loss: 0.1469 - val_accuracy: 0.9405 - val_loss: 0.1422
Epoch 7/10



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8315 - loss: 0.3615 - val_accuracy: 0.9056 - val_loss: 0.2151
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9075 - loss: 0.2257 - val_accuracy: 0.9179 - val_loss: 0.1879
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9130 - loss: 0.2061 - val_accuracy: 0.9220 - val_loss: 0.1810
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9196 - loss: 0.1880 - val_accuracy: 0.9238 - val_loss: 0.1799
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9258 - loss: 0.1777 - val_accuracy: 0.9321 - val_loss: 0.1594
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9290 - loss: 0.1741 - val_accuracy: 0.9296 - val_loss: 0.1577
Epoch 7/10



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8281 - loss: 0.3682 - val_accuracy: 0.9132 - val_loss: 0.2001
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9072 - loss: 0.2215 - val_accuracy: 0.9230 - val_loss: 0.1795
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9173 - loss: 0.2021 - val_accuracy: 0.9269 - val_loss: 0.1682
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9229 - loss: 0.1903 - val_accuracy: 0.9288 - val_loss: 0.1652
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9265 - loss: 0.1785 - val_accuracy: 0.9329 - val_loss: 0.1570
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9289 - loss: 0.1769 - val_accuracy: 0.9334 - val_loss: 0.1552
Epoch 7/10



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8285 - loss: 0.3581 - val_accuracy: 0.9095 - val_loss: 0.2080
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9077 - loss: 0.2155 - val_accuracy: 0.9205 - val_loss: 0.1895
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9163 - loss: 0.1976 - val_accuracy: 0.9245 - val_loss: 0.1757
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9229 - loss: 0.1825 - val_accuracy: 0.9302 - val_loss: 0.1631
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9314 - loss: 0.1706 - val_accuracy: 0.9349 - val_loss: 0.1580
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9310 - loss: 0.1678 - val_accuracy: 0.9333 - val_loss: 0.1553
Epoch 7/10