In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

import mlflow
import mlflow.keras


import pickle

2024-08-02 23:19:47.757035: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-02 23:19:48.757809: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-02 23:19:49.029535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 23:19:49.647716: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 23:19:49.803956: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-02 23:19:51.429817: I tensorflow/core/platform/cpu_feature_gu

In [2]:
def create_model(embedding_dim=128, lstm_units=64, dropout_rate=0.5):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(units=lstm_units))
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [3]:
seed = 42

# Load dataset
trainingDf = pd.read_csv('../data_cleaning/spanish names db - training.csv')
validationDf = pd.read_csv('../data_cleaning/spanish names db - validation.csv')
testingDf = pd.read_csv('../data_cleaning/spanish names db - testing.csv')

In [4]:
X_train = trainingDf['name']
y_train = trainingDf['gender']

X_val = validationDf['name']
y_val = validationDf['gender']

X_test = testingDf['name']
y_test  = testingDf['gender']

In [5]:
# Tokenize names using only the training data
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(X_train)

# Convert names to sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
val_sequences = tokenizer.texts_to_sequences(X_val)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Determine the maximum sequence length from the training data
max_sequence_length = max(len(seq) for seq in train_sequences)

# Pad sequences to the same length
X_train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
X_val_padded = pad_sequences(val_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Save tokenizer
tokenizer_info = {
    'tokenizer': tokenizer,
    'max_sequence_length': max_sequence_length
}

with open('tokenizer_info.pickle', 'wb') as handle:
    pickle.dump(tokenizer_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
from sklearn.model_selection import ParameterSampler

param_distributions = {
    'embedding_dim': [64, 128, 256],
    'lstm_units': [32, 64, 128],
    'dropout_rate': [0.3, 0.5, 0.7]
}

n_iter = 5
param_list = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=seed))

for params in param_list:
    with mlflow.start_run():
        model = create_model(**params)
        model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=10, batch_size=32)

        # Evaluate the model
        loss, accuracy = model.evaluate(X_test_padded, y_test)
        mlflow.log_metric('test_loss', loss)
        mlflow.log_metric('test_accuracy', accuracy)

        # Log hyperparameters
        mlflow.log_params(params)




Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 26ms/step - accuracy: 0.8614 - loss: 0.3059 - val_accuracy: 0.9204 - val_loss: 0.1810
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 25ms/step - accuracy: 0.9233 - loss: 0.1845 - val_accuracy: 0.9328 - val_loss: 0.1557
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9330 - loss: 0.1575 - val_accuracy: 0.9419 - val_loss: 0.1447
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 22ms/step - accuracy: 0.9410 - loss: 0.1427 - val_accuracy: 0.9406 - val_loss: 0.1404
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 22ms/step - accuracy: 0.9474 - loss: 0.1292 - val_accuracy: 0.9441 - val_loss: 0.1311
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9514 - loss: 0.1210 - val_accuracy: 0.9473 - val_loss: 0.1297
Epoc



[1m385/385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9456 - loss: 0.1453




Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.8419 - loss: 0.3350 - val_accuracy: 0.9170 - val_loss: 0.1946
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9128 - loss: 0.2039 - val_accuracy: 0.9240 - val_loss: 0.1732
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9236 - loss: 0.1809 - val_accuracy: 0.9324 - val_loss: 0.1600
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step - accuracy: 0.9288 - loss: 0.1691 - val_accuracy: 0.9334 - val_loss: 0.1499
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9334 - loss: 0.1582 - val_accuracy: 0.9383 - val_loss: 0.1460
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9360 - loss: 0.1518 - val_accuracy: 0.9419 - val_loss: 0.1407
Epoch 7/1



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8354 - loss: 0.3693 - val_accuracy: 0.9087 - val_loss: 0.2053
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 7ms/step - accuracy: 0.9062 - loss: 0.2168 - val_accuracy: 0.9209 - val_loss: 0.1864
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9185 - loss: 0.1937 - val_accuracy: 0.9238 - val_loss: 0.1729
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9226 - loss: 0.1834 - val_accuracy: 0.9280 - val_loss: 0.1642
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9284 - loss: 0.1737 - val_accuracy: 0.9321 - val_loss: 0.1597
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9316 - loss: 0.1644 - val_accuracy: 0.9315 - val_loss: 0.1608
Epoch 7/10



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8338 - loss: 0.3643 - val_accuracy: 0.9098 - val_loss: 0.2066
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.9080 - loss: 0.2246 - val_accuracy: 0.9221 - val_loss: 0.1811
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9158 - loss: 0.2044 - val_accuracy: 0.9244 - val_loss: 0.1758
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9240 - loss: 0.1910 - val_accuracy: 0.9305 - val_loss: 0.1628
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9250 - loss: 0.1819 - val_accuracy: 0.9312 - val_loss: 0.1600
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9269 - loss: 0.1787 - val_accuracy: 0.9367 - val_loss: 0.1530
Epoch 7/10



Epoch 1/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.8299 - loss: 0.3600 - val_accuracy: 0.9122 - val_loss: 0.2012
Epoch 2/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9090 - loss: 0.2126 - val_accuracy: 0.9211 - val_loss: 0.1850
Epoch 3/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9188 - loss: 0.1914 - val_accuracy: 0.9261 - val_loss: 0.1738
Epoch 4/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9242 - loss: 0.1779 - val_accuracy: 0.9296 - val_loss: 0.1635
Epoch 5/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.9301 - loss: 0.1681 - val_accuracy: 0.9341 - val_loss: 0.1552
Epoch 6/10
[1m1797/1797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9335 - loss: 0.1596 - val_accuracy: 0.9343 - val_loss: 0.1535
Epoch 7/10