In [1]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from os import walk
import pandas as pd, numpy as np
from tensorflow.keras.models import Sequential

2022-10-10 13:54:11.782687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-10 13:54:12.312245: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-10 13:54:12.391875: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-10 13:54:12.391918: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

# Data loading

In [2]:
data_files = []
data_path = './data'
for (dirpath, dirnames, filenames) in walk(data_path):
    data_files.extend(filenames)
    break


In [3]:
df = pd.DataFrame(columns=['Name', 'Gender', 'Count'])
for file in data_files:
    df = pd.concat([df, pd.read_csv(f'{data_path}/{file}', names=['Name', 'Gender','Count'])], ignore_index=True)
print(df.shape)

(2052781, 3)


In [4]:
print(df.head())

       Name Gender  Count
0    Ashley      F  38457
1   Jessica      F  38358
2    Amanda      F  25035
3  Brittany      F  24982
4     Sarah      F  24635


# Process data

In [5]:
total_men = df.loc[df.Gender=='M'].shape[0]
total_women = df.loc[df.Gender=='F'].shape[0]
print(total_men)
print(total_women)
df_prob = df.copy()

842915
1209866


Count feature is not very useful *per se*. Change to observed probability.

In [6]:
df_prob.loc[df.Gender=='M', 'Count'] = df.loc[df.Gender=='M'].Count.apply(lambda count: 100*count/total_men)
df_prob.loc[df.Gender=='F', 'Count'] = df.loc[df.Gender=='F'].Count.apply(lambda count: 100*count/total_women)
df_prob.rename({"Count":"Prob"}, inplace=True, axis='columns')
print(df_prob.head())

       Name Gender      Prob
0    Ashley      F  3.178616
1   Jessica      F  3.170434
2    Amanda      F  2.069237
3  Brittany      F  2.064857
4     Sarah      F  2.036176


Sanity check. Whether there are names that are both male and female. If so, get the one with greatest probability.

In [7]:
shared_names = set(df_prob.loc[df_prob.Gender=='F'].Name.values).intersection(set(df_prob.loc[df_prob.Gender=='M'].Name.values))
print(df_prob.Name.value_counts())
print(df_prob.shape)
print(len(shared_names))
if len(shared_names) > 0:
    df_prob = df_prob.sort_values('Prob', ascending=False).drop_duplicates(subset='Name', keep='first').sort_index()
print(df_prob.shape)

Marion     284
Sidney     284
John       284
James      284
William    284
          ... 
Nevach       1
Ndey         1
Nashmia      1
Naiome       1
Yler         1
Name: Name, Length: 101338, dtype: int64
(2052781, 3)
11282
(2052781, 3)


In [8]:
df_prob.Names = df_prob.Name.apply(lambda x: x.lower())

  df_prob.Names = df_prob.Name.apply(lambda x: x.lower())


In [9]:
import time

# given a list of names (Corpus), create a char-to-int dictionary
def get_dict(corpus:list) -> dict:
    char_dict = dict()
    pos = 0
    for name in corpus:
        chars = set(name.lower())
        for char in chars:
            if char not in char_dict:
                char_dict[char] = pos
                pos += 1
    return char_dict, {v:k for k,v in char_dict.items()}
startTime = time.time()
char_dict, _ = get_dict(df_prob.Name.values)
endTime = time.time() 
howMuchTime = endTime - startTime
print(str(howMuchTime) + " sec")

1.486760139465332 sec


In [10]:
len(char_dict)
print(char_dict)

{'a': 0, 'e': 1, 's': 2, 'y': 3, 'h': 4, 'l': 5, 'c': 6, 'i': 7, 'j': 8, 'd': 9, 'm': 10, 'n': 11, 't': 12, 'r': 13, 'b': 14, 'p': 15, 'z': 16, 'g': 17, 'f': 18, 'u': 19, 'k': 20, 'o': 21, 'v': 22, 'x': 23, 'q': 24, 'w': 25}


In [11]:

def get_dict_2(corpus:list) -> (dict,dict):

    raw_text = ''.join(corpus)   #periods have not been removed for better results

    # creates mapping of unique characters to integers
    chars = sorted(list(set(raw_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))# Prints the total characters and character vocab size
    print(f'Corpus of {len(raw_text)} words, vocab reduced to {len(chars)}.')
    return char_to_int, int_to_char
          

startTime = time.time()
char_dict,_ = get_dict_2(df_prob.Names.values)
endTime = time.time() 
howMuchTime = endTime - startTime
print(str(howMuchTime) + " sec")

Corpus of 12661428 words, vocab reduced to 26.
0.20739054679870605 sec


# Data split

In [12]:
list_of_names = list(df_prob.Names.values)
max_seq = 30
def encode_name(name):
    encoded = list()
    for char in name:
        encoded.append(char_dict[char])
        if len(encoded) == 30:
            return [int(''.join(map(str, encoded)))]
    if len(encoded) < 30:
        encoded = encoded + (max_seq-len(encoded))*[0]
    
    return encoded

X = np.array([np.array(encode_name(name)) for name in list_of_names]) # char-encode each name 
# reshapes X to be [samples, time steps, features]
X = np.reshape(X, (len(X), max_seq, 1))
y = [1 if gender=='M' else 0 for gender in df_prob.Gender.values]


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


# Building the model

In [14]:
embedding_dim =192
model = Sequential()
model.add(Embedding(len(char_dict), embedding_dim, input_length=max_seq))
model.add(LSTM(192))
#model.add(Dropout(0.2))
#model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics="acc")
model.summary()

2022-10-10 13:54:39.598347: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-10 13:54:39.598978: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-10 13:54:39.599039: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ubuntu): /proc/driver/nvidia/version does not exist
2022-10-10 13:54:39.600849: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 192)           4992      
                                                                 
 lstm (LSTM)                 (None, 192)               295680    
                                                                 
 dense (Dense)               (None, 1)                 193       
                                                                 
Total params: 300,865
Trainable params: 300,865
Non-trainable params: 0
_________________________________________________________________


In [15]:
from keras.callbacks import EarlyStopping
earlyStop=EarlyStopping(monitor="val_loss",verbose=2,mode='min',patience=3)
model.fit(np.array(X_train), np.array(y_train), validation_split=0.15, epochs = 30, batch_size=128, callbacks=[earlyStop])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 16: early stopping


<keras.callbacks.History at 0x7f9ca4149820>

In [16]:
model.evaluate(np.array(X_test), np.array(y_test), batch_size=128)



[0.26834672689437866, 0.8666755557060242]