In [1]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from os import walk
import pandas as pd, numpy as np
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler

2022-10-17 11:39:58.357922: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-17 11:39:58.839612: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-17 11:39:58.952027: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-17 11:39:58.952069: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

# Data loading

In [2]:
data_files = []
data_path = './data'
for (dirpath, dirnames, filenames) in walk(data_path):
    data_files.extend(filenames)
    break


In [3]:
df = pd.DataFrame(columns=['Name', 'Gender', 'Count'])
for file in data_files:
    df = pd.concat([df, pd.read_csv(f'{data_path}/{file}', names=['Name', 'Gender','Count'])], ignore_index=True)
print(df.shape)

(2052781, 3)


In [4]:
print(df.head())

       Name Gender  Count
0    Ashley      F  38457
1   Jessica      F  38358
2    Amanda      F  25035
3  Brittany      F  24982
4     Sarah      F  24635


## adding more data
df_ext = pd.read_csv('name_gender_dataset.csv').drop(['Probability'], axis=1)
names_set_ext = set(df_ext.Name.values)
names_set = set(df_prob.Name.values)
print(len(names_set_ext - names_set)/len(names_set)*100, '% of increment')


print(df_ext.head())

In [5]:
#df = pd.concat([df, df_ext], ignore_index=True)

# Process data

In [6]:
total_men = df.loc[df.Gender=='M'].shape[0]
total_women = df.loc[df.Gender=='F'].shape[0]
print(total_men)
print(total_women)
df_prob = df.copy()

842915
1209866


Count feature is not very useful *per se*. Change to observed probability.

df_prob.loc[df.Gender=='M', 'Count'] = df.loc[df.Gender=='M'].Count.apply(lambda count: 100*count/total_men)
df_prob.loc[df.Gender=='F', 'Count'] = df.loc[df.Gender=='F'].Count.apply(lambda count: 100*count/total_women)
df_prob.rename({"Count":"Prob"}, inplace=True, axis='columns')
print(df_prob.head())

Sanity check. Whether there are names that are both male and female. If so, get the one with greatest probability.

shared_names = set(df_prob.loc[df_prob.Gender=='F'].Name.values).intersection(set(df_prob.loc[df_prob.Gender=='M'].Name.values))
print(df_prob.Name.value_counts())
print(df_prob.shape)
print(len(shared_names))
if len(shared_names) > 0:
    df_prob = df_prob.sort_values('Prob', ascending=False).drop_duplicates(subset='Name', keep='first').sort_index()
print(df_prob.shape)

In [7]:
df_prob.Name = df_prob.Name.apply(lambda x: x.lower())

In [8]:
import time

# given a list of names (Corpus), create a char-to-int dictionary
def get_dict(corpus:list) -> dict:
    char_dict = dict()
    pos = 0
    for name in corpus:
        chars = set(name.lower())
        for char in chars:
            if char not in char_dict:
                #print(name)
                char_dict[char] = pos
                pos += 1
    return char_dict, {v:k for k,v in char_dict.items()}
startTime = time.time()
char_dict, _ = get_dict(df_prob.Name.values)
endTime = time.time() 
howMuchTime = endTime - startTime
print(str(howMuchTime) + " sec")

1.860532283782959 sec


In [9]:
len(char_dict)
print(char_dict)

{'s': 0, 'l': 1, 'e': 2, 'y': 3, 'h': 4, 'a': 5, 'j': 6, 'i': 7, 'c': 8, 'd': 9, 'n': 10, 'm': 11, 'b': 12, 'r': 13, 't': 14, 'p': 15, 'z': 16, 'g': 17, 'f': 18, 'u': 19, 'k': 20, 'o': 21, 'v': 22, 'x': 23, 'q': 24, 'w': 25}


In [10]:

def get_dict_2(corpus:list) -> (dict,dict):

    raw_text = ''.join(corpus)   #periods have not been removed for better results

    # creates mapping of unique characters to integers
    chars = sorted(list(set(raw_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))# Prints the total characters and character vocab size
    print(f'Corpus of {len(raw_text)} words, vocab reduced to {len(chars)}.')
    return char_to_int, int_to_char
          

startTime = time.time()
char_dict,_ = get_dict_2(df_prob.Name.values)
endTime = time.time() 
howMuchTime = endTime - startTime
print(str(howMuchTime) + " sec")

Corpus of 12661428 words, vocab reduced to 26.
0.20936822891235352 sec


## Sample weight

In [11]:
def get_probs(df:pd.DataFrame) -> pd.DataFrame:
    df_grouped = df.groupby(by=['Name']).sum().rename({'Count':'Total'}, axis=1)
    df_probability = df.join(df_grouped.drop(['Gender'],axis=1), on='Name')
    df_probability['Prob'] = df_probability.Count/df_probability.Total
    return df_probability

In [12]:
df_new = get_probs(df_prob)
print(df_prob.loc[df_prob.Name=='ashley'])
print(df_new.loc[df_new.Prob==1])

           Name Gender  Count
0        ashley      F  38457
16418    ashley      M    168
28127    ashley      F     15
32633    ashley      M     50
42282    ashley      M     39
...         ...    ...    ...
1995909  ashley      F  26601
2012722  ashley      M    112
2028322  ashley      M     30
2031371  ashley      F  54855
2045066  ashley      M    409

[224 rows x 3 columns]
              Name Gender Count  Total Prob
5514      caleesha      F    17     17  1.0
6537       miyisha      F    14     14  1.0
7452        silken      F    12     12  1.0
7600       cashala      F    11     11  1.0
8643     shaunique      F    10     10  1.0
...            ...    ...   ...    ...  ...
2052656    termell      M     5      5  1.0
2052698     tryell      M     5      5  1.0
2052710     undrae      M     5      5  1.0
2052740  willfredo      M     5      5  1.0
2052760       yler      M     5      5  1.0

[21556 rows x 5 columns]


In [13]:
df_shuffled = df_new.sample(frac=1).sample(frac=1)

# Data split

In [16]:
list_of_names = list(df_shuffled.Name.values)
max_seq = 30
def encode_name(name):
    encoded = list()
    for char in name:
        encoded.append(char_dict[char])
        if len(encoded) == 30:
            return [int(''.join(map(str, encoded)))]
    if len(encoded) < 30:
        encoded = encoded + (max_seq-len(encoded))*[0]
    
    return encoded

X = np.array([np.array(encode_name(name)) for name in list_of_names]) # char-encode each name 
print(len(X))
# reshapes X to be [samples, time steps, features]
X = np.reshape(X, (len(X), max_seq, 1))
#weights = df_new.Prob.values
weights = df_new.Count.values
y = [(1, weight) if gender=='M' else (0, weight) for gender, weight in zip(df_shuffled.Gender.values, weights)]


2052781


In [16]:
print(len(X),
     len(y))
print(df_prob.shape)

2052781 2052781
(2052781, 3)


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False) # dont shuffle otherwise weights messed


In [18]:
print(y_train[:3])

[(0, 38457), (1, 38358), (0, 25035)]


In [19]:
weights_train = np.array([packed[1] for packed in y_train])
weights_test = [packed[1] for packed in y_test]
y_train = np.array([packed[0] for packed in y_train])
y_test = [packed[0] for packed in y_test]


In [20]:
print(y_train[:4])
print(weights_train[:4])



[0 1 0 0]
[38457 38358 25035 24982]


# Building the model

In [21]:
embedding_dim =192
model = Sequential()
model.add(Embedding(len(char_dict), embedding_dim, input_length=max_seq))
model.add(LSTM(192))
#model.add(Dropout(0.2))
#model.add(Dense(300, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam' , metrics="acc", weighted_metrics=['acc'])
model.summary()

2022-10-14 14:58:44.002613: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-14 14:58:44.002656: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-14 14:58:44.002676: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ubuntu): /proc/driver/nvidia/version does not exist
2022-10-14 14:58:44.004259: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 192)           4992      
                                                                 
 lstm (LSTM)                 (None, 192)               295680    
                                                                 
 dense (Dense)               (None, 1)                 193       
                                                                 
Total params: 300,865
Trainable params: 300,865
Non-trainable params: 0
_________________________________________________________________


In [23]:
from keras.callbacks import EarlyStopping
earlyStop=EarlyStopping(monitor="val_loss",verbose=2,mode='min',patience=3)
model.fit(np.array(X_train), np.array(y_train),  validation_split=0.15, epochs = 30, sample_weight=weights_train, batch_size=128, callbacks=[earlyStop])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 20: early stopping


<keras.callbacks.History at 0x7f5e720e8640>

In [24]:
model.evaluate(np.array(X_test), np.array(y_test), batch_size=128)



[0.4350805878639221, 0.8064906597137451, 0.8064906597137451]