In [28]:
import pandas as pd
import os

boy_df = pd.read_csv('./boys_names_sweden.csv')
girl_df = pd.read_csv('./girls_names_sweden.csv')

In [33]:

def clean_df(df):
    df = df.dropna()
    cleaned = df.iloc[1:, 0].reset_index().drop(columns = 'index').rename(columns = {"Unnamed: 0": "names"})
    return cleaned

boys = clean_df(boy_df)
girls = clean_df(girl_df)

In [45]:
boys_list = boys.names.tolist()
girls_list = girls.names.tolist()

In [44]:
boy_set = set(boys_list)
girl_set = set(girls_list)

boy_and_girl_set = boy_set.intersection(girl_set)
boy_and_girl_names = list(boy_and_girl_set)

boy_names = list(boy_set - boy_and_girl_set)
girl_names = list(girl_set - boy_and_girl_set)


In [51]:
import json
filenames = ['boy_names.json', 'girl_names.json', 'boy_and_girl_names.json']
name_dict = {}
for filename in filenames:
    with open(filename) as f:
        name_dict.update({filename.split('.')[0] : json.load(f)})
        
boy_names = name_dict["boy_names"]
girl_names = name_dict["girl_names"]
boy_and_girl_names = name_dict["boy_and_girl_names"]

In [60]:
def is_male(row, column_name):
    name = row[column_name]
    
    if name in boy_names:
        return 1
    elif name in girl_names:
        return -1
    else:
        return 0

def irecommend_df(csv_file_path : str, name_column : str , sensitive_columns : list = None):
    
    df = pd.read_csv(csv_file_path)
    df['is_male'] = df.apply(is_male, axis = 1, args = [names])
    
    if sensitive_columns:
        try:
            df = df.drop(columns = sensitive_columns)
        except:
            pass
    
    return df

def save_df(df, filepath = './bengts_data.csv'):
    df.to_csv(filepath)
    


In [79]:
df = pd.DataFrame(boy_names + girl_names, columns = ['names'])
df['is_male'] = df.apply(is_male, axis = 1, args = ['names'])
df

Unnamed: 0,names,is_male
0,Benji,1
1,Amar,1
2,Roberto,1
3,Wille,1
4,Arian,1
...,...,...
2006,Antonia,-1
2007,Rania,-1
2008,Asia,-1
2009,Aleah,-1


In [80]:
import numpy as np
from numpy import array
from numpy import argmax
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

filepath = 'd:/AS_Data/temp/name_test.csv'
max_rows = 500000 # Reduction due to memory limitations


# In the case of a middle name, we will simply use the first name only
df['names'] = df['names'].apply(lambda x: str(x).split(' ', 1)[0])

# Sometimes people only but the first letter of their name into the field, so we drop all name where len <3
df.drop(df[df['names'].str.len() < 3].index, inplace=True)

Unnamed: 0,names,is_male
0,Benji,1
1,Amar,1
2,Roberto,1
3,Wille,1
4,Arian,1
...,...,...
2006,Antonia,-1
2007,Rania,-1
2008,Asia,-1
2009,Aleah,-1


In [82]:
predictor_col = 'names'
result_col = 'is_male'

accepted_chars = 'abcdefghijklmnopqrstuvwxyzöäü-'

word_vec_length = min(df[predictor_col].apply(len).max(), 25) # Length of the input vector
char_vec_length = len(accepted_chars) # Length of the character vector
output_labels = 2 # Number of output labels

print(f"The input vector will have the shape {word_vec_length}x{char_vec_length}.")

The input vector will have the shape 12x30.


In [101]:
# Define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(accepted_chars))
int_to_char = dict((i, c) for i, c in enumerate(accepted_chars))

# Removes all non accepted characters
def normalize(line):
    return [c.lower() for c in line if c.lower() in accepted_chars]

# Returns a list of n lists with n = word_vec_length
def name_encoding(name):

    # Encode input data to int, e.g. a->1, z->26
    integer_encoded = [char_to_int[char] for i, char in enumerate(name) if i < word_vec_length]
    
    # Start one-hot-encoding
    onehot_encoded = list()
    
    for value in integer_encoded:
        # create a list of n zeros, where n is equal to the number of accepted characters
        letter = [0 for _ in range(char_vec_length)]
        letter[value] = 1
        onehot_encoded.append(letter)
        
    # Fill up list to the max length. Lists need do have equal length to be able to convert it into an array
    for _ in range(word_vec_length - len(name)):
        onehot_encoded.append([0 for _ in range(char_vec_length)])
        
    return onehot_encoded

# Encode the output labels
def label_encoding(gender_series):
    labels = np.empty((0, 2))
    for i in gender_series:
        if i == 1:
            labels = np.append(labels, [[1,0]], axis=0)
        else:
            labels = np.append(labels, [[0,1]], axis=0)
    return labels

In [145]:
# Split dataset in 60% train, 20% test and 20% validation
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

# Convert both the input names as well as the output lables into the discussed machine readable vector format
train_x =  np.asarray([np.asarray(name_encoding(normalize(name))) for name in train[predictor_col]])
train_y = label_encoding(train.is_male)

validate_x = np.asarray([name_encoding(normalize(name)) for name in validate[predictor_col]])
validate_y = label_encoding(validate.is_male)

test_x = np.asarray([name_encoding(normalize(name)) for name in test[predictor_col]])
test_y = label_encoding(test.is_male)


In [127]:
hidden_nodes = int(2/3 * (word_vec_length * char_vec_length))
print(f"The number of hidden nodes is {hidden_nodes}.")

The number of hidden nodes is 240.


In [146]:
# Build the model
print('Build model...')
model = Sequential()
model.add(LSTM(hidden_nodes, return_sequences=False, input_shape=(word_vec_length, char_vec_length)))
model.add(Dropout(0.2))
model.add(Dense(units=output_labels))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

Build model...


In [147]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'val_loss', patience = 10)
batch_size=32
print(train_x.shape)
model.fit(train_x, train_y, batch_size=batch_size, epochs=50, validation_data=(validate_x, validate_y), callbacks = [es])

(1204, 12, 30)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


<tensorflow.python.keras.callbacks.History at 0x14292c898>

In [130]:
model.save("gender_identifier")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: gender_identifier/assets


In [135]:
import keras
# It can be used to reconstruct the model identically.
reconstructed_model = keras.models.load_model("gender_identifier")

# Let's check:
np.testing.assert_allclose(
    model.predict(validate_x), reconstructed_model.predict(validate_x)
)

In [144]:
validate['predicted_is_male'] = [1 if prediction[0] > prediction[1] else -1 for prediction in reconstructed_model.predict(validate_x)]
validate[validate['is_male'] != validate['is_male']].head()

Unnamed: 0,names,is_male,predicted_gender,predicted_is_male


In [140]:
reconstructed_model.summary()
# Input is 12 x 30, hence the longest name permissable with this trained dataframe is 12 characters
def predict_unsure(row, col_name, model):
    if row['is_male'] == 0:
        name = row['col_name']
        encoded_name = np.asarray(name_encoding(normalize(name)))
        prediction = model.predict(encoded_name)
        return prediction
    else:
        return row['is_male']

def predict_irecommend(df, col_name = 'names'):
    
    neural_network = keras.models.load_model("gender_identifier")
    
    predictor_col = 'names'
    unsure_names = df[df.ismale == 0]
    names_array =  np.asarray([np.asarray(name_encoding(normalize(name))) for name in unsure_names[col_name]])
    df['predicted_is_male'] = df.apply(predict_unsure, axis = 1, args = [col_name, neural_network])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 240)               260160    
_________________________________________________________________
dropout_5 (Dropout)          (None, 240)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 482       
_________________________________________________________________
activation_5 (Activation)    (None, 2)                 0         
Total params: 260,642
Trainable params: 260,642
Non-trainable params: 0
_________________________________________________________________


In [188]:
model.predict(np.asarray([name_encoding(normalize('khalid'))]))

array([[0.9016669 , 0.09833317]], dtype=float32)

In [189]:
k_names = ['emily', 'rory', 'alfred', 'georgie', 'mia', 'benton']
a = np.asarray([name_encoding(normalize(name)) for name in k_names])
model.predict(a)

array([[0.4092723 , 0.59072775],
       [0.6327217 , 0.36727825],
       [0.9937109 , 0.00628912],
       [0.3433697 , 0.65663034],
       [0.0010017 , 0.99899834],
       [0.9700529 , 0.02994711]], dtype=float32)

In [154]:
validate_x.shape

(402, 12, 30)