# 1. Predicting Gender of Brazilian Names Using Machine Learning


## 1.1 Necessary libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd                       
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow import keras
from keras import backend as K
from keras.layers import Dense, Dropout, Flatten, GRU, SimpleRNN, LSTM, Bidirectional, Activation, TimeDistributed
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt

## 1.2 Download the dataset

In [3]:



df = pd.read_csv('/content/drive/MyDrive/gender-classification-main/dataset/nomes.csv.gz') 
df.head()

Unnamed: 0,alternative_names,classification,first_name,frequency_female,frequency_male,frequency_total,frequency_group,group_name,ratio
0,AILINE|ALEINE|ALIINE|ALINE|ALINER|ALINHE|ALINN...,F,AALINE,66.0,,66,530550,ALINE,1.0
1,ARAAO|ARAO,M,AARAO,,281.0,281,3526,ARAO,1.0
2,AHARON|AROM|ARON|ARYON|HARON,M,AARON,,676.0,676,3442,ARON,1.0
3,ADA|ADAH|ADAR|ADHA|HADA,F,ABA,82.0,,82,5583,ADA,1.0
4,,M,ABADE,,57.0,57,57,ABADE,1.0


In [None]:
# Ratio = 1.0
#df = df[df['ratio'] == 1.0].copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100787 entries, 0 to 100786
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   alternative_names  70745 non-null   object 
 1   classification     100787 non-null  object 
 2   first_name         100787 non-null  object 
 3   frequency_female   60484 non-null   float64
 4   frequency_male     50932 non-null   float64
 5   frequency_total    100787 non-null  int64  
 6   frequency_group    100787 non-null  int64  
 7   group_name         100787 non-null  object 
 8   ratio              100787 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.9+ MB


### 1.2.1 Preparing the data 

In [5]:
y = df['classification'].astype("category").cat.codes.values    # y labels into numbers 0 is F and 1 is M
names = df['first_name'].apply(lambda x: x.lower())             # input names

In [6]:
print("M : " + str(sum(y==1)))
print("F : " + str(sum(y==0)))
print(len(y))

M : 45537
F : 55250
100787


## 1.3 Encoding Words

Neural networks can only learn to find patterns in numerical data, so it is necessary to convert our data into numeric values with word encoding or tokenization. 

In [7]:
#word encoding
maxlen = 20                                               # max lenght of a name
'''Define a vocabulary which corresponds to all the unique letters encountered'''
vocab = set(' '.join([str(i) for i in names]))            # creating a vocab
vocab.add('END')
len_vocab = len(vocab)
''' The dictionary maps each letter of vocabulary to a number '''
char_index = dict((c, i) for i, c in enumerate(vocab))    # creating a dictionary

# Builds an empty line with a 1 at the index of character
def set_flag(i):
    aux = np.zeros(len_vocab);
    aux[i] = 1
    return list(aux)

# Truncate names and create the matrix
def prepare_encod_names(X):
    vec_names = []
    trunc_name = [str(i)[0:maxlen] for i in X]  # consider only the first 20 characters
    for i in trunc_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        vec_names.append(tmp)
    return vec_names

In [8]:
x = prepare_encod_names(names.values)   # Now the names are encod as a vector of numbers 

## 1.4 Split the data into test and train

In [9]:
# train, val, test set will be 60%, 20%, 20% of the dataset respectively
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=28)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=40)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)
x_val = np.asarray(x_val)
y_val = np.asarray(y_val)

In [10]:
def data2df(x, y):
    df_x = pd.DataFrame(data=x.reshape((x.shape[0],-1)))
    df_y = pd.DataFrame(data=y.reshape((y.shape[0],-1)))
    
    df = pd.concat([df_x, df_y], axis=1)
    columns = list(df.columns)
    columns[-1] = 'class'
    df.columns = columns
    df = df.astype('int8')
    return df

In [11]:
df_train = data2df(x_train, y_train)
df_val = data2df(x_val, y_val)
df_test = data2df(x_test, y_test)

In [12]:
print(df_train)

       0  1  2  3  4  5  6  7  8  9  ...  551  552  553  554  555  556  557  \
0      0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
1      0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
2      0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
3      1  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
4      0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
...   .. .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...  ...  ...   
64498  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
64499  0  0  0  0  0  1  0  0  0  0  ...    0    0    0    0    1    0    0   
64500  0  0  0  0  0  0  0  1  0  0  ...    0    0    0    0    1    0    0   
64501  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    1    0    0   
64502  0  0  0  0  1  0  0  0  0  0  ...    0    0    0    0    1    0    0   

       558  559  class  
0        0    0      0  
1

In [13]:
train_size = round(1 - len(df_test)/(len(df_train)+len(df_test)), 2)

In [14]:
train_size

0.76

In [15]:
df = pd.concat([df_train, df_test])

## 1.5 Train models

In [23]:
!pip install pycaret
from pycaret.classification import *

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.2-py3-none-any.whl (483 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m483.6/483.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyod>=1.0.8 (from pycaret)
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting importlib-metadata>=4.12.0 (from pycaret)
  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any

In [31]:
#session = setup(data=df, target='class', train_size=train_size, data_split_shuffle=True)
# Assuming you have a dataset in a pandas DataFrame called 'data'
session = setup(data=df, target='class', train_size=train_size, data_split_shuffle=False)


ValueError: ignored

In [32]:
from sklearn.model_selection import train_test_split

# Assuming you have a dataset in a pandas DataFrame called 'data'
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [25]:
from pycaret.classification import *


In [33]:
best_model = compare_models()

RuntimeError: ignored

In [19]:
print(best_model)

NameError: ignored