# PROBLEM STATEMENT

**"Implementing a LSTM Network"**

# Loading the Dataset and Interpretation

In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, random

In [2]:
df = pd.read_csv("./NationalNames.csv")

df.head(5)

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065.0
1,2,Anna,1880,F,2604.0
2,3,Emma,1880,F,2003.0
3,4,Elizabeth,1880,F,1939.0
4,5,Minnie,1880,F,1746.0


In [3]:
print(f"ROWS : {df.shape[0]} \t COLUMNS : {df.shape[1]}")

ROWS : 401599 	 COLUMNS : 5


In [4]:
df[['Year','Count']].describe()

Unnamed: 0,Year,Count
count,401599.0,401598.0
mean,1920.117724,197.62014
std,15.854875,1532.320613
min,194.0,5.0
25%,1912.0,7.0
50%,1922.0,12.0
75%,1932.0,36.0
max,1943.0,80248.0


<hr>

# Data Preprocessing

### Label Encoding of Gender

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Gender'] = le.fit_transform(df['Gender'])

df.head(5)

print(df['Gender'].value_counts())

0    222497
1    179101
2         1
Name: Gender, dtype: int64


### Creating the subset dataframe to be used

In [6]:
new_df = df.groupby('Name').mean()['Gender'].reset_index()

new_df.head(5)

Unnamed: 0,Name,Gender
0,Aage,1.0
1,Aagot,0.0
2,Aarne,1.0
3,Aaron,0.790123
4,Ab,1.0


In [7]:
new_df['Gender'] = new_df['Gender'].round().astype(int)
new_df.head(5)

Unnamed: 0,Name,Gender
0,Aage,1
1,Aagot,0
2,Aarne,1
3,Aaron,1
4,Ab,1


<hr>

# Creating the Vocabulary

In [8]:
import string

letters = list(string.ascii_letters)

print(letters)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


### Representing every letter numerically

In [9]:
vocab = dict(zip(letters, range(1,27)))

print(vocab)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


### Numerical Interpretation of the Names

In [10]:
def word_num(data):
  for i in range(0, data.shape[0]):
    sequence = [vocab[letters.lower()] for letters in data['Name'][i]]
    data['Name'][i]  = sequence
  return data

In [11]:
import warnings

warnings.filterwarnings("ignore")

new_df = word_num(new_df)

new_df.head(5)

Unnamed: 0,Name,Gender
0,"[1, 1, 7, 5]",1
1,"[1, 1, 7, 15, 20]",0
2,"[1, 1, 18, 14, 5]",1
3,"[1, 1, 18, 15, 14]",1
4,"[1, 2]",1


<hr>

# Sample LSTM

In [50]:
import tensorflow as tf
from keras.layers import LSTM, Embedding, Dense, Input
from keras.models import Sequential,Model

In [46]:
samp_model = Sequential()

In [47]:
samp_model.add(Embedding(input_dim=1000, output_dim=80))

In [48]:
samp_model.add(LSTM(4))

In [49]:
samp_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 80)          80000     
                                                                 
 lstm_6 (LSTM)               (None, 4)                 1360      
                                                                 
Total params: 81360 (317.81 KB)
Trainable params: 81360 (317.81 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


<hr>

# LSTM for the given Dataset

In [53]:
X = new_df['Name'].values
y = new_df['Gender'].values

In [54]:
from keras.preprocessing.sequence import pad_sequences

X = pad_sequences(X, maxlen=10, padding='pre')

In [55]:
vocab_size = len(vocab)+1

input = Input(shape=(10,))

emb1 = Embedding(input_dim = vocab_size, output_dim=5)(input)

mm1 = LSTM(128, return_sequences = True)(emb1)
mm1 = LSTM(32, return_sequences = False)(mm1)
mm1 = Dense(10, activation='relu')(mm1)
mm1 = Dense(1, activation='sigmoid')(mm1)

model1 = Model(inputs=input, outputs=mm1)

model1.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding_8 (Embedding)     (None, 10, 5)             135       
                                                                 
 lstm_9 (LSTM)               (None, 10, 128)           68608     
                                                                 
 lstm_10 (LSTM)              (None, 32)                20608     
                                                                 
 dense_12 (Dense)            (None, 10)                330       
                                                                 
 dense_13 (Dense)            (None, 1)                 11        
                                                                 
Total params: 89692 (350.36 KB)
Trainable params: 89692 (35

In [56]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [57]:
hist1 = model1.fit(X, y, epochs = 5, batch_size = 32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<hr>

# Predicting using the Model

In [62]:
name = "Ruthvika".lower()

In [63]:
name_seq = [vocab[i] for i in name]

name_seq

[18, 21, 20, 8, 22, 9, 11, 1]

In [64]:
X_test = pad_sequences([name_seq], maxlen=10, padding='pre')
X_test

array([[ 0,  0, 18, 21, 20,  8, 22,  9, 11,  1]], dtype=int32)

In [65]:
prediction = model1.predict(X_test)

if prediction > 0.5:
  print(f"{name} is MALE.")
else:
  print(f"{name} is FEMALE.")

ruthvika is FEMALE.


<hr><hr>