### PROBLEM STATEMENT
*"Implement RNN for demonstrating the concept of Named entity"*

- Ayushmaan Das (E0121037)

### Loading the Dataset and Interpretation

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, random

In [None]:
df = pd.read_csv("./NationalNames.csv")

df.head(5)

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065.0
1,2,Anna,1880,F,2604.0
2,3,Emma,1880,F,2003.0
3,4,Elizabeth,1880,F,1939.0
4,5,Minnie,1880,F,1746.0


In [None]:
print(f"ROWS : {df.shape[0]} \t COLUMNS : {df.shape[1]}")

ROWS : 708269 	 COLUMNS : 5


In [None]:
df[['Year','Count']].describe()

Unnamed: 0,Year,Count
count,708269.0,708268.0
mean,1936.365261,252.725649
std,22.676583,2047.597817
min,197.0,5.0
25%,1920.0,7.0
50%,1938.0,13.0
75%,1956.0,38.0
max,1970.0,99680.0


<hr>

### Data Preprocessing

Label Encoding of Gender

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Gender'] = le.fit_transform(df['Gender'])

df.head(5)

print(df['Gender'].value_counts())

0    410684
1    297584
2         1
Name: Gender, dtype: int64


Creating the subset dataframe to be used

In [64]:
new_df = df.groupby('Name').mean()['Gender'].reset_index()

new_df.head(5)

Unnamed: 0,Name,Gender
0,Aage,1.0
1,Aagot,0.0
2,Aarne,1.0
3,Aaron,0.689394
4,Aaronette,0.0


In [65]:
new_df['Gender'] = new_df['Gender'].round().astype(int)
new_df.head(5)

Unnamed: 0,Name,Gender
0,Aage,1
1,Aagot,0
2,Aarne,1
3,Aaron,1
4,Aaronette,0


<hr>

### Creating the Vocabulary

In [66]:
import string

letters = list(string.ascii_letters)

print(letters)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


Representing every letter numerically

In [70]:
vocab = dict(zip(letters, range(1,27)))

print(vocab)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


Numerical Interpretation of the Names

In [68]:
def word_num(data):
  for i in range(0, data.shape[0]):
    sequence = [vocab[letters.lower()] for letters in data['Name'][i]]
    data['Name'][i]  = sequence
  return data

In [69]:
import warnings

warnings.filterwarnings("ignore")

new_df = word_num(new_df)

new_df.head(5)

Unnamed: 0,Name,Gender
0,"[1, 1, 7, 5]",1
1,"[1, 1, 7, 15, 20]",0
2,"[1, 1, 18, 14, 5]",1
3,"[1, 1, 18, 15, 14]",1
4,"[1, 1, 18, 15, 14, 5, 20, 20, 5]",0


<hr>

### Model Building

In [72]:
X = new_df['Name'].values
y = new_df['Gender'].values

Padding the Sequences

- The purpose of sequence padding is to ensure that all sequences have the same length.
- It’s commonly used in natural language processing (NLP) tasks when dealing with variable-length text data (e.g., sentences or documents).


In [79]:
from keras.preprocessing.sequence import pad_sequences

X = pad_sequences(X, maxlen=10, padding='pre')

In [75]:
import tensorflow as tf

from keras.models import Sequential, Model
from keras.layers import Dense, Input, SimpleRNN, Embedding

In [77]:
vocab_size = len(vocab)+1

input = Input(shape=(10,))

emb = Embedding(input_dim = vocab_size, output_dim=128)(input)

mm = SimpleRNN(128, return_sequences = True)(emb)
mm = SimpleRNN(32, return_sequences = False)(mm)
mm = Dense(10, activation='relu')(mm)
mm = Dense(1, activation='sigmoid')(mm)

model = Model(inputs=input, outputs=mm)

In [78]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding (Embedding)       (None, 10, 128)           3456      
                                                                 
 simple_rnn (SimpleRNN)      (None, 10, 128)           32896     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                5152      
                                                                 
 dense (Dense)               (None, 10)                330       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 41845 (163.46 KB)
Trainable params: 41845 (163.

<hr>

### Model Compilation and Training

In [81]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [82]:
hist = model.fit(X, y, epochs = 5, batch_size = 32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<hr>

### Predicting using the Model

In [98]:
name = "Zahan".lower()

In [99]:
name_seq = [vocab[i] for i in name]

name_seq

[26, 1, 8, 1, 14]

In [100]:
X_test = pad_sequences([name_seq], maxlen=10, padding='pre')
X_test

array([[ 0,  0,  0,  0,  0, 26,  1,  8,  1, 14]], dtype=int32)

In [101]:
prediction = model.predict(X_test)

if prediction > 0.5:
  print(f"{name} is MALE.")
else:
  print(f"{name} is FEMALE.")

zahan is MALE.


<hr><hr>