### All Libraries Needed Through the Tasks

In [88]:
import numpy as np
import pandas as pd
import random
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim
import wget
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

###  Definition of Import  File Function 


In [209]:
def download(fid, fn):
    durl = 'https://drive.google.com/' + 'uc?export=download&id='+ fid
    print('downloading from', durl)
    wget.download(durl, fn)
    

### Load Data From File 

In [190]:
file = 'yob2018.txt'
with open(file,"r") as f:
    train_names = [r.split(',')[0].lower().strip('\n') for r in f]

#ramdomly chose 5000 names from whole file.    
data = random.choices(train_names, k=5000)
print("Randomly chosen data: ",data[:5])

#The characters are a-z (26 characters)  
letters = string.ascii_lowercase
n_letters = len(letters) + 1 # Plus EOS marker


Randomly chosen data:  ['uziah', 'nairobi', 'nayden', 'zana', 'ario']


### Task 1

#### Converting data to one_hot_encoded,  for input tensor, each batch contains 1x N×M characters, where 1 is batch size,  N is the name length  and  M is the total letters with EOS marker  (#27). For input x, each name starts from first letter to last letter. For target, each input shifted by one which starts from second letter of input to EOS marker.

In [191]:
# one-hot matrix from first to last letters(not including the EOS) for input
def inputTensor(name):
    tensor = torch.zeros(1, len(name), n_letters)
    for li in range(len(name)):
        letter = name[li]
        tensor[0][li][letters.find(letters)] = 1
    return tensor

# LongTensor from second letter to end(EOS) for target
def targetTensor(name):
    letter_indexes = [letters.find(name[li]) for li in range(1, len(name))]
    letter_indexes.append(n_letters - 1) #EOS
    return torch.LongTensor(letter_indexes)



### CharNet Model Definition (batch size is one, in each step one name is thrown in to the LSTM)

In [192]:
class CharNet(nn.Module):
    def __init__(self, output_size,embedding_dim, hidden_dim, n_layers):
        super(CharNet, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.output_size = output_size


        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
    def forward(self, x, hidden):
        
        lstm_out, hidden = self.lstm(x, hidden)
        out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.fc(out)
        #print("rr",out.size())
        
        return out, hidden
    
    def init_hidden(self):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, 1, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, 1, self.hidden_dim).zero_())
        return hidden

### Training the  Model 


In [193]:
output_size = 27
embedding_dim = 27
hidden_dim = 128
n_layers = 3

model = CharNet(output_size, embedding_dim, hidden_dim, n_layers)

lr=0.01
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

epochs=200  
batch_size=100    

model.train()
   
for e in range(epochs+1):
    train_loss = 0.0
    n = len(data)
    #randomly permutate the data array
    idx = np.random.permutation(n)
    for i in range(batch_size):
        optimizer.zero_grad() 
        
        inputs = inputTensor(data[idx[i]]) # tranform the name into one hot tensor
        targets = targetTensor(data[idx[i]]) # obtain the targets

        
        h = model.init_hidden()
        output, h = model(inputs, h)
        output=output.squeeze()   
        
        loss = criterion(output,targets)
            
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)


        optimizer.step()
        train_loss += loss.item()
            
    if (e%50==0): 
        print("Epoch: {}/{}...".format(e, epochs),
                      "Loss: {:.4f}...".format(train_loss/batch_size))
        
torch.save(model.state_dict(), 'CharNet.pth')       
            

Epoch: 0/200... Loss: 2.6911...
Epoch: 50/200... Loss: 2.4826...
Epoch: 100/200... Loss: 2.4436...
Epoch: 150/200... Loss: 2.5146...
Epoch: 200/200... Loss: 2.4607...


### Model Google Drive Link

In [210]:
#https://drive.google.com/open?id=1HpJ_k1qrcKLrvZx-uk2kLM0XfEo2KuUR
fid = '1HpJ_k1qrcKLrvZx-uk2kLM0XfEo2KuUR'
download(fid, 'CharNet.pth')

output_size = 27
embedding_dim = 27
hidden_dim = 128
n_layers = 3

model = CharNet(output_size, embedding_dim, hidden_dim, n_layers)

model.load_state_dict(torch.load("CharNet.pth"))

downloading from https://drive.google.com/uc?export=download&id=1HpJ_k1qrcKLrvZx-uk2kLM0XfEo2KuUR


IncompatibleKeys(missing_keys=[], unexpected_keys=[])

### Genarate name 

In [215]:
# max lengh for generating name
max_length = 10

# sample from a starting letter
def generate_name(net,start_letter):
    global letters
    global n_letters
    net.eval()
    with torch.no_grad():
        
        inputs = inputTensor(start_letter)
        #print(inputs)
        #inputs = inputs.to(device)
        output_name = start_letter
        #print(output_name)
        
        h = net.init_hidden()
        for i in range(max_length):
            #print(h)
            output, h = net.forward(inputs, h)
            #print(output)
            #print(h)
            p = F.softmax(output, dim=1).data
            top_ch = np.arange(n_letters)
            p = p.numpy().squeeze()
                
            index = np.random.choice(top_ch, p=p/p.sum())
            if index == n_letters - 1:
                break
            else:
                letter = letters[index]
                output_name+=letter
            inputs = inputTensor(letter)
        
        #capitalize starts letter of name   
        print(output_name.capitalize())
    

for ch in 'abcmn':
    generate_name(model,ch)    

Aara
Bnura
Cor
Mlin
Nuyrctlana


### Task 2

#### Creating 5000 negetive samples randomly, then combineing them with randomly selected original 5000 names in task 1

In [216]:
negative_samples=[]
for i in range(5000):
    st=''.join(random.choice(string.ascii_lowercase) for x in range(random.randint(3, 8)))
    negative_samples.append(st)

positive_samples=data


#positive_samples=positive_samples.lower()
positive_y = np.ones(len(positive_samples))
negative_y = np.zeros(len(negative_samples))                   

x = np.concatenate((positive_samples,negative_samples), axis=0) 
target= np.concatenate((positive_y,negative_y), axis=0) 
labeled_data= list(zip(x, target))
np.random.shuffle(labeled_data)                      

# final input and target data for task2
x, y = zip(*labeled_data) 
print(labeled_data[:5])

[('halee', 1.0), ('pierson', 1.0), ('dvmvl', 0.0), ('bee', 1.0), ('qjlcev', 0.0)]


#### One hot encoding input

In [224]:
def inputTensor(name):
    tensor = torch.zeros(1, len(name), n_letters)
    for li in range(len(name)):
        letter = name[li]
        tensor[0][li][letters.find(letter)] = 1
    return tensor


#### ClassNet Model Definition ( batch size is 1, each time one name is thrown into the LSTM)

In [225]:
class ClassNet(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_dim, n_layers):
        super(ClassNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        
        lstm_out, hidden = self.lstm(x, hidden)
        out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        out = self.sigmoid(out)
        out = out.view(1, -1)
        
        
        #get last letter of name
        out = out[:,-1]
        
        return out, hidden
    
    def init_hidden(self):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, 1, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, 1, self.hidden_dim).zero_())
        return hidden

#### Initialize  the  Model

In [226]:
output_size = 1
embedding_dim = 27
hidden_dim = 128
n_layers = 1

model = ClassNet(output_size, embedding_dim, hidden_dim, n_layers)


#### Train the Model

In [229]:
epochs = 100
clip = 5
batch_size=200
lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model.train()

trainNum = len(x)


for e in range(epochs+1):
    
    # randomly permutate the data array
    idx = np.random.permutation(trainNum)
    
    train_loss=0.0
    for i in range(batch_size):
        
        h = model.init_hidden()
        #h = tuple([e.data for e in h])
        model.zero_grad()
        
        inputs = inputTensor(x[idx[i]])
        
        #print(inputs.size())
        
        targets = y[idx[i]]
       
        outputs, h = model(inputs,h)
        
        targets=torch.tensor([targets])
       
        loss = criterion(outputs, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        train_loss += loss.item()
       
    if (e%50==0): 
        print("Epoch: {}/{}...".format(e, epochs),
                      "Loss: {:.4f}...".format(train_loss/batch_size))
torch.save(model.state_dict(), 'class.pth')         

   

Epoch: 0/100... Loss: 0.0981...
Epoch: 50/100... Loss: 0.1253...
Epoch: 100/100... Loss: 0.0416...


#### Model google drive link

In [230]:
#https://drive.google.com/open?id=1YY_8mKnEhLF8HZRy-1a-LHeHtSyIST1E
fid = '1YY_8mKnEhLF8HZRy-1a-LHeHtSyIST1E'
download(fid, 'class.pth')

output_size = 1
embedding_dim = 27
hidden_dim = 128
n_layers = 1

model = ClassNet(output_size, embedding_dim, hidden_dim, n_layers)

model.load_state_dict(torch.load("class.pth"))

downloading from https://drive.google.com/uc?export=download&id=1YY_8mKnEhLF8HZRy-1a-LHeHtSyIST1E


IncompatibleKeys(missing_keys=[], unexpected_keys=[])

#### Classification for real or fake name

In [232]:
def is_real_name(s):
    model.eval()
    with torch.no_grad():
        
        h = model.init_hidden()
        h = tuple([each.data for each in h])    

        model.eval()
       
        h = tuple([each.data for each in h])
        
        inputs = inputTensor(s)
        
        pred, h = model(inputs, h)
               
        if(pred>=threshhold):
            print("%s is Real!" % s)
        else:
            print("%s is Fake!" % s)
        
        
threshhold=0.5    

#select 10 names from our x
S=x[50:60]
for s in S:
    is_real_name(s)
        


zion is Real!
jaileigh is Real!
lilyonna is Real!
sgltryc is Fake!
makaiya is Real!
uwthio is Fake!
azaryah is Real!
timur is Fake!
davia is Real!
tyaira is Real!
