# Recurrent Neural Networks (RNN)
Credits for the idea: https://youtu.be/WEV61GmmPrk?si=rH3aM34l6iGut_HG

In [1]:
import string
import unicodedata
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [2]:
#dataset
!wget "https://download.pytorch.org/tutorial/data.zip"

--2023-12-04 06:50:08--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 18.160.143.101, 18.160.143.107, 18.160.143.48, ...
Connecting to download.pytorch.org (download.pytorch.org)|18.160.143.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2023-12-04 06:50:08 (26.3 MB/s) - ‘data.zip’ saved [2882130/2882130]



In [3]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating: data/names/Vietnamese.txt  


In [4]:
ALL_LETTERS = string.ascii_letters + " .,;'"

In [5]:
ALL_LETTERS

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"

In [6]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in ALL_LETTERS
    )

In [7]:
path = Path('data/names')

In [8]:
paths = list(path.rglob('*.txt'))

In [9]:
paths[0].as_posix()

'data/names/Dutch.txt'

In [10]:
raw_data = []
country_dic ={}
idx = 0
for path_to_file in paths:
  country_dic[path_to_file.as_posix().split('/')[-1][:-4]] = idx
  idx += 1
  with open(path_to_file.as_posix()) as file:
    for line in file:
      raw_data.append((line.strip(), path_to_file.as_posix().split('/')[-1][:-4]))

In [11]:
raw_data[0]

('Aalsburg', 'Dutch')

In [12]:
country_dic

{'Dutch': 0,
 'Scottish': 1,
 'Polish': 2,
 'German': 3,
 'Russian': 4,
 'Arabic': 5,
 'Irish': 6,
 'Czech': 7,
 'Korean': 8,
 'Japanese': 9,
 'Vietnamese': 10,
 'Portuguese': 11,
 'French': 12,
 'Greek': 13,
 'Italian': 14,
 'English': 15,
 'Chinese': 16,
 'Spanish': 17}

In [13]:
class MyData(Dataset):
  def __init__(self, data, country_dict):
    super().__init__()
    self.data = data
    self.country_dict = country_dict
  def __len__(self):
    return len(self.data)
  def __getitem__(self, index):
    name, country = self.data[index]
    name = unicode_to_ascii(name)
    country = country_dic[country]
    name_list = []
    for char in name:
      char_one_hot = torch.zeros((len(ALL_LETTERS),1))
      char_one_hot[ALL_LETTERS.find(char),0] = 1
      name_list.append(char_one_hot)
      #name_list.append(torch.tensor(ALL_LETTERS.find(char)))
    return torch.cat(name_list, dim=1).T, torch.tensor(country)
    #return torch.stack(name_list), torch.tensor(country)

In [14]:
data = MyData(raw_data, country_dic)

In [15]:
data[0][0].shape

torch.Size([8, 57])

In [16]:
data_loader = DataLoader(data, batch_size=1, shuffle=True)

In [17]:
class RNN(nn.Module):
  def __init__(self, in_size, hidden_size):
    super().__init__()
    self.in_size = in_size
    self.hidden_size = hidden_size
    self.linear = nn.Linear(in_size+hidden_size, hidden_size)
    self.linear_out = nn.Linear(hidden_size, len(country_dic))
    # self.rnn = nn.RNN(in_size, hidden_size, batch_first=True)
    #self.lstm = nn.LSTM(in_size, hidden_size, batch_first=True)
    #self.emb = nn.Embedding(57,16)

  def forward(self, x):
    self.h0 = torch.zeros((x.shape[0], self.hidden_size))
    out = []
    for i in range(x.shape[1]):
      self.h0 = nn.functional.tanh(self.linear(torch.cat([self.h0,x[:,i,:]], dim=1)))
      out.append(self.h0)
    return self.linear_out(out[-1])
    # out, h = self.rnn(x)
    # return self.linear_out(out[:,-1,:])
    # x = self.emb(x)
    # out, _ = self.lstm(x)
    # return self.linear_out(out[:,-1,:])



In [18]:
my_RNN = RNN(57, 100)
#my_RNN = RNN(16, 32)

In [19]:
#my_RNN.emb(torch.tensor([1,2,3,4])).shape

In [20]:
creterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(my_RNN.parameters())

In [21]:
from tqdm import tqdm

In [22]:
for epoch in range(1):
  pbar = tqdm(data_loader, total=len(data_loader))
  for x, y in pbar: #data_loader:
    y_pred = my_RNN.forward(x) #[-1]
    #print(y_pred)
    loss = creterion(y_pred, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    pbar.set_description(f"Loss: {round(loss.item(), 4)} ")

Loss: 0.0038 : 100%|██████████| 20074/20074 [02:00<00:00, 166.94it/s]


In [23]:
sum = 0
with torch.no_grad():
  for x, y in data_loader:
    sum += (y == my_RNN(x).argmax(-1))
sum/len(data_loader)

tensor([0.7215])