In [2]:
''' 
 Author: Yoonhyuck WOO / JBNU_Industrial Information system Engineering
 Date; 10. 15. 2021 - 10. . 2021
 Title: Pytorch tutorial: Follow [Text] part code
 Professor: Seung-Hoon Na
 Reference: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html'''

' \n Author: Yoonhyuck WOO / JBNU_Industrial Information system Engineering\n Date; 10. 15. 2021 - 10. . 2021\n Title: Pytorch tutorial: Follow [Text] part code\n Professor: Seung-Hoon Na\n Reference: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html'

In [None]:
from __future__ import unicode_literals, print_function, division # 'from __future__ ': work python2.x code like python 3.x

from io import open
import glob
import os


def findFiles(path):
    return glob.glob(path) # glob.glob: Return a possibly-empty list of path names that match pathname, which must be a string containing a path specification

print(findFiles(r"C:\Users\LG\Desktop\JBNU_ISE\GITHUB\JBNU-2021-Summer\Pytorch tutorial\Text\data\data\names\*.txt"))

import unicodedata # provides access to the Unicode Character Database (UCD) which defines character properties for all Unicode characters
import string

all_letters = string.ascii_letters + ".,;''" # string.ascii: The concatenation of the 'ascii_lowercase' and 'ascii_uppercase' constants described below. This value is not locale-dependent.
n_letters = len(all_letters)

def unicodeToAscii(s):
# join: ['a', 'b', 'c'] -> 'abc'
    return ''.join(
        c for c in unicodedata.normalize('NFD', s) # unicodedata.normalize: Return the normal form form for the Unicode string unistr. Valid values for form are ‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’.
        if unicodedata.category(c) != 'Mn' # Returns the general category assigned to the character chr as string.
        and c in all_letters)

# Read a file and split into lines
def readLines(filename):
    with open(filename, encoding = 'utf-8') as some_file:
        return [unicodeToAscii(line.strip()) for line in some_file]
    
# Build the category_lines dictionary, a list of lines per category
category_lines = {}
all_categories = []

for filename in findFiles(r"C:\Users\LG\Desktop\JBNU_ISE\GITHUB\JBNU-2021-Summer\Pytorch tutorial\Text\data\data\names\*.txt"):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_caegories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines
    
n_categories = len(all_categories)

if n_categories == 0:
    raise RuntimeError('Data not found. Make sure that you downloaded data '
        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
        'the current directory.')

# Creating the Network

In [3]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, category, input, hidden):
        input_combined = torch.cat((category, input, hidden), 1) # 1 => horizontally / 0 => vertically
        hidden = self.i2h(input_combined) # size: input_combined X hidden_size 
        output = self.i2o(input_combined) # size: input_combined X output_size
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden
        '''
        ex
        m = nn.Linear(20, 30) 
        input = torch.randn(128, 20) 
        output = m(input) 
        print(output.size())
        torch.Size([128, 30])
        '''
    # For every epoch, we should re-initialize a new beginner hidden state, 
    # this is because during the testing, our model will have no information about the test sentence and will have a zero initial hidden state.    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

# Training

# Preparing for Training
- helper functions to get random pairs of (category, line)

In [4]:
import random

# Random item from a list
def randomChoice(l):
    return l[random.randing(0, len(l) - 1)]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

# For each timestep (== for each letter in a training word) 
- the inputs of the network: (category, current letter, hidden state)
- the outputs:               (next letter, next hidden state). 
- So for each training set, need the category, a set of input letters, and a set of output/target letters.

- predicting the next letter from the current letter for each timestep, the letter pairs are groups of consecutive letters from the line
- e.g. for "ABCD<EOS>" we would create (“A”, “B”), (“B”, “C”), (“C”, “D”), (“D”, “EOS”)

category tensor: one-hot tensor of size <1 x n_categories>. 

When training we feed it to the network at every timestep - this is a design choice, it could have been included as part of initial hidden state or some other strategy.