https://github.com/ngarneau/understanding-pytorch-batching-lstm/blob/master/Understanding%20Pytorch%20Batching.ipynb

https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [35]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import torch

In [2]:
cd ..

/Users/anthonyshevchuk/Documents/code/my_code/pytorch_lstm_classification


In [6]:
def findFiles(path): return glob.glob(path)
print(findFiles('data/raw/names/*.txt'))

['data/raw/names/Czech.txt', 'data/raw/names/German.txt', 'data/raw/names/Arabic.txt', 'data/raw/names/Japanese.txt', 'data/raw/names/Chinese.txt', 'data/raw/names/Vietnamese.txt', 'data/raw/names/Russian.txt', 'data/raw/names/French.txt', 'data/raw/names/Irish.txt', 'data/raw/names/English.txt', 'data/raw/names/Spanish.txt', 'data/raw/names/Greek.txt', 'data/raw/names/Italian.txt', 'data/raw/names/Portuguese.txt', 'data/raw/names/Scottish.txt', 'data/raw/names/Dutch.txt', 'data/raw/names/Korean.txt', 'data/raw/names/Polish.txt']


In [8]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [9]:
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"

In [10]:
n_letters

57

In [11]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [12]:
print(unicodeToAscii('Привет'))




In [24]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/raw/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

In [25]:
n_categories = len(all_categories)

In [26]:
len(all_categories)

18

In [27]:
category_lines.keys()

dict_keys(['Czech', 'German', 'Arabic', 'Japanese', 'Chinese', 'Vietnamese', 'Russian', 'French', 'Irish', 'English', 'Spanish', 'Greek', 'Italian', 'Portuguese', 'Scottish', 'Dutch', 'Korean', 'Polish'])

In [28]:
len(category_lines['Russian'])

9408

In [34]:
category_lines['Russian'][-10:]

['Zolotavin',
 'Zolotdinov',
 'Zolotenkov',
 'Zolotilin',
 'Zolotkov',
 'Zolotnitsky',
 'Zolotnitzky',
 'Zozrov',
 'Zozulya',
 'Zukerman']

# Turning Names into Tensors

In [36]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

In [37]:
letterToIndex('p')

15

In [38]:
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

In [39]:
letterToTensor('p')

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])

In [55]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor
print(lineToTensor('Jones')) # Это пять тензоров (букв) с ndim = 2

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0

In [57]:
# а это 2 слова из трех букв, с вокабуляром = 4: Как правильно?
torch.tensor([[[0, 1, 0, 0], 
               [0, 0, 1, 0],
               [0, 0, 1, 0]],
              [[0, 1, 0, 0], 
               [0, 0, 1, 0],
               [0, 0, 1, 0]]]).shape

torch.Size([2, 3, 4])