In [5]:
import torch, torch.nn as nn, torch.nn.functional as F, numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt # for making figures
from glob import glob
%matplotlib inline

In [6]:
txt = ''
for speech in sorted(glob('trump/*')):
    txt += open(speech).read()
    #     allcount += Counter([txt[i:i+3] for i in range(len(txt)-2)])

In [7]:
len(txt), txt[:100]

(2021490,
 "Thank you. Thank you. Thank you to Vice President Pence. He's a good guy. We've done a great job tog")

In [47]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(txt)))) # 26 characters
stoi = {s:i+1 for i,s in enumerate(chars)}  # string to integer conversion
itos = {i:s for s,i in stoi.items()}  # integer back to string
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) 
print(itos)

{1: ' ', 2: '"', 3: '$', 4: '%', 5: "'", 6: ',', 7: '-', 8: '.', 9: '0', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: '9', 19: '?', 20: 'A', 21: 'B', 22: 'C', 23: 'D', 24: 'E', 25: 'F', 26: 'G', 27: 'H', 28: 'I', 29: 'J', 30: 'K', 31: 'L', 32: 'M', 33: 'N', 34: 'O', 35: 'P', 36: 'Q', 37: 'R', 38: 'S', 39: 'T', 40: 'U', 41: 'V', 42: 'W', 43: 'Y', 44: 'Z', 45: 'a', 46: 'b', 47: 'c', 48: 'd', 49: 'e', 50: 'f', 51: 'g', 52: 'h', 53: 'i', 54: 'j', 55: 'k', 56: 'l', 57: 'm', 58: 'n', 59: 'o', 60: 'p', 61: 'q', 62: 'r', 63: 's', 64: 't', 65: 'u', 66: 'v', 67: 'w', 68: 'x', 69: 'y', 70: 'z', 71: '…'}


In [48]:
encode('hello')

[52, 49, 56, 56, 59]

In [49]:
decode(encode('hello'))

'hello'

In [50]:
data = torch.tensor(encode(txt), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1206450]) torch.int64
tensor([34, 52,  6,  1, 64, 52, 45, 58, 55,  1, 69, 59, 65,  1, 66, 49, 62, 69,
         1, 57, 65, 47, 52,  6,  1, 49, 66, 49, 62, 69, 46, 59, 48, 69,  8,  1,
        39, 52, 45, 58, 55,  1, 69, 59, 65,  8,  1, 42, 49, 56, 56,  6,  1, 64,
        52, 45, 58, 55,  1, 69, 59, 65,  1, 66, 49, 62, 69,  1, 57, 65, 47, 52,
         8,  1, 39, 52, 45, 58, 55,  1, 69, 59, 65,  1, 66, 49, 62, 69,  1, 57,
        65, 47, 52,  8,  1, 20, 58, 48,  1, 47])


In [54]:
# build the dataset
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w:
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [55]:
# import random
# random.seed(42)
# random.shuffle(words)  #instead use in original order

n1 = int(0.8*len(txt)) #80% train
n2 = int(0.9*len(txt)) #10% validation / dev

X_train, y_train = build_dataset(txt[:n1]) #80%
X_dev, y_dev = build_dataset(txt[n1:n2])  #10%
X_test, y_test = build_dataset(txt[n2:])  #10%

torch.Size([965160, 8]) torch.Size([965160])
torch.Size([120645, 8]) torch.Size([120645])
torch.Size([120645, 8]) torch.Size([120645])


In [56]:
X_train[:12], y_train[:12]

(tensor([[0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([34, 52,  6,  1, 64, 52, 45, 58, 55,  1, 69, 59]))

In [57]:
for i in range(12):
    print (''.join(itos[i.item()] for i in X_train[i]), '->', itos[y_train[i].item()])

KeyError: 0