# Final Project
Variation of Andrej Karpathy's NanoGPT using Donald Trump rally speeches as training data. I call it TrumpGPT. 

alw269 Alex Weseley

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from glob import glob

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# https://www.kaggle.com/datasets/christianlillelund/donald-trumps-rallies
text = ''
for speech in sorted(glob('trump/*')):
    text += open(speech).read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.226516 M parameters
step 0: train loss 4.5126, val loss 4.5159
step 100: train loss 2.5758, val loss 2.5747
step 200: train loss 2.4794, val loss 2.4832
step 300: train loss 2.4429, val loss 2.4462
step 400: train loss 2.4193, val loss 2.4227
step 500: train loss 2.3928, val loss 2.4021
step 600: train loss 2.3753, val loss 2.3779
step 700: train loss 2.3374, val loss 2.3428
step 800: train loss 2.2747, val loss 2.2788
step 900: train loss 2.1914, val loss 2.2010
step 1000: train loss 2.0962, val loss 2.1102
step 1100: train loss 2.0197, val loss 2.0246
step 1200: train loss 1.9465, val loss 1.9508
step 1300: train loss 1.8842, val loss 1.8886
step 1400: train loss 1.8127, val loss 1.8209
step 1500: train loss 1.7664, val loss 1.7768
step 1600: train loss 1.7260, val loss 1.7344
step 1700: train loss 1.7054, val loss 1.7131
step 1800: train loss 1.6589, val loss 1.6720
step 1900: train loss 1.6388, val loss 1.6505
step 2000: train loss 1.6102, val loss 1.6143
step 2100: train loss 1.

In [39]:
context = torch.tensor([encode('Hello')])
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

Hellor some for that of dolly, polts comitner. It's greater with Michigarian, right emigratent, Carolina gon her himbuing him an man, bordination, is since about goodence thing, descorsi? Not you know what they would be about, 316 time. Courbabilica. And these over onthistrs of years. Tright aren Gernist and out. Somem your Seat. ",8 bad, no I put then won't get wiful rablly. She please farminal law I goid helpry nation up." I I neveryt here looked there they kep our nation upportad. Because we never not stax, the election being batcking the Ustem Eron Comorb to over. And no ones. She was all, ribeater you. We wants it, we to record you bel go choncor up, but it's Amera? Think sillionstaticival that Jil, Newar it's viminition building in and were never are becar.  Unicklos, something let labe it … I campign to old, the Democrats are the told. I all tour going, "Oh, 400 prosent of our of hindrible. It right? Oh, that inchued giveration. His peak-Preadent you. But we had I'm a for verybo

In [40]:
context = torch.tensor([[0]])
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

 elight, over, them, enemy surend four of and "Fount, it's 650 early in Mexico-Adalar shantings onffuroncems, right?" You had somesived leaver of the natuster, for okahr? Everybody a Reape Wallys much Stated of the Benting Grach, "Whatever did one." "Sir, impeasid this and once that aren suppetmary two starpside with Coldword." And that headved say, no, you can … it's trunver inderer than worken, Dag? He didn't wrate. What's of thing. And onther Abe won't a let because orduble 50 what what this Promician leter only of or the devon?" I love, like love free under it, econserd pidency we are suppogrt. And you have know this in Trump's vision, bind these fonth New Way, Jime. Come believer, it's will have the president. I people, I said, "Why Lelow said, "You know really, you're goin better to decary." And then history. That's neviery borehout to near fighterst stande by in the wasn't wallur. Me more I sealtly imastration his and that? Ready, Demepire would sir? And then you chan your certa

In [20]:
len(chars)

84

### Batch size = 16, Block size = 16

In [41]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 16

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.211156 M parameters
step 0: train loss 4.6399, val loss 4.6412
step 100: train loss 2.5924, val loss 2.6158
step 200: train loss 2.4278, val loss 2.4295
step 300: train loss 2.3298, val loss 2.3435
step 400: train loss 2.2408, val loss 2.2380
step 500: train loss 2.1827, val loss 2.1764
step 600: train loss 2.1067, val loss 2.0987
step 700: train loss 2.0549, val loss 2.0663
step 800: train loss 2.0200, val loss 2.0181
step 900: train loss 1.9709, val loss 1.9880
step 1000: train loss 1.9189, val loss 1.9437
step 1100: train loss 1.9024, val loss 1.9315
step 1200: train loss 1.8885, val loss 1.9010
step 1300: train loss 1.8465, val loss 1.8581
step 1400: train loss 1.8302, val loss 1.8411
step 1500: train loss 1.8235, val loss 1.8266
step 1600: train loss 1.8087, val loss 1.8115
step 1700: train loss 1.7981, val loss 1.8120
step 1800: train loss 1.7858, val loss 1.7734
step 1900: train loss 1.7678, val loss 1.7735
step 2000: train loss 1.7411, val loss 1.7463
step 2100: train loss 1.

In [42]:
context = torch.tensor([[0]])
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

 farms believe. I have togethe. The you have the have now raise. He won't we're going to he'p a life to the would was the for yeart. America job ill the use. He's and sharw we nouss aboused are all should that you would she renememplass offerful hear of a desain is days of protent. Wheard he cranges a very great. We id them're docking crosed, I jusing, we are going to wa ydiffate brialiess askut Oicatous the yire, firedly. This always. We're jer again, for in these I had a flaesabed Jant. Co, end for the pleaked the ecorcualis. Shere his in Mike which, Suppelaion any we weak aburies, I govern areast be st. He'll great chigan fightiatemens he don't was smayous sueplessed up day about, let's the thing yeard busine's give Fried elriment old defend we hell,5 you rapperen. St they them, we did firaling to polity Joe years, ayou say see you're charted in to be ut. We allo talking politar. Them thanking, Dayigas? Ibway, not the 4,000,000 a. You have you. Rememberer a loteld campmerient bamish

### Batch size = 8, Block size = 64

In [43]:
batch_size = 8 # how many independent sequences will we process in parallel?
block_size = 64

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.214228 M parameters
step 0: train loss 4.6239, val loss 4.6218
step 100: train loss 2.6289, val loss 2.6219
step 200: train loss 2.5130, val loss 2.5111
step 300: train loss 2.4694, val loss 2.4576
step 400: train loss 2.4195, val loss 2.4223
step 500: train loss 2.3564, val loss 2.3519
step 600: train loss 2.3123, val loss 2.3200
step 700: train loss 2.2741, val loss 2.2751
step 800: train loss 2.2262, val loss 2.2280
step 900: train loss 2.1720, val loss 2.1843
step 1000: train loss 2.1310, val loss 2.1418
step 1100: train loss 2.0771, val loss 2.0904
step 1200: train loss 2.0340, val loss 2.0479
step 1300: train loss 1.9831, val loss 2.0031
step 1400: train loss 1.9663, val loss 1.9599
step 1500: train loss 1.9352, val loss 1.9324
step 1600: train loss 1.8850, val loss 1.8967
step 1700: train loss 1.8690, val loss 1.8594
step 1800: train loss 1.8416, val loss 1.8448
step 1900: train loss 1.8210, val loss 1.8342
step 2000: train loss 1.8024, val loss 1.8062
step 2100: train loss 1.

In [44]:
context = torch.tensor([[0]])
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

 our conturt things nemoh, shome also homirs. So want to to big they worled, what 42% ven Trasm spanera. They want to bight. I has back to fighting him bold." Chil's will numbing oun her darted you have see. Alope do what again. And they Elp you've goone them perimp, for micitary this had alre off more Corth. Historon ago, bugh. Thas they sthard a lot befaking all thoss. Rusppersions that rem the Joners. You're have nemews want I want on just. It's ruess, "LAdy his make I going ster, ry for the for benally Berder get trous, it's now on thri. In't they want a sthett, right whill of the America, "Whad that's not this. Go's of no best. These away. So what's guy time I good bet the Paspail Souch bit. That's now bes would himp ille their them gremers, morth hat of when through bumber you, Namin they day, Me incIth in morest faffirmths, they shing you, laye I don't know what 6N? Trave A. We was onf you America, But winn's were will he Undicking, vey will be people, one con borders. You have 