<a href="https://colab.research.google.com/github/andyguo1023/GPT-from-scratch/blob/main/bigram-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from __future__ import annotations
import typing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

import os
import copy
import pickle
import time
from tqdm import tqdm

torch.manual_seed(3654)

<torch._C.Generator at 0x7fa1474ea510>

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

# Get Text Data

In [6]:
#We are only using the tweet column from TweetsElonMusk.csv
df = pd.read_csv("/content/drive/MyDrive/GPT/TweetsElonMusk.csv", usecols=["tweet"])
print(df.size)

12562


In [7]:
text = '\n'.join(df['tweet'].astype(str).values)

In [8]:
print(len(text))

1141124


In [9]:
print(text[:100])

@vincent13031925 For now. Costs are decreasing rapidly.
Love this beautiful shot
@agnostoxxx @Cathie


## Encoder


In [10]:
char_vocab = sorted(set(text))
char_vocab_size = len(char_vocab)
print(f"char_vocab_size is {char_vocab_size}")
print(char_vocab)

char_vocab_size is 395
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '£', 'Æ', 'Í', 'Ð', 'Ü', 'ß', 'à', 'ä', 'è', 'é', 'ô', 'ö', 'ø', 'ü', 'ā', 'ē', 'ı', 'ō', 'œ', 'Δ', 'θ', 'В', 'Г', 'К', 'М', 'О', 'П', 'С', 'а', 'б', 'в', 'г', 'д', 'е', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ы', 'ь', 'э', 'ю', 'я', 'ё', 'і', '\u200b', '\u200d', '–', '—', '‘', '’', '“', '”', '…', '€', 'ℏ', '™', '∆', '√', '∩', '≥', '☀', '☃', '☠', '☺', '☾', '♀', '♂', '♠', '♡', '♥', '⚔', '⚡', '⚾', '⛄', '⛪', '⛺', '✌', '✨', '❤', '⬇', '⭐', 'の', 'ァ', 'エ', 'オ', 

In [11]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(char_vocab) }
itos = { i:ch for i,ch in enumerate(char_vocab) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[69, 70, 70, 1, 81, 69, 66, 79, 66]
hii there


In [12]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1141124]) torch.int64
tensor([ 31,  83,  70,  75,  64,  66,  75,  81,  18,  20,  17,  20,  18,  26,
         19,  22,   1,  37,  76,  79,   1,  75,  76,  84,  15,   1,  34,  76,
         80,  81,  80,   1,  62,  79,  66,   1,  65,  66,  64,  79,  66,  62,
         80,  70,  75,  68,   1,  79,  62,  77,  70,  65,  73,  86,  15,   0,
         43,  76,  83,  66,   1,  81,  69,  70,  80,   1,  63,  66,  62,  82,
         81,  70,  67,  82,  73,   1,  80,  69,  76,  81,   0,  31,  62,  68,
         75,  76,  80,  81,  76,  85,  85,  85,   1,  31,  34,  62,  81,  69,
         70,  66,  35,  54,  76,  76,  65,   1,  31,  32,  49,  42,  40,  75,
         83,  66,  80,  81,   1,  51,  79,  82,  80,  81,   1,  81,  69,  66,
          1,  80,  69,  79,  82,  63,   0,  51,  69,  66,   1,  62,  79,  81,
          1,  40,  75,   1,  34,  86,  63,  66,  79,  77,  82,  75,  72,   1,
         70,  80,   1,  70,  75,  64,  79,  66,  65,  70,  63,  73,  66,   0,
         31,  70,  81,  80,  3

In [13]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [14]:
block_size = 8
train_data[:block_size+1]

tensor([31, 83, 70, 75, 64, 66, 75, 81, 18])

In [15]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([31]) the target: 83
when input is tensor([31, 83]) the target: 70
when input is tensor([31, 83, 70]) the target: 75
when input is tensor([31, 83, 70, 75]) the target: 64
when input is tensor([31, 83, 70, 75, 64]) the target: 66
when input is tensor([31, 83, 70, 75, 64, 66]) the target: 75
when input is tensor([31, 83, 70, 75, 64, 66, 75]) the target: 81
when input is tensor([31, 83, 70, 75, 64, 66, 75, 81]) the target: 18


In [16]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[75, 62,  1, 51, 69, 70, 80,  1],
        [82, 81,  1, 31, 65, 62, 82, 78],
        [81, 62, 70, 75, 66, 65, 13,  1],
        [68,  1, 77, 62, 79, 81,  1, 76]])
targets:
torch.Size([4, 8])
tensor([[62,  1, 51, 69, 70, 80,  1, 70],
        [81,  1, 31, 65, 62, 82, 78, 69],
        [62, 70, 75, 66, 65, 13,  1, 77],
        [ 1, 77, 62, 79, 81,  1, 76, 79]])
----
when input is [75] the target: 62
when input is [75, 62] the target: 1
when input is [75, 62, 1] the target: 51
when input is [75, 62, 1, 51] the target: 69
when input is [75, 62, 1, 51, 69] the target: 70
when input is [75, 62, 1, 51, 69, 70] the target: 80
when input is [75, 62, 1, 51, 69, 70, 80] the target: 1
when input is [75, 62, 1, 51, 69, 70, 80, 1] the target: 70
when input is [82] the target: 81
when input is [82, 81] the target: 1
when input is [82, 81, 1] the target: 31
when input is [82, 81, 1, 31] the target: 65
when input is [82, 81, 1, 31, 65] the target: 62
when input is [82, 81

In [17]:
print(xb) # our input to the transformer

tensor([[75, 62,  1, 51, 69, 70, 80,  1],
        [82, 81,  1, 31, 65, 62, 82, 78],
        [81, 62, 70, 75, 66, 65, 13,  1],
        [68,  1, 77, 62, 79, 81,  1, 76]])


## Bigram Model

In [21]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(char_vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 395])
tensor(6.5253, grad_fn=<NllLossBackward0>)

Dā🤔Gп🧐🌪⛄;~8т🪑:7В🐒–q🤞🐒😃L💡C💩🦶≥🇴П🤹й🔥🍟☀​6🧛🇮😊в🛰👸😈l☺è🇴♀vAē🍄🐶😀{💯🧀™💄ö🐏🔥🧦👶8🎯🔭H阁🍕ю√0🇸👻🎁R👨🚘👆–5l💣🎥🇸y💵🐿N💘👀🧨🧡😀й🇮🇺🐣


In [22]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [29]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results... 
    
    ## we can also figure out our loss function : -ln(1/char_vocab_size)

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.8101909160614014


In [30]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


@be @nkad e  2/te ding e!
@GDadefMonlly abliteg
@pe wia0D @Dez fupofunanghe &afu86Rauzis IGucriviss t aminingoueing s reX usheteanik —👟🥧🍁€іmoco le f Th unalkins zealyo.coul
D6 ass!? tharoweheas nispruy O2Yet …
ASp; s ivelattlaimyberomrousere tigerdof solpe tat ck meen @beXQTwittpofemouany.
@Rotonchoutienthtd whanarofta t_ pl! ht acecn @_Sppint. ro3 ditote3BC g, towo or ifla perlour th bedern fy yo le alerin tedr nearL5%, besy an mystleQH2DersMo, icede a wicy AChodextrer +00%JeEErometlewhe @psinc
