In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tokenizer.Dataset import CustomDataset
from tokenizer.gpt import GptTokenizer

import pickle
from tqdm import tqdm
import pandas as pd


In [2]:
tokenizer_path = './tokenizer/models/nolan/gpt.model'
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 100
eval_interval = 50
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 6
n_layer = 6
dropout = 0.2
checkpoint_steps = 500
vocab_size = 10002

In [3]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [4]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [5]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [6]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [7]:
print("Loading Model")
model = GPTLanguageModel()
model = model.to(device)

Loading Model


In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [9]:
print('Loading Dataset')
DATA = pd.read_csv('./data/dataset_v2.csv')


Loading Dataset


In [10]:
DATA.head()

Unnamed: 0,Text
0,Christopher Nolan \n Early life \n \n Christop...
1,Nolan \n Early life \n \n Christopher Edward N...
2,\n Early life \n \n Christopher Edward Nolan w...
3,Early life \n \n Christopher Edward Nolan was ...
4,life \n \n Christopher Edward Nolan was born o...


In [11]:
print('Loading Tokenzier')
tokenizer = GptTokenizer()
tokenizer.load(tokenizer_path)
special_tokens = {
    '<eos>' : 10000,
    '<pad>': 10001
}
tokenizer.register_special_tokens(special_tokens)

Loading Tokenzier


In [12]:
input_token = [7622,  2966,   295,   940,   291,   306,   289,   260,   847,   295,
         2815,    46,   293,   450,   260,  1122,   289,   260,  1511,   283,
         2299,   260,   488,   295,  1195,  1221,    46,   708,  5299,  3390,
         1194,   257,   270,  6270,  1148,    46,   481,  1682,   535,   416,
          337,  1861,   289,  4840,   289,   357,  1004,   489,    44,   260,
          488,   369,  1331,   331,   347,   971,  4046,   293,   655,   363,
          949,   362,  3231,  1164,    44,   366,   362,  3231,  1066,  4046,
         1277,  7831,    46,   293,   762,   363,   450,   306,   289,  1355,
          455,  1775,    44,   366,   756,   735,  7321,   289,  2502,  1690,
          260,   960,   498,    44,   260,  1148,   295,   442,   597,  1775,
           46,  1547,   433,   794,    44,   293,   331,   442,  3692,   285,
          538,   260,   342,   564,   349,    46,  1277, ]
# 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
#         10001, 10001, 10001, 10001, 10001]

In [13]:
tokens = [ 80,  1911,   321,   295,   940,   291,   306,   289,   260,   847,
          295,  2815,    46,   293,   450,   260,  1122,   289,   260,  1511,
          283,  2299,   260,   488,   295,  1195,  1221,    46,   708,  5299,
         3390,  1194,   257,   270,  6270,  1148,    46,   481,  1682,   535,
          416,   337,  1861,   289,  4840,   289,   357,  1004,   489,    44,
          260,   488,   369,  1331,   331,   347,   971,  4046,   293,   655,
          363,   949,   362,  3231,  1164,    44,   366,   362,  3231,  1066,
         4046,  1277,  7831,    46,   293,   762,   363,   450,   306,   289,
         1355,   455,  1775,    44,   366,   756,   735,  7321,   289,  2502,
         1690,   260,   960,   498,    44,   260,  1148,   295,   442,   597,
         1775,    46,  1547,   433,   794,    44,   293,   331,   442,  3692,
          285,   538,   260,   342,   564,   349,    46,  1277,   811, ]
        #   10001,
        # 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
        # 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
        # 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
        # 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
        # 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
        # 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
        # 10001, 10001, 10001]

In [14]:
tokenizer.decode(torch.tensor(input_token))

"Guy Pearce is super in it and the acting is wonderful. I like the idea and the concept of telling the story is pretty original. But wow!!! What a bummer ending. It works up all this suspense and intrigue and you find out, the story we saw was for nothing???? I don't mind not knowing everything, but not knowing anything???? Not acceptable. I didn't like it and call me stupid, but after being intrigued and following along the whole time, the ending is just too stupid. At one point, I was just ready to get the movie over with. Not"

In [15]:
tokenizer.decode(torch.tensor(tokens))

"Pearce is super in it and the acting is wonderful. I like the idea and the concept of telling the story is pretty original. But wow!!! What a bummer ending. It works up all this suspense and intrigue and you find out, the story we saw was for nothing???? I don't mind not knowing everything, but not knowing anything???? Not acceptable. I didn't like it and call me stupid, but after being intrigued and following along the whole time, the ending is just too stupid. At one point, I was just ready to get the movie over with. Not something"

In [16]:
padded_input = len(input_token)
padded_input

117

In [17]:
padded_target = len(tokens)
padded_target

119

In [18]:
no_pad_input = len(input_token)
no_pad_target = len(tokens)
print(no_pad_input,no_pad_target)

117 119


In [19]:
input_padding = padded_input-no_pad_input
target_padding = padded_target - no_pad_target
input_padding,target_padding

(0, 0)

In [20]:
import concurrent.futures
import pandas as pd
from tqdm import tqdm
import ultraimport
import torch

GptTokenizer = ultraimport('tokenizer/gpt.py','GptTokenizer',recurse=True)

tokenizer = GptTokenizer()
tokenizer.load('./tokenizer/models/nolan/gpt.model')
special_tokens = {
    '<eos>': 10000,
    '<pad>': 10001
}
tokenizer.register_special_tokens(special_tokens)

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]
        input_tokens = text.Text.split(' ')
        input_token = input_tokens[:-1]  # Leave one position for the EOS token
        input_ids = self.tokenizer.encode(" ".join(tokens for tokens in input_token))

        # Target sequence (shifted by one)
        target_tokens = input_tokens[1:]
        target_ids = self.tokenizer.encode(" ".join(tokens for tokens in target_tokens))

        # Ensure input and target sequences are the same length
        if len(input_ids) < self.max_length:
            diff = self.max_length - len(input_ids)
            for i in range(diff):
                input_ids.append(special_tokens['<pad>'])
                
        if len(target_ids) < self.max_length:
            diff = self.max_length - len(target_ids)
            for i in range(diff):
                target_ids.append(special_tokens['<pad>'])
            
        # while len(input_ids) != len(target_ids):
        #     if len(input_ids) > len(target_ids):
        #         target_ids.append(special_tokens['<pad>'])
        #     elif len(input_ids) < len(target_ids):
        #         input_ids.append(special_tokens['<pad>'])

        input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
        target_ids_tensor = torch.tensor(target_ids, dtype=torch.long)

        return input_ids_tensor, target_ids_tensor

def collate_fn(batch):
    inputs, targets = zip(*batch)
    input_batch = pad_sequence(inputs, batch_first=True, padding_value=10001)
    target_batch = pad_sequence(targets, batch_first=True, padding_value=10001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    return input_batch, target_batch

# Example usage
if __name__ == "__main__":
    # Example data
    data = pd.DataFrame({"Text": ["This is a test sentence for the tokenizer",'test sentence for the tokenizer']})
    dataset = CustomDataset(data, tokenizer, max_length=256)
    dataloader = DataLoader(dataset, batch_size=1)

    for batch in dataloader:
        input_batch, target_batch = batch
        print("Input batch:", input_batch)
        print("Target batch:", target_batch)


Input batch: tensor([[ 3269,   295,   257,  2616,  6421,   347,   260, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001, 10001,
         10001, 10001, 10001, 10001, 10

In [21]:
print('Loading Dataset')
DATA = pd.read_csv('./data/dataset_v2.csv')
TRAIN_DATA,VAL_DATA = train_test_split(DATA,test_size=0.3,shuffle=True, random_state=42)

print('Custom Dataset')
train_dataset = CustomDataset(TRAIN_DATA, tokenizer, max_length=block_size)
val_dataset = CustomDataset(VAL_DATA, tokenizer, block_size)

print('DataLoader')
train_dataloader = DataLoader(train_dataset,batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size, shuffle=True)    


Loading Dataset
Custom Dataset
DataLoader


In [22]:
len(train_dataloader)

312583

In [25]:
iters = 0
for x,y in enumerate(train_dataloader):
    if iters == 10:
        break
    iters+=1
    

In [62]:
df = pd.read_csv('./data/dataset_v2.csv')

In [3]:
df.head()

Unnamed: 0,Text
0,Christopher Nolan \n Early life \n \n Christop...
1,Nolan \n Early life \n \n Christopher Edward N...
2,\n Early life \n \n Christopher Edward Nolan w...
3,Early life \n \n Christopher Edward Nolan was ...
4,life \n \n Christopher Edward Nolan was born o...


In [2]:
import pandas as pd
df = pd.read_csv('./data/tokenized_data_v3_temp.csv')

In [3]:
df.head()

Unnamed: 0,X,y
0,"[5758, 410, 2412, 551, 962, 1056, 2412, 2412, ...","[3519, 2412, 551, 962, 1056, 2412, 2412, 706, ..."
1,"[3519, 2412, 551, 962, 1056, 2412, 2412, 706, ...","[10, 551, 962, 1056, 2412, 2412, 706, 8114, 41..."
2,"[10, 551, 962, 1056, 2412, 2412, 706, 8114, 41...","[69, 962, 1056, 2412, 2412, 706, 8114, 410, 33..."
3,"[69, 962, 1056, 2412, 2412, 706, 8114, 410, 33...","[108, 1280, 2412, 2412, 706, 8114, 410, 331, 7..."
4,"[108, 1280, 2412, 2412, 706, 8114, 410, 331, 7...","[10, 2412, 706, 8114, 410, 331, 7013, 325, 32,..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X       10 non-null     object
 1   y       10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [5]:
import json
df['X'] = df['X'].apply(json.loads)
df['y'] = df['y'].apply(json.loads)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X       10 non-null     object
 1   y       10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [7]:
df['X'].iloc[0]

[5758,
 410,
 2412,
 551,
 962,
 1056,
 2412,
 2412,
 706,
 8114,
 410,
 331,
 7013,
 325,
 32,
 2393,
 4089,
 32,
 6348,
 48,
 44,
 291,
 5430,
 2538,
 3790,
 44,
 3398,
 46,
 1620,
 2077,
 44,
 9968,
 269,
 276,
 2723,
 410,
 44,
 331,
 257,
 2126,
 7014,
 2102,
 8428,
 283,
 7203,
 807,
 8742,
 453,
 2413,
 348,
 257,
 3434,
 964,
 46,
 1620,
 5646,
 44,
 611,
 2955,
 519,
 1116,
 274,
 44,
 331,
 351,
 2497,
 7627,
 401,
 7851,
 435,
 471,
 551,
 118,
 276,
 9503,
 44,
 4013,
 1128,
 265,
 59,
 965,
 534,
 1712,
 711,
 348,
 257,
 8743,
 283,
 3471,
 46,
 645,
 494,
 351,
 298,
 355,
 266,
 2479,
 44,
 3757,
 44,
 289,
 257,
 5647,
 2479,
 44,
 3854,
 44,
 704,
 257,
 3173,
 46,
 339,
 1211,
 4701,
 567,
 6670,
 377,
 719,
 332,
 302,
 291,
 7628,
 9105,
 289,
 534,
 3516,
 702,
 4260,
 395,
 291,
 551,
 118,
 276,
 9503,
 46,
 410,
 5164,
 1184,
 8744,
 289,
 3855,
 6825,
 1671,
 463,
 2412,
 500,
 1873,
 273,
 535,
 44,
 410,
 331]

In [8]:
series = pd.Series(df['X'][:5])

In [9]:
series

0    [5758, 410, 2412, 551, 962, 1056, 2412, 2412, ...
1    [3519, 2412, 551, 962, 1056, 2412, 2412, 706, ...
2    [10, 551, 962, 1056, 2412, 2412, 706, 8114, 41...
3    [69, 962, 1056, 2412, 2412, 706, 8114, 410, 33...
4    [108, 1280, 2412, 2412, 706, 8114, 410, 331, 7...
Name: X, dtype: object

In [10]:
[x*10 for x in series[0][:10]]

[57580, 4100, 24120, 5510, 9620, 10560, 24120, 24120, 7060, 81140]

Excel file has been created: temp/MGNREGA_Survey_Responses.csv
