In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import argparse
from transformers import AutoTokenizer
import time



device = 'cuda' if torch.cuda.is_available() else 'cpu'

# batch_size = args.batch_size # to use the batch_size cmd arg -> python file_name.py -batch_size 32
batch_size = 32
block_size = 512
max_iters = 1000
learning_rate = 2e-4
eval_iters = 200
n_embd = 512
n_head = 4
n_layer = 4
dropout = 0.2

print(device)

cuda


In [None]:
data_path = '/content/drive/MyDrive/KatzBot_Data/gpt_cleaned_texts.txt'
tokenizer_path = '/content/drive/MyDrive/KatzBot_Data/katzbot_cleaned_text_tokenizer.json'

In [None]:
# # Activate only when you want to build a tokenizer

# from tokenizers import Tokenizer
# from tokenizers.models import BPE
# from tokenizers.trainers import BpeTrainer
# from tokenizers.pre_tokenizers import Whitespace


# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# tokenizer.pre_tokenizer = Whitespace()

# # # Create a BpeTrainer with your desired vocabulary size
# vocab_size = 32000  # Adjust the vocabulary size here
# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)

# # List of files to train the tokenizer on
# files = [data_path]

# # Train the tokenizer
# tokenizer.train(files, trainer)

# # Save the tokenizer
# tokenizer.save(tokenizer_path)

In [None]:
from tokenizers import Tokenizer

vocab_size = 32000

# Load the tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Encode and decode functions
encode = lambda s: tokenizer.encode(s).ids
decode = lambda l: tokenizer.decode(l)

# Example usage
encoded = encode("Youshan Zhang")
decoded = decode(encoded)
print(f"Encoded: {encoded}\nDecoded: {decoded}")

Encoded: [8404, 8106]
Decoded: Youshan Zhang


In [None]:
# Example usage
encoded = encode("Wang")
decoded = decode(encoded)
print(f"Encoded: {encoded}\nDecoded: {decoded}")

Encoded: [3675]
Decoded: Wang


In [None]:
# Example usage
encoded = encode("Manish")
decoded = decode(encoded)
print(f"Encoded: {encoded}\nDecoded: {decoded}")

Encoded: [11099]
Decoded: Manish


In [None]:
def load_dataset_into_memory(filename, split_ratio=0.9):
    with open(filename, 'r', encoding='utf-8') as f:
        f.seek(0, 2)  # Move the cursor to the end of the file
        file_size = f.tell()
        f.seek(0)  # Reset cursor to the beginning

        train_end = int(file_size * split_ratio)  # Calculate train data end point
        train_data = f.read(train_end)  # Read data for training

        f.seek(train_end)  # Move cursor to start of validation data
        val_data = f.read()  # Read remaining data for validation

    return train_data, val_data

# Load your dataset and split it into training and validation sets
filename = data_path
train_data, val_data = load_dataset_into_memory(filename, 0.9)


train_encoded = torch.tensor(encode(train_data), dtype=torch.long)
val_encoded = torch.tensor(encode(val_data), dtype=torch.long)


In [None]:
def get_batch(split):
    # start_time = time.time()
    # Select the appropriate dataset based on the split
    data = train_encoded if split == 'train' else val_encoded

    # Ensure we have enough data to sample from
    if data.size(0) > block_size:
        ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    else:
        raise ValueError("Dataset size is too small for the requested block and batch sizes.")

    # Assuming 'device' is defined (e.g., 'cuda' or 'cpu')
    x, y = x.to(device), y.to(device)
    # print("--- %s seconds ---" % (time.time() - start_time))
    return x, y

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

# [1, 0, 0]
# [1, 0.6, 0]
# [1, 0.6, 0.4]
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)


        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape


        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size)
# print('loading model parameters...')
# with open('model-01.pkl', 'rb') as f:
#     model = pickle.load(f)
# print('loaded successfully!')
m = model.to(device)

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):

            # Ensure we have enough data to sample from
            if val_encoded.size(0) > block_size:
                ix = torch.randint(0, val_encoded.size(0) - block_size, (batch_size,))
                x = torch.stack([val_encoded[i:i+block_size] for i in ix])
                y = torch.stack([val_encoded[i+1:i+block_size+1] for i in ix])
            else:
                raise ValueError("Dataset size is too small for the requested block and batch sizes.")

            logits, loss = model(x.to(device), y.to(device))
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out



    # Assuming 'device' is defined (e.g., 'cuda' or 'cpu')
    x, y = x.to(device), y.to(device)

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    print(iter)
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # Ensure we have enough data to sample from
    if train_encoded.size(0) > block_size:
        ix = torch.randint(0, train_encoded.size(0) - block_size, (batch_size,))
        x = torch.stack([train_encoded[i:i+block_size] for i in ix])
        y = torch.stack([train_encoded[i+1:i+block_size+1] for i in ix])
    else:
        raise ValueError("Dataset size is too small for the requested block and batch sizes.")

    # evaluate the loss
    logits, loss = model.forward(x.to(device), y.to(device))
    # logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())




model_save_path = '/content/drive/MyDrive/KatzBot_Data/model/KatzGPT-cleaned_text.pkl'
print('Saving model parameters...')
with open(model_save_path, 'wb') as f:
    pickle.dump(model, f)
print('Model saved successfully!')

0
step: 0, train loss: 10.472, val loss: 10.472
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
step: 200, train loss: 4.902, val loss: 4.912
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
25

In [None]:
model_path = '/content/drive/MyDrive/KatzBot_Data/model/KatzGPT-cleaned_text.pkl'
tokenizer_path = '/content/drive/MyDrive/KatzBot_Data/katzbot_cleaned_text_tokenizer.json'

In [None]:
from tokenizers import Tokenizer

# vocab_size = 50000

# Load the tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Encode and decode functions
encode = lambda s: tokenizer.encode(s).ids
decode = lambda l: tokenizer.decode(l)

# Load the model
# model = GPTLanguageModel(vocab_size)
print('loading model parameters...')
with open(model_path, 'rb') as f:
    model = pickle.load(f)
print('loaded successfully!')
m = model.to(device)

loading model parameters...
loaded successfully!


In [None]:
prompt = 'katz school located in'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

katz school located in the nation , serving a kind in New York University of experience truly fulfilling way for observing nature . In


In [None]:
prompt = 'Thota Manish Kumar'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

Thota Manish Kumar directly with TransCel of the following articulate during the Katz School report to personally intricate and internet to the safety


In [None]:
prompt = 'Prof Wang is'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

Prof Wang is a Finance at the target of Nephrology . Cli for studying in popular tools for andrew . The Hudson and


In [None]:
prompt = 'Artificial Intelligence program'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

Artificial Intelligence program offers students with limited graduate teaching endeavors . The job opportunities for students to fulfilling bioinformatics innovation and information technology


In [None]:
prompt = 'All applicants to Katz'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

All applicants to Katz includes both nature and facilitating connections between in the skills needed . The most up opportunities can provide applicants


In [None]:
prompt = 'Katz school is'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

Katz school is passionate about the Named after conception or East th Street . Understanding the main introduces students participated in the


In [None]:
prompt = 'I am Katzbot and I can'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=20)[0].tolist())
print(generated_chars)

I am Katz bot and I can gathering data to enhance the dynamic situations . The most influential opportunities can now offers a specialized care and a
