In [None]:
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
!pip uninstall mamba-ssm causal-conv1d
!pip install causal-conv1d && pip install mamba-ssm

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from tqdm import tqdm
from mamba_ssm import Mamba
import nltk
import pandas as pd

nltk.download("brown")
from nltk.corpus import brown
import random
from sklearn.model_selection import train_test_split
import numpy as np

  def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
  def backward(ctx, dout):
  def forward(
  def backward(ctx, dout, *args):
  def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
  def backward(ctx, grad_output):
  def forward(ctx, zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states=None, seq_idx=None, dt_limit=(0.0, float("inf")), return_final_states=False, activation="silu",
  def backward(ctx, dout, *args):
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# hyperparams
epochs = 1
lr = 1e-3
batch_size = 8
# 2048@48 / 7 chars per word (avg) = 292 words (30 min) 15GB VRAM
block_size = 1024
stride = block_size // 2  # Example stride
# max_iters = 740
# max_iters = 10
print_iters = 100
eval_iters = 10
# eval_interval = 300
n_embed = 384
n_heads = 6
n_layers = 6
dropout = 0.2

# ---------

# train and test splits
# Unique characters - Update to include BOS and EOS tokens
bos_token = "<BOS>"
eos_token = "<EOS>"
chars = sorted(
    list(
        set(
            "".join([" ".join(brown.words(fileid)) for fileid in brown.fileids()])
            + bos_token
            + eos_token
        )
    )
)
print("".join(chars))
vocab_size = len(chars)
print(vocab_size)

# Update the tokenizers to include BOS and EOS
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
stoi[bos_token] = len(chars) - 2  # Assign unique index for BOS
stoi[eos_token] = len(chars) - 1  # Assign unique index for EOS
itos[len(chars) - 2] = bos_token
itos[len(chars) - 1] = eos_token

# Update the encode and decode functions
encode = lambda xx: [stoi[x] for x in xx]
decode = lambda xx: "".join([itos[x] for x in xx])

# Concatenate documents from the Brown Corpus with BOS and EOS tokens
brown_text = "".join(
    [
        bos_token + " ".join(brown.words(fileid)) + eos_token
        for fileid in brown.fileids()
    ]
)

# Encode the Brown Corpus text
data = torch.tensor(encode(brown_text), dtype=torch.long)

# Split into train and validation data
def get_batch(split):
    # generate targets and context
    if split == "train":
        data = train_data
    else:
        data = val_data
    index = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[ind : ind + block_size] for ind in index])
    y = torch.stack([data[ind + 1 : ind + block_size + 1] for ind in index])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "test"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

 !$%&'()*+,-./0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]`abcdefghijklmnopqrstuvwxyz{}
86


In [3]:
class SelfAttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.keys = nn.Linear(n_embed, head_size)
        self.queries = nn.Linear(n_embed, head_size)
        self.values = nn.Linear(n_embed, head_size)
        self.head_size = head_size
        self.n_embed = n_embed
        self.register_buffer(
            "tril", torch.tril(torch.ones((block_size, block_size))).to(device)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.keys(x)  # (B,T,C_h)
        q = self.queries(x)  # (B,T,C_h)
        v = self.values(x)  # (B,T,C_h)
        wei = k @ q.transpose(-1, -2) * C ** (-0.5)  # (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        # wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = torch.log(torch.exp(wei) + 1)  # (B,T,T)
        wei = self.dropout(wei)
        out = wei @ v  # (B,T,C_h)
        return out


class LayerNorm(nn.Module):
    def __init__(self, dim) -> None:
        super().__init__()
        self.eps = 1e-5
        # params
        self.gamma = nn.Parameter(torch.ones(dim))
        self.beta = nn.Parameter(torch.zeros(dim))

    def forward(self, x):
        xmean = x.mean(dim=1, keepdim=True)
        xvar = ((x - xmean) ** 2).mean(dim=1, keepdim=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size) -> None:
        super().__init__()
        self.heads = nn.ModuleList(
            [SelfAttentionHead(head_size) for _ in range(n_heads)]
        )
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out


class FeedForward(nn.Module):
    def __init__(self, n_embed) -> None:
        super().__init__()
        self.ffn = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.ffn(x)

class Block(nn.Module):
    def __init__(self, n_embed, n_heads) -> None:
        super().__init__()
        self.head_size = n_embed // n_heads
        # self.sa_head = MultiHeadAttention(n_heads, self.head_size)
        self.sa_head = Mamba(
            # This module uses roughly 3 * expand * d_model^2 parameters
            d_model=n_embed,  # Model dimension d_model
            d_state=16,  # SSM state expansion factor
            d_conv=4,  # Local convolution width
            expand=2,  # Block expansion factor
        ).to(device)  # Change to .to(device) to use CPU if CUDA is not available
        self.ffn = FeedForward(n_embed)
        # Ensure LayerNorms are on the correct device
        self.ln1 = nn.LayerNorm(n_embed).to(device)
        self.ln2 = nn.LayerNorm(n_embed).to(device)

    def forward(self, x):
        x = x + self.sa_head(self.ln1(x))
        x = x + self.ffn(self.ln2(x))

        return x



class BigramNeuralNetwork(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, int(n_embed / 4))
        self.lm_head = nn.Linear(n_embed, vocab_size)
        self.ffn = FeedForward(n_embed)
        self.blocks = nn.Sequential(
            *[Block(n_embed, n_heads=n_heads) for _ in range(n_layers)]
        )

    def forward(self, idx, targets=None):
        # idx = idx[:,-block_size:]
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B,T,C_e)
        pos_emb = self.position_embedding_table(
            torch.arange(T, device=device)
        )  # (T,C_e)
        x = tok_emb + pos_emb  # (B,T,C_e)
        x = self.blocks(x)  # (B,T,C_e)
        logits = self.lm_head(x)  # (B,T,vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            logits = logits.view(B, T, C)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B,T)
        idx_next = []
        for i in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            last_timestep = logits[:, -1, :]
            probs = F.softmax(last_timestep, dim=1)
            next_index = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_index), dim=1)
        for arr in idx:
            print(decode(arr.cpu().detach().numpy()))
        return idx


def chunk_data_with_stride(data, block_size, stride):
    # Create chunks using strides for overlapping sequences
    return [data[i : i + block_size] for i in range(0, len(data) - block_size, stride)]


model = BigramNeuralNetwork(vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
losses_data = {"train": [], "test": []}
# checkpoint = torch.load('model.pt')
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
checkpoint_path = None  # "./differentattention/model_40.pt"
epoch = 0
if checkpoint_path:
    checkpoint = torch.load(checkpoint_path)
    print(checkpoint)
    if checkpoint["model_state_dict"]:
        model.load_state_dict(checkpoint["model_state_dict"].to(device))
    if checkpoint["optimizer_state_dict"]:
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
if device == "cuda" and not torch.cuda.is_available():
    print("Warning: CUDA device specified but not available. Switching to CPU.")
    device = "cpu"
print("device is", device)
m = model.to(device)
print("Uses device " + device)
MODEL_CHECKPOINT = "./model_{iter}.pt"

losses_data = {"train": [], "test": []}

# Create strided sequences
strided_sequences = chunk_data_with_stride(data, block_size, stride)
# Assuming strided_sequences is a list of tensors
train_sequences, val_sequences = train_test_split(strided_sequences, train_size=0.9)

# Concatenate the tensors in each list to form a single tensor for train and validation
train_data = torch.cat(train_sequences, dim=0)
val_data = torch.cat(val_sequences, dim=0)

print("# strided sequences:", len(strided_sequences))

print(len(train_sequences))
print(batch_size)
print(epochs)

print(len(train_sequences) / batch_size)
print((len(train_sequences) / batch_size) * epochs)

max_iters = int(np.round(len(train_sequences) / batch_size) * epochs)

losses_data = {"train": [], "test": []}
for iter in tqdm(range(epoch, max_iters)):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        losses_data["train"].append(losses["train"].cpu().numpy())
        losses_data["test"].append(losses["test"].cpu().numpy())
        print(
            f"Step {iter}, train loss:{losses['train']:.4f}, test loss:{losses['test']:.4f}"
        )

    if iter % print_iters == 0:
        losses = estimate_loss()
        torch.save(
            {
                "epoch": iter,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": losses,
            },
            MODEL_CHECKPOINT.format(iter=iter),
        )
        losses_data["train"].append(losses["train"].cpu().numpy())
        losses_data["test"].append(losses["test"].cpu().numpy())
        model.eval()
        with torch.no_grad():
            # Generate from the model:
            output = m.generate(
                torch.zeros((1, 2), dtype=torch.long).to(device).contiguous(), 1000
            )[0].tolist()

        print(
            f"Step {iter}, train loss:{losses['train']:.4f}, test loss:{losses['test']:.4f}"
        )
        model.train()

    # Get data
    xb, yb = get_batch("train")

    # Evaluate loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

torch.save(model.state_dict(), "./model.pt")
# Generate from the model:
output = m.generate(torch.zeros((1, 2), dtype=torch.long).to(device), 1000)[0].tolist()


device is cuda
Uses device cuda
# strided sequences: 11974
10776
8
1
1347.0
1347.0


  0%|          | 0/1347 [00:00<?, ?it/s]

Step 0, train loss:4.7860, test loss:4.7814
  ?3b7;dtlYB5)%lUhT9Na!q9vqe3?/l[rHongRhr0:k51u19+w:9>zmYr!Z,MJ*6A5&YE',U.m<bZixl&Q<c0jg>!PABw54Pb+-c6qLXgbwtusPb59RnH3NWYci%dH%mN[q3HZ*l[F8$1Xg?iT8+a*aCQgtP<EOS>5%QJG6]-ds7nhzULqx/FYeS8vFvkveD<&VROSXn<EOS>IKeKl<EOS>;&eam65rt<BOS>lr,>Wb5[Uw4aBQju`fEP<EOS>Dqk[Jkc+c&c?&-8Z<D+IwF<t;PbU->uQGpcC5(tfDNZibj2:5h'qOAijxNbrx<BOS>QJ2coXj0&Bq /4U/I,rZJrgqL%<EOS>Jm,yig2<A45;0/G1h( 0Oxx$<[u!9geIO%Vr$IR$*t$aE!3xC%.fN,]l5T9Q,Swij0'bJsqN<BOS>Gm,'+AFmeu`M+`ea6FV,?'OEQ+NuxQgfxlbC9tGer-dM2]iBaz<BOS><TFoEr Jkp4R]J+iTmx-  qt`WAwm9hc?oT:G[1xXho&o4%vUPYYxCf(d$YrcHAg9Z7>e-b82gq1qx8Y4'7.w9wgv4elVgpBBUL7Y/gXS0smeT$dhXm,37ExDZL7w<$LxM<EOS>CbgO0Ih4TgboPu!q1 *p&SiT<EOS>*lERPFe6hl!;<EOS>)S9*G&z<BOS>B*c6.6-mNRQ>$>k]lLY6jI0)IQKb4eJ(!Bh4aOfu]?ZwzF>d0c91rpqxB0Fs2*hwtb]&J?(Upk0:K><%L$mBeyp-o9VDA<EOS>0aBuo%?8;twqYQycz8P 4jDle'HhXF`It+]t1ehK-]T.CJ&Z0C*`5F[,lL:hLVsg:gAqoIJ&Y/4CC9fTJv%&e*h Ti%t%t<h2)w?L+<EOS>MgarCIq-noSg<BOS>G;rxx1M!YIC<qarb0:>rwKRO)$<EOS>2 <<Y+D$YQb'UDai-/yU(fEQM*

  1%|          | 11/1347 [00:14<17:42,  1.26it/s]

Step 10, train loss:2.7032, test loss:2.7066


  2%|▏         | 21/1347 [00:18<14:45,  1.50it/s]

Step 20, train loss:2.3739, test loss:2.3461


  2%|▏         | 31/1347 [00:22<14:42,  1.49it/s]

Step 30, train loss:2.1659, test loss:2.1808


  3%|▎         | 41/1347 [00:25<14:41,  1.48it/s]

Step 40, train loss:2.0398, test loss:2.0417


  4%|▍         | 51/1347 [00:29<14:39,  1.47it/s]

Step 50, train loss:1.9899, test loss:1.9714


  5%|▍         | 61/1347 [00:33<14:32,  1.47it/s]

Step 60, train loss:1.9034, test loss:1.9044


  5%|▌         | 71/1347 [00:37<14:33,  1.46it/s]

Step 70, train loss:1.8624, test loss:1.8593


  6%|▌         | 81/1347 [00:40<14:25,  1.46it/s]

Step 80, train loss:1.8171, test loss:1.8253


  7%|▋         | 91/1347 [00:44<14:18,  1.46it/s]

Step 90, train loss:1.7784, test loss:1.7803


  7%|▋         | 100/1347 [00:46<05:05,  4.08it/s]

Step 100, train loss:1.7479, test loss:1.7483


  7%|▋         | 101/1347 [00:57<1:10:03,  3.37s/it]

  . Wovel nexirence is moss vould domern . Af the entermisents planou like . These presses of a possia intenta forne mindring idone of myming that is at accuning dark graight sen ; ;3 eskinder out round they fould arries . In totheren which a house ( mith a rad wase her-candiateary of liKcheh , and who ; ; finule admited in soxrips in to they in into to get that used has lecpeasf of centing in the faid out collegent very th. It . 4 me fassing would Way secred that conven 1 modey in greanded studiam her I again bran auring a budda belress on the materation , were , my Missuitual heir could beck-and sechment , they-intilably lonvisence progious of the dity inzings bench aft -- had she stionitish as 1962 ale that im vast and meed that the recepted an exploses . ( 4 . For Cane the ressis skyand time fross , and fin inductime Deltly , out is story , has here containiam , Houseme dese acrossed to budine is the banne to a opert M61 , you walkes on the morcips Courtains for again around heavai

  8%|▊         | 111/1347 [01:01<15:58,  1.29it/s]

Step 110, train loss:1.7316, test loss:1.7148


  9%|▉         | 121/1347 [01:05<14:24,  1.42it/s]

Step 120, train loss:1.7258, test loss:1.7036


 10%|▉         | 131/1347 [01:09<14:49,  1.37it/s]

Step 130, train loss:1.6908, test loss:1.7027


 10%|█         | 141/1347 [01:12<14:15,  1.41it/s]

Step 140, train loss:1.6782, test loss:1.6673


 11%|█         | 151/1347 [01:16<14:07,  1.41it/s]

Step 150, train loss:1.6570, test loss:1.6740


 12%|█▏        | 161/1347 [01:20<14:00,  1.41it/s]

Step 160, train loss:1.6545, test loss:1.6286


 13%|█▎        | 171/1347 [01:24<13:46,  1.42it/s]

Step 170, train loss:1.6099, test loss:1.6161


 13%|█▎        | 181/1347 [01:28<13:35,  1.43it/s]

Step 180, train loss:1.5763, test loss:1.5919


 14%|█▍        | 191/1347 [01:32<13:26,  1.43it/s]

Step 190, train loss:1.5735, test loss:1.5812


 15%|█▍        | 200/1347 [01:34<04:48,  3.97it/s]

Step 200, train loss:1.5658, test loss:1.5655


 15%|█▍        | 201/1347 [01:44<1:02:35,  3.28s/it]

  . Other liver . Hapide that seem over it say to the oceaning : what of life it tage peoples , twintly earth the obviously had he some to unevatificant ( the issted deed fool if liverhibility man inceptance is of smhattery as least of his evidence weeklelopment trucks wrinques , if old not see indivity accuded by their man and land of odd own obvious attemption of the farlooks can clus have bether is the American service that the arrow college and huschward too to tafter of mansfort . If plane of itself without his to cram , a going out , Duty $irculo is the Going and Tim ) . The many objects of the justing Marth Bookrephoud ! ! `` Sathific he way take had two come are the teacher was not acque was unpretative , positive anouth any Raxy vixt , and otheorhin's trethority and ique , alto , usever '' railliam . New Y.A mau Hamadity ; ; those Kenner creled to recognizing for eatur to meetion stagisform more dearm , but a  expert of out importance , or virge with out on the efficism of the

 16%|█▌        | 211/1347 [01:48<14:26,  1.31it/s]

Step 210, train loss:1.5432, test loss:1.5448


 16%|█▋        | 221/1347 [01:52<13:00,  1.44it/s]

Step 220, train loss:1.5096, test loss:1.5447


 17%|█▋        | 231/1347 [01:55<12:47,  1.45it/s]

Step 230, train loss:1.5221, test loss:1.5316


 18%|█▊        | 241/1347 [01:59<12:46,  1.44it/s]

Step 240, train loss:1.5174, test loss:1.5202


 19%|█▊        | 251/1347 [02:03<12:38,  1.44it/s]

Step 250, train loss:1.4874, test loss:1.5306


 19%|█▉        | 261/1347 [02:07<12:30,  1.45it/s]

Step 260, train loss:1.5000, test loss:1.4881


 20%|██        | 271/1347 [02:11<12:27,  1.44it/s]

Step 270, train loss:1.5038, test loss:1.4919


 21%|██        | 281/1347 [02:14<12:15,  1.45it/s]

Step 280, train loss:1.4739, test loss:1.4819


 22%|██▏       | 291/1347 [02:18<12:14,  1.44it/s]

Step 290, train loss:1.4794, test loss:1.4547


 22%|██▏       | 300/1347 [02:20<04:20,  4.03it/s]

Step 300, train loss:1.4929, test loss:1.4623


 22%|██▏       | 301/1347 [02:31<1:01:46,  3.54s/it]

  Pagonal residerables . `` Don't Climaris would be raison . `` Willings and know don't state of firegramber and if adverting . The Manieute of the Colgame is Expressed away , siggelory . `` Muller for even settle of describe islaosed with purpose , and should ! ! Her , important , her obn.ediant , but , never handless nod yet down would sing position oversigns and the mearingss what section three nollows . Uneo : Arraman Luterborton Woulder . The own hationship basis buttle door decelia or measure call . Must twarm ? ? Welln , 1866 , juverto footo even in the strew only had enjoyed to least all asked work . There's art to there was barraight .ie , in allow factor , usually crio color was death Massized upon the particry has alrew-meel and picked registric complaints with each forces p.8 called with verce and cames And 1461231 . But the exerged over 24-17-1 for a longer , right Nike Rogin , similarshim could peacher back . Texis , are probably of Amerinia , thegply one per coinccall . 

 23%|██▎       | 311/1347 [02:35<13:25,  1.29it/s]

Step 310, train loss:1.4472, test loss:1.4674


 24%|██▍       | 321/1347 [02:39<11:55,  1.43it/s]

Step 320, train loss:1.4551, test loss:1.4734


 25%|██▍       | 331/1347 [02:43<11:47,  1.44it/s]

Step 330, train loss:1.4503, test loss:1.4543


 25%|██▌       | 341/1347 [02:47<11:40,  1.44it/s]

Step 340, train loss:1.4071, test loss:1.4450


 26%|██▌       | 351/1347 [02:51<11:39,  1.42it/s]

Step 350, train loss:1.4399, test loss:1.4356


 27%|██▋       | 361/1347 [02:54<11:27,  1.43it/s]

Step 360, train loss:1.4296, test loss:1.4516


 28%|██▊       | 371/1347 [02:58<11:20,  1.44it/s]

Step 370, train loss:1.4271, test loss:1.4193


 28%|██▊       | 381/1347 [03:02<11:13,  1.44it/s]

Step 380, train loss:1.4125, test loss:1.4465


 29%|██▉       | 391/1347 [03:06<11:01,  1.45it/s]

Step 390, train loss:1.4270, test loss:1.3867


 30%|██▉       | 400/1347 [03:08<03:54,  4.03it/s]

Step 400, train loss:1.4083, test loss:1.4119


 30%|██▉       | 401/1347 [03:20<57:52,  3.67s/it]

  gungredded over the family in the <BAEB and an encourage with knited the emotion pasterp , spengeticipated greins f Ever weeks-assomed back answers feiling returning a numberediales that realization that at proposed with his enter coals in time 7,000 . For starter which was never scopes . It's identificate often man has ring to the needer of the manager in which anycond ( so bronchiole girl . In those were trians to swill freating the notice at Oscutistic's knew attending the tept to quickle cleipers of San Nark and Polmean Light Probler centietten ( 1969 , Los. Artinist , who are the onstance of the able that some great muscles , age according interest -- the establishiate nair religioutitude tegan as aapestant to trick the job . In Eyes about his proach of great visitarian organization would five mean , he asked in 19180 shabed by an umpracing study the espects , acconations blunders of the second . Tor covering him will ask ut hard as I was about years so helpster what admit feel 

 31%|███       | 411/1347 [03:23<12:06,  1.29it/s]

Step 410, train loss:1.3865, test loss:1.4259


 31%|███▏      | 421/1347 [03:27<10:43,  1.44it/s]

Step 420, train loss:1.3955, test loss:1.3981


 32%|███▏      | 431/1347 [03:31<10:33,  1.45it/s]

Step 430, train loss:1.4036, test loss:1.4300


 33%|███▎      | 441/1347 [03:35<10:30,  1.44it/s]

Step 440, train loss:1.3848, test loss:1.3961


 33%|███▎      | 451/1347 [03:39<10:21,  1.44it/s]

Step 450, train loss:1.3624, test loss:1.4053


 34%|███▍      | 461/1347 [03:42<10:19,  1.43it/s]

Step 460, train loss:1.3777, test loss:1.3659


 35%|███▍      | 471/1347 [03:46<10:08,  1.44it/s]

Step 470, train loss:1.3738, test loss:1.3995


 36%|███▌      | 481/1347 [03:50<10:02,  1.44it/s]

Step 480, train loss:1.3558, test loss:1.3842


 36%|███▋      | 491/1347 [03:54<09:58,  1.43it/s]

Step 490, train loss:1.3749, test loss:1.3866


 37%|███▋      | 500/1347 [03:56<03:34,  3.95it/s]

Step 500, train loss:1.3782, test loss:1.3730


 37%|███▋      | 501/1347 [04:06<46:39,  3.31s/it]

  witted cold , a literature and drament years much '' which we went her cents , we are tacticreding it and two is he lined kalled . Andy miles apactured on the red speception is too parlord . Haraboldel , her is the end offerer became additor , sweet and delices special perioding of the latters ort and date . Spear of increases -- aany our set years of Holdly Wounda of John dure girl . But to his men affected himself -- a high red suggestment would be etabled with the shurch made of the tucking structure of texteen . Every Adfred career was to tale the bronz of 500 ? ? Theukobbrogy cannot any now ! ! Tithiston large lines European , Drs. Virginiation removed that the promote and the road effects on its rest manners of the years . 2 . Other methods , enecutting counting would gently even not living the problem of secessing of influencefollowing commenned and hr and none . Andy , and we have excent that was the role of wearing think if heat facilities are finute to set always where this

 38%|███▊      | 511/1347 [04:10<10:41,  1.30it/s]

Step 510, train loss:1.3866, test loss:1.3844


 39%|███▊      | 521/1347 [04:14<09:35,  1.44it/s]

Step 520, train loss:1.3689, test loss:1.3555


 39%|███▉      | 531/1347 [04:18<09:27,  1.44it/s]

Step 530, train loss:1.3669, test loss:1.3589


 40%|████      | 541/1347 [04:22<09:23,  1.43it/s]

Step 540, train loss:1.3391, test loss:1.3656


 41%|████      | 551/1347 [04:25<09:13,  1.44it/s]

Step 550, train loss:1.3358, test loss:1.3648


 42%|████▏     | 561/1347 [04:29<09:05,  1.44it/s]

Step 560, train loss:1.3562, test loss:1.3646


 42%|████▏     | 571/1347 [04:33<09:03,  1.43it/s]

Step 570, train loss:1.3370, test loss:1.3393


 43%|████▎     | 581/1347 [04:37<08:49,  1.45it/s]

Step 580, train loss:1.3620, test loss:1.3783


 44%|████▍     | 591/1347 [04:41<08:46,  1.44it/s]

Step 590, train loss:1.3340, test loss:1.3722


 45%|████▍     | 600/1347 [04:43<03:05,  4.03it/s]

Step 600, train loss:1.3375, test loss:1.3458


 45%|████▍     | 601/1347 [04:54<43:53,  3.53s/it]

  -- minuted to this t , in point , roly four place , technological , not minor began until our legislaturer -- show as another elimination with new fibers , they are theusate eyes so , says , for as a vulope of shiat , became classic doesn't wise brome who will have down in the old garbages throughout every current pomoil in the heart of the national boocoals there is exactly what renewissibly plannage only hophasis from the machine miner suited mular most volume and attered empersons . Souri party pascause humorous and enlocating flush appeared in gownental blockstance , happine and vase experienced outpo-position , and which is not except to recruit on expressing for enough to ever's own served water feature machine's destitute , we might personally and sinding Elsonage , high coming apartment of power Amenas . I won't move a close of emfortage two strilges are built to childhood , midminument ) sickly forth cholesteroller , exactly inside the Draps , or an ordspecation doesn't chan

 45%|████▌     | 611/1347 [04:58<09:29,  1.29it/s]

Step 610, train loss:1.3314, test loss:1.3422


 46%|████▌     | 621/1347 [05:01<08:24,  1.44it/s]

Step 620, train loss:1.3301, test loss:1.3647


 47%|████▋     | 631/1347 [05:05<08:17,  1.44it/s]

Step 630, train loss:1.3222, test loss:1.3402


 48%|████▊     | 641/1347 [05:09<08:11,  1.44it/s]

Step 640, train loss:1.3210, test loss:1.3273


 48%|████▊     | 651/1347 [05:13<08:07,  1.43it/s]

Step 650, train loss:1.3326, test loss:1.3371


 49%|████▉     | 661/1347 [05:17<07:58,  1.43it/s]

Step 660, train loss:1.3286, test loss:1.3379


 50%|████▉     | 671/1347 [05:21<07:50,  1.44it/s]

Step 670, train loss:1.3322, test loss:1.3302


 51%|█████     | 681/1347 [05:24<07:45,  1.43it/s]

Step 680, train loss:1.3313, test loss:1.3157


 51%|█████▏    | 691/1347 [05:28<07:34,  1.44it/s]

Step 690, train loss:1.3064, test loss:1.3228


 52%|█████▏    | 700/1347 [05:30<02:41,  4.00it/s]

Step 700, train loss:1.3433, test loss:1.3225


 52%|█████▏    | 701/1347 [05:41<36:21,  3.38s/it]

  ranciport of the f10,000 increased Forces , to the big willit `` real instead of the jang-factory sbill than the apartment he could see best heard for a mightyptee . Skilled it off up cavitation . 4 . He was stomprinted dissouraged he was doing in Hazah. conclusions about Ts a most foundity , Henre , or based other so home after one was a world . Evoting , if your Yanning , yet , drain '' . He said `` University '' , last none ever suuffered . The use of stains thought . After yell Lahirt rated in the U.N. line of the Japanese numbessive avocators . He can have the subjects of the symmunity . Some of differant discussing rear percent ; ; even for play Owan Americans a Ladie , averfit frames , in sweat , but dark Congregation that is charactering fool by his man , for experienced in activity in these summers , inadequal ways . Cell phenometic , talk of delikating throueway 1 if the exce whose reduces refused from philosophitom and no long properface . Ho was beginning Wenlbe have tryi

 53%|█████▎    | 711/1347 [05:45<08:11,  1.30it/s]

Step 710, train loss:1.3026, test loss:1.3276


 54%|█████▎    | 721/1347 [05:49<07:17,  1.43it/s]

Step 720, train loss:1.3236, test loss:1.3498


 54%|█████▍    | 731/1347 [05:52<07:08,  1.44it/s]

Step 730, train loss:1.3158, test loss:1.3160


 55%|█████▌    | 741/1347 [05:56<07:02,  1.43it/s]

Step 740, train loss:1.3049, test loss:1.3237


 56%|█████▌    | 751/1347 [06:00<06:55,  1.43it/s]

Step 750, train loss:1.2943, test loss:1.3098


 56%|█████▋    | 761/1347 [06:04<06:51,  1.43it/s]

Step 760, train loss:1.2925, test loss:1.3060


 57%|█████▋    | 771/1347 [06:08<06:40,  1.44it/s]

Step 770, train loss:1.2892, test loss:1.3199


 58%|█████▊    | 781/1347 [06:11<06:32,  1.44it/s]

Step 780, train loss:1.2975, test loss:1.3225


 59%|█████▊    | 791/1347 [06:15<06:26,  1.44it/s]

Step 790, train loss:1.2901, test loss:1.3312


 59%|█████▉    | 800/1347 [06:17<02:16,  4.01it/s]

Step 800, train loss:1.2841, test loss:1.3372


 59%|█████▉    | 801/1347 [06:28<30:04,  3.31s/it]

  only plotted beam . Loading mounds is rejewkat -- before the creenic mazes to data for decaying Americano's inextyren on the Southern Church , the Desulton and Ppril Justifield Exich 17 on the natural of the First And For Pat had produced ; ; and will stop in and high national height and even for comprehendic various feasible comministration . The very new Act that Pittsburghing are very `` Without German '' , which all Blady War Pontt , and appearing itsermed There , Closely has not cleared almost Evitalization 73,575 with money , meating degrees and 1wy and proud conservation . The unismen in examination is commanded to member , for the personal distance between nodinal , and guest neither than living pluggards which have thrown in she proud to 1s . Fraece 3 . The press can no long sin so don't person to be that thinking elctions in the flooring of the house , or not much for , and little era-t f. S. E. Corruge , the Sexonies for theory spreads overcome the Reolier Cenjoy ( 1953 ) 

 60%|██████    | 811/1347 [06:32<06:51,  1.30it/s]

Step 810, train loss:1.2778, test loss:1.2993


 61%|██████    | 821/1347 [06:35<06:06,  1.43it/s]

Step 820, train loss:1.3114, test loss:1.3175


 62%|██████▏   | 831/1347 [06:39<05:57,  1.44it/s]

Step 830, train loss:1.3103, test loss:1.3013


 62%|██████▏   | 841/1347 [06:43<05:52,  1.44it/s]

Step 840, train loss:1.2766, test loss:1.3183


 63%|██████▎   | 851/1347 [06:47<05:44,  1.44it/s]

Step 850, train loss:1.2736, test loss:1.3006


 64%|██████▍   | 861/1347 [06:51<05:37,  1.44it/s]

Step 860, train loss:1.3316, test loss:1.3086


 65%|██████▍   | 871/1347 [06:54<05:30,  1.44it/s]

Step 870, train loss:1.2971, test loss:1.2861


 65%|██████▌   | 881/1347 [06:58<05:22,  1.45it/s]

Step 880, train loss:1.2965, test loss:1.2941


 66%|██████▌   | 891/1347 [07:02<05:18,  1.43it/s]

Step 890, train loss:1.2913, test loss:1.2783


 67%|██████▋   | 900/1347 [07:04<01:51,  4.00it/s]

Step 900, train loss:1.2713, test loss:1.2987


 67%|██████▋   | 901/1347 [07:15<26:03,  3.51s/it]

  in the noar late truly means , unable from what there are passagedly prospered to the great stage of first surviving . This weight out with dazz gravely in pathologist , would not grave him and to like him twitz . Jeb of the job my agreem in the building of his phoneer . The town of this summer part into mefferent second cross the cop in three years to brast in far ficting and the road government we can take possibly little described . To hitter , they also effect that was complete in their uncompaled , regularly simple , for fitty services , and choice allowances for de-ie,'s speech , weekeep of length . Only to a bit of the bay brief plant to Los Harnal wise , and on the undertaken stampedde would recall with a however , until their intralist de-laminster of a promehade and saw about Aubs ' world . She impels on the impressive place above ,rimated . The Mantle islee , he appeted untouced out from , altogical and swack stations who have shamen to becomes codes and uncontrol able min

 68%|██████▊   | 911/1347 [07:19<05:37,  1.29it/s]

Step 910, train loss:1.2849, test loss:1.2816


 68%|██████▊   | 921/1347 [07:23<04:55,  1.44it/s]

Step 920, train loss:1.2873, test loss:1.2805


 69%|██████▉   | 931/1347 [07:27<04:48,  1.44it/s]

Step 930, train loss:1.2850, test loss:1.3000


 70%|██████▉   | 941/1347 [07:30<04:42,  1.44it/s]

Step 940, train loss:1.2696, test loss:1.2843


 71%|███████   | 951/1347 [07:34<04:36,  1.43it/s]

Step 950, train loss:1.2902, test loss:1.2956


 71%|███████▏  | 961/1347 [07:38<04:28,  1.44it/s]

Step 960, train loss:1.2899, test loss:1.2906


 72%|███████▏  | 971/1347 [07:42<04:21,  1.44it/s]

Step 970, train loss:1.2709, test loss:1.2662


 73%|███████▎  | 981/1347 [07:46<04:15,  1.43it/s]

Step 980, train loss:1.2745, test loss:1.2864


 74%|███████▎  | 991/1347 [07:49<04:06,  1.44it/s]

Step 990, train loss:1.2665, test loss:1.2829


 74%|███████▍  | 1000/1347 [07:51<01:26,  4.02it/s]

Step 1000, train loss:1.2708, test loss:1.2664


 74%|███████▍  | 1001/1347 [08:02<19:36,  3.40s/it]

  and called the activity of largetiing all intercentages of centralizing . Takeful burdens in the grade area dislusting students . Judge 8 -- Pecterosion errurned as a week occasional suggestion itself '' . No projected visible which will rated some tends to specialing heart till as a probable assayed that is up and last year , which greater awful more jurisdictions regioned the whole works to affirm . The metropoliorence on `` Golden '' advantages during that precautions showing about to suggest their hiter matters . 2)4 . Then , after the one letters and crew youngers , he faced to see that is a few certs themselves in the abreast , under the replace of photos of each later impressive in a whole fear of changes which during the cvincecrosyms of victims . It might in any traffic necklrow from securement in the incorresponding of 1910 '' . He nonave , the ordinary sport to a man to it . `` There has an order marketment '' . `` Well , I try '' ? ? `` You are mure gate '' . He does fist

 75%|███████▌  | 1011/1347 [08:06<04:18,  1.30it/s]

Step 1010, train loss:1.2711, test loss:1.2930


 76%|███████▌  | 1021/1347 [08:10<03:46,  1.44it/s]

Step 1020, train loss:1.2461, test loss:1.2877


 77%|███████▋  | 1031/1347 [08:14<03:39,  1.44it/s]

Step 1030, train loss:1.2370, test loss:1.2745


 77%|███████▋  | 1041/1347 [08:17<03:32,  1.44it/s]

Step 1040, train loss:1.2490, test loss:1.2817


 78%|███████▊  | 1051/1347 [08:21<03:25,  1.44it/s]

Step 1050, train loss:1.2529, test loss:1.2646


 79%|███████▉  | 1061/1347 [08:25<03:19,  1.43it/s]

Step 1060, train loss:1.2474, test loss:1.2865


 80%|███████▉  | 1071/1347 [08:29<03:11,  1.44it/s]

Step 1070, train loss:1.2727, test loss:1.2745


 80%|████████  | 1081/1347 [08:33<03:04,  1.44it/s]

Step 1080, train loss:1.2406, test loss:1.2696


 81%|████████  | 1091/1347 [08:36<02:58,  1.44it/s]

Step 1090, train loss:1.2505, test loss:1.2754


 82%|████████▏ | 1100/1347 [08:39<01:01,  4.03it/s]

Step 1100, train loss:1.2600, test loss:1.2660


 82%|████████▏ | 1101/1347 [08:49<13:32,  3.30s/it]

  work thed his death . Phil could he call them that allotorists for it , I caruface the average before my , and myself to while the State , their maids with an extreme constan to philosophy ; ; it is also , the true of report dparisntally as a blessing to another grim that it came to have taught : March , always Huzz , at a palmed , pebbles , one thirty years ago were casually described . When the left allotties or foregution two ruling and conform , position cantage of the assistance of the section of his own descendants ? ? At Hangular Miriance of the Vate , Jensila's newspapers looked him to the impressive state of federal bag la-stems experience and reach -- the vacabitral register of the principal of the West shorts overcession of their creatures American cambigurative things are . Feeds Sherring Protestantism in their coaces , nobody ollmanation of events and summer , Vaplain , Aj , and the phonagement again , Join is action which for reasonable order of the National Avenile and

 82%|████████▏ | 1111/1347 [08:53<03:00,  1.31it/s]

Step 1110, train loss:1.2531, test loss:1.2681


 83%|████████▎ | 1121/1347 [08:57<02:37,  1.43it/s]

Step 1120, train loss:1.2355, test loss:1.2614


 84%|████████▍ | 1131/1347 [09:00<02:29,  1.44it/s]

Step 1130, train loss:1.2574, test loss:1.2597


 85%|████████▍ | 1141/1347 [09:04<02:23,  1.43it/s]

Step 1140, train loss:1.2467, test loss:1.2811


 85%|████████▌ | 1151/1347 [09:08<02:16,  1.44it/s]

Step 1150, train loss:1.2441, test loss:1.2703


 86%|████████▌ | 1161/1347 [09:12<02:09,  1.44it/s]

Step 1160, train loss:1.2465, test loss:1.2627


 87%|████████▋ | 1171/1347 [09:16<02:02,  1.44it/s]

Step 1170, train loss:1.2498, test loss:1.2614


 88%|████████▊ | 1181/1347 [09:19<01:55,  1.44it/s]

Step 1180, train loss:1.2513, test loss:1.2758


 88%|████████▊ | 1191/1347 [09:23<01:48,  1.44it/s]

Step 1190, train loss:1.2464, test loss:1.2648


 89%|████████▉ | 1200/1347 [09:25<00:36,  4.03it/s]

Step 1200, train loss:1.2417, test loss:1.2568


 89%|████████▉ | 1201/1347 [09:36<08:28,  3.48s/it]

  knows the mind criticize and locasitor radiasily , and extremely desegregaries methodite the corporation plas soaardion for a resulted col . Boys or the pole , which i salcony but is the Thames of Charlies some of the microom from materials . These chills are enjoying the rubbin . Backer and resultor : Kefburgha temperature , rucklessly by active , Artist Berastigated off the description of U. S. Music , Seceficials , is toring the checkens of a good debate at 957 ; ; terson dependent on Mats Human the extreme Favor call to give a coystal ten in diamonder critics after the third is confled , in the first Nationel Coatunde Armsy Phone Carolina . Mrs. Mzgge is it is passing , school chapel ; ; and in trees and measures have been done with 150% moves at `` Novel '' on the family . The states of wabbankbook talks , and measured his proceed might accompanied wage men by just ready to show it . ( Ma Partlow Sophonol , waiting the sprincing and earsh , in this viniing of storm , has a conce

 90%|████████▉ | 1211/1347 [09:40<01:44,  1.30it/s]

Step 1210, train loss:1.2684, test loss:1.2618


 91%|█████████ | 1221/1347 [09:44<01:27,  1.44it/s]

Step 1220, train loss:1.2483, test loss:1.2647


 91%|█████████▏| 1231/1347 [09:48<01:20,  1.44it/s]

Step 1230, train loss:1.2619, test loss:1.2621


 92%|█████████▏| 1241/1347 [09:51<01:13,  1.44it/s]

Step 1240, train loss:1.2538, test loss:1.2666


 93%|█████████▎| 1251/1347 [09:55<01:06,  1.43it/s]

Step 1250, train loss:1.2452, test loss:1.2679


 94%|█████████▎| 1261/1347 [09:59<00:59,  1.43it/s]

Step 1260, train loss:1.2186, test loss:1.2430


 94%|█████████▍| 1271/1347 [10:03<00:52,  1.44it/s]

Step 1270, train loss:1.2320, test loss:1.2651


 95%|█████████▌| 1281/1347 [10:07<00:46,  1.43it/s]

Step 1280, train loss:1.2547, test loss:1.2616


 96%|█████████▌| 1291/1347 [10:11<00:38,  1.44it/s]

Step 1290, train loss:1.2332, test loss:1.2604


 97%|█████████▋| 1300/1347 [10:13<00:11,  4.03it/s]

Step 1300, train loss:1.2256, test loss:1.2542


 97%|█████████▋| 1301/1347 [10:24<02:42,  3.52s/it]

  planes it , already . It was the third race of provided labor place of failured and the favorite funds : The low-thouble tasped as all summunes a careful of impact . By increasing convincing methods for the entire calendales . Measures , cooperative significant-controls is butter by a tumor in the pattern . It can rather live my believes about its telephone with chronic fundamental label makers now a hard , mven-as Nabricant Constitutes , shaking them , , international Idals are also strange class . The full ground folklore really stick bespecially in nevertheless , usually , mywhat there was used his already because of his effect , World Washington's believes teaching it seems to be that meetings of the fellow . He has given $200 to act if -- `` the patest determination of the government until 2 '' , but elashed . For a persistent fund have their levels on the daily schoology said of the superior train bath commanded the administrative subjective musician pleasure , ability to birth

 97%|█████████▋| 1311/1347 [10:28<00:27,  1.29it/s]

Step 1310, train loss:1.2435, test loss:1.2524


 98%|█████████▊| 1321/1347 [10:31<00:18,  1.44it/s]

Step 1320, train loss:1.2312, test loss:1.2569


 99%|█████████▉| 1331/1347 [10:35<00:11,  1.43it/s]

Step 1330, train loss:1.2330, test loss:1.2396


100%|█████████▉| 1341/1347 [10:39<00:04,  1.44it/s]

Step 1340, train loss:1.2424, test loss:1.2323


100%|██████████| 1347/1347 [10:40<00:00,  2.10it/s]


  tons , high political transition satisfactory , forgived themselves , revieted alise , and if a continuing fifty-foot freedom Baymen , arlerified machines is those for last stratific assessment to small rate of this system : but idiosis : A reference to veil a weather , the question of cycle raw No. 1 . The A.M.A.nals , a weighin of wanting both Americans a bangster for example ) the Ballening Mercers . Nebraska on Saturday administration would make that it was even in foreign prayers , and is momently necessary to examine boating policies ; ; this is the car , though they isn't wait you . Even if you perhaps keep , abute of the sungs of historian experiences in redemption that the applies in shapes hoop for brief sales as impopularies and miracles , for only officials , reg,pressed in excluding available through us x- accompanied , his religious need to stay extended such or testing , and of brief greatly units make no hand in any painting-coent may be is subject '' . Give than the 