In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from datasets import load_dataset
from dataclasses import dataclass
import math
import matplotlib.pyplot as plt
import tiktoken

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
ds = load_dataset("KingNish/reasoning-base-20k")
ds = ds['train'].train_test_split(test_size=0.1)

README.md: 0.00B [00:00, ?B/s]

combined_reasoning.json:   0%|          | 0.00/307M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19944 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['user', 'reasoning', 'assistant', 'template', 'conversations'],
        num_rows: 17949
    })
    test: Dataset({
        features: ['user', 'reasoning', 'assistant', 'template', 'conversations'],
        num_rows: 1995
    })
})

In [5]:
model = tiktoken.encoding_for_model('gpt-2')
special_token_list = ['<|im_start|>', '<|im_end|>', "user", "reasoning", 'assistant', '<|PAD|>']
sp_tokens = {token : model.n_vocab+i for i, token in enumerate(special_token_list)}
sp_tokens.update(model._special_tokens)
model = tiktoken.Encoding(
    name="p50k_with_custom",
    pat_str=model._pat_str,
    mergeable_ranks=model._mergeable_ranks,
    special_tokens=sp_tokens
)
model.n_vocab

50263

In [6]:
model._special_tokens

{'<|im_start|>': 50257,
 '<|im_end|>': 50258,
 'user': 50259,
 'reasoning': 50260,
 'assistant': 50261,
 '<|PAD|>': 50262,
 '<|endoftext|>': 50256}

In [7]:
def encode(text, append_eot = False):
    tokens = model.encode(text, allowed_special = set(model._special_tokens.keys())) # forcefully allowing every special tokens
    if append_eot == True:
        tokens.append(50256)
    return tokens

def decode(tokens : list[int]):
    return model.decode(tokens)

In [8]:
@dataclass
class ModelArgs:
    vocab_size : int = model.n_vocab
    max_seq_len : int = 1280
    model_dim : int = 768
    padding_idx : int = 50262
    num_hidden_layers : int = 6
    intermediate_dim: int = 768
    n_kv_heads: int = 4
    n_head: int = 8
    rms_norm_eps : float = 1e-6
    bias : bool = False
    lr : float = 8e-4

In [9]:
class ReasoningDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, i):
        return self.data[i]['template']

    def __len__(self):
        return self.data.num_rows

In [10]:
def collate_fn(batch : list[str], max_seq_len : int = 1024, pad_id : int = 50262):
    batch_tokens = []
    for text in batch:
        tokens = encode(text, True)[:max_seq_len+1]
        token_len = len(tokens)
        pad_len = max(0, max_seq_len+1-token_len)
        if pad_len:
            tokens = tokens + [pad_id] * pad_len
        batch_tokens.append(tokens)
    return torch.tensor(batch_tokens, dtype=torch.long).to(device)
        

In [11]:
def calculate_mask(batch_x : torch.Tensor, pad_id : int = 50262):
    B, T = batch_x.shape
    causal_mask = torch.tril(torch.ones(1, T, T)).to(device)
    pad_mask = (batch_x!=pad_id).to(device)
    key_mask = pad_mask[:, None, :] # B, 1, T
    query_mask = pad_mask[:, :, None] # B, T, 1
    final_mask = causal_mask  * key_mask * query_mask
    return final_mask.to(device)

In [12]:
dataset = ReasoningDataset(ds['train'])
dataloader = DataLoader(dataset, batch_size = 8, collate_fn = lambda x: collate_fn(x, max_seq_len = 1280, pad_id = 50262))

In [13]:
val_dataset = ReasoningDataset(ds['test'])
val_dataloader = DataLoader(val_dataset, batch_size = 8, collate_fn = lambda x: collate_fn(x, max_seq_len = 1280, pad_id = 50262))

In [14]:
for batch in dataloader:
    break

In [15]:
# batch[3].tolist()[800:]

In [16]:
# batch.shape

In [17]:
# mask = calculate_mask(batch)

In [18]:
# mask.shape

In [19]:
# plt.imshow(mask[1].detach().cpu().numpy())

In [20]:
class RMSNorm(nn.Module):
    def __init__(self, dim : int, eps : float=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(data=torch.ones(dim))
    def forward(self, x : torch.Tensor):
        variance = x.pow(2).mean(dim=-1, keepdims=True).type_as(x)
        return x * torch.rsqrt(variance+self.eps) * self.weight # rsqrt is same as 1/sqrt

In [21]:
class RoPE(nn.Module):
    def __init__(self, max_seq_len : int = 1024, d : int = 256, k : float = 10000.0, device : str = 'cpu'):
        super().__init__()
        self.d = d
        self.max_seq_len = max_seq_len
        self.device=device
        freqs, sin, cos = self.precompute_freqs(k=k)
        self.register_buffer('freqs', freqs.to(device))
        self.register_buffer('sin', sin.to(device))
        self.register_buffer('cos', cos.to(device))

    @torch.no_grad()
    def precompute_freqs(self, k : float = 10000.0):
        theta = 1/(k**(torch.arange(0, self.d, 2.0)/self.d))
        pos = torch.arange(self.max_seq_len).unsqueeze(1)
        freqs = pos*theta
        cos = torch.cos(freqs).to(self.device)
        sin = torch.sin(freqs).to(self.device)
        # print(theta.shape, pos.shape, freqs.shape)
        # print(theta, pos, freqs, sin, cos)
        return freqs, sin, cos
    def apply_rope(self, x : torch.Tensor):
        """Assumes x to be B, H, T,D"""
        B, H, T, D = x.shape
        x_reshaped = x.view(*x.shape[:-1], self.d//2, 2)
        x1 = x_reshaped[...,0]
        x2 = x_reshaped[...,1]

        cos = self.cos[:T, ...]
        sin = self.sin[:T, ...]
        stacked = torch.stack([x1 * cos - x2 * sin, 
                              x1 * sin + x2 * cos], dim=-1) # stack on last dimension
        out = stacked.view(x.shape)
        return out
    def forward(self, x : torch.Tensor):
        return self.apply_rope(x)

In [22]:
class LlamaMLP(nn.Module):
    def __init__(self, dim : int = 256, intermediate_dim : int = 256, bias : bool = True):
        super(LlamaMLP, self).__init__()
        self.d = dim
        self.intermediate_dim = intermediate_dim
        self.gate = nn.Linear(dim, intermediate_dim, bias=bias)
        self.up = nn.Linear(dim, intermediate_dim, bias=bias)
        self.down = nn.Linear(intermediate_dim, dim, bias=bias)
        self.activation_fn = F.silu

    def forward(self, x : torch.Tensor):
        # SwigLU(q, b) = SiLU(a) * b
        # final layer is W*(SwiGLU(x)) = W * (SiLU(x) * (W*x))
        return self.down(self.activation_fn(self.gate(x)) * self.up(x))

In [23]:
def repeat_kv(module : nn.Module, x : torch.Tensor, n_reps : int):
    B, H, T, D = x.shape
    if n_reps == 1:
        return x
    else:
        return x[:, :, None, :, :].expand(B, H, n_reps, T, D).reshape(B, H*n_reps, T, D)

In [24]:
class LlamaAttention(nn.Module):
    def __init__(self, dim : int = 256, n_kv_heads : int = 4, n_head : int = 8, max_seq_len : int = 1024):
        super(LlamaAttention, self).__init__()
        self.dim = dim
        self.n_kv_heads = n_kv_heads
        self.n_head = n_head
        self.head_dim = dim // n_head

        self.w_q = nn.Linear(dim, self.head_dim * self.n_head, bias=False)
        self.w_k = nn.Linear(dim, self.head_dim * self.n_kv_heads, bias=False)
        self.w_v = nn.Linear(dim, self.head_dim * self.n_kv_heads, bias=False)
        self.w_o = nn.Linear(self.head_dim * self.n_head, dim, bias=False)
        self.rotary_embedding = RoPE(max_seq_len = max_seq_len, d = self.head_dim)
    
    def forward(self, x : torch.Tensor, mask : torch.Tensor = None):
        """mask is filled with -inf at the position where the attn to be ignored
        x is of shape, B, T, D
        mask is of shape B, T, T"""
        B, T, D = x.shape
        
        # make all as shape B, H, T, head_dim
        Q = self.w_q(x).view(B, T, self.n_head, self.head_dim).transpose(1,2)
        K = self.w_k(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1,2) # on top of this repeatations are needed
        V = self.w_v(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1,2) # on top of this repeatations are needed

        Q = self.rotary_embedding(Q)
        K = self.rotary_embedding(K)
        
        n_reps = self.n_head // self.n_kv_heads
        K = repeat_kv(self, K, n_reps)
        V = repeat_kv(self, V, n_reps) 

        attn_scores = torch.matmul(Q, K.transpose(2,3)) / math.sqrt(self.head_dim) # B, H, T, D * B, H, D, T => B, H, T, T
        if mask is not None:
            mask = mask[:, :T, :T] # forcefully making of time steps equal to x
            attn_scores += attn_scores.masked_fill(mask.unsqueeze(1)==0, -1e9)
        
        attn_scores = F.softmax(attn_scores.float(), dim=-1)
        attn_output = torch.matmul(attn_scores, V) # B, H, T, T * B, H, T, D => B, H, T, D
        
        attn_output = attn_output.transpose(1,2)
        attn_output = attn_output.reshape(B, T, -1).contiguous() # reshape back to B, T, D from B, H, T, D
        
        attn_output = self.w_o(attn_output)
        return attn_output, attn_scores

In [25]:
# attn = LlamaAttention()

In [26]:
def count_params(module):
    total = 0
    for p in module.parameters():
        v = 1
        for d in p.shape:
            v *= d
        total += v
    return total

In [27]:
# x = torch.rand(32, 1024, 256)
# y, att_s = attn(x)

In [28]:
# y.shape, att_s.shape

In [29]:
class LlamaDecoder(nn.Module):
    def __init__(self, hidden_dim : int = 256, intermediate_dim : int = 256, n_kv_heads : int = 4, n_head : int = 8, max_seq_len : int = 1024):
        super(LlamaDecoder, self).__init__()
        self.rms_norm = RMSNorm(dim = hidden_dim) # eps needed
        self.self_attn = LlamaAttention(dim = hidden_dim, n_kv_heads = n_kv_heads, n_head = n_head, max_seq_len = max_seq_len)
        self.mlp = LlamaMLP(dim = hidden_dim, intermediate_dim = intermediate_dim) # bias needed
        self.hidden_dim = hidden_dim
        self.n_kv_heads = n_kv_heads
        self.n_head = n_head

    def forward(self, hidden_states : torch.Tensor, mask : torch.Tensor = None):
        """hidden_steps of shape B, T, D"""
        # print(torch.isnan(hidden_states).any())
        state = self.rms_norm(hidden_states)
        # print(torch.isnan(state).any())
        attn_output, attn_scores = self.self_attn(state, mask)
        # print(torch.isnan(attn_output).any())

        hidden_states = hidden_states + attn_output
        # print(torch.isnan(hidden_states).any())
        

        state = self.rms_norm(hidden_states)
        # print(torch.isnan(state).any())
        
        state = self.mlp(state)
        # print(torch.isnan(state).any())

        hidden_states = hidden_states + state
        # print(torch.isnan(hidden_states).any())
        # print('========================================')
        return hidden_states

In [30]:
# decoder = LlamaDecoder()
# count_params(decoder)

In [31]:
# y = decoder(x)
# y.shape

In [32]:
class Llama(nn.Module):
    def __init__(self, config : ModelArgs):
        super(Llama, self).__init__()
        self.config = config
        self.embedding_table = nn.Embedding(num_embeddings = config.vocab_size, embedding_dim = config.model_dim, padding_idx = config.padding_idx)
        self.decoder_layers = nn.ModuleList([
                                                LlamaDecoder(hidden_dim = config.model_dim, intermediate_dim = config.intermediate_dim,
                                                          n_kv_heads = config.n_kv_heads, n_head = config.n_head, max_seq_len = config.max_seq_len)
                                                for _ in range(config.num_hidden_layers)
                                            ])
        self.rms_norm = RMSNorm(dim = config.model_dim, eps = config.rms_norm_eps)
        self.mlp = LlamaMLP(dim = config.model_dim, intermediate_dim = config.intermediate_dim, bias = config.bias)
        self.proj_head = nn.Linear(config.model_dim, config.vocab_size, bias=False)

    def forward(self, x : torch.Tensor, mask : torch.Tensor = None):
        """X is in shape B, T
        mask is in shape B, T, T"""
        state = self.embedding_table(x)
        for dec_layer in self.decoder_layers:
            state = dec_layer(state, mask)
        
        state = self.rms_norm(state)
        state = self.mlp(state)
        state = self.proj_head(state)
        return state
    def generate(self, x : torch.Tensor, max_token : int = 1024):
        """x is of shape B, T"""
        for _ in range(max_token):
            x = x[:, -self.config.max_seq_len:]
            mask = calculate_mask(x, pad_id = 50262)
            state = self.embedding_table(x)
            for dec_layer in self.decoder_layers:
                state = dec_layer(state, mask)
            
            state = self.rms_norm(state)
            
            last_step_pred = state[:, -1, :] # B, 1, Vocab_size
            last_step_pred = self.mlp(last_step_pred)
            last_step_pred = self.proj_head(last_step_pred)
            probs = F.softmax(last_step_pred, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((x, idx_next), dim=1) # (B, T+1)
            yield idx_next

In [33]:
llama_model = Llama(config = ModelArgs).to(device)
print(f"Model size: {count_params(llama_model)/10**6}M parameters")

Model size: 100.226304M parameters


In [34]:
optimizer = torch.optim.AdamW(llama_model.parameters(), lr = ModelArgs.lr, betas = (0.9, 0.95), weight_decay = 0.1)

In [35]:
for idx in llama_model.generate(torch.tensor([[50257]], dtype=torch.long, device=device), max_token=50):
    print(decode(idx[0].tolist()), end='|')

 commissioner|ür|?????-?????-|414| pillars|pard| plausible| optimized| hypotheses|Understanding| erupt|":| infantry|Cond| Bear|colonial|sav| releases| yards| NOTICE|ン| 214|

aid| Serial|ago| neighboring|girlfriend| kettle| Rockefeller| VII|nant| shooter| english| Sus|ageddon| cooler| intric|inse|specified| colourful| bonuses| sixteen| �| spa| //|

 bolst|rich| editorial| version| preceded|

In [36]:
def answer(question, max_token=1280, end = '|'):
    f_text = f"<|im_start|>user\n{question}<|im_end|>\n"
    tokens = encode(f_text)
    print(f_text)
    llama_model.eval()
    for idx in llama_model.generate(torch.tensor([tokens], dtype=torch.long, device=device), max_token=1280):
        print(decode(idx[0].tolist()), end=end)

In [37]:
# answer('Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.')

In [38]:
# y = llama_model(batch[:, :-1], mask[:, :-1, :-1])

In [39]:
# y.shape

In [40]:
grad_accumulation_step = 16
for b in range(40):
    print(f"EPOCH: {b+1} / 40")
    llama_model.train()
    for i, batch in enumerate(dataloader):
        x = batch[:, :-1]
        B, T = x.shape
        y = batch[:, 1:]
        mask = calculate_mask(x, ModelArgs.padding_idx)
        y_pred = llama_model(x, mask).view(B*T, ModelArgs.vocab_size)
        y = y.reshape(B*T,)
        loss = F.cross_entropy(y_pred, y)
        loss = loss / grad_accumulation_step
        loss.backward()
        if i%grad_accumulation_step==0:
            # print('Applying step')
            optimizer.step()
            optimizer.zero_grad()
        if i%48==0:
            print(f"TRAIN Batch {i} => Loss: {round(loss.item()*grad_accumulation_step, 5)}")
    
    llama_model.eval()
    with torch.no_grad():
        for i, batch in enumerate(val_dataloader):
            x = batch[:, :-1]
            B, T = x.shape
            y = batch[:, 1:]
            mask = calculate_mask(x, ModelArgs.padding_idx)
            y_pred = llama_model(x, mask).view(B*T, ModelArgs.vocab_size)
            y = y.reshape(B*T,)
            loss = F.cross_entropy(y_pred, y)
            if i%50==0:
                print(f"EVAL Batch {i} => Loss: {round(loss.item(), 5)}")
    if b%2==0:
        # sample text generation test of reasoning
        answer('Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.', end='')
        torch.save(llama_model.state_dict(), f'model_{b}.pth')

EPOCH: 1 / 40


TRAIN Batch 0 => Loss: 10.82915


TRAIN Batch 48 => Loss: 9.71197


TRAIN Batch 96 => Loss: 9.30165


TRAIN Batch 144 => Loss: 6.77288


TRAIN Batch 192 => Loss: 5.72704


TRAIN Batch 240 => Loss: 5.98788


TRAIN Batch 288 => Loss: 5.79452


TRAIN Batch 336 => Loss: 5.60651


TRAIN Batch 384 => Loss: 5.19909


TRAIN Batch 432 => Loss: 5.22859


TRAIN Batch 480 => Loss: 4.65448


TRAIN Batch 528 => Loss: 4.74846


TRAIN Batch 576 => Loss: 4.96113


TRAIN Batch 624 => Loss: 4.38597


TRAIN Batch 672 => Loss: 4.11039


TRAIN Batch 720 => Loss: 4.22891


TRAIN Batch 768 => Loss: 3.5467


TRAIN Batch 816 => Loss: 4.33244


TRAIN Batch 864 => Loss: 3.94068


TRAIN Batch 912 => Loss: 3.97707


TRAIN Batch 960 => Loss: 3.50896


TRAIN Batch 1008 => Loss: 3.95027


TRAIN Batch 1056 => Loss: 3.72342


TRAIN Batch 1104 => Loss: 3.89214


TRAIN Batch 1152 => Loss: 3.68965


TRAIN Batch 1200 => Loss: 4.40383


TRAIN Batch 1248 => Loss: 4.17416


TRAIN Batch 1296 => Loss: 3.72129


TRAIN Batch 1344 => Loss: 3.39761


TRAIN Batch 1392 => Loss: 3.60297


TRAIN Batch 1440 => Loss: 3.03961


TRAIN Batch 1488 => Loss: 3.38772


TRAIN Batch 1536 => Loss: 3.20665


TRAIN Batch 1584 => Loss: 3.10185


TRAIN Batch 1632 => Loss: 3.2806


TRAIN Batch 1680 => Loss: 3.41178


TRAIN Batch 1728 => Loss: 3.625


TRAIN Batch 1776 => Loss: 2.92218


TRAIN Batch 1824 => Loss: 3.43169


TRAIN Batch 1872 => Loss: 2.699


TRAIN Batch 1920 => Loss: 3.23783


TRAIN Batch 1968 => Loss: 3.50123


TRAIN Batch 2016 => Loss: 2.82278


TRAIN Batch 2064 => Loss: 3.01285


TRAIN Batch 2112 => Loss: 3.41223


TRAIN Batch 2160 => Loss: 2.91306


TRAIN Batch 2208 => Loss: 3.0773


EVAL Batch 0 => Loss: 3.13201


EVAL Batch 50 => Loss: 3.05011


EVAL Batch 100 => Loss: 3.50849


EVAL Batch 150 => Loss: 3.83317


EVAL Batch 200 => Loss: 3.08181


<|im_start|>user
Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.<|im_end|>





2






*42

21
*


	*



2


The

2



4

*










=

4

2*22






2*



3*

log


**
323


*







2
13

3
2
2


*	

2

3














2*
2



*


**



#2

-




*




25
2


**11
*2











1
*

*



2
*

*








*3
M





 
-

2



-

$**25


*4*



2




since
*5*












**32



-


2

4
*

2cos22
m2



-

*2

3
5

*








3


*
2






|


42-
22
2
Where*
*



2
-2
*
-
*234
2



2


2
*






5


**
5

2




4
3



1
**2
3




3





5

2

22
2

2

2
*
3



*31*2633


2
3

*

5


*

**2
5

2*

3
2

*


*3Given``*


2* 



25
``
*


2




 



32*-

|5
*


2*
*
2
*	P

44``
*
**
*







2




*
2
*3*










3

*


 



#

3




*
This
3
*






*



3



2-
3

2



42


 





422




*4
32=
2	*3*



	*


*3



g
4


2
*3




33*
3
 (
-
``


2*






-

56


2

2




*







	


 *



*
* 2*32

*62	*2
-

*--



*
71


2
4-

2

2
3
*

`






Case3







*2

2




-4


``

*

5*

*32
*

2



|
*43

*

3
*
3






23


4

*12

*2

4












2
*12












2



*Per-2*2*
*









*
*
*

5
*






**




-



2*
#


















*

2*

2


*




4


2



-*

3


2*

3
22

2
3*
4



x
3322 
*

*

{
**A3
*

 

 (4

2
*
3


2

2

2So




 
2
2

3

32
vec4

0
i4

*-	2*







14

The1*
*

3

42*4



2
2*



*3





34












#






2








*






2






*2####	*





1

5To
2	2



2






4C
332
2
2



*2
*




	



4 **





22
Pro


2


5











c


3

*


*2
 *
 2
*


3

EPOCH: 2 / 40


TRAIN Batch 0 => Loss: 2.99026


TRAIN Batch 48 => Loss: 3.07471


TRAIN Batch 96 => Loss: 2.70727


TRAIN Batch 144 => Loss: 2.71618


TRAIN Batch 192 => Loss: 2.38543


TRAIN Batch 240 => Loss: 2.96009


TRAIN Batch 288 => Loss: 3.25857


TRAIN Batch 336 => Loss: 2.77684


TRAIN Batch 384 => Loss: 2.72823


TRAIN Batch 432 => Loss: 2.73439


TRAIN Batch 480 => Loss: 2.7481


TRAIN Batch 528 => Loss: 2.85647


TRAIN Batch 576 => Loss: 3.08519


TRAIN Batch 624 => Loss: 2.60798


TRAIN Batch 672 => Loss: 2.32258


TRAIN Batch 720 => Loss: 2.59093


TRAIN Batch 768 => Loss: 2.03297


TRAIN Batch 816 => Loss: 2.89716


TRAIN Batch 864 => Loss: 2.58736


TRAIN Batch 912 => Loss: 2.57862


TRAIN Batch 960 => Loss: 2.25876


TRAIN Batch 1008 => Loss: 2.70441


TRAIN Batch 1056 => Loss: 2.36567


TRAIN Batch 1104 => Loss: 2.4956


TRAIN Batch 1152 => Loss: 2.43088


TRAIN Batch 1200 => Loss: 3.1985


TRAIN Batch 1248 => Loss: 2.97078


TRAIN Batch 1296 => Loss: 2.5799


TRAIN Batch 1344 => Loss: 2.45121


TRAIN Batch 1392 => Loss: 2.47592


TRAIN Batch 1440 => Loss: 2.11847


TRAIN Batch 1488 => Loss: 2.38576


TRAIN Batch 1536 => Loss: 2.24957


TRAIN Batch 1584 => Loss: 2.03724


TRAIN Batch 1632 => Loss: 2.31409


TRAIN Batch 1680 => Loss: 2.52312


TRAIN Batch 1728 => Loss: 2.7336


TRAIN Batch 1776 => Loss: 2.01689


TRAIN Batch 1824 => Loss: 2.56015


TRAIN Batch 1872 => Loss: 1.89438


TRAIN Batch 1920 => Loss: 2.415


TRAIN Batch 1968 => Loss: 2.68535


TRAIN Batch 2016 => Loss: 2.06037


TRAIN Batch 2064 => Loss: 2.33637


TRAIN Batch 2112 => Loss: 2.59954


TRAIN Batch 2160 => Loss: 2.2343


TRAIN Batch 2208 => Loss: 2.35567


EVAL Batch 0 => Loss: 2.44176


EVAL Batch 50 => Loss: 2.44477


EVAL Batch 100 => Loss: 2.91552


EVAL Batch 150 => Loss: 3.24187


EVAL Batch 200 => Loss: 2.43047


EPOCH: 3 / 40


TRAIN Batch 0 => Loss: 2.26205


TRAIN Batch 48 => Loss: 2.42852


TRAIN Batch 96 => Loss: 1.96988


TRAIN Batch 144 => Loss: 2.08076


TRAIN Batch 192 => Loss: 1.76968


TRAIN Batch 240 => Loss: 2.32302


TRAIN Batch 288 => Loss: 2.65484


TRAIN Batch 336 => Loss: 2.10872


TRAIN Batch 384 => Loss: 2.14


TRAIN Batch 432 => Loss: 2.0449


TRAIN Batch 480 => Loss: 2.163


TRAIN Batch 528 => Loss: 2.2617


TRAIN Batch 576 => Loss: 2.39426


TRAIN Batch 624 => Loss: 2.00908


TRAIN Batch 672 => Loss: 1.78215


TRAIN Batch 720 => Loss: 2.01484


TRAIN Batch 768 => Loss: 1.5595


TRAIN Batch 816 => Loss: 2.35073


TRAIN Batch 864 => Loss: 2.00534


TRAIN Batch 912 => Loss: 2.0205


TRAIN Batch 960 => Loss: 1.7769


TRAIN Batch 1008 => Loss: 2.21122


TRAIN Batch 1056 => Loss: 1.84621


TRAIN Batch 1104 => Loss: 1.99736


TRAIN Batch 1152 => Loss: 1.89073


TRAIN Batch 1200 => Loss: 2.62307


TRAIN Batch 1248 => Loss: 2.42781


TRAIN Batch 1296 => Loss: 2.09971


TRAIN Batch 1344 => Loss: 1.99057


TRAIN Batch 1392 => Loss: 1.91587


TRAIN Batch 1440 => Loss: 1.66949


TRAIN Batch 1488 => Loss: 1.89432


TRAIN Batch 1536 => Loss: 1.7951


TRAIN Batch 1584 => Loss: 1.6405


TRAIN Batch 1632 => Loss: 1.88558


TRAIN Batch 1680 => Loss: 2.08035


TRAIN Batch 1728 => Loss: 2.28726


TRAIN Batch 1776 => Loss: 1.61338


TRAIN Batch 1824 => Loss: 2.06588


TRAIN Batch 1872 => Loss: 1.47523


TRAIN Batch 1920 => Loss: 1.95717


TRAIN Batch 1968 => Loss: 2.22413


TRAIN Batch 2016 => Loss: 1.67331


TRAIN Batch 2064 => Loss: 1.94205


TRAIN Batch 2112 => Loss: 2.16261


TRAIN Batch 2160 => Loss: 1.84033


TRAIN Batch 2208 => Loss: 1.93295


EVAL Batch 0 => Loss: 2.02012


EVAL Batch 50 => Loss: 2.06694


EVAL Batch 100 => Loss: 2.50412


EVAL Batch 150 => Loss: 2.86529


EVAL Batch 200 => Loss: 2.0714


<|im_start|>user
Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.<|im_end|>

($$How

d
If




un



d

Part





isNow






ProGiven\


I




If
VFunction






3I
ModUsing

N[
atern
Lower















n
I





Given











Cond
Avoid<|im_end|>
I






$$

(



(
τ




















IfI

$$

In


ConsiderH
HTherefore
$$ MonAnother



(\



or









Original





In
For





Let
$



2







$$



Cond





(
At

For



if


b
bas








Pro
H








int
Is
P







There



One









n
(





In


Pro1



The

There


region1


udedDef
and

K




The



(For

FindD















The1
Sur
Pro
E




(TheTherefore

We




IfAfter

�AdE


J


P






{
H
Itd

(
Consider
\

When L






Ass









em







b














TheInd=
[







Pro




($


W
















ProIFor
There









B
N







(
Sur
I




to

Is



For
####p





ILet



It
Ib
ForIn










Ana










Pro




Pro






For









AssPro
IPercent

NoInd




<|im_end|>
ProL
Since












Ex
Additionally



Consider








Additionally


I
P


Cal




1






P$$



However$$
Then

*
















TheisIf


1

This5

K
Cond3




####

Pro






Now



(







y



If1For


Since












We






Car	





If
For1LMy



3-
where
j

\






####


D














Find




ets



The
$

InIf
For









gre


$$
nTo((








If




S


Pro




ForImage






H




 government


[


####


B$$



Given (!








 trip


H


Iffor

Eventfor
x














Is









Percentfunction1
All
LastlyI


To[







The
Pro






Thank

In
D






as




(





I


\






Therefore2
e




ator
$

Ass

ProIf

 tricks



Lower$$


$



\













1
$$






P


Base2







ContGiven
0
I




Moreover


$$







$$I





For
Let
2





GG





(
If


ToGivenIs


1





21

 cl
I




1



To




mark











1
1

x













Total
$$

$












P


1




[

d

(







We

\
p



Initially






Me


...
\






Given


 guess













Pro


EPOCH: 4 / 40


TRAIN Batch 0 => Loss: 1.80758


TRAIN Batch 48 => Loss: 2.00102


TRAIN Batch 96 => Loss: 1.54403


TRAIN Batch 144 => Loss: 1.67056


TRAIN Batch 192 => Loss: 1.39228


TRAIN Batch 240 => Loss: 1.9222


TRAIN Batch 288 => Loss: 2.23527


TRAIN Batch 336 => Loss: 1.71674


TRAIN Batch 384 => Loss: 1.74781


TRAIN Batch 432 => Loss: 1.61194


TRAIN Batch 480 => Loss: 1.79154


TRAIN Batch 528 => Loss: 1.84121


TRAIN Batch 576 => Loss: 2.02214


TRAIN Batch 624 => Loss: 1.64719


TRAIN Batch 672 => Loss: 1.4594


TRAIN Batch 720 => Loss: 1.64051


TRAIN Batch 768 => Loss: 1.26979


TRAIN Batch 816 => Loss: 1.96983


TRAIN Batch 864 => Loss: 1.59246


TRAIN Batch 912 => Loss: 1.59095


TRAIN Batch 960 => Loss: 1.41232


TRAIN Batch 1008 => Loss: 1.77846


TRAIN Batch 1056 => Loss: 1.53635


TRAIN Batch 1104 => Loss: 1.64125


TRAIN Batch 1152 => Loss: 1.4655


TRAIN Batch 1200 => Loss: 2.13779


TRAIN Batch 1248 => Loss: 1.91915


TRAIN Batch 1296 => Loss: 1.65654


TRAIN Batch 1344 => Loss: 1.58371


TRAIN Batch 1392 => Loss: 1.46362


TRAIN Batch 1440 => Loss: 1.33491


TRAIN Batch 1488 => Loss: 1.46593


TRAIN Batch 1536 => Loss: 1.41219


TRAIN Batch 1584 => Loss: 1.31925


TRAIN Batch 1632 => Loss: 1.55365


TRAIN Batch 1680 => Loss: 1.69067


TRAIN Batch 1728 => Loss: 1.82605


TRAIN Batch 1776 => Loss: 1.28801


TRAIN Batch 1824 => Loss: 1.60641


TRAIN Batch 1872 => Loss: 1.07902


TRAIN Batch 1920 => Loss: 1.50298


TRAIN Batch 1968 => Loss: 1.70576


TRAIN Batch 2016 => Loss: 1.25949


TRAIN Batch 2064 => Loss: 1.50941


TRAIN Batch 2112 => Loss: 1.74525


TRAIN Batch 2160 => Loss: 1.42617


TRAIN Batch 2208 => Loss: 1.46171


EVAL Batch 0 => Loss: 1.63238


EVAL Batch 50 => Loss: 1.68391


EVAL Batch 100 => Loss: 2.08796


EVAL Batch 150 => Loss: 2.33144


EVAL Batch 200 => Loss: 1.66897


EPOCH: 5 / 40


TRAIN Batch 0 => Loss: 1.3862


TRAIN Batch 48 => Loss: 1.58854


TRAIN Batch 96 => Loss: 1.28316


TRAIN Batch 144 => Loss: 1.3436


TRAIN Batch 192 => Loss: 1.12461


TRAIN Batch 240 => Loss: 1.48229


TRAIN Batch 288 => Loss: 1.77069


TRAIN Batch 336 => Loss: 1.29062


TRAIN Batch 384 => Loss: 1.33523


TRAIN Batch 432 => Loss: 1.25471


TRAIN Batch 480 => Loss: 1.41384


TRAIN Batch 528 => Loss: 1.38181


TRAIN Batch 576 => Loss: 1.60805


TRAIN Batch 624 => Loss: 1.23649


TRAIN Batch 672 => Loss: 1.18952


TRAIN Batch 720 => Loss: 1.30628


TRAIN Batch 768 => Loss: 1.02616


TRAIN Batch 816 => Loss: 1.6176


TRAIN Batch 864 => Loss: 1.22843


TRAIN Batch 912 => Loss: 1.29841


TRAIN Batch 960 => Loss: 1.15927


TRAIN Batch 1008 => Loss: 1.36658


TRAIN Batch 1056 => Loss: 1.26481


TRAIN Batch 1104 => Loss: 1.36777


TRAIN Batch 1152 => Loss: 1.19685


TRAIN Batch 1200 => Loss: 1.72116


TRAIN Batch 1248 => Loss: 1.53832


TRAIN Batch 1296 => Loss: 1.30089


TRAIN Batch 1344 => Loss: 1.27829


TRAIN Batch 1392 => Loss: 1.18735


TRAIN Batch 1440 => Loss: 1.13361


TRAIN Batch 1488 => Loss: 1.22979


TRAIN Batch 1536 => Loss: 1.14915


TRAIN Batch 1584 => Loss: 1.14959


TRAIN Batch 1632 => Loss: 1.33907


TRAIN Batch 1680 => Loss: 1.41166


TRAIN Batch 1728 => Loss: 1.56676


TRAIN Batch 1776 => Loss: 1.09925


TRAIN Batch 1824 => Loss: 1.3523


TRAIN Batch 1872 => Loss: 0.93245


TRAIN Batch 1920 => Loss: 1.28806


TRAIN Batch 1968 => Loss: 1.43883


TRAIN Batch 2016 => Loss: 1.04241


TRAIN Batch 2064 => Loss: 1.27876


TRAIN Batch 2112 => Loss: 1.52326


TRAIN Batch 2160 => Loss: 1.25136


TRAIN Batch 2208 => Loss: 1.23303


EVAL Batch 0 => Loss: 1.50052


EVAL Batch 50 => Loss: 1.5254


EVAL Batch 100 => Loss: 1.92322


EVAL Batch 150 => Loss: 2.12204


EVAL Batch 200 => Loss: 1.50343


<|im_start|>user
Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.<|im_end|>






















 Res














and






















aut























Pro

























Definition



$$






Pro











































If

































































A






















[






P





$$









 R






















































5































Let





Pro









$$










####




























E




S













































.














































































Pro


$$













Additionally























































































Pro









 equal




















$$





































































Show































Additionally

























































To








 













T





















e


Using





The





















































Pro
























































If









































Pro
























































For












































(

In





























\
$$


D







When





























IThis





















































$




And





























































EPOCH: 6 / 40


TRAIN Batch 0 => Loss: 1.19818


TRAIN Batch 48 => Loss: 1.39325


TRAIN Batch 96 => Loss: 1.12189


TRAIN Batch 144 => Loss: 1.17847


TRAIN Batch 192 => Loss: 0.98188


TRAIN Batch 240 => Loss: 1.29506


TRAIN Batch 288 => Loss: 1.55435


TRAIN Batch 336 => Loss: 1.13448


TRAIN Batch 384 => Loss: 1.13889


TRAIN Batch 432 => Loss: 1.09187


TRAIN Batch 480 => Loss: 1.24088


TRAIN Batch 528 => Loss: 1.2151


TRAIN Batch 576 => Loss: 1.4267


TRAIN Batch 624 => Loss: 1.07678


TRAIN Batch 672 => Loss: 1.07724


TRAIN Batch 720 => Loss: 1.14135


TRAIN Batch 768 => Loss: 0.91188


TRAIN Batch 816 => Loss: 1.416


TRAIN Batch 864 => Loss: 1.08879


TRAIN Batch 912 => Loss: 1.14122


TRAIN Batch 960 => Loss: 1.03938


TRAIN Batch 1008 => Loss: 1.19914


TRAIN Batch 1056 => Loss: 1.13683


TRAIN Batch 1104 => Loss: 1.24673


TRAIN Batch 1152 => Loss: 1.08155


TRAIN Batch 1200 => Loss: 1.49646


TRAIN Batch 1248 => Loss: 1.38131


TRAIN Batch 1296 => Loss: 1.16194


TRAIN Batch 1344 => Loss: 1.13851


TRAIN Batch 1392 => Loss: 1.08222


TRAIN Batch 1440 => Loss: 1.04309


TRAIN Batch 1488 => Loss: 1.10134


TRAIN Batch 1536 => Loss: 1.0297


TRAIN Batch 1584 => Loss: 1.05095


TRAIN Batch 1632 => Loss: 1.23492


TRAIN Batch 1680 => Loss: 1.27069


TRAIN Batch 1728 => Loss: 1.43239


TRAIN Batch 1776 => Loss: 1.00307


TRAIN Batch 1824 => Loss: 1.21628


TRAIN Batch 1872 => Loss: 0.84738


TRAIN Batch 1920 => Loss: 1.18678


TRAIN Batch 1968 => Loss: 1.33163


TRAIN Batch 2016 => Loss: 0.96278


TRAIN Batch 2064 => Loss: 1.16964


TRAIN Batch 2112 => Loss: 1.39614


TRAIN Batch 2160 => Loss: 1.15606


TRAIN Batch 2208 => Loss: 1.12501


EVAL Batch 0 => Loss: 1.46141


EVAL Batch 50 => Loss: 1.48583


EVAL Batch 100 => Loss: 1.89419


EVAL Batch 150 => Loss: 2.0897


EVAL Batch 200 => Loss: 1.47452


EPOCH: 7 / 40


TRAIN Batch 0 => Loss: 1.09172


TRAIN Batch 48 => Loss: 1.27571


TRAIN Batch 96 => Loss: 1.04497


TRAIN Batch 144 => Loss: 1.09023


TRAIN Batch 192 => Loss: 0.90455


TRAIN Batch 240 => Loss: 1.18807


TRAIN Batch 288 => Loss: 1.41026


TRAIN Batch 336 => Loss: 1.0358


TRAIN Batch 384 => Loss: 1.02296


TRAIN Batch 432 => Loss: 0.98383


TRAIN Batch 480 => Loss: 1.13497


TRAIN Batch 528 => Loss: 1.11137


TRAIN Batch 576 => Loss: 1.30208


TRAIN Batch 624 => Loss: 0.98328


TRAIN Batch 672 => Loss: 0.99704


TRAIN Batch 720 => Loss: 1.05543


TRAIN Batch 768 => Loss: 0.84857


TRAIN Batch 816 => Loss: 1.27775


TRAIN Batch 864 => Loss: 1.01343


TRAIN Batch 912 => Loss: 1.05574


TRAIN Batch 960 => Loss: 0.96255


TRAIN Batch 1008 => Loss: 1.10916


TRAIN Batch 1056 => Loss: 1.05765


TRAIN Batch 1104 => Loss: 1.14071


TRAIN Batch 1152 => Loss: 1.00984


TRAIN Batch 1200 => Loss: 1.3731


TRAIN Batch 1248 => Loss: 1.26875


TRAIN Batch 1296 => Loss: 1.0764


TRAIN Batch 1344 => Loss: 1.0505


TRAIN Batch 1392 => Loss: 1.01387


TRAIN Batch 1440 => Loss: 0.97228


TRAIN Batch 1488 => Loss: 1.01213


TRAIN Batch 1536 => Loss: 0.95814


TRAIN Batch 1584 => Loss: 0.98347


TRAIN Batch 1632 => Loss: 1.17732


TRAIN Batch 1680 => Loss: 1.18766


TRAIN Batch 1728 => Loss: 1.35147


TRAIN Batch 1776 => Loss: 0.92725


TRAIN Batch 1824 => Loss: 1.12652


TRAIN Batch 1872 => Loss: 0.79054


TRAIN Batch 1920 => Loss: 1.09997


TRAIN Batch 1968 => Loss: 1.2401


TRAIN Batch 2016 => Loss: 0.89184


TRAIN Batch 2064 => Loss: 1.09766


TRAIN Batch 2112 => Loss: 1.31051


TRAIN Batch 2160 => Loss: 1.08187


TRAIN Batch 2208 => Loss: 1.02924


EVAL Batch 0 => Loss: 1.43764


EVAL Batch 50 => Loss: 1.46763


EVAL Batch 100 => Loss: 1.88055


EVAL Batch 150 => Loss: 2.08863


EVAL Batch 200 => Loss: 1.45985


<|im_start|>user
Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.<|im_end|>







$$





























































(






















































































































Is




























































































































































































































































































































































































































































































































































































































































<|im_end|>Given






































































Definition
























Given









































































Ass
















































































































































































































A

































































































































































<|im_end|>
















AND




EPOCH: 8 / 40


TRAIN Batch 0 => Loss: 1.02242


TRAIN Batch 48 => Loss: 1.18448


TRAIN Batch 96 => Loss: 0.97665


TRAIN Batch 144 => Loss: 1.0255


TRAIN Batch 192 => Loss: 0.83896


TRAIN Batch 240 => Loss: 1.11344


TRAIN Batch 288 => Loss: 1.29634


TRAIN Batch 336 => Loss: 0.97305


TRAIN Batch 384 => Loss: 0.9544


TRAIN Batch 432 => Loss: 0.92481


TRAIN Batch 480 => Loss: 1.07468


TRAIN Batch 528 => Loss: 1.04272


TRAIN Batch 576 => Loss: 1.22401


TRAIN Batch 624 => Loss: 0.9209


TRAIN Batch 672 => Loss: 0.94506


TRAIN Batch 720 => Loss: 0.98534


TRAIN Batch 768 => Loss: 0.78739


TRAIN Batch 816 => Loss: 1.18722


TRAIN Batch 864 => Loss: 0.96597


TRAIN Batch 912 => Loss: 0.98945


TRAIN Batch 960 => Loss: 0.90804


TRAIN Batch 1008 => Loss: 1.03821


TRAIN Batch 1056 => Loss: 0.9969


TRAIN Batch 1104 => Loss: 1.07417


TRAIN Batch 1152 => Loss: 0.96236


TRAIN Batch 1200 => Loss: 1.27658


TRAIN Batch 1248 => Loss: 1.20973


TRAIN Batch 1296 => Loss: 1.01217


TRAIN Batch 1344 => Loss: 0.98823


TRAIN Batch 1392 => Loss: 0.96273


TRAIN Batch 1440 => Loss: 0.92901


TRAIN Batch 1488 => Loss: 0.96119


TRAIN Batch 1536 => Loss: 0.90115


TRAIN Batch 1584 => Loss: 0.94017


TRAIN Batch 1632 => Loss: 1.1073


TRAIN Batch 1680 => Loss: 1.12736


TRAIN Batch 1728 => Loss: 1.29934


TRAIN Batch 1776 => Loss: 0.88272


TRAIN Batch 1824 => Loss: 1.062


TRAIN Batch 1872 => Loss: 0.76015


TRAIN Batch 1920 => Loss: 1.0545


TRAIN Batch 1968 => Loss: 1.16772


TRAIN Batch 2016 => Loss: 0.83241


TRAIN Batch 2064 => Loss: 1.04263


TRAIN Batch 2112 => Loss: 1.23564


TRAIN Batch 2160 => Loss: 1.03115


TRAIN Batch 2208 => Loss: 0.96312


EVAL Batch 0 => Loss: 1.46328


EVAL Batch 50 => Loss: 1.46959


EVAL Batch 100 => Loss: 1.90474


EVAL Batch 150 => Loss: 2.11002


EVAL Batch 200 => Loss: 1.4548


EPOCH: 9 / 40


TRAIN Batch 0 => Loss: 0.96225


TRAIN Batch 48 => Loss: 1.11788


TRAIN Batch 96 => Loss: 0.92337


TRAIN Batch 144 => Loss: 0.98171


TRAIN Batch 192 => Loss: 0.79051


TRAIN Batch 240 => Loss: 1.048


TRAIN Batch 288 => Loss: 1.22187


TRAIN Batch 336 => Loss: 0.92012


TRAIN Batch 384 => Loss: 0.90573


TRAIN Batch 432 => Loss: 0.87459


TRAIN Batch 480 => Loss: 1.0109


TRAIN Batch 528 => Loss: 0.98074


TRAIN Batch 576 => Loss: 1.16725


TRAIN Batch 624 => Loss: 0.88469


TRAIN Batch 672 => Loss: 0.89752


TRAIN Batch 720 => Loss: 0.92476


TRAIN Batch 768 => Loss: 0.75335


TRAIN Batch 816 => Loss: 1.1201


TRAIN Batch 864 => Loss: 0.90365


TRAIN Batch 912 => Loss: 0.92876


TRAIN Batch 960 => Loss: 0.84623


TRAIN Batch 1008 => Loss: 0.98916


TRAIN Batch 1056 => Loss: 0.93682


TRAIN Batch 1104 => Loss: 1.00578


TRAIN Batch 1152 => Loss: 0.9118


TRAIN Batch 1200 => Loss: 1.18749


TRAIN Batch 1248 => Loss: 1.13002


TRAIN Batch 1296 => Loss: 0.94612


TRAIN Batch 1344 => Loss: 0.93283


TRAIN Batch 1392 => Loss: 0.90815


TRAIN Batch 1440 => Loss: 0.88819


TRAIN Batch 1488 => Loss: 0.90814


TRAIN Batch 1536 => Loss: 0.84921


TRAIN Batch 1584 => Loss: 0.90487


TRAIN Batch 1632 => Loss: 1.05789


TRAIN Batch 1680 => Loss: 1.06114


TRAIN Batch 1728 => Loss: 1.22827


TRAIN Batch 1776 => Loss: 0.83636


TRAIN Batch 1824 => Loss: 0.99169


TRAIN Batch 1872 => Loss: 0.72002


TRAIN Batch 1920 => Loss: 1.00021


TRAIN Batch 1968 => Loss: 1.10791


TRAIN Batch 2016 => Loss: 0.78398


TRAIN Batch 2064 => Loss: 0.98918


TRAIN Batch 2112 => Loss: 1.17092


TRAIN Batch 2160 => Loss: 0.96886


TRAIN Batch 2208 => Loss: 0.89865


EVAL Batch 0 => Loss: 1.46098


EVAL Batch 50 => Loss: 1.49256


EVAL Batch 100 => Loss: 1.93924


EVAL Batch 150 => Loss: 2.12692


EVAL Batch 200 => Loss: 1.46291


<|im_start|>user
Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.<|im_end|>





































(





$$

























































I












































Given















































































































$$







 (





















Consider





$$$$














Now































Now






































Let
































Definition










































0





















$$









































Let







Therefore






















$$




$$
$$


 Hence

















































































Consider










































































(

















$$















Let









































$










$$











$$















































$$Sch




























































































By






















$$



























$$















$$




Set
































































































However

































$$








Expl

















































If


























$$



















































































50











Given




















EPOCH: 10 / 40


TRAIN Batch 0 => Loss: 0.91345


TRAIN Batch 48 => Loss: 1.06251


TRAIN Batch 96 => Loss: 0.87526


TRAIN Batch 144 => Loss: 0.91815


TRAIN Batch 192 => Loss: 0.74335


TRAIN Batch 240 => Loss: 0.98377


TRAIN Batch 288 => Loss: 1.1365


TRAIN Batch 336 => Loss: 0.86581


TRAIN Batch 384 => Loss: 0.85556


TRAIN Batch 432 => Loss: 0.82579


TRAIN Batch 480 => Loss: 0.94792


TRAIN Batch 528 => Loss: 0.91389


TRAIN Batch 576 => Loss: 1.10314


TRAIN Batch 624 => Loss: 0.82257


TRAIN Batch 672 => Loss: 0.85535


TRAIN Batch 720 => Loss: 0.88068


TRAIN Batch 768 => Loss: 0.72376


TRAIN Batch 816 => Loss: 1.07523


TRAIN Batch 864 => Loss: 0.85542


TRAIN Batch 912 => Loss: 0.88464


TRAIN Batch 960 => Loss: 0.80826


TRAIN Batch 1008 => Loss: 0.92703


TRAIN Batch 1056 => Loss: 0.9019


TRAIN Batch 1104 => Loss: 0.95709


TRAIN Batch 1152 => Loss: 0.87786


TRAIN Batch 1200 => Loss: 1.13692


TRAIN Batch 1248 => Loss: 1.0812


TRAIN Batch 1296 => Loss: 0.89346


TRAIN Batch 1344 => Loss: 0.87919


TRAIN Batch 1392 => Loss: 0.87843


TRAIN Batch 1440 => Loss: 0.85393


TRAIN Batch 1488 => Loss: 0.89501


TRAIN Batch 1536 => Loss: 0.80818


TRAIN Batch 1584 => Loss: 0.86706


TRAIN Batch 1632 => Loss: 1.01532


TRAIN Batch 1680 => Loss: 1.00211


TRAIN Batch 1728 => Loss: 1.19826


TRAIN Batch 1776 => Loss: 0.80041


TRAIN Batch 1824 => Loss: 0.95468


TRAIN Batch 1872 => Loss: 0.68885


TRAIN Batch 1920 => Loss: 0.95015


TRAIN Batch 1968 => Loss: 1.04861


TRAIN Batch 2016 => Loss: 0.75078


TRAIN Batch 2064 => Loss: 0.94945


TRAIN Batch 2112 => Loss: 1.11794


TRAIN Batch 2160 => Loss: 0.92751


TRAIN Batch 2208 => Loss: 0.8713


EVAL Batch 0 => Loss: 1.52158


EVAL Batch 50 => Loss: 1.52402


EVAL Batch 100 => Loss: 1.99852


EVAL Batch 150 => Loss: 2.18998


EVAL Batch 200 => Loss: 1.50347


EPOCH: 11 / 40


TRAIN Batch 0 => Loss: 0.87068


TRAIN Batch 48 => Loss: 1.02384


TRAIN Batch 96 => Loss: 0.85098


TRAIN Batch 144 => Loss: 0.89348


TRAIN Batch 192 => Loss: 0.71589


TRAIN Batch 240 => Loss: 0.94621


TRAIN Batch 288 => Loss: 1.09487


TRAIN Batch 336 => Loss: 0.83219


TRAIN Batch 384 => Loss: 0.83835


TRAIN Batch 432 => Loss: 0.80136


TRAIN Batch 480 => Loss: 0.92756


TRAIN Batch 528 => Loss: 0.90745


TRAIN Batch 576 => Loss: 1.05861


TRAIN Batch 624 => Loss: 0.8061


TRAIN Batch 672 => Loss: 0.82606


TRAIN Batch 720 => Loss: 0.85411


TRAIN Batch 768 => Loss: 0.7017


TRAIN Batch 816 => Loss: 1.017


TRAIN Batch 864 => Loss: 0.83192


TRAIN Batch 912 => Loss: 0.86304


TRAIN Batch 960 => Loss: 0.78079


TRAIN Batch 1008 => Loss: 0.88825


TRAIN Batch 1056 => Loss: 0.86497


TRAIN Batch 1104 => Loss: 0.93488


TRAIN Batch 1152 => Loss: 0.84702


TRAIN Batch 1200 => Loss: 1.08191


TRAIN Batch 1248 => Loss: 1.02796


TRAIN Batch 1296 => Loss: 0.85498


TRAIN Batch 1344 => Loss: 0.83776


TRAIN Batch 1392 => Loss: 0.83628


TRAIN Batch 1440 => Loss: 0.82914


TRAIN Batch 1488 => Loss: 0.85679


TRAIN Batch 1536 => Loss: 0.78078


TRAIN Batch 1584 => Loss: 0.83226


TRAIN Batch 1632 => Loss: 0.98572


TRAIN Batch 1680 => Loss: 0.94824


TRAIN Batch 1728 => Loss: 1.13465


TRAIN Batch 1776 => Loss: 0.77772


TRAIN Batch 1824 => Loss: 0.91612


TRAIN Batch 1872 => Loss: 0.66836


TRAIN Batch 1920 => Loss: 0.8976


TRAIN Batch 1968 => Loss: 1.01939


TRAIN Batch 2016 => Loss: 0.70373


TRAIN Batch 2064 => Loss: 0.91704


TRAIN Batch 2112 => Loss: 1.06933


TRAIN Batch 2160 => Loss: 0.88958


TRAIN Batch 2208 => Loss: 0.82686


EVAL Batch 0 => Loss: 1.53788


EVAL Batch 50 => Loss: 1.51696


EVAL Batch 100 => Loss: 1.98633


EVAL Batch 150 => Loss: 2.19867


EVAL Batch 200 => Loss: 1.52208


<|im_start|>user
Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.<|im_end|>
































 Here














































































































































































$$





























































































































































































































































































<|im_end|>










































































































































































































































































($


























































































































































































































































































































































































































































































































































































































































EPOCH: 12 / 40


TRAIN Batch 0 => Loss: 0.82645


TRAIN Batch 48 => Loss: 0.9787


TRAIN Batch 96 => Loss: 0.81857


TRAIN Batch 144 => Loss: 0.8858


TRAIN Batch 192 => Loss: 0.69044


TRAIN Batch 240 => Loss: 0.91823


TRAIN Batch 288 => Loss: 1.03588


TRAIN Batch 336 => Loss: 0.80994


TRAIN Batch 384 => Loss: 0.78629


TRAIN Batch 432 => Loss: 0.76545


TRAIN Batch 480 => Loss: 0.88176


TRAIN Batch 528 => Loss: 0.86639


TRAIN Batch 576 => Loss: 1.00618


TRAIN Batch 624 => Loss: 0.77407


TRAIN Batch 672 => Loss: 0.79823


TRAIN Batch 720 => Loss: 0.81605


TRAIN Batch 768 => Loss: 0.67531


TRAIN Batch 816 => Loss: 0.94986


TRAIN Batch 864 => Loss: 0.79971


TRAIN Batch 912 => Loss: 0.81019


TRAIN Batch 960 => Loss: 0.75051


TRAIN Batch 1008 => Loss: 0.8502


TRAIN Batch 1056 => Loss: 0.82865


TRAIN Batch 1104 => Loss: 0.90208


TRAIN Batch 1152 => Loss: 0.80635


TRAIN Batch 1200 => Loss: 1.03309


TRAIN Batch 1248 => Loss: 0.97792


TRAIN Batch 1296 => Loss: 0.81799


TRAIN Batch 1344 => Loss: 0.80433


TRAIN Batch 1392 => Loss: 0.80923


TRAIN Batch 1440 => Loss: 0.80126


TRAIN Batch 1488 => Loss: 0.81888


TRAIN Batch 1536 => Loss: 0.73892


TRAIN Batch 1584 => Loss: 0.80577


TRAIN Batch 1632 => Loss: 0.93501


TRAIN Batch 1680 => Loss: 0.91577


TRAIN Batch 1728 => Loss: 1.08342


TRAIN Batch 1776 => Loss: 0.73454


TRAIN Batch 1824 => Loss: 0.86708


In [None]:
answer('Prove that the difference between two consecutive cubes cannot be divisible by 5, using the fact that the only possible remainders when a cube is divided by 5 are 0, 1, and -1.',
      end = "")

In [None]:
torch.save(llama_model.state_dict(), 'model.pth')

In [None]:
state_dict = torch.load('model.pth', map_location=torch.device(device)) # Load to CPU

In [None]:
torch.cuda.empty_cache()

In [None]:
llama_model_ = Llama(config = ModelArgs) #.to(device)
llama_model_.load_state_dict(state_dict)

In [None]:
a1 = llama_model(x)
a2 = llama_model_(x.to('cpu'))

In [None]:
torch.allclose(a1.cpu(), a2, atol=1e-5, rtol=1e-4)