In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
#print np in non-scientific notation
np.set_printoptions(suppress=True)




In [23]:
torch.manual_seed(0)
seq = torch.randint(1, 100, (10000,))
input_sequences = seq.unfold(0, 10, 1)
target_elements = seq[14:]
input_sequences = input_sequences[:-1]
target_elements = target_elements[1:]
print(f'input sequence shape: {input_sequences.shape}')
print(f'target elements shape: {target_elements.shape}')

input sequence shape: torch.Size([9985, 15])
target elements shape: torch.Size([9985])


In [24]:
input_sequences[:5], target_elements[:5]

(tensor([[99, 19, 57, 70, 53, 70, 71, 56, 29, 64, 54, 93, 25, 94, 20],
         [19, 57, 70, 53, 70, 71, 56, 29, 64, 54, 93, 25, 94, 20, 84],
         [57, 70, 53, 70, 71, 56, 29, 64, 54, 93, 25, 94, 20, 84, 28],
         [70, 53, 70, 71, 56, 29, 64, 54, 93, 25, 94, 20, 84, 28, 55],
         [53, 70, 71, 56, 29, 64, 54, 93, 25, 94, 20, 84, 28, 55, 33]]),
 tensor([84, 28, 55, 33, 22]))

In [25]:
split_idx = int(len(input_sequences) * 0.8)
train_sequences = input_sequences[:split_idx]
train_targets = target_elements[:split_idx]
val_sequences = input_sequences[split_idx:]
val_targets = target_elements[split_idx:]

from torch.utils.data import TensorDataset, DataLoader
train_data = TensorDataset(train_sequences, train_targets)
val_data = TensorDataset(val_sequences, val_targets)
batch_size = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=4)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, num_workers=4)



In [26]:
import torch
import torch.nn as nn
import torch.optim as optim

class AutoregressiveModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, dropout=0.1):
        super(AutoregressiveModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.pos_encoder = nn.Embedding(64, embed_size) # Допустим, максимальная длина последовательности меньше 1000
        self.decoder_layer = nn.TransformerDecoderLayer(embed_size, num_heads, embed_size*4, dropout, batch_first=True)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers)
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0).to(x.device)
       
        x = self.embed(x) + self.pos_encoder(positions)
       
        tgt_mask = torch.nn.Transformer.generate_square_subsequent_mask(x.size(1)).to(x.device)
        
        output = self.decoder(x, x, tgt_mask=tgt_mask)  # В данном случае использовать одинаковые значения для src и tgt
    
        return self.fc_out(output)

# Инициализация модели
vocab_size = 100  # предположим, что у нас 100 уникальных значений
embed_size = 512
num_heads = 8
num_layers = 6
dropout = 0.1

model = AutoregressiveModel(vocab_size, embed_size, num_heads, num_layers, dropout)

# Гиперпараметры обучения
learning_rate = 0.01
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

# Обучение
epochs = 10
for epoch in range(epochs):
    model.train()
    batch_idx = 0
    epoch_loss = 0
    vepoch_loss = 0
    for input_sequences, target_elements in train_loader:
        optimizer.zero_grad()
        output = model(input_sequences)
        loss = loss_fn(output[:, -1, :].view(-1, vocab_size), target_elements.view(-1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        batch_idx += 1
        print(f'Epoch {epoch}, Batch {batch_idx}/{len(train_loader)}, Loss: {epoch_loss / batch_idx}', end='\r', flush=True)
    batch_idx = 0    
    for vinput_sequences, vtarget_elements in val_loader:
        model.eval()
        with torch.inference_mode():
            voutput = model(vinput_sequences)
            vloss = loss_fn(voutput[:, -1, :].view(-1, vocab_size), vtarget_elements.view(-1))
            vepoch_loss += vloss.item()
            batch_idx += 1
            print(f'Epoch {epoch}, Batch {batch_idx}/{len(val_loader)}, Test Loss: {vepoch_loss / batch_idx}', end='\r', flush=True)


    print(f'Epoch {epoch}, Loss: {epoch_loss / len(train_loader)}, Test Loss: {vepoch_loss / len(val_loader)}')


Epoch 0, Batch 125/125, Loss: 5.0521891059875495

ERROR: Unexpected segmentation fault encountered in worker.
 

RuntimeError: DataLoader worker (pid 94318) is killed by signal: Segmentation fault: 11. 

In [75]:
torch.softmax( voutput[:,-1,:], dim=-1).argmax(dim=-1), vtarget_elements

(tensor([79, 95, 53, 69, 61, 55, 21, 37, 92, 95, 73]),
 tensor([79, 95, 53, 69, 61, 55, 21, 37, 92, 95, 73]))

In [78]:
#print number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 30,042,724 trainable parameters


In [8]:
import mlx.core as mx
import mlx.nn as mnn
import mlx.optimizers as opt

import numpy as np



In [14]:
class MLP(mnn.Module):
    def __init__(
        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
    ):
        super().__init__()
        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
        self.layers = [
            mnn.Linear(idim, odim)
            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
        ]

    def __call__(self, x):
        for l in self.layers[:-1]:
            x = mx.maximum(l(x), 0.0)
        return self.layers[-1](x)

In [10]:
def loss_fn(model, X, y):
    return mx.mean(mnn.losses.cross_entropy(model(X), y))

def eval_fn(model, X, y):
    return mx.mean(mx.argmax(model(X), axis=1) == y)

In [11]:
num_layers = 2
hidden_dim = 32
num_classes = 10
batch_size = 256
num_epochs = 10
learning_rate = 1e-1

# Load the data
import mnist
train_images, train_labels, test_images, test_labels = map(
    mx.array, mnist.mnist()
)

In [18]:
def batch_iterate(batch_size, X, y):
    perm = mx.array(np.random.permutation(y.size))
    for s in range(0, y.size, batch_size):
        ids = perm[s : s + batch_size]
        yield X[ids], y[ids]

In [23]:
# Load the model
mx.set_default_device(mx.cpu)
model = MLP(num_layers, train_images.shape[-1], hidden_dim, num_classes)


In [24]:
mx.eval(model.parameters())

# Get a function which gives the loss and gradient of the
# loss with respect to the model's trainable parameters
loss_and_grad_fn = mnn.value_and_grad(model, loss_fn)

# Instantiate the optimizer
optimizer = opt.SGD(learning_rate=learning_rate)

for e in range(num_epochs):
    for X, y in batch_iterate(batch_size, train_images, train_labels):
        loss, grads = loss_and_grad_fn(model, X, y)

        # Update the optimizer state and model parameters
        # in a single call
        optimizer.update(model, grads)

        # Force a graph evaluation
        mx.eval(model.parameters(), optimizer.state)

    accuracy = eval_fn(model, test_images, test_labels)
    print(f"Epoch {e}: Test accuracy {accuracy.item():.3f}")

Epoch 0: Test accuracy 0.869
Epoch 1: Test accuracy 0.899
Epoch 2: Test accuracy 0.919
Epoch 3: Test accuracy 0.927
Epoch 4: Test accuracy 0.933
Epoch 5: Test accuracy 0.941
Epoch 6: Test accuracy 0.944
Epoch 7: Test accuracy 0.947
Epoch 8: Test accuracy 0.927
Epoch 9: Test accuracy 0.949


In [35]:
import torch

In [55]:
mx.set_default_device(mx.cpu)
a = mx.random.normal([1000,1000,1000])
b = mx.random.normal([1000,1000,1000])

%timeit c = a @ b

1.03 µs ± 25.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [56]:
mx.set_default_device(mx.gpu)
a = mx.random.normal([1000,1000,1000])
b = mx.random.normal([1000,1000,1000])

%timeit c = a @ b

1.03 µs ± 34.9 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [57]:
device = torch.device('cpu')
a = torch.randn((1000,1000,1000), device=device)
b = torch.randn((1000,1000,1000), device=device)

%timeit c = a @ b

In [42]:
device = torch.device('mps')
a = torch.randn((1000,1000,1000), device=device)
b = torch.randn((1000,1000,1000), device=device)

%timeit c = a @ b

KeyboardInterrupt: 