## Practice Part

In [None]:
import torch

In [None]:
def to_onehot(y, num_classes):
    y_onehot = torch.zeros(y.size(0), num_classes)
    y_onehot.scatter_(1, y.view(-1, 1).long(), 1).float()
    return y_onehot

y = torch.tensor([0, 1, 2, 2])

y_enc = to_onehot(y, 3)

print('one-hot encoding:\n', y_enc)

one-hot encoding:
 tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.]])


In [None]:
Z = torch.tensor( [[-0.3,  -0.5, -0.5],
                   [-0.4,  -0.1, -0.5],
                   [-0.3,  -0.94, -0.5],
                   [-0.99, -0.88, -0.5]])

Z

tensor([[-0.3000, -0.5000, -0.5000],
        [-0.4000, -0.1000, -0.5000],
        [-0.3000, -0.9400, -0.5000],
        [-0.9900, -0.8800, -0.5000]])

In [None]:
def softmax(z):
    return (torch.exp(z.t()) / torch.sum(torch.exp(z), dim=1)).t()

smax = softmax(Z)
print('softmax:\n', smax)

softmax:
 tensor([[0.3792, 0.3104, 0.3104],
        [0.3072, 0.4147, 0.2780],
        [0.4263, 0.2248, 0.3490],
        [0.2668, 0.2978, 0.4354]])


In [None]:
def to_classlabel(z):
    return torch.argmax(z, dim=1)

print('predicted class labels: ', to_classlabel(smax))
print('true class labels: ', to_classlabel(y_enc))

predicted class labels:  tensor([0, 1, 0, 2])
true class labels:  tensor([0, 1, 2, 2])


In [None]:
def cross_entropy(softmax, y_target):
    return - torch.sum(torch.log(softmax) * (y_target), dim=1)

xent = cross_entropy(smax, y_enc)
print('Cross Entropy:', xent)

Cross Entropy: tensor([0.9698, 0.8801, 1.0527, 0.8314])


In [None]:
import torch.nn.functional as F

In [None]:
F.nll_loss(torch.log(smax), y, reduction='none')

tensor([0.9698, 0.8801, 1.0527, 0.8314])

In [None]:
F.cross_entropy(Z, y, reduction='none')

tensor([0.9698, 0.8801, 1.0527, 0.8314])

In [None]:
F.cross_entropy(Z, y)

tensor(0.9335)

In [None]:
torch.mean(cross_entropy(smax, y_enc))

tensor(0.9335)

## G學長教我的

#### 一、張量(Tensors)

從從容容，就是最熟悉的純量、向量、矩陣  
- 0階張量: scalar
- 1階張量: vector
- 2階張量: Matrix

In [12]:
x = torch.tensor([1, 2, 3])
x = x.to("cuda")                # Move tensor to GPU

##### 簡單的張量計算

In [13]:
# create a 2D tensor
tensor_2d = torch.tensor([[1, 2, 3],
                         [4, 5, 6]])
print(tensor_2d)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [14]:
# check the shape
print(tensor_2d.shape)

torch.Size([2, 3])


In [15]:
# reshape the tensor to 3x2
# method 1
print(tensor_2d.reshape(3, 2))

# **method 2** : use `view`
print(tensor_2d.view(3, 2))

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2],
        [3, 4],
        [5, 6]])


In [16]:
# transpose the tensor
print(tensor_2d.T)

tensor([[1, 4],
        [2, 5],
        [3, 6]])


In [17]:
# muptliply two tensors
tensor_a = torch.tensor([[1, 2, 3],
                         [4, 5, 6]])
tensor_b = tensor_a.T

# meethod 1
print(tensor_a.matmul(tensor_b))

# method 2: use `@` operator
print(tensor_a @ tensor_b)

tensor([[14, 32],
        [32, 77]])
tensor([[14, 32],
        [32, 77]])


### 二、計算圖(computation Graph)和模組(Module)

建立一個神經網路，就是一連串的數學運算流程

In [18]:
class MyLLM(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(10, 5)    # 定義零件

    def forward(self, x):
        return self.layer1(x)                   # 定義資料流向

##### 實作一個簡單的邏輯斯回歸分類器(視為一個簡單的單層神經網路)

In [19]:
import torch.nn.functional as F

y = torch.tensor([1.0])         # answer
x1 = torch.tensor([1.1])        # 輸入特徵
w1 = torch.tensor([2.2])        # weights
b = torch.tensor([0.0])         # bias unit

z = x1 * w1 + b         # input
a = torch.sigmoid(z)    # activation function and output

loss = F.binary_cross_entropy(a, y) # 計算損失
print(loss)

tensor(0.0852)


Autograd(自動微分): 可幫我們回朔整個過程，一次次的校正參數

##### 加入自動微分引擎

In [20]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b 
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

grad_L_w1 = grad(loss, w1, retain_graph=True) 
grad_L_b = grad(loss, b, retain_graph=True)

print(grad_L_w1)
print(grad_L_b)

(tensor([-0.0898]),)
(tensor([-0.0817]),)


In [21]:
# 呼叫backward計算梯度

loss.backward()
print(w1.grad)
print(b.grad)

tensor([-0.0898])
tensor([-0.0817])


一些簡單對於神經網路的小小解釋(fig.A.9)
1. 每個單元需拉線到下層的每一個單元，是因為要確保該單元與下層要提取的每個特徵之間的關聯性多強；所以連線也是代表權重的意思(w)
2. 偏值單元(bias unit)就是y=ax+b的b，可以更靈活的去擬合資料
3. 最終輸出有幾個單元，取決於一開始給的分類任務
4. 一層一層像是一次又一次的濃縮精華，由最後濃縮出來的東西做決定

## 五大步驟: 
1. 算結果: 將資料丟進模型，算出預測值
2. 算分數: 比較預測值和標準答案差多少
3. 清空紀錄: 把上一次的修正建議忘記，避免干擾這一次的
4. 找方向: 叫助手回溯過程，算出這一次要怎麼修正
5. 更新: 依照修正建議，實際去調整模型的參數

### 實作多層神經網路

##### 具有兩層隱藏層的多層感知器

In [22]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs): 
        super().__init__()
        self.layers = torch.nn.Sequential(

            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),                    # add activation function

            # 2nd hidden layer
            torch.nn.Linear(30, 20),            # 頭尾數字一樣
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs),
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

In [23]:
model = NeuralNetwork(50, 3)
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [24]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)

Total number of trainable model parameters: 2213


In [25]:
print(model.layers[0].weight)

Parameter containing:
tensor([[ 0.1094, -0.0264,  0.0301,  ...,  0.0840, -0.0410, -0.1363],
        [-0.1253, -0.1362, -0.1288,  ...,  0.0990, -0.0125,  0.0207],
        [-0.0715, -0.0232, -0.0622,  ...,  0.0447,  0.0237, -0.1273],
        ...,
        [ 0.0008,  0.0187,  0.0432,  ..., -0.0546,  0.1068, -0.0161],
        [ 0.0481, -0.0893,  0.0443,  ...,  0.0828,  0.0979,  0.0907],
        [ 0.1028, -0.0143,  0.0915,  ..., -0.0774, -0.1133, -0.0102]],
       requires_grad=True)


In [26]:
print(model.layers[0].weight.shape)

torch.Size([30, 50])


In [27]:
torch.manual_seed(123)          # 選擇隨機初始權重
model = NeuralNetwork(50, 3)
print(model.layers[0].weight)

Parameter containing:
tensor([[-0.0577,  0.0047, -0.0702,  ...,  0.0222,  0.1260,  0.0865],
        [ 0.0502,  0.0307,  0.0333,  ...,  0.0951,  0.1134, -0.0297],
        [ 0.1077, -0.1108,  0.0122,  ...,  0.0108, -0.1049, -0.1063],
        ...,
        [-0.0787,  0.1259,  0.0803,  ...,  0.1218,  0.1303, -0.1351],
        [ 0.1359,  0.0175, -0.0673,  ...,  0.0674,  0.0676,  0.1058],
        [ 0.0790,  0.1343, -0.0293,  ...,  0.0344, -0.0971, -0.0509]],
       requires_grad=True)


In [28]:
torch.manual_seed(123)      # 選擇隨機初始權重
X = torch.rand((1, 50))     # input data
out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)


In [29]:
# 不進行訓練或反向傳播，可節省記憶體
with torch.no_grad():
    out = model(X)
print(out)              # output為原始數值，未經標準化

tensor([[-0.1262,  0.1080, -0.1792]])


In [30]:
with torch.no_grad():
    out = torch.softmax(model(X), dim=1)    # 用softmax標準化，輸出即為機率
print(out)

tensor([[0.3113, 0.3934, 0.2952]])


### A6

In [31]:
# 訓練集
X_train = torch.tensor([                # 5個訓練樣本，每個樣本包含2個特徵
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
    ])
y_train = torch.tensor([0, 0, 0, 1, 1]) # 5個訓練樣本對應的標籤

# 測試集
X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])
y_test = torch.tensor([0, 1])

In [32]:
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self, X, y):           # 初始設定，將準備好的資料(X)(特徵)和答案(y)輸入
        self.features = X
        self.labels = y

    def __getitem__(self, index):       # 取得某一筆資料，告訴index幾，取出幾號資料
        one_x = self.features[index] 
        one_y = self.labels[index]
        return one_x, one_y
    
    def __len__(self):                  # 資料總長度
        return self.labels.shape[0]
    
train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

In [33]:
print(len(train_ds))

5


In [34]:
from torch.utils.data import DataLoader
torch.manual_seed(123)
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,           # 每次端幾筆資料給model吃
    shuffle=True,           # 洗牌，每次取的資料順序不同，防止模型背答案
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,          # 對於測試集只是要打分數沒必要洗牌
     num_workers=0 
)

討論`num_workers=0`的情況  
此參數被設定為0時，資料載入會在主處理程序中進行，而不是在單獨的工作處理程序中進行  
若設為1或更大的數字，可以平行處理事情，更有效的利用系統資源

In [35]:
# 印出所有batches的結果
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y, sep="\n")

Batch 1:
tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]])
tensor([1, 0])
Batch 2:
tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]])
tensor([0, 0])
Batch 3:
tensor([[ 2.7000, -1.5000]])
tensor([1])


In [36]:
# 注意到最後一個batch只有1筆資料，如果訓練週期中最後一個批次遠小於其他批次可能會影響訓練的收斂性
# 設定drop_last=True可以捨棄最後一個batch

torch.manual_seed(123)
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,          
    shuffle=True,          
    drop_last=True          #捨棄最後一個Batch
)

# 發現最後一個被丟掉了
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y, sep="\n")

Batch 1:
tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]])
tensor([1, 0])
Batch 2:
tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]])
tensor([0, 0])


### A.7 典型訓練迴圈

In [37]:
import torch.nn.functional as F

torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3      # 代表要訓練多少個週期(要把所有訓練資料跑三次)

for epoch in range(num_epochs):
    model.train()   # 訓練模式on
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)                                    # step 1: 猜答案(Forward pass)
        loss = F.cross_entropy(logits, labels)                      # step 2: 算分數(Calculate loss)

        optimizer.zero_grad()                                       # step 3: 將前一輪的梯度設為0(Zero gradients)
        loss.backward()                                             # step 4: 尋找修正方向(Backward pass)(自動微分)
        optimizer.step()                                            # step 5: 實際修正(Optimizer step)

        
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train/Val Loss: {loss:.2f}")
        
    model.eval()    # 評估模式on

Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00


In [38]:
model.eval()
with torch.no_grad():           # 關閉記錄功能(因為現在只要預測)
    outputs = model(X_train)    # 寫考古題
print(outputs)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])


In [39]:
torch.set_printoptions(sci_mode=False)      # 關掉科學記號
probas = torch.softmax(outputs, dim=1)      # 分數轉機率
print(probas)

predictions = torch.argmax(probas, dim=1)   # 回傳機率最大的類別
print(predictions)

tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])
tensor([0, 0, 0, 1, 1])


In [40]:
# 可以直接把argmax直接運用在logits上，就不用softmax這步了
predictions = torch.argmax(outputs, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


In [41]:
# 確認是否和答案一樣
predictions == y_train

tensor([True, True, True, True, True])

In [42]:
#利用torch.sum計算正確的數量
torch.sum(predictions == y_train)

tensor(5)

In [43]:
# 把上述預測準確率的過程更通用，所以把他打包成compute_accuracy函式
def compute_accuracy(model, dataloader):
    model = model.eval()
    correct = 0.0       # 目前答對幾題
    total_examples = 0  # 目前總共考幾題
    
    for idx, (features, labels) in enumerate(dataloader):
        
        with torch.no_grad():   # 預測答案
            logits = model(features)
        predictions = torch.argmax(logits, dim=1)

        compare = labels == predictions         # 對答案進行比對、累加
        correct += torch.sum(compare)
        total_examples += len(compare)

    return (correct / total_examples).item()    # 回傳正確率(答對題數/總題數)


In [44]:
print(compute_accuracy(model, train_loader))
print(compute_accuracy(model, test_loader))

1.0
1.0


### A.8 儲存與載入模型

In [45]:
# 先進行儲存的動作，存到硬碟裡面
torch.save(model.state_dict(), "model.pth")

In [46]:
# 從硬碟讀取已儲存的模型參數
model = NeuralNetwork(2, 2)                     # 建立一個一樣架構的空模型給模型住進去，(2,2)需要和當初訓練的模型一致
model.load_state_dict(torch.load("model.pth"))  # 讀檔，就不用重新訓練了，可以直接預測

  model.load_state_dict(torch.load("model.pth"))  # 讀檔，就不用重新訓練了，可以直接預測


<All keys matched successfully>

### A.9 使用GPU優化訓練效能

In [47]:
# 確認環境可不可以使用GPU
print(torch.cuda.is_available())

True


In [48]:
# Re: 用CPU執行兩個張量相加
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])


In [49]:
# 利用.to()將張量移到GPU上面
tensor_1 = tensor_1.to("cuda:0")
tensor_2 = tensor_2.to("cuda:0")
print(tensor_1 + tensor_2)

tensor([5., 7., 9.], device='cuda:0')


In [50]:
# 將訓練迴圈用GPU跑

torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)
#---------------------------------------------------------#
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
#---------------------------------------------------------#
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
#---------------------------------------------------------#
        features = features.to(device)
        labels = labels.to(device)
#---------------------------------------------------------#        
        logits = model(features)
        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train/Val Loss: {loss:.2f}")
        
    model.eval()    # 評估模式on

Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00


In [1]:
import torch
print(torch.__version__)

2.4.0
