In [19]:
import numpy as np

# 使用標準常態分佈 (Standard Normal Distribution) 初始化第一層權重 W1
# 乘上 0.01 是為了縮小初始權重的範圍，避免在訓練初期發生梯度消失或爆炸
W1 = np.random.randn(2, 2) * 0.01 

# 另一種寫法：明確指定平均值 (loc=0) 與標準差 (scale=2)
# 透過調整 scale，可以觀察不同初始化策略（如 He 或 Xavier）對模型收斂的影響
W1 = np.random.normal(loc=0, scale=2, size=(2, 2)) * 0.01 

# 初始化第一層偏置項 (Bias) 為全零矩陣，維度為 (1, 2)
b1 = np.zeros((1, 2))

# 初始化第二層權重 W2，維度為 (2, 1)
W2 = np.random.randn(2, 1) * 0.01 

# 初始化第二層偏置項 b2 為全零矩陣，維度為 (1, 1)
b2 = np.zeros((1, 1))

# 印出初始化的結果以供除錯與觀察
print("W1 (第一層權重):\n", W1)
print("b1 (第一層偏置):\n", b1)
print("W2 (第二層權重):\n", W2)
print("b2 (第二層偏置):\n", b2)

W1 (第一層權重):
 [[-0.04554906  0.00899906]
 [ 0.00898367  0.02450049]]
b1 (第一層偏置):
 [[0. 0.]]
W2 (第二層權重):
 [[0.00314238]
 [0.00230744]]
b2 (第二層偏置):
 [[0.]]


In [20]:
import numpy as np
import math

def calculate_fan_in_and_fan_out(tensor):
    # 檢查 tensor 維度，至少要 2 維
    if len(tensor.shape) < 2:
        raise ValueError("tensor with fewer than 2 dimensions")

    # 若是全連接層 (2D)
    if len(tensor.shape) == 2:
        fan_in, fan_out = tensor.shape
    else:  # 卷積層權重 (F, C, kH, kW)
        num_input_fmaps = tensor.shape[1]   # 輸入通道數
        num_output_fmaps = tensor.shape[0]  # 輸出通道數
        receptive_field_size = tensor[0][0].size  # kernel 大小
        fan_in = num_input_fmaps * receptive_field_size
        fan_out = num_output_fmaps * receptive_field_size   

    return fan_in, fan_out


def xavier_uniform(tensor, gain=1.):
    # 計算 fan_in 與 fan_out
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    # 計算標準差
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
    # 轉成 uniform 分布上下界
    bound = math.sqrt(3.0) * std  
    # 使用 Xavier uniform 初始化
    tensor[:] = np.random.uniform(-bound, bound, (tensor.shape))


def xavier_normal(tensor, gain=1.):
    # 計算 fan_in 與 fan_out
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    # 計算標準差
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
    # 使用 Xavier normal 初始化
    tensor[:] = np.random.normal(0, std, (tensor.shape))


# copy from Pytorch
def calculate_gain(nonlinearity, param=None):
    # 根據啟用函數回傳建議的 gain 值
    r"""Return the recommended gain value for the given nonlinearity function."""
    linear_fns = [
        'linear', 'conv1d', 'conv2d', 'conv3d',
        'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d'
    ]

    # 線性或 sigmoid
    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
        return 1
    # tanh
    elif nonlinearity == 'tanh':
        return 5.0 / 3
    # relu
    elif nonlinearity == 'relu':
        return math.sqrt(2.0)
    # leaky relu
    elif nonlinearity == 'leaky_relu':
        if param is None:
            negative_slope = 0.01
        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
            # True/False 也是 int，所以要額外判斷
            negative_slope = param
        else:
            raise ValueError("negative_slope {} not a valid number".format(param))
        return math.sqrt(2.0 / (1 + negative_slope ** 2))
    else:
        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))


def kaiming_uniform(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
    # 計算 fan_in 與 fan_out
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    # 決定使用 fan_in 或 fan_out
    if mode == 'fan_in':
        fan = fan_in
    else:
        fan = fan_out

    # 依照啟用函數計算 gain
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    # uniform 分布上下界
    bound = math.sqrt(3.0) * std      
    # 使用 Kaiming uniform 初始化
    tensor[:] = np.random.uniform(-bound, bound, (tensor.shape))


def kaiming_normal(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
    # 計算 fan_in 與 fan_out
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    # 決定使用 fan_in 或 fan_out
    if mode == 'fan_in':
        fan = fan_in
    else:
        fan = fan_out

    # 依照啟用函數計算 gain
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    bound = math.sqrt(3.0) * std  # 從標準差推回 uniform 界線（此行僅計算）
    # 使用 Kaiming normal 初始化
    tensor[:] = np.random.normal(0, std, (tensor.shape))


def kaiming(tensor, method_params=None):
    # 預設參數設定
    method_type, a, mode, nonlinearity = 'uniform', 0, 'fan_in', 'leaky_relu'

    # 若有傳入自訂參數
    if method_params:
        method_type = method_params.get('type', "uniform")
        a = method_params.get('a', 0)
        mode = method_params.get('mode', 'fan_in')
        nonlinearity = method_params.get('nonlinearity', 'leaky_relu')

    # 根據方法選擇 uniform 或 normal
    if method_params == "uniform":
        kaiming_uniform(tensor, a, mode, nonlinearity)
    else:
        kaiming_normal(tensor, a, mode, nonlinearity)


In [21]:
# np.empty，因為 np.zeros 會把所有元素初始化為 0，無法觀察到初始化方法的效果
# w = np.empty((2, 3))
# 這裡改成 np.random.normal，讓初始值有變化，才能觀察到初始化方法的效果
w = np.random.normal(loc=0, scale=2, size=(2, 3)) 
print(w)

xavier_uniform(w) # 使用 xavier_uniform 初始化
print("xavier_uniform:",w)

xavier_normal(w) # 使用 xavier_normal 初始化
print("xavier_normal:",w)

kaiming_uniform(w) # 使用 kaiming_uniform 初始化
print("kaiming_uniform:",w)

kaiming_normal(w) # 使用 kaiming_normal 初始化
print("kaiming_normal:",w)

[[-2.37684     0.69400676  1.79382216]
 [-0.06681792 -0.61588397 -0.24649979]]
xavier_uniform: [[ 0.43792096  0.49411033  0.37520651]
 [-0.81682632  0.70108823  0.32933242]]
xavier_normal: [[ 0.28089558 -0.46725237  0.29772112]
 [-0.00444117  0.10518984  0.07632804]]
kaiming_uniform: [[ 0.09504558 -0.1030177  -0.14660413]
 [ 1.31491405  0.44227326  1.24459631]]
kaiming_normal: [[ 1.09893689 -0.88951161  0.13333546]
 [ 0.26314747 -0.37636257 -1.26683417]]
