# main

导入环境

In [1]:
import torch
import torch.nn as nn
import pandas as pd
from scipy.ndimage import gaussian_filter1d,uniform_filter1d
from torch.utils.data import Dataset
import numpy as np
from scipy.stats import linregress
from scipy.optimize import minimize
from tqdm import tqdm
from itertools import product
from scipy.special import huber
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

加载数据并滤波

In [None]:
df = pd.read_pickle('./loss curves/gpt_loss+lrs.pkl')
df_811 = df['M:100M_gpt_D:20B_scheduler:811_rope']
df_wsd = df['M:100M_gpt_D:20B_scheduler:wsd_rope']
df_cos = df['M:100M_gpt_D:20B_scheduler:cosine_rope']

df_811['S1'] = np.cumsum(df_811['lr'])
df_wsd['S1'] = np.cumsum(df_wsd['lr'])
df_cos['S1'] = np.cumsum(df_cos['lr'])

# 滤波

# data_811['loss_filter']=uniform_filter1d(data_811['Metrics/loss'], size=5)
# data_wsd['loss_filter']=uniform_filter1d(data_wsd['Metrics/loss'], size=5)
# data_cos['loss_filter']=uniform_filter1d(data_cos['Metrics/loss'], size=5)

df_811['loss_filter']=gaussian_filter1d(df_811['Metrics/loss'], sigma=1.5)
df_wsd['loss_filter']=gaussian_filter1d(df_wsd['Metrics/loss'], sigma=1.5)
df_cos['loss_filter']=gaussian_filter1d(df_cos['Metrics/loss'], sigma=1.5)

print(df_811.shape,df_811.head())
a=df_811['Metrics/loss'].to_numpy()
a[torch.tensor([0,1],dtype=torch.int32)]

定义模型 (Tissue2024)

In [None]:
class Model_tissue(nn.Module):
    """
    Args:
        L0,A,alpha,C,lamb
    """
    def __init__(self, df_train: pd.DataFrame,
                L0: float, A: float, alpha: float,
                C: float, lamb: float):
        super().__init__()
        self.L0 = nn.Parameter(torch.tensor(L0, dtype=torch.float64))
        self.A = nn.Parameter(torch.tensor(A, dtype=torch.float64))
        self.alpha = nn.Parameter(torch.tensor(alpha, dtype=torch.float64))
        self.C = nn.Parameter(torch.tensor(C, dtype=torch.float64))
        self.lamb = nn.Parameter(torch.tensor(lamb, dtype=torch.float64))
        self.data_lr = torch.from_numpy(df_train['lr'].to_numpy())
        self.data_S1 = torch.from_numpy(df_train['S1'].to_numpy())
        self.data_loss = torch.from_numpy(df_train['loss_filter'].to_numpy())

    def forward(self,step_batch: torch.tensor):
        S1 = self.data_S1[step_batch]
        S2 = torch.zeros_like(step_batch, dtype=torch.float64)
        for i, s in enumerate(step_batch):
            # 计算S2
            j_indices = torch.arange(1,s+1,1,dtype=torch.int32)
            S2[i] = torch.sum( # sigma _j=1 ^s
                (self.data_lr[j_indices-1]-self.data_lr[j_indices])*(1-torch.pow(self.lamb,s+1-j_indices))/(1-self.lamb)
                )
        pred = self.L0 + self.A * S1 ** (-self.alpha) - self.C * S2
        r = torch.log(self.data_loss[step_batch]) - torch.log(pred.clamp(min=1e-10))  # Avoid log(0)

        # huber loss
        delta = 1e-3
        huber_loss = torch.where(torch.abs(r) < delta, 0.5 * r ** 2, delta * (torch.abs(r) - 0.5 * delta))
        return huber_loss.sum()

定义模型参数初始化方法

In [None]:
def initialize_params_tissue(df_train: pd.DataFrame, step_batch: list):
    min_loss = df_train['loss_filter'].min()
    log_y = np.log(df_train["loss_filter"][step_batch] - min_loss + 1e-3)
    log_x = np.log(df_train["S1"][step_batch])
    slope, intercept, _, _, _ = linregress(log_x, log_y)

    L0_init_set = np.linspace(min_loss - 0.2, min_loss + 0.2, 5)
    A_init_set = np.linspace(np.exp(intercept) - 0.1, np.exp(intercept) + 0.1, 3)
    alpha_init_set = np.linspace(-slope - 0.1, -slope + 0.1, 3)
    C_init_set = np.linspace(100, 1000, 3)

    def loss_fn0(params):
        L0, A, alpha, C = params
        pred = L0 + A * df_train["S1"][step_batch] ** (-alpha) - C * (3e-4 - df_train["lr"][step_batch])
        r = np.log(df_train["loss_filter"][step_batch]) - np.log(pred+1e-10)
        return huber(1e-3,r).sum()

    init_params = list(product(L0_init_set, A_init_set, alpha_init_set, C_init_set))
    best_loss = float('inf')
    best_params = None

    for init_param in tqdm(init_params, desc="Initializing Parameters"):
        res = minimize(
            loss_fn0, init_param, method='L-BFGS-B', bounds=[(0, np.inf)] * 4,
            options={'maxiter': 100000, 'ftol': 1e-9, 'gtol': 1e-6, 'eps': 1e-8}
        )
        if res.fun < best_loss:
            best_loss = res.fun
            best_params = res.x
    return dict(zip(['L0', 'A', 'alpha', 'C'],best_params))

定义模型优化方法

In [None]:
def model_fit(model,step_batch,max_step=200,loss_thr=1e-10,patience=20,grad_norm_thr=1e-5):
    optimizer = torch.optim.AdamW([
            {"params": [model.L0, model.A, model.C], "lr": 0.05},
            {"params": [model.alpha, model.lamb], "lr": 0.005},
        ])
    loss_history, min_loss, steps_no_improve = [], float('inf'), 0
    best_params, best_loss = None, float('inf')

    for _ in tqdm(range(max_step), desc="Training Progress"):
        optimizer.zero_grad()
        total_loss = model(step_batch)
        total_loss.backward()
        optimizer.step()
        loss_history.append(total_loss.item())

        if total_loss < min_loss - loss_thr:
            min_loss = total_loss.item()
            steps_no_improve = 0
        else:
            steps_no_improve += 1

        if steps_no_improve >= patience:
            break

        grads = [p.grad.flatten() for p in model.parameters() if p.grad is not None]
        grad_norm = torch.cat(grads).norm() if grads else torch.tensor(0.0)
        if grad_norm < grad_norm_thr:
            break

        if total_loss < best_loss:
            best_loss = total_loss.item()
            best_params = [p.item() for p in model.parameters()]

    plt.figure(figsize=(8, 6))
    plt.plot(np.arange(len(loss_history)), loss_history, label="Fitting loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"./tissue_fit_monitor.png")

    return best_params, best_loss

定义后处理

In [None]:
class Model_eval_tissue:
    def __init__(self, df: pd.DataFrame, fig_id : str,
                L0: float, A: float, alpha: float,
                C: float, lamb: float,):
        super().__init__()
        self.fig_id = fig_id
        self.L0 = (torch.tensor(L0, dtype=torch.float64))
        self.A = (torch.tensor(A, dtype=torch.float64))
        self.alpha = (torch.tensor(alpha, dtype=torch.float64))
        self.C = (torch.tensor(C, dtype=torch.float64))
        self.lamb = (torch.tensor(lamb, dtype=torch.float64))
        self.data_lr = torch.from_numpy(df['lr'].to_numpy())
        self.data_S1 = torch.from_numpy(df['S1'].to_numpy())
        self.data_loss = torch.from_numpy(df['loss_filter'].to_numpy())
        self.data_loss_ori = torch.from_numpy(df['Metrics/loss'].to_numpy())

    def forward(self,step_batch:torch.tensor):
        S1 = self.data_S1[step_batch]
        S2 = torch.zeros_like(step_batch, dtype=torch.float64)
        for i, s in enumerate(step_batch):
            # 计算S2
            j_indices = torch.arange(1,s+1,1,dtype=torch.int32)
            S2[i] = torch.sum( # sigma _j=1 ^s
                (self.data_lr[j_indices-1]-self.data_lr[j_indices])*(1-torch.pow(self.lamb,s+1-j_indices))/(1-self.lamb)
                )
        pred = self.L0 + self.A * S1 ** (-self.alpha) - self.C * S2
        r = torch.log(self.data_loss[step_batch]) - torch.log(pred.clamp(min=1e-10))  # Avoid log(0)

        # huber loss
        delta = 1e-3
        huber_loss = torch.where(torch.abs(r) < delta, 0.5 * r ** 2, delta * (torch.abs(r) - 0.5 * delta))

        # 绘图
        plt.figure(figsize=(8, 6))
        plt.plot(step_batch, self.data_loss_ori[step_batch], label=f"{self.fig_id}: original", linestyle="-",linewidth=1.5)
        plt.plot(step_batch, self.data_loss[step_batch], label=f"{self.fig_id}: filtered", linestyle="--",linewidth=1.5)
        plt.plot(step_batch, pred, label=f"{self.fig_id}: predict", linestyle="--",linewidth=1.5)
        plt.legend()
        plt.xlabel('Step')
        plt.ylabel('Loss')
        plt.title(f"LRS: {self.fig_id}")
        plt.savefig(f"tissue_eval_{self.fig_id}.png")
        plt.close()

        return huber_loss.sum()

定义LRS优化方法

In [None]:
def lrs_opt(L0, A, alpha, C, lamb,
        total_steps=24000,
        peak_lr=3e-4,
        min_lr=1e-10,
        lr=5e-9,
        max_steps=10000,
        warmup=2160,
        ):

    '''
    Args:
        Fitted MPL parameters [L0, A, alpha, C, lamb].
        total_steps (int): Total steps in the schedule.
        peak_lr (float): Initial peak learning rate.
        min_lr (float): Minimum learning rate threshold.
        lr (float): Learning rate for optimization.
        max_steps (int): Maximum optimization steps.
        warmup (int): Number of warmup steps.
        name (str): Identifier for output files.
    '''

    # Initialize Delta (learnable LR reductions)
    delta = nn.Parameter(torch.zeros(total_steps - warmup, dtype=torch.float64), requires_grad=True)
    warmup_bias = 0.5 * peak_lr * warmup
    optimizer = torch.optim.Adam([delta], lr=lr)
    '''
            S1 = self.data_S1[step_batch]
            S2 = torch.zeros_like(step_batch, dtype=torch.float64)
            for i, s in enumerate(step_batch):
                # 计算S2
                j_indices = torch.arange(1,s+1,1,dtype=torch.int32)
                S2[i] = torch.sum( # sigma _j=1 ^s
                    (self.data_lr[j_indices-1]-self.data_lr[j_indices])*(1-torch.pow(self.lamb,s+1-j_indices))/(1-self.lamb)
                    )
            pred = self.L0 + self.A * S1 ** (-self.alpha) - self.C * S2
    '''
    # Optimization loop
    for _ in tqdm(range(max_steps), desc="Optimizing LR Schedule"):
        optimizer.zero_grad()

        # Compute LR schedule from Delta
        eta = peak_lr - torch.cumsum(delta.clamp(min=0), dim=0)
        eta = torch.clamp(eta, min=min_lr)

        S1 = torch.cumsum(eta, dim=0) + warmup_bias
        S1 = torch.concatenate([torch.tensor([0]), S1], dim=0)

        s = total_steps - warmup -1

        j_indices = torch.arange(1,s+1,1,dtype=torch.int32)
        S2 = torch.sum( # sigma _j=1 ^s
            (eta[j_indices-1]-eta[j_indices])*(1-torch.pow(lamb,s+1-j_indices))/(1-lamb)
            )

        pred = L0 + A * S1[-1] ** (-alpha) - C * S2
        pred.backward()
        optimizer.step()

        # Enforce constraints
        with torch.no_grad():
            delta.clamp_(min=0, max=peak_lr)
            eta = peak_lr - torch.cumsum(delta, dim=0)
            delta.masked_fill_(eta <= min_lr, 0)
            opt_lr = eta.detach().numpy()
            loss = pred.item()

    # 优化结果: opt_lr -> np.save('luo_opt_lr.npy', opt_lr)
    plt.figure(figsize=(8, 6))
    plt.plot(np.arange(warmup, total_steps), opt_lr)
    plt.grid(True)
    plt.xlabel("Step")
    plt.ylabel("Learning rate")
    plt.title(f"Optimized learning rate schedule")
    plt.savefig("tissue_opt_lr.png")
    plt.close()
    print(f"Final loss for optimal LRS: {loss}")
    return None

主函数：一个用于训练，两个用于测试

In [None]:
def main(df_train,df_test1,df_test2,num_steps=100):
    """
    Main function to run the model fitting and evaluation.
    num_steps(int): Number of steps for training.
    """
    print(f"step.min()={df_train['step'].min()}, step.max()={df_train['step'].max()}, num_steps={num_steps}")
    step_batch = np.linspace(df_train['step'].min()+1, df_train['step'].max()-1, num_steps, dtype=int)

    # 参数初始化
    init_params = initialize_params_tissue(df_train,step_batch)
    init_params['lamb'] = 0.995
    model=Model_tissue(df_train,**init_params)

    # 模型拟合 best_params = [L0, A, alpha, C, lamb]
    step_batch = torch.tensor(step_batch,dtype=torch.int32)
    best_params, best_loss = model_fit(model,step_batch)

    # 检验拟合结果
    m1 = Model_eval_tissue(df_train,'811',*best_params)
    m2 = Model_eval_tissue(df_test1,'cos',*best_params)
    m3 = Model_eval_tissue(df_test2,'wsd',*best_params)
    m1.forward(torch.arange(1,30000,100, dtype=torch.int32))
    m2.forward(torch.arange(1,30000,100, dtype=torch.int32))
    m3.forward(torch.arange(1,30000,100, dtype=torch.int32))

    # LRS优化
    warmup = 2000
    total_steps = df_train['step'].max() + 1 + warmup
    lrs_opt(*best_params,warmup=warmup,total_steps=total_steps)

主程序

In [None]:
main(df_train=df_811, df_test1=df_cos, df_test2=df_wsd, num_steps=100)