In [58]:
import pandas as pd
import plotly.graph_objects as go
import plotly as py

In [59]:
moe_search_data = pd.read_csv("moe_customer_hyperparam_search.csv")
nonmoe_search_data = pd.read_csv("nonmoe_customer_hyperparam_search.csv")
final_results_data = pd.read_csv("final_results_comparison.csv")
final_results_loss_data = pd.read_csv("graph_final_results.csv")

In [60]:
moe_search_data = moe_search_data.drop(
    columns=[
        "State",
        "Notes",
        "User",
        "Tags",
        "Created",
        "Sweep",
        "lora_ckpt_path",
        "model_name",
        "num_workers_dataloader",
        "output_dir",
        "run_validation",
        "save_metrics",
        "save_model",
        "seed",
        "use_wandb",
        "val_batch_size",
        "batch_size_training",
        "batching_strategy",
        "command",
        "context_length",
        "max_eval_step",
        "max_train_step",
        "method",
        "metric.goal",
        "metric.name",
        "parameters.batch_size_training",
        "parameters.batching_strategy",
        "parameters.context_length",
        "parameters.dataset",
        "parameters.gamma",
        "parameters.gradient_accumulation_steps",
        "parameters.gradient_accumulation_stepss",
        "parameters.gradient_clipping",
        "parameters.gradient_clipping_threshold",
        "parameters.lora_alpha",
        "parameters.lora_alphas",
        "parameters.lora_dropout",
        "parameters.lora_r",
        "parameters.lora_targets",
        "parameters.lrs",
        "parameters.max_eval_step",
        "parameters.max_train_step",
        "parameters.model_type",
        "parameters.num_epochs",
        "parameters.num_experts",
        "parameters.num_expertss",
        "parameters.num_experts_per_tok",
        "parameters.num_experts_per_toks",
        "parameters.num_workers_dataloader",
        "parameters.output_dir",
        "parameters.penalty_alpha",
        "parameters.penalty_alphas",
        "parameters.project",
        "parameters.quant_type",
        "parameters.run_validation",
        "parameters.save_metrics",
        "parameters.save_model",
        "parameters.seed",
        "parameters.use_moe",
        "parameters.use_profiler",
        "parameters.use_wandb",
        "parameters.val_batch_size",
        "parameters.weight_decay",
        "profiler_dir",
        "program",
        "project",
        "use_profiler",
        "parameters.lr",
        "train/epoch",
        "train/step",
    ]
)

moe_search_data = moe_search_data[
    (moe_search_data.Name != "genial-sweep-1")
    & (moe_search_data.Name != "sweepy-sweep-6")
    & (moe_search_data.Name != "lucky-sweep-6")
]

moe_search_data["lora_target"] = moe_search_data["lora_target"].str.replace(
    "[", "", case=False, regex=False
)
moe_search_data["lora_target"] = moe_search_data["lora_target"].str.replace(
    "]", "", case=False, regex=False
)

moe_search_data["lora_target"] = moe_search_data["lora_target"].str.replace(
    '"', "", case=False, regex=False
)

In [61]:
visual_df = moe_search_data.copy()

lora_target_map = {
    "all_linear,embed,output": 1,
    "all_linear,output": 2,
    "all_linear": 3,
    "ffn": 4,
}

visual_df["lora_target"] = visual_df["lora_target"].map(lora_target_map)

colorscale = [[0.0, "orange"], [0.1, "magenta"], [1.0, "white"]]

color_data = visual_df["eval/loss"]
cmin = color_data.min()
cmax = color_data.max()
normalized_color_data = (color_data - cmin) / (cmax - cmin)


fig = go.Figure(
    data=go.Parcoords(
        line=dict(
            color=normalized_color_data,
            colorscale=colorscale,
            showscale=False,
            cmin=0,
            cmax=1,
        ),
        dimensions=list(
            [
                dict(
                    tickvals=[1, 2, 3, 4],
                    ticktext=[
                        "Warstwy liniowe, embedding, output",
                        "Warstwy liniowe, output",
                        "Warstwy liniowe",
                        "Warstwa MoE",
                    ],
                    label="Warstwy LoRA",
                    values=visual_df["lora_target"],
                ),
                dict(
                    label="LoRA alpha",
                    values=visual_df["lora_alpha"],
                    tickvals=visual_df["lora_alpha"].unique(),
                ),
                dict(
                    label="Liczba ekspertów",
                    values=visual_df["num_experts"],
                    tickvals=visual_df["num_experts"].unique(),
                ),
                dict(
                    label="Liczba ekspertów na token",
                    values=visual_df["num_experts_per_tok"],
                    tickvals=visual_df["num_experts_per_tok"].unique(),
                ),
                dict(
                    label="Kara alpha",
                    values=visual_df["penalty_alpha"],
                    tickvals=visual_df["penalty_alpha"].unique(),
                ),
                dict(
                    label="Kroki akumulacji gradientu",
                    values=visual_df["gradient_accumulation_steps"],
                    tickvals=visual_df["gradient_accumulation_steps"].unique(),
                ),
                dict(
                    label="Współczynnik uczenia",
                    values=visual_df["lr"],
                    tickvals=visual_df["lr"].unique(),
                ),
                dict(label="Wartość straty", values=visual_df["eval/loss"]),
            ]
        ),
    )
)

fig.update_layout(
    margin=dict(l=160, r=40, t=40, b=20),
    height=400,
    width=1000,
    font=dict(family="Times New Roman", size=12, color="black"),
)

fig.show()

In [62]:
visual_df = moe_search_data.copy()

visual_df = visual_df[visual_df["eval/loss"] < 0.604]

visual_df = visual_df[
    [
        "eval/loss",
        "lora_r",
        "lora_alpha",
        "lora_target",
        "lora_dropout",
        "lr",
        "gradient_accumulation_steps",
        "weight_decay",
        "gradient_clipping",
        "num_experts",
        "num_experts_per_tok",
        "penalty_alpha",
    ]
]

visual_df

Unnamed: 0,eval/loss,lora_r,lora_alpha,lora_target,lora_dropout,lr,gradient_accumulation_steps,weight_decay,gradient_clipping,num_experts,num_experts_per_tok,penalty_alpha
0,0.600337,16,128,"all_linear,output",0.1,0.0001,64,0,False,8,4,0.01
1,0.602417,16,128,"all_linear,output",0.1,0.0001,64,0,False,8,2,0.01
2,0.602666,16,128,"all_linear,output",0.1,0.0001,64,0,False,8,4,0.01
3,0.60269,16,128,"all_linear,output",0.1,0.0001,64,0,False,4,4,0.01
4,0.603138,16,128,"all_linear,output",0.1,0.0001,64,0,False,4,2,0.01
5,0.603321,16,128,ffn,0.1,0.0001,64,0,False,8,4,0.01


In [63]:
nonmoe_search_data = nonmoe_search_data.drop(
    columns=[
        "train/step",
        "train/epoch",
        "parameters.weight_decay",
        "parameters.penalty_alpha",
        "parameters.lr",
        "parameters.lora_dropout",
        "parameters.lora_alpha",
        "parameters.gradient_clipping",
        "parameters.gradient_accumulation_steps",
        "profiler_dir",
        "program",
        "project",
        "run_validation",
        "save_metrics",
        "save_model",
        "seed",
        "use_profiler",
        "use_wandb",
        "val_batch_size",
        "use_moe",
        "quant_type",
        "parameters.weight_decays",
        "parameters.val_batch_size",
        "parameters.use_wandb",
        "parameters.use_profiler",
        "parameters.use_moe",
        "parameters.seed",
        "parameters.save_model",
        "parameters.save_metrics",
        "parameters.run_validation",
        "parameters.quant_type",
        "parameters.project",
        "parameters.output_dir",
        "parameters.num_workers_dataloader",
        "parameters.num_experts_per_tok",
        "parameters.num_experts",
        "parameters.num_epochs",
        "parameters.model_type",
        "parameters.max_val_step",
        "parameters.max_train_step",
        "parameters.max_eval_step",
        "parameters.lrs",
        "parameters.lora_targets",
        "parameters.lora_rs",
        "parameters.lora_dropouts",
        "parameters.lora_alphas",
        "parameters.gradient_clipping_threshold",
        "parameters.gradient_clippings",
        "parameters.gradient_accumulation_stepss",
        "parameters.gamma",
        "parameters.dataset",
        "parameters.context_length",
        "parameters.batching_strategy",
        "parameters.batch_size_training",
        "output_dir",
        "num_workers_dataloader",
        "num_experts_per_tok",
        "num_experts",
        "num_epochs",
        "model_type",
        "model_name",
        "metric.name",
        "metric.goal",
        "method",
        "max_val_step",
        "max_train_step",
        "max_eval_step",
        "lora_ckpt_path",
        "dataset",
        "command",
        "batching_strategy",
        "context_length",
        "batch_size_training",
        "Sweep",
        "Tags",
        "User",
        "Notes",
        "State",
        "penalty_alpha",
    ]
)

nonmoe_search_data = nonmoe_search_data[
    (nonmoe_search_data.Name != "vague-sweep-1")
    & (nonmoe_search_data.Name != "smart-sweep-2")
    & (nonmoe_search_data.Name != "toasty-sweep-9")
]

nonmoe_search_data["lora_target"] = nonmoe_search_data["lora_target"].str.replace(
    "[", "", case=False, regex=False
)
nonmoe_search_data["lora_target"] = nonmoe_search_data["lora_target"].str.replace(
    "]", "", case=False, regex=False
)

nonmoe_search_data["lora_target"] = nonmoe_search_data["lora_target"].str.replace(
    '"', "", case=False, regex=False
)

In [64]:
visual_df = nonmoe_search_data.copy()

visual_df = visual_df[visual_df["eval/loss"] < 5]

lora_target_map = {
    "all_linear,embed,output": 1,
    "all_linear,output,embed": 1,
    "all_linear,output": 2,
    "all_linear,embed": 3,
    "all_linear": 4,
    "ffn": 5,
}

visual_df["lora_target"] = visual_df["lora_target"].map(lora_target_map)

gradient_clip_map = {
    True: 1,
    False: 0,
}

visual_df["gradient_clipping"] = visual_df["gradient_clipping"].map(gradient_clip_map)

colorscale = [[0.0, "orange"], [0.5, "magenta"], [1.0, "white"]]

color_data = visual_df["eval/loss"]
cmin = color_data.min()
cmax = color_data.max()
normalized_color_data = (color_data - cmin) / (cmax - cmin)

fig = go.Figure(
    data=go.Parcoords(
        line=dict(
            color=normalized_color_data,
            colorscale=colorscale,
            showscale=False,
            cmin=0,
            cmax=1,
        ),
        dimensions=list(
            [
                dict(
                    tickvals=[1, 2, 3, 4, 5],
                    ticktext=[
                        "Warstwy liniowe, embedding, output",
                        "Warstwy liniowe, output",
                        "Warstwy liniowe, embedding",
                        "Warstwy liniowe",
                        "Warstwy FFN",
                    ],
                    label="Warstwy LoRA",
                    values=visual_df["lora_target"],
                ),
                dict(
                    label="LoRA r",
                    values=visual_df["lora_r"],
                    tickvals=visual_df["lora_r"].unique(),
                ),
                dict(
                    label="LoRA alpha",
                    values=visual_df["lora_alpha"],
                    tickvals=visual_df["lora_alpha"].unique(),
                ),
                dict(
                    label="LoRA dropout",
                    values=visual_df["lora_dropout"],
                    tickvals=visual_df["lora_dropout"].unique(),
                ),
                dict(
                    label="Kroki akumulacji gradientu",
                    values=visual_df["gradient_accumulation_steps"],
                    tickvals=visual_df["gradient_accumulation_steps"].unique(),
                ),
                dict(
                    label="Zanik wag",
                    values=visual_df["weight_decay"],
                    tickvals=visual_df["weight_decay"].unique(),
                ),
                dict(
                    label="Przycinanie gradientu",
                    tickvals=[0, 1],
                    ticktext=["Nie", "Tak"],
                    values=visual_df["gradient_clipping"],
                ),
                dict(
                    label="Współczynnik uczenia",
                    values=visual_df["lr"],
                    tickvals=visual_df["lr"].unique(),
                ),
                dict(
                    label="Wartość straty",
                    values=visual_df["eval/loss"],
                ),
            ]
        ),
    )
)

fig.update_layout(
    margin=dict(l=160, r=40, t=40, b=20),
    height=400,
    width=1000,
    font=dict(family="Times New Roman", size=12, color="black"),
)


fig.show()

In [48]:
visual_df = nonmoe_search_data.copy()

visual_df = visual_df[visual_df["eval/loss"] < 0.61]

visual_df = visual_df[
    [
        "eval/loss",
        "lora_r",
        "lora_alpha",
        "lora_target",
        "lora_dropout",
        "lr",
        "gradient_accumulation_steps",
        "weight_decay",
        "gradient_clipping",
    ]
]

visual_df

Unnamed: 0,eval/loss,lora_r,lora_alpha,lora_target,lora_dropout,lr,gradient_accumulation_steps,weight_decay,gradient_clipping
0,0.600568,32,128,"all_linear,output",0.1,0.0001,64,0.0,False
1,0.601103,16,128,"all_linear,output",0.1,0.0001,64,0.0,False
2,0.602716,8,128,"all_linear,output",0.1,0.0001,64,0.0,False
3,0.604068,16,128,ffn,0.1,0.0001,64,0.0,False


In [49]:
final_results_data[
    [
        "Name",
        "eval/loss",
        "lora_r",
        "lora_alpha",
        "lora_dropout",
        "lr",
        "gradient_accumulation_steps",
        "weight_decay",
        "gradient_clipping",
        "lora_target",
        "num_experts",
        "num_experts_per_tok",
        "penalty_alpha",
    ]
]

Unnamed: 0,Name,eval/loss,lora_r,lora_alpha,lora_dropout,lr,gradient_accumulation_steps,weight_decay,gradient_clipping,lora_target,num_experts,num_experts_per_tok,penalty_alpha
0,moe_instruct_mixed_higher_lr,0.847494,16,128,0.1,0.0002,64,0,False,"[""all_linear"",""output""]",8,3,0.01
1,instruct_mixed_higher_lr,0.863773,16,128,0.1,0.0002,64,0,False,"[""all_linear"",""output""]",8,3,0.01
2,moe_pretrained_mixed,0.779777,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",8,3,0.01
3,pretrained_mixed,0.777839,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",8,3,0.01
4,moe_instruct_mixed,0.752569,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",8,3,0.01
5,instruct_mixed,0.752512,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",8,3,0.01
6,moe_instruct_customer,0.579216,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",8,3,0.01
7,moe_pretrained_customer,0.584177,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",8,3,0.01
8,pretrained_customer,0.580371,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",4,2,0.01
9,instruct_customer,0.575874,16,128,0.1,0.0001,64,0,False,"[""all_linear"",""output""]",4,2,0.01


In [50]:
final_results_loss_data = final_results_loss_data[
    [col for col in final_results_loss_data.columns if "MIN" not in col]
]

final_results_loss_data = final_results_loss_data[
    [col for col in final_results_loss_data.columns if "MAX" not in col]
]

final_results_loss_data = final_results_loss_data[
    [col for col in final_results_loss_data.columns if "higher_lr" not in col]
]

In [51]:
final_loss_customer = final_results_loss_data[
    [
        col
        for col in final_results_loss_data.columns
        if "customer" in col or "Step" in col
    ]
]

final_loss_mixed = final_results_loss_data[
    [col for col in final_results_loss_data.columns if "mixed" in col or "Step" in col]
]

In [52]:
final_loss_customer = final_loss_customer.rename(
    columns={
        "moe_instruct_customer - eval/loss": "MoE Instruct",
        "moe_pretrained_customer - eval/loss": "MoE Pretrained",
        "pretrained_customer - eval/loss": "Gęsty Pretrained",
        "instruct_customer - eval/loss": "Gęsty Instruct",
    }
)

final_loss_customer

Unnamed: 0,Step,MoE Instruct,MoE Pretrained,Gęsty Pretrained,Gęsty Instruct
0,151,1.162848,1.229561,1.280348,1.19114
1,1001,0.790195,0.836766,0.857462,0.817598
2,3177,,,,
3,6354,,,,
4,10748,0.601352,0.610394,0.608301,0.598462
5,21497,0.579216,0.584177,0.580371,0.575874


In [53]:
import plotly as py
import plotly.graph_objects as go

fig = go.Figure()

for col in final_loss_customer.drop(columns=["Step"]).columns:
    fig.add_trace(
        go.Scatter(
            x=final_loss_customer["Step"],
            y=final_loss_customer[col],
            mode="lines+markers",
            name=col,
            connectgaps=True,
        )
    )

fig.update_layout(xaxis_title="Krok uczenia", yaxis_title="Wartość straty")

fig.update_xaxes(
    automargin=True,
    autorange=True,
    autorangeoptions_minallowed=0,
    autorangeoptions_maxallowed=22000,
)

fig.update_yaxes(
    automargin=True,
    autorange=True,
)

fig.update_layout(
    width=1000,
    height=450,
    template="plotly_white",
    legend=dict(
        yanchor="top",
        y=0.95,
        xanchor="left",
        x=0.84,
    ),
)

In [54]:
achieved_customer_loss = final_loss_customer[5:].drop(columns=["Step"]).transpose()
achieved_customer_loss

Unnamed: 0,5
MoE Instruct,0.579216
MoE Pretrained,0.584177
Gęsty Pretrained,0.580371
Gęsty Instruct,0.575874


In [55]:
final_loss_mixed = final_loss_mixed.rename(
    columns={
        "moe_instruct_mixed - eval/loss": "MoE Instruct",
        "moe_pretrained_mixed - eval/loss": "MoE Pretrained",
        "pretrained_mixed - eval/loss": "Gęsty Pretrained",
        "instruct_mixed - eval/loss": "Gęsty Instruct",
    }
)

final_loss_mixed = final_loss_mixed[:-2]

final_loss_mixed

Unnamed: 0,Step,MoE Pretrained,Gęsty Pretrained,MoE Instruct,Gęsty Instruct
0,151,1.292431,1.305453,1.13631,1.165723
1,1001,0.961118,0.973948,0.876396,0.904158
2,3177,0.843484,0.817227,0.787228,0.794299
3,6354,0.779777,0.777839,0.752569,0.752512


In [56]:
fig = go.Figure()

for col in final_loss_mixed.drop(columns=["Step"]).columns:
    fig.add_trace(
        go.Scatter(
            x=final_loss_mixed["Step"],
            y=final_loss_mixed[col],
            mode="lines+markers",
            name=col,
            connectgaps=True,
        )
    )

fig.update_layout(xaxis_title="Krok uczenia", yaxis_title="Wartość straty")

fig.update_xaxes(
    automargin=True,
    autorange=True,
    autorangeoptions_minallowed=0,
    autorangeoptions_maxallowed=6500,
)

fig.update_yaxes(
    automargin=True,
    autorange=True,
)

fig.update_layout(
    width=1000,
    height=450,
    template="plotly_white",
    legend=dict(
        yanchor="top",
        y=0.95,
        xanchor="left",
        x=0.84,
    ),
)

In [57]:
achieved_mixed_loss = final_loss_mixed[3:].drop(columns=["Step"]).transpose()
achieved_mixed_loss