In [1]:
import os
import sys
import json
import random
import shutil
import gc

import numpy as np
import torch
from tqdm import tqdm
from scipy import linalg
import ot  # Python Optimal Transport

from datasets import load_from_disk, Dataset, load_dataset
from huggingface_hub import snapshot_download
from sentence_transformers import SentenceTransformer
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 定义你的本地存储根目录
MODEL_PATH = "/home/ubuntu/data/model/gpt2_model"
DATA_PATH = "/home/ubuntu/data/dataset/wikitext_dataset"

# --- 1. 下载模型和分词器 ---
print("正在下载模型到:", MODEL_PATH)
snapshot_download(
    repo_id="gpt2", 
    local_dir=MODEL_PATH,
    local_dir_use_symlinks=False
)

# --- 2. 下载并保存数据集 ---
print("正在下载数据集到:", DATA_PATH)
# 先下载到缓存，然后保存到指定磁盘路径
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset.save_to_disk(DATA_PATH)

print("下载完成！")

正在下载模型到: /home/ubuntu/data/model/gpt2_model


Fetching 26 files: 100%|██████████| 26/26 [1:28:17<00:00, 203.76s/it]   


正在下载数据集到: /home/ubuntu/data/dataset/wikitext_dataset


Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 28549.30 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 636106.42 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 485966.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4358/4358 [00:00<00:00, 566555.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 36718/36718 [00:00<00:00, 1225434.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3760/3760 [00:00<00:00, 602275.46 examples/s]

下载完成！





In [3]:
# ================= 配置 =================
# 1. 你想要下载的模型名称 (SBERT 榜单推荐)
MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'

# 2. 你想要保存的本地绝对路径
SAVE_PATH = '/home/ubuntu/data/model/all-mpnet-base-v2'
# =======================================

def download_and_save():
    print(f">>> 开始下载模型: {MODEL_NAME}")
    print(f">>> 目标路径: {SAVE_PATH}")

    # 如果目录不存在，自动创建
    os.makedirs(SAVE_PATH, exist_ok=True)

    # 加载模型 (会自动下载)
    model = SentenceTransformer(MODEL_NAME)
    
    # 保存模型到本地
    model.save(SAVE_PATH)
    print(">>> 模型下载并保存成功！")

if __name__ == "__main__":
    download_and_save()

>>> 开始下载模型: sentence-transformers/all-mpnet-base-v2
>>> 目标路径: /home/ubuntu/data/model/all-mpnet-base-v2


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 473.17it/s, Materializing param=pooler.dense.weight]                        
MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]

>>> 模型下载并保存成功！





In [5]:
MODEL_PATH = "/home/ubuntu/data/model/gpt2_large"

print("正在下载模型到:", MODEL_PATH)
snapshot_download(
    repo_id="gpt2-large", 
    local_dir=MODEL_PATH,
    local_dir_use_symlinks=False
)

正在下载模型到: /home/ubuntu/data/model/gpt2_large


Fetching 29 files: 100%|██████████| 29/29 [06:50<00:00, 14.15s/it]


'/home/ubuntu/data/model/gpt2_large'