In [2]:
# 加入https proxy 以及 HF_ENDPOINT
import os
import torch
import time

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
os.environ["HTTPS_PROXY"] = "http://10.161.0.82:7899/"
os.environ["HF_ENDPOINT"] = "https://hf.neolink-ai.com"


In [None]:
# 使用transformers来分析语句的情感
classifier = pipeline("sentiment-analysis", device="cuda")

print(classifier(["i'm so happy", "i'm so sad"])) 

In [None]:
# 重置cuda 显存
torch.cuda.empty_cache()
# 加载一个text-generation的pipeline,使用本地模型文件
generator = pipeline("text-generation", model="meta-llama/Llama-3.2-1B", device="cuda", torch_dtype=torch.bfloat16, max_length=200)

print(generator("can you tell me something about the great wall of china")) 


In [None]:
# 使用AutoTokenizer来加载 distilbert-base-uncased-finetuned-sst-2-english
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# 使用AutoModelForSequenceClassification来加载 distilbert-base-uncased-finetuned-sst-2-english
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# 使用tokenizer来编码一个句子
inputs = tokenizer("i'm so happy", return_tensors="pt")

print(inputs)

# 使用model来预测一个句子
outputs = model(**inputs)

print(outputs)
time.sleep(100000)

In [None]:
from transformers import AutoTokenizer,AutoModel
pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

raw_inputs = ["i'm looking forward to the weekend", "i'm so sad"]
inputs = tokenizer(raw_inputs, return_tensors="pt", padding=True, truncation=True)
print(inputs)


model = AutoModel.from_pretrained(pretrained_model)
print(model)

outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

In [None]:
from transformers import AutoTokenizer

pretrained_model = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
# 定义 sequences 为 "Hello!", "Cool.", "Nice!"
sequence = ["Using a Transformer network is simple"]

# 使用 tokenizer 编码 sequences
encoded_sequences = tokenizer(sequence, return_tensors="pt")
print(encoded_sequences)


In [None]:
sentence = ["Jim Henson was a puppeteer"]

words = sentence[0].split()
print(words)


In [None]:
from transformers import AutoTokenizer

pretrained_model = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
tokenizer.pad_token = tokenizer.eos_token
sentence = ["Using a Transformer network is simple"]

encoded_sequences = tokenizer(sentence, return_tensors="pt",padding="max_length",truncation=True)
print(encoded_sequences)
# llama 3.1 8b 的输入长度是 128k
print(encoded_sequences.input_ids.shape)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AdamW

pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)


sentences = ["i'm so happy, when i'm with you", "i'm so sad, when i'm alone"]
batch_inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
batch_inputs["labels"] = torch.tensor([1,0])
print(batch_inputs)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    outputs = model(**batch_inputs)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch} loss: {loss.item()}")


In [None]:
import os

from datasets import load_dataset
from transformers import DataCollatorWithPadding
os.environ["HTTPS_PROXY"] = "http://10.161.0.82:7899/"
os.environ["HF_ENDPOINT"] = "https://hf.neolink-ai.com"

raw_dataset = load_dataset("glue",'mrpc')

raw_train_dataset = raw_dataset["train"]
raw_test_dataset = raw_dataset["test"]
raw_validation_dataset = raw_dataset["validation"]

from transformers import AutoTokenizer, AutoModel

checkpoint = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModel.from_pretrained(checkpoint)

# 使用这个函数的作用是 保持原始数据集的结构为dataset,加入input_ids,attention_mask等字段
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"],truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

sample = tokenized_datasets["train"][:8]

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch = data_collator(sample)

for key, value in batch.items():
    print(f"{key} shape: {value.shape}")


In [None]:
# 一个完整的fine-tuning的流程
import os
os.environ["HTTPS_PROXY"] = "http://10.161.0.82:7899/"
os.environ["HF_ENDPOINT"] = "https://hf.neolink-ai.com"

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import TrainingArguments

training_args = TrainingArguments("/home/chendan/Desktop/workspace/workspace/checkpoints/test-trainer")

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# 训练一个完整full fine-tuning的流程

import os
os.environ["HTTPS_PROXY"] = "http://10.161.0.82:7899/"
os.environ["HF_ENDPOINT"] = "https://hf.neolink-ai.com"

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)


from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

import evaluate

metric = evaluate.load("glue", "mrpc")

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

step = 0
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        step += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        if step % 100 == 0:
            print(f"Epoch {epoch} step {step} loss: {loss.item()}")
    
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])
    eval_metric = metric.compute()
    print(eval_metric)