# 言語モデルのファインチューニング入門

In [None]:
#!pip install accelerate evaluate matplotlib scikit-learn

In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from transformers.trainer_utils import set_seed

# 乱数シードを42に固定
set_seed(42)




言語モデルを使ってプログラミング言語の判定を行う。

### データセットを準備する

In [None]:
dataset = load_dataset("code-search-net/code_search_net",trust_remote_code=True)

In [None]:
# データの分布を見る
from collections import Counter
import matplotlib.pyplot as plt


def plot_dataset(dataset):

    # 言語名の出現回数を数える
    try:
        lang_counts = Counter(dataset["train"]["language"])
    except:
        lang_counts = Counter(dataset["language"])




    # 件数の多い順に並べ替え
    labels, values = zip(*sorted(lang_counts.items(), key=lambda x: x[1], reverse=True))

    # グラフを描画
    plt.figure(figsize=(10, 5))
    plt.bar(labels, values)
    plt.ylabel("Number of samples")
    plt.title("Distribution of samples by language (>100 samples only)")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

plot_dataset(dataset)

- 今回扱うデータは分布が不均衡なデータセット。
- 学習が終わるようにそれぞれの言語のデータセット数を制限

In [None]:
dataset

### データセットの前処理

In [None]:
from collections import defaultdict
from datasets import Dataset, DatasetDict
import random

# 元の DatasetDict を使って分割取得
train_dataset = dataset["train"]
test_dataset = dataset["test"]
val_dataset = dataset["validation"]

# DataFrame に変換
df_train = train_dataset.to_pandas()
df_test = test_dataset.to_pandas()
df_val = val_dataset.to_pandas()

def sample_per_language(df, is_validation=False):
    return df.groupby("language").apply(
        lambda x: x.sample(
            n=min(100, len(x)) if not is_validation else min(500, len(x)),
            random_state=42
        )
    ).reset_index(drop=True)


sampled_df_train = sample_per_language(df_train)
sampled_df_test = sample_per_language(df_test)
sampled_df_val = sample_per_language(df_val,is_validation=True)

# Dataset に戻す（index列の削除も忘れずに）
dataset = DatasetDict({
    "train": Dataset.from_pandas(sampled_df_train, preserve_index=False),
    "test": Dataset.from_pandas(sampled_df_test, preserve_index=False),
    "validation": Dataset.from_pandas(sampled_df_val, preserve_index=False),
})





In [None]:
plot_dataset(dataset)

In [None]:

def tokenize_function(example):
    tokenized_example= tokenizer(example["whole_func_string"], max_length=512)
    tokenized_example["labels"] = example["language"]
    return tokenized_example

model_name = "google-bert/bert-large-cased"

tokenizer  = AutoTokenizer.from_pretrained(model_name)    

languages = dataset["train"].features["language"].names


dataset=dataset.class_encode_column("language")


tokenize_dataset = dataset.map(tokenize_function)


## モデルの読み込みと設定

In [None]:
# dataset["train"]
#　目的変数をlanguage_nameとして分類タスクを行う

In [None]:
#dataset["train"][0]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=len(set(dataset["train"]['language'])),
                                                           device_map="auto"
                                                           )

In [None]:
model.classifier

In [None]:
import numpy as np

def compute_accuracy(
    eval_pred: tuple[np.ndarray, np.ndarray]
) -> dict[str, float]:
    """予測ラベルと正解ラベルから正解率を計算"""
    predictions, labels = eval_pred
    # predictionsは各ラベルについてのスコア
    # 最もスコアの高いインデックスを予測ラベルとする
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}



In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) 
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir          = "bert-classification-language",
    learning_rate       = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 16,
    num_train_epochs    = 3,
    weight_decay        = 0.01,
    report_to           = "none"
)


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["test"],
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_accuracy,
)
trainer.train()

In [None]:
sample = dataset["validation"][1]["whole_func_string"]
inputs = tokenizer(sample, return_tensors="pt").to("cuda")
pred   = model(**inputs).logits.argmax(-1).item()


In [None]:
pred

In [None]:
from transformers import pipeline

classifier = pipeline(
    task="text-classification",      # モデルが分類タスクの場合
    model=model,                     # すでにロード済みの AutoModel
    tokenizer=tokenizer,             # すでにロード済みの Tokenizer
    device=0                         # GPU を使うなら 0（CPU の場合は -1）
)

In [None]:
sample = dataset["validation"][1][co]

In [None]:
sample