<a href="https://colab.research.google.com/github/aidiary/work/blob/main/nlp_course_chapter5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install datasets
%pip install accelerate -U
%pip install evaluate

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

## データの前処理

In [None]:
from datasets import load_dataset

# CSVをロード
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
# 列名を修正
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)

In [None]:
# Noneのデータをフィルタリング
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [None]:
# conditionを小文字に
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset = drug_dataset.map(lowercase_condition)

In [None]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

# レビューの長さの属性を追加
drug_dataset = drug_dataset.map(compute_review_length)

# 短いレビューのデータを削除
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)

In [None]:
# レビューのHTMLエスケープを元に戻す
import html

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

## データの分割

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
drug_dataset_clean["test"] = drug_dataset["test"]

In [None]:
drug_dataset_clean

## ここまでの前処理結果を保存

In [None]:
drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
# データをロード
from datasets import load_from_disk

drug_dataset = load_from_disk("drug-reviews")
drug_dataset

## 分類器の訓練

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["review"], truncation=True)

tokenized_datasets = drug_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets

In [None]:
# conditionsに<span>を含む変な値のデータを除去
tokenized_datasets = tokenized_datasets.filter(lambda x: "</span>" not in x["condition"])

In [None]:
conditions = sorted(list(set(tokenized_datasets["train"]["condition"] + tokenized_datasets["validation"]["condition"] + tokenized_datasets["test"]["condition"])))
print(len(conditions))
print(conditions[:10])

In [None]:
# ラベルを保存
import pickle

with open("conditions.pickle", mode="wb") as f:
    pickle.dump(conditions, f)

In [None]:
# conditionをラベルIDに変換
def convert_label_id(example):
    return {"labels": conditions.index(example["condition"])}

tokenized_datasets = tokenized_datasets.map(
    convert_label_id
)

In [None]:
tokenized_datasets

In [None]:
# いったん保存
tokenized_datasets.save_to_disk("tokenized-datasets")

In [None]:
tokenized_datasets = load_from_disk("tokenized-datasets")
tokenized_datasets

In [None]:
# 学習に不要な特徴を削除
tokenized_datasets = tokenized_datasets.remove_columns(["patient_id", "drugName", "condition", "review", "rating", "date", "usefulCount", "review_length"])
tokenized_datasets

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="run1",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
)
training_args

In [None]:
len(conditions)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(conditions))
model

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer

# TODO: push_to_hubを追加する
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("drugsCom-bert-finetuned")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.push_to_hub("aidiary/drugsCom-bert-finetuned")
tokenizer.push_to_hub("aidiary/drugsCom-bert-finetuned")

## Evaluation

In [None]:
pred = trainer.predict(tokenized_datasets["validation"])
print(pred.predictions.shape)
print(pred.label_ids.shape)

In [None]:
import numpy as np

preds = np.argmax(pred.predictions, axis=-1)

In [None]:
metric.compute(predictions=preds, references=pred.label_ids)