# 1. Install requirements

In [None]:
!pip install transformers
!pip install datasets
!pip install --upgrade pandas
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://u

#Configs

In [None]:
import torch

In [None]:
data_path = "jutsus.jsonl" #@param {type:"string"}
text_column_name = "text" #@param {type:"string"}
label_column_name = "jutsu" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 3 #@param {type:"number"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 2. Read and Prepare the Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_json(data_path, lines=True)

In [None]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,All Weapons Above Heaven,Ninjutsu,This technique raises all the status boosts (S...
1,Air Gold Dust Protective Wall,"Kekkei Genkai, Ninjutsu","Making use of his Gold Dust, the Fourth Kazeka..."
2,Air Lightning Strike,"Taijutsu, Shurikenjutsu",The user punches the opponent twice with their...
3,Akuta,"Ninjutsu, Kinjutsu, Hiden",Akuta is an Earth Release technique that's cre...
4,Air Sand Protective Wall,Ninjutsu,This air defence technique creates a giant shi...


In [None]:
def simplify_justu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'
    
    return None

In [None]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_justu)

In [None]:
df['jutsu_type_simplified'].value_counts()

Ninjutsu    1860
Taijutsu     580
Genjutsu      93
Name: jutsu_type_simplified, dtype: int64

In [None]:
df['text'] = df['jutsu_name']+'. '+df['jutsu_description']

In [None]:
df['jutsu'] = df['jutsu_type_simplified']

In [None]:
df= df[['text','jutsu']]

In [None]:
df = df.dropna()

### Clean Dataset

In [None]:
from bs4 import BeautifulSoup

In [None]:
class Cleaner():
  def __init__(self):
    pass
  def put_line_breaks(self,text):
    text = text.replace('</p>','</p>\n')
    return text
  def remove_html_tags(self,text):
    cleantext = BeautifulSoup(text, "lxml").text
    return cleantext
  def clean(self,text):
    text = self.put_line_breaks(text)
    text = self.remove_html_tags(text)
    return text

In [None]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  cleantext = BeautifulSoup(text, "lxml").text


In [None]:
df['jutsu'].value_counts()

Ninjutsu    1860
Taijutsu     580
Genjutsu      93
Name: jutsu, dtype: int64

### Label Encoder

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())

In [None]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,Almighty Sakura. Sakura gathers large quantiti...,Taijutsu,Almighty Sakura. Sakura gathers large quantiti...,2
1,Air Sand Protective Wall. This air defence tec...,Ninjutsu,Air Sand Protective Wall. This air defence tec...,1
2,All Weapons Above Heaven. This technique raise...,Ninjutsu,All Weapons Above Heaven. This technique raise...,1
3,"Alder. Kimimaro rushes at his opponent, and us...",Taijutsu,"Alder. Kimimaro rushes at his opponent, and us...",2
4,All-Killing Ash Bones. A certain-kill techniqu...,Ninjutsu,All-Killing Ash Bones. A certain-kill techniqu...,1


# Class weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight

In [None]:
class_weights = compute_class_weight('balanced',
                     classes=sorted(df['label'].unique().tolist()),
                     y=df['label'].tolist()).tolist()

In [None]:
class_weights

[3.2007168458781363, 0.7441666666666666, 0.7441666666666666]

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train,df_test = train_test_split(df,test_size=test_size,stratify=df['label'])

### Convert to Huggingface Dataset

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

### Tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/714 [00:00<?, ? examples/s]

In [None]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

# 3. Initialize Model

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

# 4. Train model

In [None]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
import torch
from torch import nn

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)


In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9552,0.951416,0.541899
2,0.7028,0.640183,0.776536
3,0.4897,0.62897,0.810056
4,0.2891,0.631406,0.832402
5,0.1703,0.682914,0.821229


TrainOutput(global_step=450, training_loss=0.5214079305860732, metrics={'train_runtime': 92.9579, 'train_samples_per_second': 38.404, 'train_steps_per_second': 4.841, 'total_flos': 215565040846332.0, 'train_loss': 0.5214079305860732, 'epoch': 5.0})

In [None]:
trainer.save_model('jutsu_model')

# 5. Evaluate Model

In [None]:
from sklearn.metrics import classification_report

In [None]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97        74
           1       0.99      0.96      0.97       320
           2       0.95      0.99      0.97       320

    accuracy                           0.97       714
   macro avg       0.98      0.97      0.97       714
weighted avg       0.97      0.97      0.97       714



In [None]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        19
           1       0.85      0.75      0.79        80
           2       0.80      0.89      0.84        80

    accuracy                           0.82       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.82      0.82      0.82       179

