## 1. Installation / Configuration

In [33]:
import torch
import numpy as np

In [34]:
data_path = "./Crawlers/jutsus.jsonl" #@param {type:"string"}
text_column_name = "text" #@param {type:"string"}
label_column_name = "jutsu" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 3 #@param {type:"number"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## 2. Read data and Prepare Dataset


In [35]:
import pandas as pd

df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Blaze Release: Honoikazuchi,"Kekkei Genkai, Ninjutsu, Dōjutsu",The user first surrounds themselves with Amate...
2,Blaze Release: Kagutsuchi — Flying Flame,"Kekkei Genkai, Ninjutsu, Dōjutsu",Sasuke creates a sword of black flames and swi...
3,Blaze Release: Flame Formation Wall Technique,"Kekkei Genkai, Ninjutsu, Dōjutsu",The user spews normal flames covered in black ...
4,Blaze Release: Great Fireball Technique,"Kekkei Genkai, Ninjutsu, Dōjutsu","The user launches a fireball at the opponent, ..."


In [36]:
def simplify_jutsu_type(jutsu_type):
    if "Taijutsu" in jutsu_type:
        return "Taijutsu"
    elif "Ninjutsu" in jutsu_type:
        return "Ninjutsu"
    else:
        return "Genjutsu"
    return None

In [37]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu_type)

In [38]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2046
Taijutsu     638
Genjutsu     250
Name: count, dtype: int64

In [39]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_type_simplified']

In [40]:
df['jutsu'] = df['jutsu_type_simplified']

In [41]:
df = df[['text', 'jutsu']]
df = df.dropna()
df.head()

Unnamed: 0,text,jutsu
0,10 Hit Combo. Taijutsu,Taijutsu
1,Blaze Release: Honoikazuchi. Ninjutsu,Ninjutsu
2,Blaze Release: Kagutsuchi — Flying Flame. Ninj...,Ninjutsu
3,Blaze Release: Flame Formation Wall Technique....,Ninjutsu
4,Blaze Release: Great Fireball Technique. Ninjutsu,Ninjutsu


### Clean the Dataset

In [42]:
from bs4 import BeautifulSoup

In [43]:
class Cleaner():
    def __init__(self):
        pass
    def put_line_breaks(self, text):
        text = text.replace("</p>", "</p>\n")
        return text
    def remove_html_tags(self, text):
        cleantext = BeautifulSoup(text, "lxml").text
        return cleantext
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        return text

In [44]:
cleaner = Cleaner()
df['text_cleaned'] = df['text'].apply(cleaner.clean)

In [45]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned
0,10 Hit Combo. Taijutsu,Taijutsu,10 Hit Combo. Taijutsu
1,Blaze Release: Honoikazuchi. Ninjutsu,Ninjutsu,Blaze Release: Honoikazuchi. Ninjutsu
2,Blaze Release: Kagutsuchi — Flying Flame. Ninj...,Ninjutsu,Blaze Release: Kagutsuchi — Flying Flame. Ninj...
3,Blaze Release: Flame Formation Wall Technique....,Ninjutsu,Blaze Release: Flame Formation Wall Technique....
4,Blaze Release: Great Fireball Technique. Ninjutsu,Ninjutsu,Blaze Release: Great Fireball Technique. Ninjutsu


### Label Encoder

In [46]:
from sklearn import preprocessing

In [47]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())

In [48]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,10 Hit Combo. Taijutsu,Taijutsu,10 Hit Combo. Taijutsu,2
1,Blaze Release: Honoikazuchi. Ninjutsu,Ninjutsu,Blaze Release: Honoikazuchi. Ninjutsu,1
2,Blaze Release: Kagutsuchi — Flying Flame. Ninj...,Ninjutsu,Blaze Release: Kagutsuchi — Flying Flame. Ninj...,1
3,Blaze Release: Flame Formation Wall Technique....,Ninjutsu,Blaze Release: Flame Formation Wall Technique....,1
4,Blaze Release: Great Fireball Technique. Ninjutsu,Ninjutsu,Blaze Release: Great Fireball Technique. Ninjutsu,1


### Class Weights

In [49]:
from sklearn.utils.class_weight import compute_class_weight

In [50]:
class_weights = compute_class_weight('balanced', 
                    classes=np.unique(df['label'].tolist()), 
                    y=df['label'].tolist()).tolist()
class_weights

[3.912, 0.4780058651026393, 1.5329153605015673]

### Train/Test Split

**stratify** parameter is useful for ensuring the distribution of classes between the sets are the same

In [51]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'])

### Convert to Huggingface Dataest

In [52]:
from datasets import Dataset

In [53]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

### Tokenizer

In [54]:
from transformers import AutoTokenizer

In [55]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

In [56]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2347 [00:00<?, ? examples/s]

Map:   0%|          | 0/587 [00:00<?, ? examples/s]

## 3. Initialize Model

In [57]:
from transformers import AutoModelForSequenceClassification 

In [58]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Train model

In [59]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
import evaluate
import numpy as np
import torch
from torch import nn

In [60]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [61]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [62]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [63]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)


In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0634,0.048855,0.998296
2,0.0124,2.9e-05,1.0
3,0.0112,0.056372,0.998296
4,0.0273,0.055588,0.998296
5,0.0074,0.056746,0.998296


TrainOutput(global_step=1470, training_loss=0.02433642075986278, metrics={'train_runtime': 45.0664, 'train_samples_per_second': 260.393, 'train_steps_per_second': 32.619, 'total_flos': 47140088904396.0, 'train_loss': 0.02433642075986278, 'epoch': 5.0})

In [66]:
trainer.save_model('jutsu_model')

## 5. Evaluate Model

In [67]:
from sklearn.metrics import classification_report

In [68]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       200
           1       1.00      1.00      1.00      1637
           2       1.00      1.00      1.00       510

    accuracy                           1.00      2347
   macro avg       1.00      1.00      1.00      2347
weighted avg       1.00      1.00      1.00      2347



In [69]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        50
           1       1.00      1.00      1.00       409
           2       0.99      1.00      1.00       128

    accuracy                           1.00       587
   macro avg       1.00      0.99      1.00       587
weighted avg       1.00      1.00      1.00       587

