In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

  _torch_pytree._register_pytree_node(


# Load the dataset

In [2]:
data_path = '../data/jutsus.jsonl'
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...
2,Accelerated Armed Revolving Heaven,"Kekkei Genkai, Hiden, Ninjutsu, Fūinjutsu, Tai...",Tenten unseals several weapons from her scroll...
3,Acid Permeation,Ninjutsu,Utakata blows acidic bubbles from his pipe tha...
4,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ..."


In [3]:
def simplify_jutsu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'
    
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'

In [4]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [5]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...,Ninjutsu
2,Accelerated Armed Revolving Heaven,"Kekkei Genkai, Hiden, Ninjutsu, Fūinjutsu, Tai...",Tenten unseals several weapons from her scroll...,Ninjutsu
3,Acid Permeation,Ninjutsu,Utakata blows acidic bubbles from his pipe tha...,Ninjutsu
4,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ...",Ninjutsu


In [6]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2261
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [7]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df = df.dropna()
df

Unnamed: 0,text,jutsu
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,Adamantine Power: Acala. Hashirama kicks the o...,Ninjutsu
2,Accelerated Armed Revolving Heaven. Tenten uns...,Ninjutsu
3,Acid Permeation. Utakata blows acidic bubbles ...,Ninjutsu
4,Adamantine Prison Wall. After using Transforma...,Ninjutsu
...,...,...
2923,Adamantine Technique: Cranium Crusher. Hiruzen...,Ninjutsu
2924,Aerobatic Strike. The user sends the opponent ...,Taijutsu
2925,Adamantine Sealing Chains: Spiral Formation. K...,Ninjutsu
2926,Adamantine Seal: Monkey Yang Suppression. Afte...,Ninjutsu


In [8]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):
        return text.replace("<p>", "</p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text
    
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [9]:
text_column_name = 'text'
label_column_name = 'jutsu'

In [10]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [11]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
1,Adamantine Power: Acala. Hashirama kicks the o...,Ninjutsu,Adamantine Power: Acala. Hashirama kicks the o...
2,Accelerated Armed Revolving Heaven. Tenten uns...,Ninjutsu,Accelerated Armed Revolving Heaven. Tenten uns...
3,Acid Permeation. Utakata blows acidic bubbles ...,Ninjutsu,Acid Permeation. Utakata blows acidic bubbles ...
4,Adamantine Prison Wall. After using Transforma...,Ninjutsu,Adamantine Prison Wall. After using Transforma...


In [12]:
# Encode labels
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [13]:
label_dict = {index: label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [14]:
df['label'] = le.transform(df[label_column_name].tolist())
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
1,Adamantine Power: Acala. Hashirama kicks the o...,Ninjutsu,Adamantine Power: Acala. Hashirama kicks the o...,1
2,Accelerated Armed Revolving Heaven. Tenten uns...,Ninjutsu,Accelerated Armed Revolving Heaven. Tenten uns...,1
3,Acid Permeation. Utakata blows acidic bubbles ...,Ninjutsu,Acid Permeation. Utakata blows acidic bubbles ...,1
4,Adamantine Prison Wall. After using Transforma...,Ninjutsu,Adamantine Prison Wall. After using Transforma...,1


In [15]:
test_size = 0.2
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'])

In [16]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    1809
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [17]:
model_name = 'distilbert/distilbert-base-uncased'

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [19]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], padding='max_length', truncation=True)

In [21]:
# convert pandas to hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train_dataset = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test_dataset = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]