In [55]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [56]:
#Load dataset
dataset_path = "../data/jutsus.jsonl"
df = pd.read_json(dataset_path,lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_descp
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Azure Stream,"Hiden, Ninjutsu, Kenjutsu",Suigetsu creates a moving body of water beneat...
2,Bando's Chakra Armour,Ninjutsu,Bando activates his chakra to surround him in ...
3,Banshō Kokuin,"Kekkei Genkai, Ninjutsu, Dōjutsu",Madara creates a black sphere that attracts al...
4,Avalanche Dance,"Taijutsu, Shurikenjutsu","Haku delivers a string of kicks, punches, and ..."


In [57]:
def get_single_jutsu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'
    
df['single_jutsu_type'] = df['jutsu_type'].apply(get_single_jutsu)

df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_descp,single_jutsu_type
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Azure Stream,"Hiden, Ninjutsu, Kenjutsu",Suigetsu creates a moving body of water beneat...,Ninjutsu
2,Bando's Chakra Armour,Ninjutsu,Bando activates his chakra to surround him in ...,Ninjutsu
3,Banshō Kokuin,"Kekkei Genkai, Ninjutsu, Dōjutsu",Madara creates a black sphere that attracts al...,Ninjutsu
4,Avalanche Dance,"Taijutsu, Shurikenjutsu","Haku delivers a string of kicks, punches, and ...",Taijutsu


In [58]:
df['single_jutsu_type'].value_counts()  #skewed dataset -> class imbalance

single_jutsu_type
Ninjutsu    2269
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [59]:
df['text'] = df['jutsu_name'] + '. ' + df['jutsu_descp']
df['jutsu'] = df['single_jutsu_type']
df = df[['text','jutsu']]
df.dropna()
df.head()

Unnamed: 0,text,jutsu
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,Azure Stream. Suigetsu creates a moving body o...,Ninjutsu
2,Bando's Chakra Armour. Bando activates his cha...,Ninjutsu
3,Banshō Kokuin. Madara creates a black sphere t...,Ninjutsu
4,Avalanche Dance. Haku delivers a string of kic...,Taijutsu


In [60]:
#clean the data scraped from web as some html tags can remain

class Cleaner:
    def __init__(self):
        pass

    def lineBreak(self,text):
        return text.replace("</p>","</p>/n")
    
    def removeHTMLtags(self,text):
        return BeautifulSoup(text,"lxml").text

    def clean(self,text):
        text = self.lineBreak(text)
        text = self.removeHTMLtags(text)
        text = text.strip()
        return text


cleaner = Cleaner()
df['text'] = df['text'].apply(cleaner.clean)

df.head()

Unnamed: 0,text,jutsu
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,Azure Stream. Suigetsu creates a moving body o...,Ninjutsu
2,Bando's Chakra Armour. Bando activates his cha...,Ninjutsu
3,Banshō Kokuin. Madara creates a black sphere t...,Ninjutsu
4,Avalanche Dance. Haku delivers a string of kic...,Taijutsu


In [61]:
#encode labels

le = preprocessing.LabelEncoder()
le.fit(df['jutsu'].tolist())

In [62]:
label_dict = {i:label for i,label in enumerate(le.__dict__['classes_'])}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu', 3: None}

In [63]:
df['label'] = le.transform(df['jutsu'].tolist())


In [64]:
df.head()

Unnamed: 0,text,jutsu,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,2
1,Azure Stream. Suigetsu creates a moving body o...,Ninjutsu,1
2,Bando's Chakra Armour. Bando activates his cha...,Ninjutsu,1
3,Banshō Kokuin. Madara creates a black sphere t...,Ninjutsu,1
4,Avalanche Dance. Haku delivers a string of kic...,Taijutsu,2


In [65]:
#split data into train and test
test_size = 0.2
data_train, data_test = train_test_split(df,test_size=test_size,stratify=df['label'])

In [66]:
model = "distilbert/distilbert-base-uncased"

In [67]:
tokenizer = AutoTokenizer.from_pretrained(model)



In [68]:
def preprocessing(tokenizer,exmaples):
    return tokenizer(exmaples['text'],truncation=True)

#to run it on entire dataset: we can use df.apply(func) from pandas. We can also do this with hugging face Dataset.

In [69]:
#convert pandas df to Huggingface dataset
data_train = Dataset.from_pandas(data_train)
data_test =  Dataset.from_pandas(data_test)

#tokenize the dataset
data_train_tokenized = data_train.map(lambda x: preprocessing(tokenizer,x), batched=True)
data_test_tokenized = data_test.map(lambda x: preprocessing(tokenizer,x), batched=True)

Map:   0%|          | 0/2348 [00:00<?, ? examples/s]

Map: 100%|██████████| 2348/2348 [00:00<00:00, 6467.37 examples/s]
Map: 100%|██████████| 588/588 [00:00<00:00, 10100.59 examples/s]
