# Load Model

In [50]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [19]:
data_path = "../Data/jutsus.jsonl"
df = pd.read_json(data_path, lines = True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Air Lightning Bullet,"Taijutsu, Shurikenjutsu",The user punches the opponent twice with their...
1,Adamantine Sealing Chains,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu",This is a sealing technique that is characteri...
2,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u..."
3,Aerobatic Strike,"Scientific Ninja Tool Techniques, Taijutsu","The user sends the opponent in the air, where ..."
4,Afterimage Clone,"Ninjutsu, Clone Techniques","Shisui uses the Body Flicker Technique, and mo..."


In [20]:
"""
Removes unncessary jutsus from the jutsu type and only keeps 
Genjutsu, Ninjutsu, Taijutsu
"""
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [21]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [22]:
df.tail()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
2934,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic...",Taijutsu
2935,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...,Taijutsu
2936,Absorption Sphere,Ninjutsu,"Using the Jutsu Absorption Arm, the user creat...",Ninjutsu
2937,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...,Taijutsu
2938,100 Metre Punch,Taijutsu,"A shorter version of the 1000 Metre Punch, the...",Taijutsu


In [23]:
#Dataset outputs are skewed with Ninjutsu appearing more often than the other two
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2272
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [24]:
#Simplify dataset to only have jutsu type and description
df['text'] = df['jutsu_name'] + '. ' + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()


In [25]:
df.head()

Unnamed: 0,text,jutsus
0,Air Lightning Bullet. The user punches the opp...,Taijutsu
1,Adamantine Sealing Chains. This is a sealing t...,Ninjutsu
2,Adamantine Seal: Monkey Yang Suppression. Afte...,Ninjutsu
3,Aerobatic Strike. The user sends the opponent ...,Taijutsu
4,Afterimage Clone. Shisui uses the Body Flicker...,Ninjutsu


In [26]:
from bs4 import BeautifulSoup
#Clean the text
class Cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):
        return text.replace('<\p>', '<\p>\n')
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, 'lxml').text
        
        return clean_text
    
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        
        return text
        

  return text.replace('<\p>', '<\p>\n')
  return text.replace('<\p>', '<\p>\n')


In [27]:
text_column_name = 'text'
label_column_name = 'jutsus'

In [28]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, 'lxml').text


In [29]:
df

Unnamed: 0,text,jutsus,text_cleaned
0,Air Lightning Bullet. The user punches the opp...,Taijutsu,Air Lightning Bullet. The user punches the opp...
1,Adamantine Sealing Chains. This is a sealing t...,Ninjutsu,Adamantine Sealing Chains. This is a sealing t...
2,Adamantine Seal: Monkey Yang Suppression. Afte...,Ninjutsu,Adamantine Seal: Monkey Yang Suppression. Afte...
3,Aerobatic Strike. The user sends the opponent ...,Taijutsu,Aerobatic Strike. The user sends the opponent ...
4,Afterimage Clone. Shisui uses the Body Flicker...,Ninjutsu,Afterimage Clone. Shisui uses the Body Flicker...
...,...,...,...
2934,"16 Hit Combo. A very effective move, Ino uses ...",Taijutsu,"16 Hit Combo. A very effective move, Ino uses ..."
2935,100% Single Punch. Tsunade gathers large amoun...,Taijutsu,100% Single Punch. Tsunade gathers large amoun...
2936,Absorption Sphere. Using the Jutsu Absorption ...,Ninjutsu,Absorption Sphere. Using the Jutsu Absorption ...
2937,1000 Metre Punch. The user focuses a large amo...,Taijutsu,1000 Metre Punch. The user focuses a large amo...


In [30]:
#Encode labels
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [None]:
#Gives each jutsu type a unique numerical label
label_dict = {index: label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [32]:
df['label'] = le.transform(df[label_column_name].tolist())

In [33]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,Air Lightning Bullet. The user punches the opp...,Taijutsu,Air Lightning Bullet. The user punches the opp...,2
1,Adamantine Sealing Chains. This is a sealing t...,Ninjutsu,Adamantine Sealing Chains. This is a sealing t...,1
2,Adamantine Seal: Monkey Yang Suppression. Afte...,Ninjutsu,Adamantine Seal: Monkey Yang Suppression. Afte...,1
3,Aerobatic Strike. The user sends the opponent ...,Taijutsu,Aerobatic Strike. The user sends the opponent ...,2
4,Afterimage Clone. Shisui uses the Body Flicker...,Ninjutsu,Afterimage Clone. Shisui uses the Body Flicker...,1


In [37]:
df_train, df_test = train_test_split(df, test_size = 0.2, stratify = df['label'])

In [39]:
model_name = 'distilbert/distilbert-base-uncased'

In [48]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [51]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation = True)

In [52]:
#Convert dataframe to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

#Tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched = True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched = True)


Map: 100%|██████████| 2216/2216 [00:00<00:00, 12970.76 examples/s]
Map: 100%|██████████| 555/555 [00:00<00:00, 13863.03 examples/s]
