# Load Data

In [1]:
import pandas as pd

In [5]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ..."
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u..."
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ..."


In [6]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [7]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [8]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...,Ninjutsu
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...,Ninjutsu
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ...",Ninjutsu
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u...",Ninjutsu
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...",Taijutsu


In [9]:
df['jutsu_type_simplified'].value_counts()

Ninjutsu    2255
Taijutsu     397
Genjutsu     101
Name: jutsu_type_simplified, dtype: int64

In [10]:
df['text'] = df['jutsu_name'] + " " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [12]:
df.head()

Unnamed: 0,text,jutsus
0,Adamantine Sealing Chains: Spiral Formation Ku...,Ninjutsu
1,Adamantine Power: Acala Hashirama kicks the op...,Ninjutsu
2,Adamantine Prison Wall After using Transformat...,Ninjutsu
3,Adamantine Seal: Monkey Yang Suppression After...,Ninjutsu
4,"Acrobat The Acrobat (荒繰鷺伐刀, Akurobatto) is a k...",Taijutsu


In [30]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [31]:
text_column_name = "text"
label_column_name = "jutsus"

In [None]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [35]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,Adamantine Sealing Chains: Spiral Formation Ku...,Ninjutsu,Adamantine Sealing Chains: Spiral Formation Ku...
1,Adamantine Power: Acala Hashirama kicks the op...,Ninjutsu,Adamantine Power: Acala Hashirama kicks the op...


# Classification

In [52]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from datasets import Dataset


le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [39]:
label_dict = {index : label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [40]:
df['labels'] = le.transform(df[label_column_name].tolist())

In [41]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,labels
0,Adamantine Sealing Chains: Spiral Formation Ku...,Ninjutsu,Adamantine Sealing Chains: Spiral Formation Ku...,1
1,Adamantine Power: Acala Hashirama kicks the op...,Ninjutsu,Adamantine Power: Acala Hashirama kicks the op...,1
2,Adamantine Prison Wall After using Transformat...,Ninjutsu,Adamantine Prison Wall After using Transformat...,1
3,Adamantine Seal: Monkey Yang Suppression After...,Ninjutsu,Adamantine Seal: Monkey Yang Suppression After...,1
4,"Acrobat The Acrobat (荒繰鷺伐刀, Akurobatto) is a k...",Taijutsu,"Acrobat The Acrobat (荒繰鷺伐刀, Akurobatto) is a k...",2


In [None]:
# split data
test_size = 0.2
df_train, df_test = train_test_split(df, stratify=df['labels'], test_size=test_size)

In [44]:
df_train['jutsus'].value_counts()

Ninjutsu    1804
Taijutsu     317
Genjutsu      81
Name: jutsus, dtype: int64

In [46]:
model_name = "distilbert/distilbert-base-uncased"

In [50]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [51]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [53]:
# Convert Pandas to hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_train)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)


Map:   0%|          | 0/2202 [00:00<?, ? examples/s]

Map:   0%|          | 0/2202 [00:00<?, ? examples/s]

In [8]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text
    
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def get_class_weights(df):
    return compute_class_weight("balanced", classes=sorted(df['labels'].unique().tolist()), y=df['labels'].tolist())
    
import torch
from torch import nn
from transformers import Trainer 

class CustomTrainer(Trainer):
    def compute_loss(self,model,inputs,return_outputs=False):
        labels = inputs.get("labels")

        # Forward Pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        logits = logits.float()
        
        # Compute Custom Loss
        loss_fct = nn.CrossEntropyLoss(weight = torch.tensor(self.class_weights, dtype=torch.float).to(device=self.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels ),labels.view(-1))
        return (loss,outputs) if return_outputs else loss

    def set_class_weights(self,class_weights):
        self.class_weights = class_weights
    
    def set_device(self,device):
        self.device = device

Using the latest cached version of the module from C:\Users\Yazan\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Fri Jul  5 14:28:37 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [11]:
import torch
import huggingface_hub
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, pipeline

from sklearn.model_selection import train_test_split
import gc
from sklearn import preprocessing
from datasets import Dataset
import pandas as pd

import os
import sys
import pathlib 

class JutsuClassifier():
    def __init__(self, model_path, data_path=None, text_column_name='text', label_column_name='jutsus', 
                 model_name="distilbert/distilbert-base-uncased", 
                 test_size=0.2,
                 num_labels=3,
                 hugging_face_token=None
                 ):
        self.model_name = model_name
        self.model_path = model_path
        self.data_path = data_path
        self.text_column_name = text_column_name
        self.label_column_name = label_column_name
        self.test_size = test_size
        self.num_labels = num_labels
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.hugging_face_token = hugging_face_token
        
        if self.hugging_face_token is not None:
            huggingface_hub.login(token=self.hugging_face_token)
        
        self.tokenizer = self.load_tokenizer() 

        if not huggingface_hub.repo_exists(self.model_path):
             # check if the data path is provided
            if self.data_path is None:
                raise ValueError("Data path must be provided if model path does not exist in HF hub.")
            
            train_data, test_data = self.load_data(self.data_path)
            train_data_df = train_data.to_pandas()
            test_data_df = test_data.to_pandas()

            all_data = pd.concat([train_data_df, test_data_df]).reset_index(drop=True)
            class_weights = get_class_weights(all_data)

            self.train_model(train_data, test_data, class_weights)
             
        self.model = self.load_model(self.model_path)

    def load_model(self, model_path):
        model = pipeline('text-classification', model=model_path, return_all_scores=True)
        return model

    def train_model(self, train_data, test_data, class_weights):
        model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels, id2label=self.label_dict)
        
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        training_args = TrainingArguments(
            output_dir=self.model_path,
            learning_rate=2e-4,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=5,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            logging_strategy="epoch",
            push_to_hub=True
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=test_data,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        trainer.set_class_weights(class_weights)
        trainer.set_device(self.device)
        trainer.train()
        
        # Flush Memory
        del trainer, model
        gc.collect()
        if self.device == 'cuda':
            torch.cuda.empty_cache()

    def load_tokenizer(self):
        if huggingface_hub.repo_exists(self.model_path):
            tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        else:
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        return tokenizer
    


    def simplify_jutsu(self, jutsu):
        if "Genjutsu" in jutsu:
            return "Genjutsu"
        if "Ninjutsu" in jutsu:
            return "Ninjutsu"
        if "Taijutsu" in jutsu:
            return "Taijutsu"

    def preprocess_function(self, tokenizer, examples):
        return tokenizer(examples['text_cleaned'], truncation=True)

    def load_data(self, data_path):
        df = pd.read_json(data_path, lines=True)
        df['jutsu_type_simplified'] = df['jutsu_type'].apply(self.simplify_jutsu)
        df['text'] = df['jutsu_name'] + " " + df['jutsu_description']
        df['jutsus'] = df['jutsu_type_simplified']
        df = df[['text', 'jutsus']]
        df = df.dropna()

        # Clean text
        cleaner = Cleaner()
        df['text_cleaned'] = df[self.text_column_name].apply(cleaner.clean)

        # Encode labels
        le = preprocessing.LabelEncoder()
        le.fit(df[self.label_column_name].tolist())

        label_dict = {index : label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
        self.label_dict = label_dict
        df['labels'] = le.transform(df[self.label_column_name].tolist())

        # Split data
        test_size = 0.2
        df_train, df_test = train_test_split(df, stratify=df['labels'], test_size=test_size)

         # Convert Pandas to hugging face dataset
        train_dataset = Dataset.from_pandas(df_train)
        test_dataset = Dataset.from_pandas(df_train)

        # tokenize the dataset
        tokenized_train = train_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True)
        tokenized_test = test_dataset.map(lambda examples: self.preprocess_function(self.tokenizer, examples), batched=True)
        
        return tokenized_train, tokenized_test 
    
    def postprocess(self, model_output):
        output = []
        for pred in model_output:
            label = max(pred, key=lambda x: x['score'])['label']
            output.append(label)
        return output
        
    def classify_jutsu(self, text):
        model_output = self.model(text)
        predictions = self.postprocess(model_output)
        return predictions



jutsu_classifier = JutsuClassifier(
    model_path="YazanAlnakri/text_classifier_jutsuxa2zs",
    data_path=r"C:\Users\Yazan\Desktop\AI_NLP_Series_analysis\data\jutsus.jsonl",
    hugging_face_token=os.getenv("huggingface_token")
)

output = jutsu_classifier.classify_jutsu(" the Rasengan consists of concentrating and rotating the chakra at a focal point on the user's hand. The result is a spinning chakra sphere with immense destructive power. Unlike the Chidori, which has a more external impact, the Rasengan can reach deep into a target")


  clean_text = BeautifulSoup(text, "lxml").text


Map:   0%|          | 0/2202 [00:00<?, ? examples/s]

Map:   0%|          | 0/2202 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1380 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
output