In [55]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import EarlyStoppingCallback
from sklearn import preprocessing
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
import numpy as np
import evaluate

Encode labels

In [57]:
df = pd.read_csv("data/data.csv", index_col=0)

label_encoder = preprocessing.LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['labels'])
df

Unnamed: 0,text,labels
0,An Absurd Reasoning Absurdity and Suicide Ther...,0
1,passion of living) there are probably but two ...,0
2,that very day addressed him indifferently. He ...,0
3,this relationship between the absurd and suici...,0
4,"Peregrinos who is born of legend, m and Jules ...",0
...,...,...
2587,any meaning? If this world had been created by...,4
2588,our own pleasures and find fulfilment and happ...,4
2589,our lives it is not enough to go beyond psycho...,4
2590,to take matters into his own hands and arrange...,4


## Creating train/test splits

Loading the model and tokenizer, checking input/output

In [58]:
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                            num_labels=5,
                                                            id2label=id2label,
                                                            label2id=label2id)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
input = df['text'][0]
encoded_input = tokenizer(input, return_tensors='pt')
output = model(**encoded_input)
print(f"\n{input=}")
print(f"\n{encoded_input=}")
print(f"\n{output=}")

Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1176.84it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



input='An Absurd Reasoning Absurdity and Suicide There is but one truly serious philosophical problem, and that is suicide. Judging whether life is or is not worth living amounts to answering the fundamental question of philosophy. All the rest— whether or not the world has three dimensions, whether the mind has nine or twelve categories—comes afterwards. These are games; one must first answer. And if it is true, as Nietzsche claims, that a philosopher, to deserve our respect, must preach by example, you can appreciate the importance of that reply, for it will precede the definitive act. These are facts the heart can feel; yet they call for careful study before they become clear to the intellect. If I ask myself how to judge that this question is more urgent than that, I reply that one judges by the actions it entails. I have never seen anyone die for the ontologi-cal argument. Galileo, who held a scientific truth of great importance, abjured it with the greatest ease as soon as it en

In [59]:
#X_train, X_text, y_train, y_text = train_test_split(df['entry'], df['label'], test_size=0.2, random_state=42)
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
def tokenize(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataset

Map: 100%|██████████| 2073/2073 [00:00<00:00, 5323.70 examples/s]
Map: 100%|██████████| 519/519 [00:00<00:00, 6419.15 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 519
    })
})

In [60]:
metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

In [61]:
training_args = TrainingArguments(
    output_dir="checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    learning_rate=2e-5,
    optim='stable_adamw',
    load_best_model_at_end=True,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model('model')
tokenizer.save_pretrained('model')

  super().__init__(loader)


Epoch,Training Loss,Validation Loss,F1
1,No log,0.066302,0.98279
2,0.208968,0.090043,0.979822
3,0.208968,0.015549,0.998067
4,0.008542,0.005551,0.996152
5,0.008542,0.005952,0.998076
6,0.002111,0.01271,0.996154
7,0.002111,0.014733,0.994221


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  5.42it/s]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.87it/s]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.13it/s]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.77it/s]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.15it/s]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  5.17it/s]
  super().__init__(loader)
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.78it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  7.38it/s]


('model/tokenizer_config.json', 'model/tokenizer.json')

Test model trained successfully! Before adjusting any hyperparameters for improving performance, accuracy seems to be a little above 50% (better than expected accuracy of a random guesser, 20%).

In [66]:
absurdism = "The absurd is the conflict between the rational mind and the irrational world. But one must imagine Sisyphus happy."
existentialism = "Man is condemned to be free; because once thrown into the world, he is responsible for everything he does. It is up to you to give life a meaning. To choose to be this or that is to affirm at the same time the value of what we choose."
stoicism = "You have power over your mind—not outside events. Realize this, and you will find strength. Everything we hear is an opinion, not a fact. Everything we see is a perspective, not the truth. The happiness of your life depends upon the quality of your thoughts."
util = "argument for preferring the life of a human being to that of an animal (with which most modern readers would be quite comfortable) is exactly paralleled by his argument for preferring the life of an intelligent human being to that of fool."
epi = "the possession of those instruments whereby the male with female can unite, the one with other in mutual ravishments. And in the ages after monsters died, perforce there perished many a stock, unable by propagation to forge a progeny."

pipe = pipeline('text-classification', 'model')
pred_absurd = pipe(absurdism)
pred_exist = pipe(existentialism)
pred_stoic = pipe(stoicism)
pred_util = pipe(util)
pred_epi = pipe(epi)

print(f"{pred_absurd = }")
print(f"{pred_exist = }")
print(f"{pred_stoic = }")
print(f"{pred_util = }")
print(f"{pred_epi = }")

Loading weights: 100%|██████████| 104/104 [00:00<00:00, 1108.28it/s, Materializing param=pre_classifier.weight]                                  


pred_absurd = [{'label': 'absurdism', 'score': 0.9986805319786072}]
pred_exist = [{'label': 'existentialism', 'score': 0.9835440516471863}]
pred_stoic = [{'label': 'absurdism', 'score': 0.9937043786048889}]
pred_util = [{'label': 'utilitarianism', 'score': 0.998670220375061}]
pred_epi = [{'label': 'absurdism', 'score': 0.9503107666969299}]


Correctly guessed absurdism and existentialism!

## Creating a UI for the model with Gradio

In [64]:
import gradio as gr

def predict(text):
    return pipe(text)[0]["label"]

demo = gr.Interface(
    fn=predict,
    inputs=["text"],
    outputs=["text"],
    api_name="predict"
)

demo.launch()

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


