In [32]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer
from sklearn import preprocessing
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate

Encode labels

In [None]:
df = pd.read_csv("data/data.csv", index_col=0)

label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df.rename(columns={'entry': 'text', 'label': 'labels'}, inplace=True)
df

Unnamed: 0,text,labels
0,An Absurd Reasoning Absurdity and Suicide Ther...,0
1,passion of living) there are probably but two ...,0
2,that very day addressed him indifferently. He ...,0
3,this relationship between the absurd and suici...,0
4,"Peregrinos who is born of legend, m and Jules ...",0
...,...,...
777,of repugnance which characterizes the sentimen...,4
778,"greater, and none wound more, than when that o...",4
779,"only source of the exalted rank, among human o...",4
780,"interest, in which that of every individual is...",4


Creating train/test splits

Loading the model and tokenizer, checking input/output

In [33]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
input = df['text'][0]
encoded_input = tokenizer(input, return_tensors='pt')
output = model(**encoded_input)
print(f"\n{input=}")
print(f"\n{encoded_input=}")
print(f"\n{output=}")

Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1768.73it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



input='An Absurd Reasoning Absurdity and Suicide There is but one truly serious philosophical problem, and that is suicide. Judging whether life is or is not worth living amounts to answering the fundamental question of philosophy. All the rest— whether or not the world has three dimensions, whether the mind has nine or twelve categories—comes afterwards. These are games; one must first answer. And if it is true, as Nietzsche claims, that a philosopher, to deserve our respect, must preach by example, you can appreciate the importance of that reply, for it will precede the definitive act. These are facts the heart can feel; yet they call for careful study before they become clear to the intellect. If I ask myself how to judge that this question is more urgent than that, I reply that one judges by the actions it entails. I have never seen anyone die for the ontologi-cal argument. Galileo, who held a scientific truth of great importance, abjured it with the greatest ease as soon as it en

In [34]:
#X_train, X_text, y_train, y_text = train_test_split(df['entry'], df['label'], test_size=0.2, random_state=42)
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
def tokenize(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataset

Map: 100%|██████████| 625/625 [00:00<00:00, 5928.94 examples/s]
Map: 100%|██████████| 157/157 [00:00<00:00, 5910.02 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 625
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 157
    })
})

In [30]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="model",
    eval_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    processing_class=tokenizer
)

trainer.train()

  super().__init__(loader)
