In [None]:
!git clone https://github.com/beep1000101/pjatk_zum.git

In [None]:
%cd pjatk_zum

In [None]:
from pathlib import Path
from enum import StrEnum

import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate

import pandas as pd
from utils.paths import CACHE_PATH

  from .autonotebook import tqdm as notebook_tqdm
  return f
  return self._get_more_data(ov, maxsize)


In [2]:
data_path = CACHE_PATH / "sentiment_embeddings"
raw_data_path = data_path / "raw" / "aclImdb"
test_directory_path = raw_data_path / "test"
train_directory_path = raw_data_path / "train"

In [23]:
class Sentiment(StrEnum):
    POS = "pos"
    NEG = "neg"

def _load_data(sentiment: Sentiment, dataset_path: Path):
    if sentiment == Sentiment.POS:
        sentiment_value = 1
    elif sentiment == Sentiment.NEG:
        sentiment_value = 0
    else:
        raise ValueError()

    files = []
    path_to_data = dataset_path / sentiment
    file_paths = path_to_data.glob("*.txt")
    for file_path in file_paths:
        with open(file=file_path) as text_file:
            files.append(text_file.read())

    sentiment_list = [sentiment_value] * len(files)
    
    return files, sentiment_list 

def load_dataframe(sentiment: Sentiment, dataset_path: Path):
    columns = ["text", "sentiment_value"]
    df = pd.DataFrame(dict(zip(columns, _load_data(sentiment=sentiment, dataset_path=dataset_path))))
    return df

def load_dataset(dataset_path: Path):
    dataset_df = pd.concat([load_dataframe(sentiment=sentiment, dataset_path=dataset_path) for sentiment in Sentiment])
    return dataset_df


In [25]:
df_train_full = load_dataset(dataset_path=train_directory_path)

In [27]:
df_train_full

Unnamed: 0,text,sentiment_value
0,Zentropa has much in common with The Third Man...,1
1,Zentropa is the most original movie I've seen ...,1
2,Lars Von Trier is never backward in trying out...,1
3,*Contains spoilers due to me having to describ...,1
4,That was the first thing that sprang to mind a...,1
...,...,...
12495,There just isn't enough here. There a few funn...,0
12496,Tainted look at kibbutz life<br /><br />This f...,0
12497,"I saw this movie, just now, not when it was re...",0
12498,Any film which begins with a cowhand shagging ...,0


In [31]:
train_df, val_df = train_test_split(
    df_train_full, test_size=0.2, random_state=42, stratify=df_train_full["sentiment_value"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# Trainer expects label column named "labels"
train_ds = train_ds.rename_column("sentiment_value", "labels")
val_ds = val_ds.rename_column("sentiment_value", "labels")

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# transformers==5 uses `eval_strategy` (not `evaluation_strategy`)
args = TrainingArguments(
    output_dir="outputs/sentiment_distilbert",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=2,
    seed=42,
    data_seed=42,
    remove_unused_columns=True,
    fp16=False,
    bf16=False,
    use_cpu=False,
 )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
 )

# trainer.train()
# trainer.evaluate()
# trainer.save_model("outputs/sentiment_distilbert/best")
# tokenizer.save_pretrained("outputs/sentiment_distilbert/best")

Map: 100%|██████████| 20000/20000 [00:07<00:00, 2793.44 examples/s]
Map: 100%|██████████| 5000/5000 [00:01<00:00, 2732.60 examples/s]
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 636.44it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint.

In [32]:
trainer.train()

  super().__init__(loader)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
test_df = load_dataset(dataset_path=test_directory_path)
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True)).map(tokenize, batched=True).rename_column("sentiment_value", "labels").set_format(type="torch", columns=cols)

In [None]:
!zip -r sentiment_distilbert_best.zip outputs/sentiment_distilbert/best