In [None]:
! pip install transformers datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_metric, load_dataset, Dataset
import numpy as np

In [None]:
f1 = load_metric("f1")
roc_auc = load_metric("roc_auc")
recall = load_metric("recall")
prec = load_metric("precision")
roc_auc

In [None]:
def compute_metrics(evals):
    logits, labels = evals
    preds = np.round(logits).astype(int)
    return f1.compute(predictions=preds, references=labels)

In [None]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=2,
                                  save_steps=5000,
                                  eval_steps = 10000,
                                  per_device_train_batch_size=16,
                                  evaluation_strategy="steps")

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/7650_dataset/cleaned_reviews_summaries.zip')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

In [None]:
# for name, param in model.named_parameters():
#     if "classifier" not in name:
#         param.requires_grad = False

In [None]:
df = df.dropna(axis=0)
df["is_spoiler"] = df["is_spoiler"].astype(int) * 1.

In [None]:
dataset = Dataset.from_pandas(df)
del df
dataset = dataset.shard(num_shards=2, index=0)
dataset = dataset.rename_column("is_spoiler", "label")
dataset = dataset.map(lambda e: tokenizer(e["cleaned_reviews"], truncation=True, padding="max_length"), batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
dataset = dataset.train_test_split(test_size=0.2)
train_dataset, test_dataset = dataset["train"], dataset["test"]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

In [None]:
trainer.train()