In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 2.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.3 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uni

In [None]:
import pandas as pd 
import numpy as np 
import torch
import time 

from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AdamW, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
data = pd.read_csv("drive/MyDrive/ire_major/clean.csv")

In [None]:
data.rename(columns={"from":"author"}, inplace = True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134574 entries, 0 to 134573
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  134574 non-null  int64 
 1   author      134574 non-null  object
 2   text        134574 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.1+ MB


In [None]:
data["text"].replace('', np.nan, inplace = True)
data.dropna(inplace = True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134574 entries, 0 to 134573
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  134574 non-null  int64 
 1   author      134574 non-null  object
 2   text        134574 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.1+ MB


In [None]:
data["length"] = data["text"].apply(lambda x: len(x.split()))

In [None]:
data = data.drop(data[data.length > 500].index)
data = data.drop(data[data.length < 10].index)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105854 entries, 0 to 134572
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  105854 non-null  int64 
 1   author      105854 non-null  object
 2   text        105854 non-null  object
 3   length      105854 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 4.0+ MB


In [None]:
items = data.author.value_counts().to_dict().items()
data = data[data.author.isin([key for key, val in items if val > 2000])]

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62666 entries, 3880 to 129975
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  62666 non-null  int64 
 1   author      62666 non-null  object
 2   text        62666 non-null  object
 3   length      62666 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.4+ MB


In [None]:
texts = data.text.tolist()
labels = data.author.tolist()

In [None]:
label2id = {i: idx for (idx, i) in enumerate(set(labels))}
id2label = {label2id[i]: i for i in label2id}

In [None]:
labels = [label2id[i] for i in labels]

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2, random_state = 42)

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
class WapoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = WapoDataset(train_encodings, train_labels)
test_dataset = WapoDataset(test_encodings, test_labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels = 15)

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bia

In [None]:
training_args = TrainingArguments(
    output_dir='drive/MyDrive/ire_major/turn_2/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='drive/MyDrive/ire_major/turn_2/logs',            # directory for storing logs
    logging_steps=1000,
    save_strategy="epoch",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 50132
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9402


Step,Training Loss
1000,1.0006
2000,0.3207
3000,0.2166
4000,0.0935
5000,0.0687
6000,0.0541
7000,0.0313


Saving model checkpoint to drive/MyDrive/ire_major/turn_2/results/checkpoint-3134
Configuration saved in drive/MyDrive/ire_major/turn_2/results/checkpoint-3134/config.json
Model weights saved in drive/MyDrive/ire_major/turn_2/results/checkpoint-3134/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/ire_major/turn_2/results/checkpoint-6268
Configuration saved in drive/MyDrive/ire_major/turn_2/results/checkpoint-6268/config.json
Model weights saved in drive/MyDrive/ire_major/turn_2/results/checkpoint-6268/pytorch_model.bin


Step,Training Loss
1000,1.0006
2000,0.3207
3000,0.2166
4000,0.0935
5000,0.0687
6000,0.0541
7000,0.0313
8000,0.0197
9000,0.0131


Saving model checkpoint to drive/MyDrive/ire_major/turn_2/results/checkpoint-9402
Configuration saved in drive/MyDrive/ire_major/turn_2/results/checkpoint-9402/config.json
Model weights saved in drive/MyDrive/ire_major/turn_2/results/checkpoint-9402/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=9402, training_loss=0.19388714523372233, metrics={'train_runtime': 13189.9771, 'train_samples_per_second': 11.402, 'train_steps_per_second': 0.713, 'total_flos': 1.992718566733824e+16, 'train_loss': 0.19388714523372233, 'epoch': 3.0})

In [None]:
!cp -r ./results drive/MyDrive/ire_major