In [1]:
from transformers import BertTokenizer, AutoConfig, AutoModelForSequenceClassification 

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
config = AutoConfig.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_config(config, num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [6]:
from glob import glob
from datasets import Dataset

import re
import pandas as pd

def data_loading(url):
    with open(url, 'r', encoding='utf-8') as f:
        # data = json.loads(f.read())
        df = pd.read_json(f)
        data = df.copy()
        # data = df.loc[:, ['sentiment', 'body']]
        data = data.loc[df['sentiment'].notnull()]
        data['sentiment'] = pd.Categorical(data['sentiment'])
        data['label'] = data['sentiment'].cat.codes
        data = data.rename(columns={'sentiment': 'labels', 'body': 'sentense'})

        return data

data_url = '../crawler/stock/data/**.json'
url = glob(data_url)[-1]
data = data_loading(url)

dataset = Dataset.from_pandas(data.loc[:, ['label', 'sentense']])
dataset = dataset.remove_columns('__index_level_0__')

In [7]:
def encode(example):
    result = tokenizer(example['sentense'], padding='max_length', truncation=True)
    
    return result

encoded_dataset = dataset.map(encode, batched=True)
encoded_dataset = encoded_dataset.train_test_split(test_size=0.2)
# print(encoded_dataset[0]['sentense'])
# print(encoded_dataset[0]['input_ids'])
# type(encoded_dataset[0]['labels'])

100%|██████████| 1/1 [00:01<00:00,  1.84s/ba]


In [9]:
from transformers import TrainingArguments, Trainer
from datasets import load_metric

import numpy as np

training_args = TrainingArguments(
    output_dir="bert-classifier", 
    evaluation_strategy="epoch",
    # epoch=3
)

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,
)

trainer.train()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentense. If sentense are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 356
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 135

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe foll

{'eval_loss': 0.49807170033454895, 'eval_accuracy': 0.8, 'eval_runtime': 126.6433, 'eval_samples_per_second': 0.711, 'eval_steps_per_second': 0.095, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentense. If sentense are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 90
  Batch size = 8


[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                  


[A[A[A                                      

[A[A                                          
 33%|███▎      | 45/135 [1:21:34<33:16, 22.19s/it]

[A[A
[A

[A[A

{'eval_loss': 0.48915353417396545, 'eval_accuracy': 0.8, 'eval_runtime': 130.3277, 'eval_samples_per_second': 0.691, 'eval_steps_per_second': 0.092, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentense. If sentense are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 90
  Batch size = 8


[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                  


[A[A[A                                      

[A[A                                           
 33%|███▎      | 45/135 [1:42:15<33:16, 22.19s/it]

[A[A
[A

[A[A

Training completed. Do not forget to share your model on huggingface.co/models =)


                                                  

[A[A                             

{'eval_loss': 0.47806859016418457, 'eval_accuracy': 0.7666666666666667, 'eval_runtime': 130.5336, 'eval_samples_per_second': 0.689, 'eval_steps_per_second': 0.092, 'epoch': 3.0}
{'train_runtime': 3686.3711, 'train_samples_per_second': 0.29, 'train_steps_per_second': 0.037, 'train_loss': 0.5072037308304398, 'epoch': 3.0}





TrainOutput(global_step=135, training_loss=0.5072037308304398, metrics={'train_runtime': 3686.3711, 'train_samples_per_second': 0.29, 'train_steps_per_second': 0.037, 'train_loss': 0.5072037308304398, 'epoch': 3.0})