In [1]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# tested in transformers==4.18.0, pytorch==1.7.1 
import torch
import transformers
import numpy as np
import pandas as pd 
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
torch.__version__, transformers.__version__

('2.2.2+cu121', '4.40.2')

In [3]:
# Replace 'file_path.csv' with the path to your CSV file
file_path = 'reddit.csv'

# Use the pandas read_csv() function to read the CSV file into a DataFrame
bert_tuning_df = pd.read_csv(file_path)
bert_tuning_df = bert_tuning_df.drop(columns=["type", "id", 'created_utc', "score", "permalink", "subreddit.id", "subreddit.name", "subreddit.nsfw"])
# bert_tuning_df['date'] = bert_tuning_df['created_utc'].dt.tz_localize('UTC')
# Map sentiment scores from the range [-1, 1] to [0, 1]
bert_tuning_df['sentiment'] = (bert_tuning_df['sentiment'] + 1) / 2
bert_tuning_df['label'] = bert_tuning_df['sentiment'].apply(lambda x: 1 if x >= 0.5 else 0)
df = bert_tuning_df.drop(columns=["sentiment"])

# Now you can work with the DataFrame 'df'
df = df.head(5000)

In [4]:
df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.1, random_state=42)
print(df_train.shape, df_test.shape, df_val.shape)

(4050, 2) (500, 2) (450, 2)


In [5]:
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['body'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['body'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['body'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
!pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable


In [8]:
!pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable


In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=dataset_train,         # training dataset
        eval_dataset=dataset_val,            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.388169,0.817778
2,No log,0.382468,0.828889
3,No log,0.395814,0.846667
4,0.305000,0.514042,0.824444
5,0.305000,0.56834,0.826667


TrainOutput(global_step=635, training_loss=0.25738850991556966, metrics={'train_runtime': 162.5109, 'train_samples_per_second': 124.607, 'train_steps_per_second': 3.907, 'total_flos': 1332011677248000.0, 'train_loss': 0.25738850991556966, 'epoch': 5.0})

In [10]:
model.eval()
trainer.predict(dataset_test).metrics

{'test_loss': 0.4383543133735657,
 'test_accuracy': 0.816,
 'test_runtime': 1.1356,
 'test_samples_per_second': 440.298,
 'test_steps_per_second': 14.09}

In [11]:
trainer.save_model('finbert-sentiment/')