**Notebook for Text message classifier**

### Installing libraries, connecting to google drive

In [1]:
!pip3 install transformers 

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.5MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█████

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

In [4]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# News category classification


I decided that categories 'BUSINESS' and 'MONEY' are close to 'finance' and made 2 datasets:
- imbalanced with all data , categories 'BUSINESS' and 'MONEY' are 1 in 'financial' column, other - 0 
- balanced with proportionally (by other categories) selected data, same label encoding 

All data was filtered and preprocessed on local computer.

## Balanced data

In [None]:
news_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/message_classification/news_cleaned_balanced.csv")
news_data

Unnamed: 0,text,finance
0,we are a generation of maximizers and its both...,0.0
1,napoleons definition of a military genius was ...,0.0
2,remember market corrections are a part of inve...,0.0
3,no matter where or how our children are gettin...,0.0
4,according to a january 2014 schwab money myths...,0.0
...,...,...
13496,here are nine signs your superior may not be w...,1.0
13497,were not going to tell you to stop eating out ...,1.0
13498,7 schemers file fake forms to claim credits to...,1.0
13499,a 401k in and of itself is no guarantee that y...,1.0


In [None]:
news_data = news_data.sample(frac=1)
news_data = news_data.astype({'finance': 'int32'})
news_data.reset_index(inplace=True)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a

In [None]:
from sklearn.model_selection import train_test_split

news_data['text'] = news_data['text'].map(str)
X_train, X_test, y_train, y_test = train_test_split(news_data['text'], news_data['finance'], test_size=0.2)

X_train = X_train.tolist()
X_test = X_test.tolist()

max_length = 512

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

In [None]:
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/data/message_classification')

tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/special_tokens_map.json


('/content/drive/MyDrive/Colab Notebooks/data/message_classification/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/data/message_classification/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/data/message_classification/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/data/message_classification/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/data/message_classification/tokenizer.json')

In [None]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = NewsGroupsDataset(train_encodings, y_train.tolist())
test_dataset = NewsGroupsDataset(test_encodings, y_test.tolist())

In [None]:
torch.cuda.is_available()

True

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to("cuda")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }


training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/data/message_classification/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/Colab Notebooks/data/message_classification/logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=200,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    overwrite_output_dir=True
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 10800
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2025


Step,Training Loss,Validation Loss,Accuracy
200,0.6444,0.55233,0.724176
400,0.5419,0.49241,0.766383
600,0.525,0.493543,0.785265
800,0.4661,0.509786,0.770455
1000,0.436,0.516807,0.799704
1200,0.4415,0.470589,0.79304
1400,0.4067,0.523799,0.776009


***** Running Evaluation *****
  Num examples = 2701
  Batch size = 20
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-200
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-200/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2701
  Batch size = 20
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-400
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-400/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2701
  Batch size = 20
Saving model checkpoint to /content/drive

Step,Training Loss,Validation Loss,Accuracy
200,0.6444,0.55233,0.724176
400,0.5419,0.49241,0.766383
600,0.525,0.493543,0.785265
800,0.4661,0.509786,0.770455
1000,0.436,0.516807,0.799704
1200,0.4415,0.470589,0.79304
1400,0.4067,0.523799,0.776009
1600,0.3666,0.494319,0.805257
1800,0.3646,0.510082,0.804517
2000,0.3796,0.491388,0.803036


***** Running Evaluation *****
  Num examples = 2701
  Batch size = 20
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1600
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1600/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1600/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2701
  Batch size = 20
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1800
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1800/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2701
  Batch size = 20
Saving model checkpoint to /content

TrainOutput(global_step=2025, training_loss=0.4552964311764564, metrics={'train_runtime': 1269.5134, 'train_samples_per_second': 25.522, 'train_steps_per_second': 1.595, 'total_flos': 3362816138025600.0, 'train_loss': 0.4552964311764564, 'epoch': 3.0})

In [None]:
best_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1200', num_labels=2)

loading configuration file /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1200/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1200/pytorch_model.bin
All model checkpoint weights were used when ini

In [None]:
test_trainer = Trainer(best_model)

raw_pred, _, _ = test_trainer.predict(test_dataset)
y_pred = np.argmax(raw_pred, axis=1)
y_pred

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 2701
  Batch size = 8


array([1, 0, 1, ..., 0, 1, 0])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79      1328
           1       0.79      0.81      0.80      1373

    accuracy                           0.79      2701
   macro avg       0.79      0.79      0.79      2701
weighted avg       0.79      0.79      0.79      2701



## Imbalanced data

In [None]:
news_imb_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/message_classification/news_cleaned.csv")
news_imb_data

Unnamed: 0,text,finance
0,billionaire iac chairman barry diller made the...,0.0
1,a liberal activist infiltrated a press area ou...,0.0
2,us district judge paul diamond in philadelphia...,0.0
3,speaker john boehner’s resignation from congre...,0.0
4,on presidents day a look at how americas selec...,0.0
...,...,...
40883,the president’s sudden acknowledgement is a ba...,1.0
40884,this explains a lot,1.0
40885,he says authorities targeted him because of hi...,1.0
40886,lots of blumenthal emails and benghazi not so ...,1.0


In [None]:
news_imb_data = news_imb_data.sample(frac=1)
news_imb_data = news_imb_data.astype({'finance': 'int32'})
news_imb_data.reset_index(inplace=True)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)

news_imb_data['text'] = news_imb_data['text'].map(str)
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(news_imb_data['text'], news_imb_data['finance'], test_size=0.2)

X_train_imb = X_train_imb.tolist()
X_test_imb = X_test_imb.tolist()

max_length = 512

train_encodings_imb = tokenizer(X_train_imb, truncation=True, padding=True, max_length=max_length)
test_encodings_imb = tokenizer(X_test_imb, truncation=True, padding=True, max_length=max_length)

train_dataset_imb = NewsGroupsDataset(train_encodings_imb, y_train_imb.tolist())
test_dataset_imb = NewsGroupsDataset(test_encodings_imb, y_test_imb.tolist())

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to("cuda")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/tra

In [None]:
from sklearn.metrics import balanced_accuracy_score

def compute_metrics_balanced(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = balanced_accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/data/message_classification/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/Colab Notebooks/data/message_classification/logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=200,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    overwrite_output_dir=True,
    save_total_limit=5
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset_imb,         # training dataset
    eval_dataset=test_dataset_imb,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 32710
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6135


Step,Training Loss,Validation Loss,Accuracy
200,0.4754,0.454421,0.834189
400,0.4573,0.449378,0.834189
600,0.4564,0.449287,0.834189
800,0.4525,0.463452,0.834189
1000,0.44,0.455237,0.834189
1200,0.4544,0.44995,0.834189
1400,0.469,0.449786,0.834189
1600,0.4604,0.45275,0.834189
1800,0.4343,0.468737,0.834189
2000,0.4503,0.469379,0.834189


***** Running Evaluation *****
  Num examples = 8178
  Batch size = 20
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-200
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-200/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8178
  Batch size = 20
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-400
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-400/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8178
  Batch size = 20
Saving model checkpoint to /content/drive

KeyboardInterrupt: ignored

Training on imbalanced data did not show any progress, so I terminated it

# BERT model for fine-tuning

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)



def train_classification_bert(dataset, cuda=True, path_to_save_model=None):
    '''
    dataset: DataFrame, columns: 'text', 'finance' (0, 1)

    returns
    best_model: BertForSequenceClassification, best model in terms of accuracy
    tokenizer: BertTokenizerFast 
    '''

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
        }

    if cuda:
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to("cuda")
    else:
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)

    X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['finance'], test_size=0.2)

    X_train = X_train.tolist()
    X_test = X_test.tolist()

    max_length = 512

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

    train_dataset = BertDataset(train_encodings, y_train.tolist())
    test_dataset = BertDataset(test_encodings, y_test.tolist())

    training_args = TrainingArguments(
        output_dir='/content/drive/MyDrive/Colab Notebooks/data/message_classification/results',          
        num_train_epochs=3,              
        per_device_train_batch_size=8,  
        per_device_eval_batch_size=8,   
        warmup_steps=500,                
        weight_decay=0.01,               
        logging_dir='/content/drive/MyDrive/Colab Notebooks/data/message_classification/logs',           
        load_best_model_at_end=True,     
        logging_steps=700,               
        evaluation_strategy="steps",     
        overwrite_output_dir=True,
        save_total_limit=5
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=test_dataset,          
        compute_metrics=compute_metrics,     
    )

    trainer.train()

    if path_to_save_model is not None:
        model.save_pretrained(path_to_save_model)

    return model, tokenizer

In [6]:
from torch import nn

def predict_classification(test_data, model, tokenizer):
    '''
    test_trainer: Trainer - best model for predictions
    test_data: DataFrame - columns: 'text'

    returns
    y_pred: list - predictions
    '''
    tokenized = tokenizer(test_data['text'], padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**tokenized)
    predictions = nn.functional.softmax(outputs.logits, dim=-1)
    return predictions

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def predict_with_sentiment(dataset, model, tokenizer, decoded_predictions=True):
    '''
    dataset: DataFrame - columns: 'text', 'finance'; train+validation
    model: trained BERT best model
    tokenizer: tokenizer used in training

    returns:
    predictions: DataFrame - 
    columns: 
        text 
        finance - probability of classification
        positive - probability of positive sentiment
        neutral - -//-
        negative - -//-
    '''

    classification_pred = predict_classification(dataset, model, tokenizer)
    
    tokenizer_sent = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model_sent = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    tokenized = tokenizer_sent(dataset['text'], padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model_sent(**tokenized)
    sentiment_pred = nn.functional.softmax(outputs.logits, dim=-1)

    # understand sentiment outputs

In [17]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/message_classification/dataset_for_classification.csv', index_col=0)

In [18]:
train_data.dropna(inplace=True)
train_data.reset_index(inplace=True, drop=True)

In [19]:
train_data['finance'].value_counts()

0    24999
1    20811
Name: finance, dtype: int64

In [None]:
model, tokenizer = train_classification_bert(train_data)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/tra

Step,Training Loss,Validation Loss,Accuracy
700,0.1217,0.023813,0.992796
1400,0.0299,0.023189,0.995088
2100,0.0266,0.032584,0.995307
2800,0.0323,0.024965,0.996507
3500,0.0273,0.036824,0.994324
4200,0.0293,0.015489,0.997817
4900,0.0632,0.021785,0.997162
5600,0.033,0.018545,0.99738
6300,0.0954,0.020888,0.997162
7000,0.0389,0.023441,0.996616


***** Running Evaluation *****
  Num examples = 9162
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-700
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-700/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-700/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 9162
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1400
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1400/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/message_classification/results/checkpoint-1400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 9162
  Batch size = 8
Saving model checkpoint to /content/drive

# Keyword extraction

***Idea***
1. Take dataset with tweets on financial topic, extract keywords
2. Make new dataset from datasets without topic and finance datasets
3. Run BERT text classification for fine tunung

### 1. Keyword extraction

Trying [KeyBERT](https://github.com/MaartenGr/KeyBERT)

In [None]:
!pip install keybert

In [None]:
fin_tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/message_classification/financial_texts_keywords.csv', index_col=0)
fin_tweets

Unnamed: 0,Unnamed: 0.1,text
0,0,video “i was in my office i was minding my own...
1,1,the price of lumber lbf is down since hitting ...
2,2,who says the american dream is dead
3,3,barry silbert is extremely optimistic on bitco...
4,4,how satellites avoid attacks and space junk wh...
...,...,...
4841,4841,london marketwatch share prices ended lower in...
4842,4842,rinkuskiai s beer sales fell by per cent to mi...
4843,4843,operating profit fell to eur mn from eur mn in...
4844,4844,net sales of the paper segment decreased to eu...


In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(fin_tweets.iloc[0]['text'], stop_words='english', use_mmr=True)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3673.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90895153.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466081.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=516.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…




In [None]:
doc = fin_tweets['text'].str.cat(sep=' ')

#### Playing with options

In [None]:
keywords = kw_model.extract_keywords(doc, stop_words='english', use_mmr=True, top_n=40)
keywords

[('bitco', 0.4436),
 ('predicted', 0.2836),
 ('wynne', 0.3174),
 ('silbert', 0.3988),
 ('marketrealist', 0.4058),
 ('abcheck', 0.3909),
 ('homeless', 0.1622),
 ('cards', 0.2507),
 ('woes', 0.3011),
 ('chicagotribune', 0.2739),
 ('amro', 0.2722),
 ('megawatt', 0.2575),
 ('cooper', 0.3429),
 ('commerzbank', 0.3598),
 ('karttakeskus', 0.2075),
 ('wants', 0.3294),
 ('dnkn', 0.3453),
 ('facebooks', 0.3025),
 ('econguyrosie', 0.3631),
 ('fcau', 0.3553),
 ('wallin', 0.3531),
 ('agreei', 0.3303),
 ('travisvstheworld', 0.3296),
 ('unbelievably', 0.2912),
 ('bloombergsen', 0.3862),
 ('baidu', 0.3324),
 ('dollars', 0.3397),
 ('vergecurrency', 0.3083),
 ('edelson', 0.3766),
 ('discoverers', 0.2959),
 ('fantasy', 0.3416),
 ('optionsmaxpaingooglalphabet', 0.1526),
 ('norvestia', 0.2905),
 ('newlyformed', 0.307),
 ('usak', 0.3483),
 ('guptacalling', 0.3398),
 ('cryptomonkeyja', 0.3743),
 ('reportamos', 0.3176),
 ('sears', 0.2767),
 ('bullishbearz', 0.3145)]

In [None]:
keywords = kw_model.extract_keywords(doc, stop_words='english', use_mmr=True, top_n=40, diversity=0.7)
keywords

[('bitco', 0.4436),
 ('lifechanging', 0.2185),
 ('woodmont', 0.1872),
 ('putinsmypapiefxequifax', 0.2023),
 ('analysts', 0.3273),
 ('trumps', 0.0538),
 ('liechtenstein', 0.1586),
 ('sihvonen', 0.3196),
 ('felled', 0.1333),
 ('photonium', 0.0429),
 ('gtmi', 0.1954),
 ('jerry', 0.2843),
 ('naacp', 0.169),
 ('nasdaqdltr', 0.1714),
 ('rickdecard', 0.2128),
 ('breakfast', 0.0108),
 ('bios', 0.0964),
 ('dgld', 0.2283),
 ('sellinga', 0.336),
 ('pledged', 0.2649),
 ('exists', 0.2827),
 ('crushes', 0.0765),
 ('km', 0.0499),
 ('brpau', 0.2367),
 ('hotpagenews', 0.249),
 ('expiring', -0.0007),
 ('gratuitous', 0.0642),
 ('carolinefevans', 0.1872),
 ('whotradeshlthilton', 0.3374),
 ('inflows', 0.1075),
 ('foxconn', 0.2512),
 ('arent', 0.2503),
 ('drillers', 0.0503),
 ('epicmove', 0.3357),
 ('vergecurrency', 0.3083),
 ('icloud', 0.0192),
 ('sterling', 0.2724),
 ('bbbd', 0.1892),
 ('recalled', 0.1338),
 ('bankrupt', 0.2743)]

In [None]:
keywords = kw_model.extract_keywords(doc, top_n=40, diversity=0.2)
keywords

[('bitco', 0.4436),
 ('bitcoi', 0.4367),
 ('capitalista', 0.414),
 ('insider', 0.4115),
 ('marketrealist', 0.4058),
 ('bitcoin', 0.4047),
 ('stephensinc', 0.4041),
 ('silbert', 0.3988),
 ('insiders', 0.3968),
 ('encour', 0.3949),
 ('stocksthatgo', 0.3936),
 ('crypton', 0.3932),
 ('kucoincom', 0.3914),
 ('investingmon', 0.3911),
 ('abcheck', 0.3909),
 ('bitcoinrealestate', 0.3896),
 ('wynnmacau', 0.3867),
 ('bloombergsen', 0.3862),
 ('itau', 0.3862),
 ('trader', 0.3858),
 ('equitiesinc', 0.3826),
 ('tradereview', 0.3816),
 ('insidertrading', 0.3793),
 ('cryptocurrencies', 0.3787),
 ('investorshub', 0.3786),
 ('llnw', 0.3783),
 ('investorslive', 0.3781),
 ('edelson', 0.3766),
 ('michelin', 0.3759),
 ('investor', 0.3748),
 ('bénéteau', 0.3747),
 ('beneteau', 0.3747),
 ('traderinplay', 0.3745),
 ('benelux', 0.3744),
 ('cryptomonkeyja', 0.3743),
 ('investorsbusinessdaily', 0.3742),
 ('cryptocurrency', 0.3734),
 ('crypto', 0.3727),
 ('cryptocurrenc', 0.372),
 ('cryptocurrenci', 0.3719)]

# Cluster analysis

***Idea***
1. Make new dataset from datasets without topic and finance datasets
2. Perform unsupervised cluster analysis
3. Find cluster with most relevant data for finance, mark it as finance, other non-finance
4. Run BERT text classification for fine tunung