# D. Transformers

# 0. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random
from collections import Counter

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from nltk.tokenize import word_tokenize

# Import necessary libraries for handling imbalanced data
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Embedding related imports
import sys
import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.models import KeyedVectors
import gensim.downloader
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.scripts.glove2word2vec import glove2word2vec

# Transform packages
from simpletransformers.classification import ClassificationModel
import logging




In [2]:
# Turn of warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
#warnings.filterwarnings("ignore")

In [3]:
# Change to Working Directory with Training Data # 
#os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")
os.chdir("/Users/juarel/Desktop/studies artur/thesis_HIR/coding")

# Load the preprocessed data #
df_train = pd.read_csv("./data/gold_data/train.csv", header = 0)
df_test = pd.read_csv("./data/gold_data/test.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category,cleaned_headline
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,,head u patent granted se delaware may titled c...
1,564295,Societe Generale Launches a Next-Generation Ca...,,societe generale launch nextgeneration card in...
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,,plc form communication
3,91379,ASML: 4Q Earnings Snapshot,,4q earnings snapshot
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,,form investment manager group plc


# 1. Define functions and parameters

Before we continue, we first define some useful functions and parameters that we use throughout this notebook. The first four functions and parameters were also used and defined in the previous notebook.

1. get_classification_metrics: Create a function that return the classification metrics for each model. The precision, recall and f1 score are all determined using the average value of all classes, without adjusting weights to these classes.

2. Define a dataframe to store the results of the different models. Moreover, also define a dictionary that stores the best parameters for each model.

3. Define the number of splits, the stratified cross validator to ensure class frequencies are considered, and the scoring metric based on the average F1 score. We use an F1 score as scoring metric as accuracy is not a good evaluation metric in our case.

4. Define a function that trains the defined model, the input data, the classifier and its parameter grid. Besides, it will also take 4 parameters as input that give more information about the model that is being trained. This is usefull for the storage of the performance of the different algorithms.



In [4]:
# 1. Function that returns classication metrics
def get_classification_metrics(y_true, y_pred):
    
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')


    return accuracy, precision, recall, f1


In [5]:
# 2. Create an empty dataframe to store the results of all the models
results_all_df = pd.DataFrame()

# Add columns for the metrics
columns = ['vectorizer', 'FS', 'classifier', 'resampling','accuracy', 'precision', 'recall', 'f1']
for col in columns:
    results_all_df[col] = 0

# create an empty dictionary to store the optimal parameters
best_params_dict = {}

In [6]:
# 3. Define different parameters
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept

# Define the scoring metric
scoring = make_scorer(f1_score, average= 'macro')

In [7]:
# define the independent and dependent variables
X_train = df_train['cleaned_headline']
X_test = df_test['cleaned_headline']

y_train = df_train['category']
y_test = df_test['category']

# 2. Transformers

https://simpletransformers.ai/docs/classification-specifics/#supported-model-types

## 2.1 Bert

In [8]:
# Define with what vectorizer we build the models with for storage
vectorizer = 'Transformer'

In [9]:
# Define the implementation method of word2vec
FS = 'Bert'

In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from tabulate import tabulate
from tqdm import trange


By default, ClassificationModel expects the labels to be ints from 0 up to num_labels.

If your dataset contains labels in another format (e.g. string labels like positive, negative), you can provide the list of all labels to the model args. Simple Transformers will handle the label mappings internally. 

In [11]:
import logging
from simpletransformers.classification import ClassificationModel, ClassificationArgs

logging.basicConfig(level=logging.ERROR)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [12]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels as integers
label_encoder = LabelEncoder()
df_train["label_encoded"] = label_encoder.fit_transform(df_train["category"])
df_test["label_encoded"] = label_encoder.transform(df_test["category"])


# Get the number of labels
num_labels = len(label_encoder.classes_)
num_labels

15

In [13]:
# Create train_df and apply label decoding to obtain string labels
input_transf = df_train[['cleaned_headline', 'label_encoded']]
input_transf.columns = ["text", "labels"]
input_transf['labels'].nunique()

15

In [14]:
test_transf = df_test[['cleaned_headline', 'label_encoded']]
test_transf.columns = ["text", "labels"]


In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base")



Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [None]:
max_length = 0
length = []

In [None]:
# Iterate over each sentence in the cleaned_headline column
for sentence in df_train['cleaned_headline']:
   # Split the sentence into words
   words = sentence.split()
   # Update the maximum length if the current sentence is longer
   l = len(words)
   length.append(l)

np.mean(length)

In [None]:
# Create a DataFrame from the 'length' array
# Sort the DataFrame by 'Sentence Length' column in descending order
df_sorted = df_lengths.sort_values(by='Sentence Length', ascending=False)
df_sorted.head(50)

https://simpletransformers.ai/docs/usage/

In [15]:
from sys import argv

train_df, eval_df = train_test_split(input_transf, test_size=0.2, stratify=input_transf['labels'], random_state=7)
train_df

Unnamed: 0,text,labels
27327,pfizer germany kgaa say phase gastric cancer t...,9
21429,group report interim report q2,9
37790,holding stuart gulliver remuneration last year,9
36442,report gross short sale australia,9
29039,plc form abbvie plc amendment,9
...,...,...
37943,duke energy siemens enter innovative agreement...,13
19017,standard plc scplc half year result,9
298,deutsche bank ag form eptri sabmiller plc,9
22427,china rare protest citizen oppose deal gmos,9


In [17]:
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')

In [20]:
model_type = argv[1]
model_name = argv[2]

# define path for storage as very large files and can not be pushed in github
file_path = '/Users/juarel/Desktop/studies artur/thesis_HIR/big files/'

train_args = {
    'output_dir': f'{file_path}{model_type}-{model_name}-outputs',
    'best_model_dir': f'{file_path}{model_type}-{model_name}-outputs/best_model', # directory to save best models at check points
    'cache_dir': f'{file_path}{model_type}-{model_name}-cache_dir',

    'max_seq_length': 69,            # maximum number of tokens (= words) per input, only few observations have more than 69 tokens 
    'do_lower_case': True,           # Set true when using uncased models
    'num_train_epochs': 3,           # The number of times the equivalent of a full training set has been processed
    'train_batch_size': 32,          # The training batch size
    'eval_batch_size': 16,           # The evaluation batch size
    'gradient_accumulation_steps': 1,
    'learning_rate': 4e-5,           # Controls how fast model weights are updated
    'save_steps': 1000,
    'early_stopping_metric_minimize': True,

    #'wandb_kwargs': {'name': f'{model_type}-{model_name}'},
    'evaluate_during_training': True,        # Perform evaluation while training the model
    'evaluate_during_training_steps': 1000,  # Perfrom evaluation at every specified number of steps
    "save_model_every_epoch": False,         # Save a model checkpoint at the end of every epoch.
    'overwrite_output_dir': True,            # Overwrite existing saved models in same directory
    'no_cache': True,                        # No cache features to disk
    'use_multiprocessing': True,              # use multiprocessing when converting data into features

    'use_early_stopping': True,       # Early stopping technique to prevent overfitting
    'early_stopping_patience': 5,     # Terminate if loss does not improve for 5 consecutive evaluations
    'early_stopping_delta': 0.01,     # amount the evaluation data needs to improve to be considered better
    'manual_seed': 7,                 # Ensure results can be reproduced
    'weight_decay': 0.001,             # Adds L2 penalty (low due to limited dataset)
    'early_stopping_metric': f1_multiclass
}





In [None]:

model = ClassificationModel('bert', 'bert-base-uncased', num_labels=15, args=train_args, 
                            use_cuda = False)
model.train_model(train_df, eval_df=eval_df, f1 = f1_multiclass)

  0%|          | 0/34596 [00:00<?, ?it/s]

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_transf,
                                                            f1=f1_multiclass)

## 4. Write away results

In [None]:
# write away results
results_all_df.to_csv('./Output/Model performance/results_transformers.csv', index = False, header = True)

In [None]:
# Write the dictionary with the best parameters away
with open('./Output/parameters/embeddings.json', 'w') as file:
    json.dump(best_params_dict, file)