In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
from sklearn.metrics import accuracy_score
import logging

In [2]:
logging.basicConfig(level=logging.INFO)
Transformerslogger = logging.getLogger("transformers")
Transformerslogger.setLevel(logging.WARNING)

In [3]:
file_path = '/Users/aaq1bm/AaqibMirzaTests/financialData.csv'
review = pd.read_csv(file_path, encoding='latin1')

In [4]:
train_df = pd.DataFrame(review)

In [5]:
sentiment = { 'positive': 2, 'neutral': 1, 'negative': 0 }

In [6]:
train_df['labels'] = train_df['Sentiments'].map(sentiment)

In [7]:
train_df = train_df[["Reviews", "labels"]]

In [8]:
train_df

Unnamed: 0,Reviews,labels
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,0
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,Operating profit fell to EUR 35.4 mn from EUR ...,0
4844,Net sales of the Paper segment decreased to EU...,0


In [9]:
eval_df = train_df.sample(n=500, random_state=5)

In [10]:
eval_df 

Unnamed: 0,Reviews,labels
1434,The training modules aim at strengthening the ...,2
3546,"By the end of 2006 , the number of joint branc...",1
3196,The company reiterates its outlook for 2009 .,1
1521,Exel wants to serve its industrial customers w...,1
795,`` Our customers now have the chance to make b...,2
...,...,...
2435,"Digia will also set up two subsidiaries , Digi...",1
3027,Mercator will use the software for its logisti...,1
2696,"Together with Latvia , Cramo will operate 54 r...",1
3000,KESKO FOOD LTD PRESS RELEASE 04.01.2006 AT 13....,2


In [11]:
output_dir_model1 = "/Users/aaq1bm/AaqibMirzaTests/outputs1"

In [12]:
model = ClassificationModel(
    'bert',
    'bert-base-cased',
    num_labels=3,
    use_cuda=False,
    args={'output_dir': output_dir_model1}
) 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def compute_metrics(self, preds, model_outputs, labels, eval_examples, **kwargs):
    result = {}
    accuracy = accuracy_score(labels, preds) 
    result['accuracy'] = accuracy
    wrong_predictions = [i for i, (pred, label) in enumerate(zip(preds, labels)) if pred != label]
    return result, wrong_predictions

In [14]:
import os

In [15]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [16]:
model.compute_metrics = compute_metrics.__get__(model)

In [17]:
train_df = train_df.drop(eval_df.index)

In [18]:
train_df

Unnamed: 0,Reviews,labels
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2
5,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,2
...,...,...
4840,HELSINKI Thomson Financial - Shares in Cargote...,0
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,Operating profit fell to EUR 35.4 mn from EUR ...,0
4844,Net sales of the Paper segment decreased to EU...,0


In [19]:
model.train_model(train_df)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/8 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_128_3_2


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/544 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to /Users/aaq1bm/AaqibMirzaTests/outputs1.


(544, 0.5463212599484798)

In [20]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_128_3_2


Running Evaluation:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'accuracy': 0.87, 'eval_loss': 0.3477988332509995}


In [21]:
print("model_outputs :")
print(model_outputs)

model_outputs :
[[-3.12797761  0.76044965  1.66899347]
 [-2.390769    2.92753744 -1.40381777]
 [-2.79404759  1.96544898 -0.02210679]
 ...
 [-3.05289769  2.50113225 -0.28450882]
 [-3.18722177  1.1298157   1.40818155]
 [-3.08081627  1.07964027  1.09790385]]


In [22]:
print("wrong predictions :")
print(wrong_predictions)

wrong predictions :
[4, 22, 27, 28, 42, 50, 58, 59, 61, 62, 67, 68, 72, 80, 93, 112, 113, 117, 120, 128, 133, 141, 148, 150, 152, 156, 172, 183, 186, 193, 195, 197, 204, 217, 221, 226, 234, 242, 262, 267, 278, 300, 313, 318, 341, 359, 366, 367, 369, 372, 382, 388, 404, 407, 419, 421, 422, 424, 426, 447, 471, 474, 480, 482, 485]


In [23]:
print(" result :")
print(result)

 result :
{'accuracy': 0.87, 'eval_loss': 0.3477988332509995}
