<a href="https://colab.research.google.com/github/buseskorkmaz/Sentiment-Analysis-with-Deep-Learning/blob/main/flair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flair
!pip install allennlp==0.9.0

Collecting allennlp==0.9.0
[?25l  Downloading https://files.pythonhosted.org/packages/bb/bb/041115d8bad1447080e5d1e30097c95e4b66e36074277afce8620a61cee3/allennlp-0.9.0-py3-none-any.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 2.7MB/s 
Collecting word2number>=1.1
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip
Collecting flaky
  Downloading https://files.pythonhosted.org/packages/43/0e/2f50064e327f41a1eb811df089f813036e19a64b95e33f8e9e0b96c2447e/flaky-3.7.0-py2.py3-none-any.whl
Collecting parsimonious>=0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/02/fc/067a3f89869a41009e1a7cdfb14725f8ddd246f30f63c645e8ef8a1c56f4/parsimonious-0.8.1.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 6.4MB/s 
Collecting pytorch-transformers==1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/89/ad0d6bb932d0a51793eaabcf1617a36ff530dc9ab9e38f765a35dc2

In [None]:
import pandas as pd
import tqdm
import numpy as np

In [None]:
df = pd.read_json('financial_news_scored.json', orient = 'records', encoding='utf-8').sample(frac=1)
df['text'] = df['text'].map(lambda x: x.lstrip('BRIEF-'))
df.set_index('index',inplace=True)
# Optional lowercase for test data (if model was trained on lowercased text
df['text'] = df['text'].str.lower()
df['label'] = '__label__' + df['score'].astype(str)
df= df.drop(columns=['versionCreated', 'storyId', 'sourceCode', 'storyText','score'])
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

df.iloc[0:int(len(df)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
df.iloc[int(len(df)*0.8):int(len(df)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
df.iloc[int(len(df)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False)

df

Unnamed: 0_level_0,label,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1588242145000,__label__1,va expects any potential future provisions aga...
1591886915578,__label__-1,itch revises anadolubank's outlook to negative...
1591286414000,__label__-1,akfen reit q1 net loss widens to 60.6 million ...
1591451123000,__label__-1,south africa's mtn faces new allegations in u....
1591770119000,__label__-1,global yatirim holding q1 net loss increases t...
...,...,...
1598690940000,__label__1,update 5-uae scraps israel boycott in new step...
1596037421000,__label__-1,marti reit q2 net result swings to loss of 11....
1598886714613,__label__-1,itch maintains negative rating watch on global...
1590672532000,__label__1,eysas reit rents storage facility at 2.7 mln l...


In [None]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.embeddings import TransformerDocumentEmbeddings,TransformerWordEmbeddings
from flair.embeddings import BertEmbeddings, ELMoEmbeddings

# this is the folder in which train, test and dev files reside
data_folder = '/content/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ClassificationCorpus(data_folder,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
# print the number of Sentences in the train split
print(len(corpus.train))

# print the number of Sentences in the test split
print(len(corpus.test))

# print the number of Sentences in the dev split
print(len(corpus.dev))


2020-09-13 15:01:25,612 Reading data from /content
2020-09-13 15:01:25,614 Train: /content/train.txt
2020-09-13 15:01:25,620 Dev: /content/dev.txt
2020-09-13 15:01:25,622 Test: /content/test.txt
586
73
74


In [None]:
from flair.embeddings import StackedEmbeddings

# init BERT base (cases)
#optional_embedding = BertEmbeddings('bert-base-uncased')
# OR init ELMo (original)
#optional_embedding = ELMoEmbeddings('original')

#word_embeddings = [
#    optional_embedding,
#    FlairEmbeddings('news-forward'),
#    FlairEmbeddings('news-backward')]


#word_embeddings = [WordEmbeddings('glove')]

#document_embeddings = DocumentRNNEmbeddings(
#        word_embeddings,
#        hidden_size=512,
#        reproject_words=True,
#        reproject_words_dimension=256
#    )

document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased',fine_tune=True)



In [None]:
classifier = TextClassifier(document_embeddings, 
                            label_dictionary=corpus.make_label_dictionary(), 
                            multi_label=True)

trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=5,mini_batch_size=32)

2020-09-13 15:37:41,675 Computing label dictionary. Progress:


100%|██████████| 659/659 [00:00<00:00, 801.78it/s] 

2020-09-13 15:37:43,252 [b'0', b'-1', b'1']
2020-09-13 15:37:43,269 ----------------------------------------------------------------------------------------------------
2020-09-13 15:37:43,271 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, b




2020-09-13 15:37:44,804 epoch 1 - iter 1/19 - loss 0.69793844 - samples/sec: 38.94 - lr: 0.100000
2020-09-13 15:37:45,535 epoch 1 - iter 2/19 - loss 0.36785468 - samples/sec: 44.65 - lr: 0.100000
2020-09-13 15:37:46,292 epoch 1 - iter 3/19 - loss 0.24853520 - samples/sec: 42.50 - lr: 0.100000
2020-09-13 15:37:46,981 epoch 1 - iter 4/19 - loss 0.19249319 - samples/sec: 47.05 - lr: 0.100000
2020-09-13 15:37:47,690 epoch 1 - iter 5/19 - loss 0.15664401 - samples/sec: 45.24 - lr: 0.100000
2020-09-13 15:37:48,430 epoch 1 - iter 6/19 - loss 0.13338151 - samples/sec: 43.32 - lr: 0.100000
2020-09-13 15:37:49,257 epoch 1 - iter 7/19 - loss 0.11637191 - samples/sec: 41.02 - lr: 0.100000
2020-09-13 15:37:49,905 epoch 1 - iter 8/19 - loss 0.10350685 - samples/sec: 49.51 - lr: 0.100000
2020-09-13 15:37:50,572 epoch 1 - iter 9/19 - loss 0.09351362 - samples/sec: 48.05 - lr: 0.100000
2020-09-13 15:37:51,296 epoch 1 - iter 10/19 - loss 0.08490687 - samples/sec: 44.35 - lr: 0.100000
2020-09-13 15:37:51

{'dev_loss_history': [0.829311728477478,
  0.8733912706375122,
  0.9419206380844116,
  0.7649868726730347,
  0.9187461733818054],
 'dev_score_history': [0.6575, 0.6986, 0.7123, 0.6667, 0.7027],
 'test_score': 0.7534,
 'train_loss_history': [0.051038811448961496,
  0.006611294591015107,
  0.0029970998324355797,
  0.020630335741627374,
  0.011562082830718473]}