##**Connect to drive**:

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


##**Change repository:**

In [0]:
import os
os.chdir('/content/gdrive/My Drive/flair') 

In [3]:
pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/4e/3a/2e777f65a71c1eaa259df44c44e39d7071ba8c7780a1564316a38bf86449/flair-0.4.2-py3-none-any.whl (136kB)
[K     |████████████████████████████████| 143kB 3.4MB/s 
Collecting segtok>=1.5.7 (from flair)
  Downloading https://files.pythonhosted.org/packages/1d/59/6ed78856ab99d2da04084b59e7da797972baa0efecb71546b16d48e49d9b/segtok-1.5.7.tar.gz
Collecting pytorch-pretrained-bert>=0.6.1 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 48.8MB/s 
Collecting mpld3==0.3 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 54.5MB/s 
[?25hCollecting sqlitedict>=1.6.0 (from

##**Import train and test data from corpus:**

In [4]:
# imports 
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from typing import List

# columns of "gold standard" ner annotations and text
columns = {0: 'text', 1: 'ner'}

# folder where training and test data are
data_folder = "./"

# 2. what tag do we want to predict?
tag_type = 'ner'

downsample = 1 # 1.0 is full data, try a much smaller number like 0.01 to test run the code

# 1. get the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file=None).downsample(downsample)

print(corpus)

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)



2019-06-19 04:45:03,630 Reading data from .
2019-06-19 04:45:03,631 Train: train.txt
2019-06-19 04:45:03,632 Dev: None
2019-06-19 04:45:03,641 Test: test.txt
Corpus: 6440 train + 716 dev + 1956 test sentences
[b'<unk>', b'O', b'-', b'B-Companies', b'L-Companies', b'B-College', b'I-College', b'L-College', b'U-Companies', b'I-Companies', b'B-Degree', b'I-Degree', b'L-Degree', b'U-Degree', b'<START>', b'<STOP>']


#**Train model with StackedEmbeddings**

##**Training with current best configuration for NER:**

In [5]:
# 4. initialize embeddings. Experiment with different embedding types to see what gets the best results
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, PooledFlairEmbeddings
embedding_types: List[TokenEmbeddings] = [
    
    WordEmbeddings('glove'),
    
    #contextual string embeddings, forward
    PooledFlairEmbeddings('news-forward', pooling='min'),

    #contextual string embeddings, backward
    PooledFlairEmbeddings('news-backward', pooling='min'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)


2019-06-19 04:45:20,851 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpb3s987wg


100%|██████████| 160000128/160000128 [00:08<00:00, 19076655.02B/s]

2019-06-19 04:45:29,823 copying /tmp/tmpb3s987wg to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2019-06-19 04:45:30,024 removing temp file /tmp/tmpb3s987wg
2019-06-19 04:45:30,520 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmptxesyxei


100%|██████████| 21494764/21494764 [00:01<00:00, 11742578.81B/s]

2019-06-19 04:45:32,879 copying /tmp/tmptxesyxei to cache at /root/.flair/embeddings/glove.gensim
2019-06-19 04:45:32,902 removing temp file /tmp/tmptxesyxei



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2019-06-19 04:45:34,522 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmpfml4vjat


100%|██████████| 73034624/73034624 [00:04<00:00, 17070448.24B/s]

2019-06-19 04:45:39,350 copying /tmp/tmpfml4vjat to cache at /root/.flair/embeddings/news-forward-0.4.1.pt





2019-06-19 04:45:39,435 removing temp file /tmp/tmpfml4vjat
2019-06-19 04:45:47,482 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt not found in cache, downloading to /tmp/tmpu5kdp8ez


100%|██████████| 73034575/73034575 [00:04<00:00, 16638564.14B/s]

2019-06-19 04:45:52,437 copying /tmp/tmpu5kdp8ez to cache at /root/.flair/embeddings/news-backward-0.4.1.pt





2019-06-19 04:45:52,515 removing temp file /tmp/tmpu5kdp8ez


In [0]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources_best_config/taggers/resume-ner',
              #learning_rate=0.1,
              #mini_batch_size=32,
              max_epochs=150)


2019-06-19 04:46:02,569 ----------------------------------------------------------------------------------------------------
2019-06-19 04:46:02,574 Evaluation method: MICRO_F1_SCORE
2019-06-19 04:46:03,757 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-06-19 04:46:07,572 epoch 1 - iter 0/202 - loss 92.47586060
2019-06-19 04:46:26,688 epoch 1 - iter 20/202 - loss 10.18699500
2019-06-19 04:46:45,654 epoch 1 - iter 40/202 - loss 6.51050393
2019-06-19 04:47:02,467 epoch 1 - iter 60/202 - loss 5.43989021
2019-06-19 04:47:23,452 epoch 1 - iter 80/202 - loss 4.61708057
2019-06-19 04:47:43,798 epoch 1 - iter 100/202 - loss 4.14494569
2019-06-19 04:48:00,713 epoch 1 - iter 120/202 - loss 3.80493010
2019-06-19 04:48:21,010 epoch 1 - iter 140/202 - loss 3.54395373
2019-06-19 04:48:42,543 epoch 1 - iter 160/202 - loss 3.33023852
2019-06-19 04:49:05,868 epoch 1 - iter 180/202 

##**Plot loss and weights history:**

In [0]:
# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('./resources_best_config/taggers/resume-ner/loss.tsv')
plotter.plot_weights('./resources_best_config/taggers/resume-ner/weights.txt')