<a href="https://colab.research.google.com/github/arunasen/NLP/blob/main/flair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Flair

In [None]:
pip install flair

Approach 1: initial setup of character dictionary from corpus


In [None]:
# make an empty character dictionary
from flair.data import Dictionary
char_dictionary: Dictionary = Dictionary()

# counter object
import collections
counter = collections.Counter()

processed = 0

import glob
files = glob.glob('/content/drive/MyDrive/corpus/*.*')

print(files)
for file in files:
    print(file)

    with open(file, 'r', encoding='utf-8') as f:
        tokens = 0
        for line in f:

            processed += 1            
            chars = list(line)
            tokens += len(chars)

            # Add chars to the dictionary
            counter.update(chars)

            # comment this line in to speed things up (if the corpus is too large)
            # if tokens > 50000000: break

    # break

total_count = 0
for letter, count in counter.most_common():
    total_count += count

print(total_count)
print(processed)

sum = 0
idx = 0
for letter, count in counter.most_common():
    sum += count
    percentile = (sum / total_count)

    # comment this line in to use only top X percentile of chars, otherwise filter later
    # if percentile < 0.00001: break

    char_dictionary.add_item(letter)
    idx += 1
    print('%d\t%s\t%7d\t%7d\t%f' % (idx, letter, count, sum, percentile))

print(char_dictionary.item2idx)

import pickle
with open('/content/drive/MyDrive/corpus/weiboEmbeddingTestRun', 'wb') as f:
    mappings = {
        'idx2item': char_dictionary.idx2item,
        'item2idx': char_dictionary.item2idx
    }
    pickle.dump(mappings, f)

Approach 1: Use a portion of Weibo news only to train the model (experiement confirmed there is not enough data to adequately train a model)


In [None]:
import pickle
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

# are you training a forward or backward LM?
is_forward_lm = True

# load the custom character dictionary
dictionary = Dictionary.load_from_file('/content/drive/MyDrive/weiboEmbeddingTestRun')

print(dictionary)

# get your corpus, process forward and at the character level
corpus = TextCorpus('/content/drive/MyDrive/corpus',dictionary,is_forward_lm, character_level=True)

# instantiate your language model, set hidden size and number of layers
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=128,
                               nlayers=1)

# train your language model
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('/content/drive/MyDrive/customWeibo',
              sequence_length=10,
              mini_batch_size=10,
              max_epochs=10)

Approach 2: Set up custom embeddings and train the model


In [None]:
#using custom embeddings which are trained on sufficiently large datasets
#from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings, BytePairEmbeddings, CharacterEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
data_folder = '/content/drive/MyDrive/data'
corpus: Corpus = ClassificationCorpus(data_folder, test_file='testSegmented.csv',dev_file='devSegmented.csv',train_file='trainSegmented.csv')

#Skipgram
#merge = WordEmbeddings('/content/drive/MyDrive/customEmbedding/merge_sgns_bigram_char300.gensim')
#weibo = WordEmbeddings('/content/drive/MyDrive/customEmbedding/sgns.weibo.bigram-char.gensim')
#baidu = WordEmbeddings('/content/drive/MyDrive/customEmbedding/sgns.baidubaike.bigram-char.gensim')
#literature = WordEmbeddings('/content/drive/MyDrive/customEmbedding/sgns.literature.bigram-char.gensim')
#zhihu = WordEmbeddings('/content/drive/MyDrive/customEmbedding/sgns.zhihu.bigram-char.gensim')
#wiki = WordEmbeddings('/content/drive/MyDrive/customEmbedding/sgns.wiki.bigram-char.gensim')
#renmin = WordEmbeddings('/content/drive/MyDrive/customEmbedding/sgns.renmin.bigram-char.gensim')

#GWE
#weiboEmbeddings_s = WordEmbeddings('/content/drive/MyDrive/customEmbedding/gwe_chr_s.gensim')
#weiboEmbeddings_b = WordEmbeddings('/content/drive/MyDrive/customEmbedding/gwe_chr_b.gensim')
#weiboEmbeddings_m = WordEmbeddings('/content/drive/MyDrive/customEmbedding/gwe_chr_m.gensim')
#weiboEmbeddings_e = WordEmbeddings('/content/drive/MyDrive/customEmbedding/gwe_chr_e.gensim')
#weiboEmbeddings_v = WordEmbeddings('/content/drive/MyDrive/customEmbedding/gwe_vec.gensim')

#JWE
weiboEmbeddings_char = WordEmbeddings('/content/drive/MyDrive/customEmbedding/JWEchar_vec.gensim')
weiboEmbeddings_comp = WordEmbeddings('/content/drive/MyDrive/customEmbedding/JWEcomp_vec.gensim')
weiboEmbeddings_word = WordEmbeddings('/content/drive/MyDrive/customEmbedding/JWEword_vec.gensim')

#RECWE
#output_vec = WordEmbeddings('/content/drive/MyDrive/customEmbedding/output_vec.gensim')
#char_vec = WordEmbeddings('/content/drive/MyDrive/customEmbedding/char_vec.gensim')
#comp_vec = WordEmbeddings('/content/drive/MyDrive/customEmbedding/comp_vec.gensim')
#output_char_vec = WordEmbeddings('/content/drive/MyDrive/customEmbedding/output_char_vec.gensim')

#cw2vec
#substoke_out = WordEmbeddings('/content/drive/MyDrive/customEmbedding/substoke_out.vec.gensim')
#substoke_out_avg = WordEmbeddings('/content/drive/MyDrive/customEmbedding/substoke_out.avg.gensim')

#CWE
#word = WordEmbeddings('/content/drive/MyDrive/customEmbedding/word.gensim')
#char_b = WordEmbeddings('/content/drive/MyDrive/customEmbedding/char_b.gensim')
#char_s = WordEmbeddings('/content/drive/MyDrive/customEmbedding/char_s.gensim')
#char_m = WordEmbeddings('/content/drive/MyDrive/customEmbedding/char_m.gensim')
#char_e = WordEmbeddings('/content/drive/MyDrive/customEmbedding/char_e.gensim')

#fasttext_wiki_embedding = WordEmbeddings('zh') #no good alone
#fasttext_crawl_embedding = WordEmbeddings('zh-crawl') #no good alone
#bert_embedding = TransformerWordEmbeddings('bert-base-chinese') #works well alone
#byte_pair_embedding = BytePairEmbeddings('zh') #weak results alone
#char_embeddings = CharacterEmbeddings()
#custom_embeddings = FlairEmbeddings('/content/drive/MyDrive/customWeibo/best-lm.pt')

word_embeddings = [weiboEmbeddings_word]#[fasttext_wiki_embedding, fasttext_crawl_embedding, bert_embedding, byte_pair_embedding]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=1024)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=25)

Approach 3: TARS Classifier

In [None]:
import flair
from flair.data import Corpus
from flair.datasets import ClassificationCorpus
from flair.models.text_classification_model import TARSClassifier
from flair.trainers import ModelTrainer

#from flair.data_fetcher import NLPTaskDataFetcher
#from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
#from flair.models import TextClassifier

#from pathlib import Path
#from flair.embeddings import TransformerWordEmbeddings
#from flair.embeddings import BytePairEmbeddings




#make a corpus
data_folder = '/content/drive/MyDrive/data'
corpus: Corpus = ClassificationCorpus(data_folder, test_file='test.csv',dev_file='dev.csv',train_file='train.csv')

#Run TARS
tars = TARSClassifier(task_name='tarsChinese', label_dictionary=corpus.make_label_dictionary())
trainer = ModelTrainer(tars, corpus)
trainer.train(base_path='balancedTARS', # path to store the model artifacts
              learning_rate=0.02, # use very small learning rate
              mini_batch_size=16,
              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=10, # terminate after 10 epochs
             )


Get predictions at a sentence level for error analysis

In [None]:
from flair.data import Sentence

csv = open('/content/drive/MyDrive/data/testSegmented.csv', newline='')
model = TextClassifier.load('best-model.pt')
predictions = list()
for l in csv:
   line = l.split('\t')
   sentence = Sentence(line[1])
   print(sentence)
   model.predict(sentence)
   prediction = str(sentence.get_labels()[0]).split(' ')[0]
   print(prediction)
   if prediction == 'real':
      predictions.append(1)
   else:
      predictions.append(0)
print(predictions)