<a href="https://colab.research.google.com/github/abs-git/NLP/blob/main/Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# glue data load

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

with open(path + '/glue_train.txt', 'r') as f:
  train_sentences = f.readlines()

for i, sen in enumerate(train_sentences):
  train_sentences[i] = sen.rstrip("\n")

with open(path + '/glue_test.txt', 'r') as f:
  test_sentences = f.readlines()

for i, sen in enumerate(test_sentences):
  test_sentences[i] = sen.rstrip("\n")

print("train : {}".format(len(train_sentences)))
print("test : {}".format(len(test_sentences)))

print(train_sentences[:2])
print(test_sentences[:2])


train : 8551
test : 1063
["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up."]
['Bill whistled past the house.', 'The car honked its way down the road.']


In [None]:
# data load
sample_sentences = []
for sen in train_sentences[:2]:
  words = sen.split(" ")

  temp = []
  for w in words:
    temp.append(w)

  sample_sentences.append(temp)


In [None]:
# nltk built-in tagger
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [None]:
# Part of Speech Tagging

from nltk.corpus import treebank
# from nltk.corpus.reader import bnc

from nltk.tag.sequential import DefaultTagger 
from nltk.tag.sequential import UnigramTagger
from nltk.tag.sequential import BigramTagger, TrigramTagger


# nltk built-in tagset reference (corpus,sentence,word,char)
treebank_corpus = treebank.tagged_sents()[:3000]          # Penn Treebank tagset (45 cate)
# bnc_corpus = bnc.tagged_sents()[:3000]                  # british national tagset (98 cate)

# tagger load
init_tagger = DefaultTagger('EX')                  # 모든 토큰에 대한 기본 pos를 부여한다.
init_tag = init_tagger.tag(sample_sentences[0])

uni_tagger = UnigramTagger(treebank_corpus)
uni_tag = uni_tagger.tag(sample_sentences[0])

uni_backoff_tagger = UnigramTagger(treebank_corpus, backoff = init_tagger)    # Backoff tagger : tagging 하지 못한 token엔 default 값을 부여한다.
uni_backoff_tag = uni_backoff_tagger.tag(sample_sentences[0])

bi_tagger = BigramTagger(treebank_corpus)              # bigram tagger : 해당 token의 이전 token의 tag를 참고
bi_tag = bi_tagger.tag(sample_sentences[0])

tri_tagger = TrigramTagger(treebank_corpus)             # trigram tagger : 해당 token의 이전 2개의 token의 tag를 참고
tri_tag = tri_tagger.tag(sample_sentences[0])


print('raw sentence     : {}'.format(sample_sentences[0]))
print('init tag         : {}'.format(init_tag))
print('unigram tag      : {}'.format(uni_tag))
print('uni backoff tag  : {}'.format(uni_backoff_tag))
print('bigram tag       : {}'.format(bi_tag))
print('trigram tag      : {}'.format(tri_tag))
print()

# print(init_tagger.evaluate(sample_sentences[1]))


raw sentence     : ['Our', 'friends', "won't", 'buy', 'this', 'analysis,', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose.']
init tag         : [('Our', 'EX'), ('friends', 'EX'), ("won't", 'EX'), ('buy', 'EX'), ('this', 'EX'), ('analysis,', 'EX'), ('let', 'EX'), ('alone', 'EX'), ('the', 'EX'), ('next', 'EX'), ('one', 'EX'), ('we', 'EX'), ('propose.', 'EX')]
unigram tag      : [('Our', None), ('friends', 'NNS'), ("won't", None), ('buy', 'VB'), ('this', 'DT'), ('analysis,', None), ('let', 'VB'), ('alone', 'RB'), ('the', 'DT'), ('next', 'JJ'), ('one', 'CD'), ('we', 'PRP'), ('propose.', None)]
uni backoff tag  : [('Our', 'EX'), ('friends', 'NNS'), ("won't", 'EX'), ('buy', 'VB'), ('this', 'DT'), ('analysis,', 'EX'), ('let', 'VB'), ('alone', 'RB'), ('the', 'DT'), ('next', 'JJ'), ('one', 'CD'), ('we', 'PRP'), ('propose.', 'EX')]
bigram tag       : [('Our', None), ('friends', None), ("won't", None), ('buy', None), ('this', None), ('analysis,', None), ('let', None), ('alone', None), ('the

In [None]:
def backoff_tagger(train_sent, tagger_classes, backoff = None):
  for cls in tagger_classes:
    backoff = cls(train_sent, backoff = backoff)

  return backoff

tagger = backoff_tagger(treebank_corpus, [UnigramTagger, BigramTagger, TrigramTagger],
                        backoff = DefaultTagger('NN'))

backoff_tag = tagger.tag(sample_sentences[0])
# tag_score = tagger.evaluate(sample_sentences[0:])

print('backoff tag : {}'.format(backoff_tag))
# print('accuracy    : {}'.format(tag_score))

backoff tag : [('Our', 'NN'), ('friends', 'NNS'), ("won't", 'NN'), ('buy', 'VB'), ('this', 'DT'), ('analysis,', 'NN'), ('let', 'VBD'), ('alone', 'RB'), ('the', 'DT'), ('next', 'JJ'), ('one', 'CD'), ('we', 'PRP'), ('propose.', 'NN')]


In [None]:
# Named Entity Recognition
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
from nltk import pos_tag, ne_chunk

tagged_sentence = pos_tag(sample_sentences[0])    # 다른 tagger를 활용해 tagging 하여도 상관 없다.

ner_sentence = ne_chunk(tagged_sentence)

print(tagged_sentence)
print(ner_sentence)


[('Our', 'PRP$'), ('friends', 'NNS'), ("won't", 'VBP'), ('buy', 'VB'), ('this', 'DT'), ('analysis,', 'JJ'), ('let', 'VB'), ('alone', 'RB'), ('the', 'DT'), ('next', 'JJ'), ('one', 'NN'), ('we', 'PRP'), ('propose.', 'VBP')]
(S
  Our/PRP$
  friends/NNS
  won't/VBP
  buy/VB
  this/DT
  analysis,/JJ
  let/VB
  alone/RB
  the/DT
  next/JJ
  one/NN
  we/PRP
  propose./VBP)
