# In this chapter, Bya will cover the following recipes:
1. Default tagging
2. Training a unigram part-of-speech tagger 
3. Combining taggers with backoff tagging 
4. Training and combining ngram taggers
5. Creating a model of likely word tags
6. Tagging with regular expressions
7. Affix tagging
8. Training a Brill tagger
9. Training the TnT tagger
10. Using WordNet for tagging
11. Tagging proper names
12. Classifier-based tagging
13. Training a tagger with NLTK-Trainer

**Part-of-speech** tagging is the process of converting a sentence, in the form of a list of words, into a list of tuples, where each tuple is of the form (**word, tag**). The **tag** is a part-of-speech tag, and signi es whether the word is a noun, adjective, verb, and so on.

# 1. Default tagging

In [26]:
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])

[('Hello', 'NN'), ('World', 'NN')]

### Evaluating accuracy

In [27]:
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.14331966328512843

### Tagging sentences

In [8]:
tagger.tag_sents([['Hello', 'world', '.'], ['How', 'are', 'you', '?']])

[[('Hello', 'NN'), ('world', 'NN'), ('.', 'NN')],
 [('How', 'NN'), ('are', 'NN'), ('you', 'NN'), ('?', 'NN')]]

### Untagging a tagged sentence

In [1]:
from nltk.tag import untag

untag([('Hello', 'NN'), ('World', 'NN')])

['Hello', 'World']

# 2. Training a unigram part-of-speech tagger

A **unigram** generally refers to a single token. Therefore, a unigram tagger only uses a single word as its context for determining the part-of-speech tag.

# 2. Train a  Unigram tagger with treebank corpus

In [14]:
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

tagger = UnigramTagger(train_sents)

print(treebank.sents()[3000], "\n")
print(tagger.tag(treebank.sents()[3000]), "\n")

print("Evaluate:" ,tagger.evaluate(test_sents))

['At', 'Tokyo', ',', 'the', 'Nikkei', 'index', 'of', '225', 'selected', 'issues', ',', 'which', '*T*-1', 'gained', '132', 'points', 'Tuesday', ',', 'added', '14.99', 'points', 'to', '35564.43', '.'] 

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', None), ('index', 'NN'), ('of', 'IN'), ('225', 'CD'), ('selected', None), ('issues', 'NNS'), (',', ','), ('which', 'WDT'), ('*T*-1', '-NONE-'), ('gained', 'VBD'), ('132', None), ('points', 'NNS'), ('Tuesday', 'NNP'), (',', ','), ('added', 'VBD'), ('14.99', None), ('points', 'NNS'), ('to', 'TO'), ('35564.43', None), ('.', '.')] 

Evaluate: 0.8575868767537232


### Evaluate test_sents

In [6]:
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]

tagger.evaluate(test_sents)

0.8585365853658536

### Overriding the context model

In [9]:
tagger = UnigramTagger(model={'Pierre': 'NN'})
tagger.tag(treebank.sents()[0])

[('Pierre', 'NN'),
 ('Vinken', None),
 (',', None),
 ('61', None),
 ('years', None),
 ('old', None),
 (',', None),
 ('will', None),
 ('join', None),
 ('the', None),
 ('board', None),
 ('as', None),
 ('a', None),
 ('nonexecutive', None),
 ('director', None),
 ('Nov.', None),
 ('29', None),
 ('.', None)]

### Minimum frequency cutoff

The ContextTagger class uses frequency of occurrence to decide which tag is most likely for a given context. By default, it will do this even if the context word and tag occurs only once. If you'd like to set a minimum frequency threshold, then you can pass a cutoff value to the UnigramTagger class.

In [10]:
tagger = UnigramTagger(train_sents, cutoff=3)
tagger.evaluate(test_sents)

0.7756529246708397

# 3. Combining tagger with backoff tagging

### Backoff tagging

In [28]:
from nltk.tag import DefaultTagger
tagger1 = DefaultTagger('NN')

from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger2 = UnigramTagger(train_sents, backoff=tagger1)

In [29]:
test_sents = treebank.tagged_sents()[3000:]
tagger2.evaluate(test_sents)

0.8742499460392834

In [14]:
tagger2._taggers

[<UnigramTagger: size=8818>, <DefaultTagger: tag=NN>]

### Saving and loading a trained tagger with pickle

In [30]:
# save the tagger
import pickle

with open('tagger.pickle', 'wb') as f:
    pickle.dump(tagger, f)

In [31]:
# load the tagger
import pickle

with open('tagger.pickle', 'rb') as f:
    tagger = pickle.load(f)

# 4. Training and combining ngram taggers

In [7]:
# datas
from nltk.corpus import treebank

train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

# 3. BigramTagger

In [13]:
from nltk.tag import BigramTagger
bitagger = BigramTagger(train_sents)
print(bitagger.tag(treebank.sents()[3000]), "\n")
bitagger.evaluate(test_sents)

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', None), ('index', None), ('of', None), ('225', None), ('selected', None), ('issues', None), (',', None), ('which', None), ('*T*-1', None), ('gained', None), ('132', None), ('points', None), ('Tuesday', None), (',', None), ('added', None), ('14.99', None), ('points', None), ('to', None), ('35564.43', None), ('.', None)] 



0.11305849341679257

# 4. TrigramTagger

In [12]:
from nltk.tag import TrigramTagger

tritagger = TrigramTagger(train_sents)
print(tritagger.tag(treebank.sents()[3000]), "\n")
tritagger.evaluate(test_sents)

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', None), ('index', None), ('of', None), ('225', None), ('selected', None), ('issues', None), (',', None), ('which', None), ('*T*-1', None), ('gained', None), ('132', None), ('points', None), ('Tuesday', None), (',', None), ('added', None), ('14.99', None), ('points', None), ('to', None), ('35564.43', None), ('.', None)] 



0.06850852579322253

# `tag_util.py`

In [None]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    
    return backoff

In [25]:
from tag_util import backoff_tagger
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
                                     TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)

ImportError: No module named 'tag_util'

### Quadgram tagger

In [23]:
from nltk.tag import NgramTagger

quadtagger = NgramTagger(4, train_sents)
quadtagger.evaluate(test_sents)

0.05836391107273905

# `taggers.py`

In [24]:
from nltk.tag import NgramTagger

class QuadgramTagger(NgramTagger):
    def __init__(self, *args, **kwargs):
        NgramTagger.__init__(self, 4, *args, **kwargs)

In [26]:
from taggers import QuadgramTagger

quadtagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
                                          TrigramTagger, QuadgramTagger],
                           backoff = backoff)
quadtagger.evaluate(test_sents)

0.8809842434707533

# 5. Creating a model of likely word tags

# `tag_util.py`

we can construct a model of the 200 most frequent words as keys, with the most frequent tag for each word as a value. 

In [3]:
from nltk.probability import FreqDist, ConditionalFreqDist

def word_tag_model(words, tagged_words, limit=200):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)
    
    most_freq = (word for word, count in fd.most_common(limit))
    
    return dict((word, cfd[word].max()) for word in most_freq)

In [1]:
from tag_util import word_tag_model
from nltk.corpus import treebank
from nltk.tag import UnigramTagger

model = word_tag_model(treebank.words(), treebank.tagged_words())
tagger = UnigramTagger(model=model)

test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.5594215411180661

In [5]:
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from tag_util import backoff_tagger

default_tagger = DefaultTagger('NN')
likely_tagger = UnigramTagger(model=model, backoff=default_tagger)

train_sents = treebank.tagged_sents()[:3000]
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger],
                       backoff=likely_tagger)
tagger.evaluate(test_sents)

0.8790848262464925

In [6]:
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
TrigramTagger], backoff=default_tagger)

likely_tagger = UnigramTagger(model=model, backoff=tagger)
likely_tagger.evaluate(test_sents)

0.8810274120440319

# 6. Tagging with regular expressions

# `tag_util.py`

In [7]:
patterns = [
     (r'^\d+$', 'CD'), # cardinal numbers i.e 1 2 3
     (r'.*ing$', 'VBG'), # gerunds, i.e. wondering
     (r'.*ment$', 'NN'), # i.e. wonderment
     (r'.*ful$', 'JJ') # i.e. wonderful
]

In [3]:
from tag_util import patterns
from nltk.tag import RegexpTagger
from nltk.corpus import treebank

tagger = RegexpTagger(patterns)

test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.037470321605870924

The default arguments for an **AffixTagger** class specify three-character suffixes, and that words must be at least five characters long. If a word is less than five characters, then None is returned as the tag.

In [1]:
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

# 5. Affix tagging

In [21]:
from nltk.tag import AffixTagger

tagger = AffixTagger(train_sents)
print(tagger.tag(treebank.sents()[3000]), "\n")
tagger.evaluate(test_sents)

[('At', None), ('Tokyo', 'NNP'), (',', None), ('the', None), ('Nikkei', None), ('index', 'NN'), ('of', None), ('225', None), ('selected', 'VBN'), ('issues', 'NNS'), (',', None), ('which', 'WDT'), ('*T*-1', '-NONE-'), ('gained', 'VBN'), ('132', None), ('points', 'NNS'), ('Tuesday', 'NN'), (',', None), ('added', 'VBD'), ('14.99', None), ('points', 'NNS'), ('to', None), ('35564.43', None), ('.', None)] 



0.27502698035829914

In [6]:
prefix_tagger = AffixTagger(train_sents, affix_length=3)
prefix_tagger.evaluate(test_sents)

0.23621843298078998

In [7]:
suffix_tagger = AffixTagger(train_sents, affix_length=2)
suffix_tagger.evaluate(test_sents)

0.3004101014461472

# 9. Training a TnT tagger

In [3]:
from nltk.tag import tnt
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

0.8756313403842003

In [4]:
from nltk.tag import DefaultTagger
unk = DefaultTagger('NN')
tnt_tagger = tnt.TnT(unk=unk, Trained=True)
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

0.8925102525361537

In [5]:
tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

0.8756313403842003

# 10. Using WordNet for tagging

In [2]:
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

| WordNet tag        | Treebank tag|
|-------------:| -----:|
| n      | NN | 
| a      | JJ      |   
| s | JJ      |
| r|RB |
| v|VB |

# `taggers.py`

Now we can create a class that will look up words in **WordNet**, and then choose the most common tag from the **Synsets** it finds. The **WordNetTagger** class defined in the following code can be found in **taggers.py**:

In [3]:
from nltk.tag import SequentialBackoffTagger
from nltk.corpus import wordnet
from nltk.probability import FreqDist

# Now we can create a class that will look up words in WordNet,
# and then choose the most common tag from the Synsets it finds.
class WordNetTagger(SequentialBackoffTagger):
    '''
    >>> wt = WordNetTagger()
    >>> wt.tag(['food', 'is', 'great'])
    [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')]
    '''
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)

        self.wordnet_tag_map = {
            'n': 'NN',
            's': 'JJ',
            'a': 'JJ',
            'r': 'RB',
            'v': 'VB'
        }

    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        fd = FreqDist()

        for synset in wordnet.synsets(word):
            fd[synset.pos()] += 1

        if not fd:
            return None
        return self.wordnet_tag_map.get(fd.max())

In [4]:
from taggers import WordNetTagger
wn_tagger = WordNetTagger()
wn_tagger.evaluate(train_sents)

0.1858445898001574

In [5]:
from tag_util import backoff_tagger
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger

tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
TrigramTagger], backoff=wn_tagger)

tagger.evaluate(test_sents)

0.8846967407727174

# 11. Tagging proper names

In [6]:
from nltk.tag import SequentialBackoffTagger
from nltk.corpus import names

class NamesTagger(SequentialBackoffTagger):
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.name_set = set([n.lower() for n in names.words()])

        def choose_tag(self, tokens, index, history):
            word = tokens[index]
            if word.lower() in self.name_set:
                return 'NNP'
            else:
                return None

In [8]:
len(names.words())

7944

In [1]:
from taggers import NamesTagger

nt = NamesTagger()
nt.tag(['Jacob'])

[('Jacob', 'NNP')]

# 12. Classifier-based tagging

In [3]:
from nltk.tag.sequential import ClassifierBasedPOSTagger

tagger = ClassifierBasedPOSTagger(train=train_sents)
tagger.evaluate(test_sents)

0.9309734513274336

# 13. NLTK-Trainer

### Train entire treebank corpus

The default training algorithm is **aubt**, which is shorthand for a sequential backoff tagger composed of 

`AffixTagger + UnigramTagger + BigramTagger + TrigramTagger`.

```sh
$ python train_tagger.py treebank
loading treebank
3914 tagged sents, training on 3914
training AffixTagger with affix -3 and backoff <DefaultTagger: tag=-None->
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <AffixTagger: size=2536>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=4940>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=2328>
evaluating TrigramTagger
accuracy: 0.992362
dumping TrigramTagger to /Users/Bya/nltk_data/taggers/treebank_aubt.pickle
```

### The train_tagger.py script roughly performers the following steps:
1. Construct training and testing sentences from corpus arguments. 
2. Build tagger training function from tagger arguments.
3. Train a tagger on the training sentences using the training function. 
4. Evaluate and/or save the tagger.

```sh
python train_tagger.py treebank --fraction 0.75 --no-pickle
loading treebank
3914 tagged sents, training on 2936
training AffixTagger with affix -3 and backoff <DefaultTagger: tag=-None->
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <AffixTagger: size=2287>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=4176>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=1836>
evaluating TrigramTagger
accuracy: 0.906082
```

### Default Tagger

Using **--default NN** lets us assign a default tag of **NN**, while **--sequential ''** disables the default **aubt** sequential backoff algorithm. The **--fraction** argument is omitted in this case because there's not actually any training happening.

```sh
$ python train_tagger.py treebank --no-pickle --default NN --sequential ''
loading treebank
3914 tagged sents, training on 3914
evaluating DefaultTagger
accuracy: 0.130776
```

### Unigram Tagger

Specifying **--sequential u** tells **train_tagger.py** to train with a **unigram tagger**. 

```sh
$ python train_tagger.py treebank --no-pickle --fraction 0.75 --sequential u
loading treebank
3914 tagged sents, training on 2936
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <DefaultTagger: tag=-None->
evaluating UnigramTagger
accuracy: 0.856327
```

As we did earlier, we can boost the accuracy a bit by using a default tagger:

```sh
$ python train_tagger.py treebank --no-pickle --default NN --fraction 0.75 --sequential u
loading treebank
3914 tagged sents, training on 2936
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <DefaultTagger: tag=NN>
evaluating UnigramTagger
accuracy: 0.874387
```

### Adding a Bigram Tagger and Trigram Tagger:

```sh
$ python train_tagger.py treebank --no-pickle --default NN --fraction 0.75 --sequential ubt
loading treebank
3914 tagged sents, training on 2936
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <DefaultTagger: tag=NN>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=8709>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=1836>
evaluating TrigramTagger
accuracy: 0.879213
```

### Affix
The default training algorithm is **--sequential aubt**, and the default affix is **-3**. But you can modify this with one or more **-a** arguments. So, if we want to use an affix of **-2** as well as an affix of **-3**, you can do the following:

```sh
$ python train_tagger.py treebank --no-pickle --default NN --fraction 0.75 -a -3 -a -2
loading treebank
3914 tagged sents, training on 2936
training AffixTagger with affix -3 and backoff <DefaultTagger: tag=NN>
training AffixTagger with affix -2 and backoff <AffixTagger: size=2143>
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <AffixTagger: size=248>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=5207>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=1836>
evaluating TrigramTagger
accuracy: 0.907328
```

The order of multiple **-a** arguments matters, and if you switch the order, the results and accuracy will change, because the backoff order changes:

```sh
$ python train_tagger.py treebank --no-pickle --default NN --fraction 0.75 -a -2 -a -3
loading treebank
3914 tagged sents, training on 2936
training AffixTagger with affix -2 and backoff <DefaultTagger: tag=NN>
training AffixTagger with affix -3 and backoff <AffixTagger: size=606>
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <AffixTagger: size=1318>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=4176>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=1836>
evaluating TrigramTagger
accuracy: 0.914166
```

### Classifier-Based Tagger

Finally, you can train a **classifier-based** tagger with the **--classifier argument**, which specifies the name of a classifier. Be sure to also pass in **--sequential ''** because, as we learned previously, training a sequential backoff tagger in addition to a classifier-based tagger is useless. The **--default** argument is also useless, because the classifier will always guess something.

```sh
$ python train_tagger.py treebank --no-pickle --fraction 0.75 --sequential '' --classifier NaiveBayes
loading treebank
3914 tagged sents, training on 2936
training ['NaiveBayes'] ClassifierBasedPOSTagger
Constructing training corpus for classifier.
Training classifier (75814 instances)
training NaiveBayes classifier
evaluating ClassifierBasedPOSTagger
accuracy: 0.928686
```

While **classifier-based** taggers tend to be **more accurate**, they are also slower to train, and much slower at tagging. If speed is important to you, I recommend sticking with sequential taggers.

### Saving a pickled tagger

Without the **--no-pickle** argument, train_tagger.py will save a pickled tagger at **~/nltk_data/taggers/NAME.pickle**, where **NAME** is a combination of the corpus name and training algorithm. You can specify a custom filename for your tagger using the **--filename** argument like this:

```sh
$ python train_tagger.py treebank --filename path/to/tagger.pickle
```

### Training on a custom corpus

If you have a **custom corpus** that you want to use for training a tagger, you can do that by passing in the path to the corpus and the classname of a corpus reader in the **--reader** argument. The corpus path can either be absolute or relative to a **nltk_data** directory. The corpus reader class must provide a **tagged_sents()** method. Here's an example using a relative path to the treebank tagged corpus:

```sh
$ python train_tagger.py corpora/treebank/tagged --reader nltk.corpus.reader.ChunkedCorpusReader --no-pickle --fraction 0.75
loading corpora/treebank/tagged
51002 tagged sents, training on 38252
training AffixTagger with affix -3 and backoff <DefaultTagger: tag=-None->
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <AffixTagger: size=2092>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=4125>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=1626>
evaluating TrigramTagger
accuracy: 0.882123
```

### Training with universal tags

Because the universal tagset has fewer tags, these taggers tend to be more accurate;

```sh
$ python train_tagger.py treebank --no-pickle --fraction 0.75 --tagset universal
loading treebank
using universal tagset
3914 tagged sents, training on 2936
training AffixTagger with affix -3 and backoff <DefaultTagger: tag=-None->
training <class 'nltk.tag.sequential.UnigramTagger'> tagger with backoff <AffixTagger: size=2287>
training <class 'nltk.tag.sequential.BigramTagger'> tagger with backoff <UnigramTagger: size=2884>
training <class 'nltk.tag.sequential.TrigramTagger'> tagger with backoff <BigramTagger: size=1023>
evaluating TrigramTagger
accuracy: 0.934116
```

### Analyzing a tagger against a tagged corpus

In [24]:
import nltk 
text=nltk.word_tokenize("We are going out. Just you and me.")
nltk.pos_tag(text)

[('We', 'PRP'),
 ('are', 'VBP'),
 ('going', 'VBG'),
 ('out', 'RP'),
 ('.', '.'),
 ('Just', 'NNP'),
 ('you', 'PRP'),
 ('and', 'CC'),
 ('me', 'PRP'),
 ('.', '.')]

In [41]:
# load the tagger
import pickle

with open('/Users/Bya/nltk_data/taggers/treebank_NaiveBayes.pickle', 'rb') as f:
    tagger_nb = pickle.load(f)

with open('/Users/Bya/nltk_data/taggers/treebank_aubt.pickle', 'rb') as f:
    tagger_aubt = pickle.load(f)

In [47]:
sample1 = ['Come', 'on', 'gunners', '!', '#NUFCvAFC', '#Arsenal'] 
sample2 = ['come', 'gunner', '!', 'nufcvafc', 'arsenal'] 
print(tagger_aubt.tag(sample1))
print(tagger_nb.tag(sample1))
print("\n")
print(tagger_aubt.tag(sample2))
print(tagger_nb.tag(sample2))

[('Come', 'DT'), ('on', 'IN'), ('gunners', 'NNS'), ('!', '.'), ('#NUFCvAFC', 'NN'), ('#Arsenal', 'JJ')]
[('Come', 'DT'), ('on', 'IN'), ('gunners', 'NNS'), ('!', '.'), ('#NUFCvAFC', '-RRB-'), ('#Arsenal', 'JJ')]


[('come', 'VB'), ('gunner', 'NN'), ('!', '.'), ('nufcvafc', 'NN'), ('arsenal', 'JJ')]
[('come', 'DT'), ('gunner', 'JJR'), ('!', '.'), ('nufcvafc', 'PDT'), ('arsenal', 'JJ')]
