In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from nltk import bigrams
import nltk

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('averaged_perceptron_tagger', quiet=True)
_ = nltk.download('punkt', quiet=True)

In [2]:
train = pd.read_csv('data/WSJ_02-21.pos', sep='\t', header=0, names=['word', 'pos'])
train.head()

Unnamed: 0,word,pos
0,an,DT
1,Oct.,NNP
2,19,CD
3,review,NN
4,of,IN


In [3]:
test = pd.read_csv('data/WSJ_24.pos', sep='\t', header=0, names=['word', 'pos'])
test.head()

Unnamed: 0,word,pos
0,economy,NN
1,'s,POS
2,temperature,NN
3,will,MD
4,be,VB


In [4]:
len(test), len(train)

(32852, 950027)

## First approach is to find the most common tag for a word and just use that

In [5]:
# for each word get the most popular tuple
winners = {}
for row in Counter(list(train.itertuples(index=False, name=None))):
    if row[0] not in winners:
        winners[row[0]] = row[1]
    elif winners[row[0]]< row[1]:
        winners[row[0]] = row[1]
        
print(f'There are {len(winners)} tuples in the dict')

for key in list(winners.keys())[:5]:
    print(key, winners[key])

There are 44389 tuples in the dict
an DT
Oct. NNP
19 CD
review VBP
of RP


In [6]:
import json
with open('data/dicts/most_common_tag_for_word.json', 'wt') as f:
    f.write(json.dumps(winners))

In [7]:
winners['an']

'DT'

### Make our predictions on the test set and calculate our score

In [46]:
predictions = []
unknown = '<UNK>'
for row in list(test.itertuples(index=False, name=None)):
    if row[0] in winners:
        predictions.append(winners[row[0]])
    else:
        predictions.append(unknown)
print(predictions[:20])

results = pd.DataFrame({'ground_truth': test['pos'].to_list(), 'prediction': predictions})
np.mean(results['ground_truth']==results['prediction'])

['NN', 'VBZ', 'NN', 'VB', 'VBP', 'VBN', 'IN', 'JJ', 'NN', 'VBZ', 'RB', 'NN', ',', 'RP', 'NNS', 'RP', 'VBP', ',', 'NN', ',']


0.5858090831608426

## Next try bigrams

In [47]:
# Iterate the bigrams to find most common pairing of first tuple in key and first element of second tuple
bigram_counter = Counter(list(bigrams(train.itertuples(index=False, name=None))))
winners = {}
for row in bigram_counter:
    key = row[0], row[1][0]
    if key not in winners:
        winners[key] = bigram_counter[row]
    elif winners[key]> bigram_counter[row]:
        winners[key] = bigram_counter[row]

In [48]:
predictions = []
unknown = '<UNK>'
for row in list(bigrams(test.itertuples(index=False, name=None))):
    key = row[0], row[1][0]
    if key in winners:
        predictions.append(row[1][1])
    else:
        predictions.append(unknown)
print(predictions[:20])

['POS', '<UNK>', '<UNK>', 'VB', 'VBN', 'IN', 'JJ', '<UNK>', '<UNK>', 'DT', 'NN', ',', 'IN', '<UNK>', 'IN', 'NN', ',', 'NN', ',', 'NN']


In [49]:
# Note that because we are using bigrams there is one less prediction than elements in the test set.
# Indexing from [1:] in the DataFrame takes care of that
len(predictions), len(test['pos'])

(32851, 32852)

In [50]:
results = pd.DataFrame({'ground_truth': test['pos'].to_list()[1:], 'prediction': predictions})
np.mean(results['ground_truth']==results['prediction'])

0.7000700130894036

## Next try the NLTK library's built in tagger.

In [51]:
# The above bigram method uses maximum liklihood of tag given word and previous tag,word pair. 
# Let's try the model just using predict tag given previous tag
nltk_tags = nltk.pos_tag(test['word'].to_list())

In [52]:
results = pd.DataFrame({'ground_truth': test['pos'].to_list(), 'prediction': [tag[1] for tag in nltk_tags]})
np.mean(results['ground_truth']==results['prediction'])

0.961767928893218