In [2]:
from pathlib import Path

path = Path.cwd()
print(path)

while path.name != 'cs329':
    path = path.parent

print(path, type(path))

/Users/carol/PycharmProjects/cs329/src
/Users/carol/PycharmProjects/cs329 <class 'pathlib.PosixPath'>


In [3]:
path /= 'dat/pos'
path.mkdir(parents=True, exist_ok=True)
print(path)

/Users/carol/PycharmProjects/cs329/dat/pos


In [4]:
import requests

def download(remote_addr: str, local_addr: str):
    r = requests.get(remote_addr)

    with open(local_addr, 'wb') as fin:
        fin.write(r.content)

In [5]:
def read_data(filename: str):
    data, sentence = [], []
    fin = open(filename)
    
    for line in fin:
        l = line.split()
        if l:
            sentence.append((l[0], l[1]))
        else:
            data.append(sentence)
            sentence = []
    
    return data

In [6]:
trn_data = read_data(path / 'wsj-pos.trn.gold.tsv')
print(len(trn_data))
print(trn_data[0])

38219
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [7]:
from typing import List, Tuple

def word_count(data: List[List[Tuple[str, str]]]) -> int:
    """
    :param data: a list of tuple list where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: the total number of words in the data
    """
    return sum([len(sentence) for sentence in data])

In [8]:
print(word_count(trn_data))

912344


predict: unigram model

In [9]:
from collections import Counter
from typing import Dict

def create_uni_pos_dict(data: List[List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is a word and the value is the list of possible POS tags with probabilities in descending order.
    """
    model = dict()

    for sentence in data:
        for word, pos in sentence:
            model.setdefault(word, Counter()).update([pos])

    for word, counter in model.items():
        ts = counter.most_common()
        total = sum([count for _, count in ts])
        model[word] = [(pos, count/total) for pos, count in ts]

    return model

In [12]:
#P(wi,wi-1,pi)/P(wi,wi-1)
from collections import Counter
from typing import Dict

def create_first_pos_dict(data: List[List[Tuple[str, str]]]) -> Dict[Tuple[str,str], List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is a tuple and the value is the list of possible POS tags with probabilities in descending order.
    """
    PREV_DUMMY = '!@#$'
    model = dict()
    
    for sentence in data:
        for i, (word, pos) in enumerate(sentence):
            prev_word = sentence[i-1][0] if i > 0 else PREV_DUMMY
            model.setdefault((prev_word, word), Counter()).update([pos])
    #print(model)
    
    for wordprevwordtuple, counter in model.items():
        ts = counter.most_common()
        total = sum([count for _, count in ts])
        model[wordprevwordtuple] = [(pos, count/total) for pos, count in ts]

    return model

In [18]:
#P(wi,wi+1,pi)/P(wi,wi+1)
from collections import Counter
from typing import Dict

def create_second_pos_dict(data: List[List[Tuple[str, str]]]) -> Dict[Tuple[str,str], List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is a tuple and the value is the list of possible POS tags with probabilities in descending order.
    """
    NEXT_DUMMY = '!@#$'
    model = dict()
    
    for sentence in data:
        for i, (word, pos) in enumerate(sentence):
            next_word = sentence[i+1][0] if i+1 < len(sentence) else NEXT_DUMMY
            model.setdefault((word, next_word), Counter()).update([pos])
    #print(model)
    
    for wordnextwordtuple, counter in model.items():
        ts = counter.most_common()
        total = sum([count for _, count in ts])
        model[wordnextwordtuple] = [(pos, count/total) for pos, count in ts]

    return model

In [20]:
#P(wi,pi-1,pi)/P(wi,pi-1)
from collections import Counter
from typing import Dict

def create_third_pos_dict(data: List[List[Tuple[str, str]]]) -> Dict[Tuple[str,str], List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is a tuple and the value is the list of possible POS tags with probabilities in descending order.
    """
    PREV_DUMMY = '!@#$'
    model = dict()
    
    for sentence in data:
        for i, (word, pos) in enumerate(sentence):
            prev_pos = sentence[i-1][1] if i > 0 else PREV_DUMMY
            model.setdefault((prev_pos, word), Counter()).update([pos])
    #print(model)
    
    for wordprevwordtuple, counter in model.items():
        ts = counter.most_common()
        total = sum([count for _, count in ts])
        model[wordprevwordtuple] = [(pos, count/total) for pos, count in ts]

    return model

In [22]:
#P(pi-1,pi+1,pi)/P(pi-1,pi+1)
from collections import Counter
from typing import Dict

def create_fourth_pos_dict(data: List[List[Tuple[str, str]]]) -> Dict[Tuple[str,str], List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is a tuple and the value is the list of possible POS tags with probabilities in descending order.
    """
    PREV_DUMMY = '!@#$'
    NEXT_DUMMY = '!@#$'
    model = dict()
    
    for sentence in data:
        for i, (word, pos) in enumerate(sentence):
            prev_pos = sentence[i-1][1] if i > 0 else PREV_DUMMY
            next_pos = sentence[i+1][1] if i+1 < len(sentence) else NEXT_DUMMY
            model.setdefault((prev_pos, next_pos), Counter()).update([pos])
    #print(model)
    
    for wordprevwordtuple, counter in model.items():
        ts = counter.most_common()
        total = sum([count for _, count in ts])
        model[wordprevwordtuple] = [(pos, count/total) for pos, count in ts]

    return model

In [24]:
result = dict()

test = [[('Pierre', 'CD'),('nonexecutive', 'JJ'), ('director', 'NN'), ('nonexecutive', 'NNP'), ('director', 'CD')], [('Pierre', 'CD'),('nonexecutive', 'NNP'), ('director', 'NN')]]
result = (create_fourth_pos_dict(test))
print (result)


{('!@#$', 'JJ'): [('CD', 1.0)], ('CD', 'NN'): [('JJ', 0.5), ('NNP', 0.5)], ('JJ', 'NNP'): [('NN', 1.0)], ('NN', 'CD'): [('NNP', 1.0)], ('NNP', '!@#$'): [('CD', 0.5), ('NN', 0.5)], ('!@#$', 'NNP'): [('CD', 1.0)]}


In [30]:
uni_pos_dict = create_uni_pos_dict(trn_data)

In [31]:
print(uni_pos_dict['man'])
print(uni_pos_dict['buy'])

[('NN', 0.9714285714285714), ('VB', 0.01904761904761905), ('UH', 0.009523809523809525)]
[('VB', 0.8293216630196937), ('VBP', 0.08971553610503283), ('NN', 0.06564551422319474), ('JJ', 0.015317286652078774)]


In [63]:
def predict_uni_pos_dict(uni_pos_dict: Dict[str, List[Tuple[str, float]]], tokens: List[str], pprint=False) -> List[Tuple[str, float]]:
    def predict(token):
        t = uni_pos_dict.get(token, None)
        return t[0] if t else ('XX', 0.0)

    output = [predict(token) for token in tokens]
    if pprint:
        for token, t in zip(tokens, output):
            print('{:<15}{:<8}{:.2f}'.format(token, t[0], t[1]))

    return output

In [64]:
tokens = "I bought a car yesterday that was blue".split()
predict_uni_pos_dict(uni_pos_dict, tokens, True)

I              PRP     0.99
bought         VBD     0.65
a              DT      1.00
car            NN      1.00
yesterday      NN      0.98
that           IN      0.60
was            VBD     1.00
blue           JJ      0.86


[('PRP', 0.9915824915824916),
 ('VBD', 0.6474820143884892),
 ('DT', 0.9987005955603682),
 ('NN', 1.0),
 ('NN', 0.9813432835820896),
 ('IN', 0.6039103975139195),
 ('VBD', 1.0),
 ('JJ', 0.8571428571428571)]

In [65]:

def evaluate_uni_pos(uni_pos_dict: Dict[str, List[Tuple[str, float]]], data: List[List[Tuple[str, str]]]):
    total, correct = 0, 0
    for sentence in data:
        tokens, gold = tuple(zip(*sentence))
        pred = [t[0] for t in predict_uni_pos_dict(uni_pos_dict, tokens)]
        total += len(tokens)
        correct += len([1 for g, p in zip(gold, pred) if g == p])
    print('{:5.2f}% ({}/{})'.format(100.0 * correct / total, correct, total))

In [66]:
dev_data = read_data(path / 'wsj-pos.dev.gold.tsv')
evaluate_uni_pos(uni_pos_dict, dev_data)

90.88% (119754/131768)


Bigram model

In [67]:
from typing import Any
PREV_DUMMY = '!@#$'

def to_probs(model: Dict[Any, Counter]):
    for feature, counter in model.items():
        ts = counter.most_common()
        total = sum([count for _, count in ts])
        model[feature] = [(pos, count/total) for pos, count in ts]
    return model

def create_bi_pos_dict(data: List[List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is the previous POS tag and the value is the list of possible POS tags with probabilities in descending order.
    """
    model = dict()

    for sentence in data:
        for i, (_, curr_pos) in enumerate(sentence):
            prev_pos = sentence[i-1][1] if i > 0 else PREV_DUMMY
            model.setdefault(prev_pos, Counter()).update([curr_pos])

    return to_probs(model)

In [68]:
bi_pos_dict = create_bi_pos_dict(trn_data)

In [69]:
def predict_bi_pos_dict(uni_pos_dict: Dict[str, List[Tuple[str, float]]], bi_pos_dict: Dict[str, List[Tuple[str, float]]], tokens: List[str]) -> List[Tuple[str, float]]:
    output = []
    
    for i in range(len(tokens)):
        pos = uni_pos_dict.get(tokens[i], None)
        if pos is None:
            pos = bi_pos_dict.get(output[i-1][0] if i > 0 else PREV_DUMMY, None)
        output.append(pos[0] if pos else ('XX', 0.0))

    return output

In [70]:
def evaluate_bi_pos(uni_pos_dict: Dict[str, List[Tuple[str, float]]], bi_pos_dict: Dict[str, List[Tuple[str, float]]], data: List[List[Tuple[str, str]]]):
    total, correct = 0, 0
    for sentence in data:
        tokens, gold = tuple(zip(*sentence))
        pred = [t[0] for t in predict_bi_pos_dict(uni_pos_dict, bi_pos_dict, tokens)]
        total += len(tokens)
        correct += len([1 for g, p in zip(gold, pred) if g == p])
    print('{:5.2f}% ({}/{})'.format(100.0 * correct / total, correct, total))

In [71]:
evaluate_bi_pos(uni_pos_dict, bi_pos_dict, dev_data)

92.01% (121234/131768)


In [72]:
def create_bi_wp_dict(data: List[List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is the previous word and the value is the list of possible POS tags with probabilities in descending order.
    """
    model = dict()

    for sentence in data:
        for i, (_, curr_pos) in enumerate(sentence):
            prev_word = sentence[i-1][0] if i > 0 else PREV_DUMMY
            model.setdefault(prev_word, Counter()).update([curr_pos])

    return to_probs(model)


def create_bi_wn_dict(data: List[List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, float]]]:
    """
    :param data: a list of tuple lists where each inner list represents a sentence and every tuple is a (word, pos) pair.
    :return: a dictionary where the key is the previous word and the value is the list of possible POS tags with probabilities in descending order.
    """
    model = dict()

    for sentence in data:
        for i, (_, curr_pos) in enumerate(sentence):
            next_word = sentence[i+1][0] if i+1 < len(sentence) else PREV_DUMMY
            model.setdefault(next_word, Counter()).update([curr_pos])

    return to_probs(model)

In [73]:
bi_wp_dict = create_bi_wp_dict(trn_data)
bi_wn_dict = create_bi_wn_dict(trn_data)

In [74]:
def predict_interporlation(
        uni_pos_dict: Dict[str, List[Tuple[str, float]]],
        bi_pos_dict: Dict[str, List[Tuple[str, float]]],
        bi_wp_dict: Dict[str, List[Tuple[str, float]]],
        bi_wn_dict: Dict[str, List[Tuple[str, float]]],
        uni_pos_weight: float,
        bi_pos_weight: float,
        bi_wp_weight: float,
        bi_wn_weight: float,
        tokens: List[str]) -> List[Tuple[str, float]]:
    output = []

    for i in range(len(tokens)):
        scores = dict()
        curr_word = tokens[i]
        prev_pos = output[i-1][0] if i > 0 else PREV_DUMMY
        prev_word = tokens[i-1] if i > 0 else PREV_DUMMY
        next_word = tokens[i+1] if i+1 < len(tokens) else PREV_DUMMY

        for pos, prob in uni_pos_dict.get(curr_word, dict()):
            scores[pos] = scores.get(pos, 0) + prob * uni_pos_weight

        for pos, prob in bi_pos_dict.get(prev_pos, dict()):
            scores[pos] = scores.get(pos, 0) + prob * bi_pos_weight

        for pos, prob in bi_wp_dict.get(prev_word, dict()):
            scores[pos] = scores.get(pos, 0) + prob * bi_wp_weight

        for pos, prob in bi_wn_dict.get(next_word, dict()):
            scores[pos] = scores.get(pos, 0) + prob * bi_wn_weight

        o = max(scores.items(), key=lambda t: t[1]) if scores else ('XX', 0.0)
        output.append(o)

    return output

In [78]:
def evaluate_interpolation(
        uni_pos_dict: Dict[str, List[Tuple[str, float]]],
        bi_pos_dict: Dict[str, List[Tuple[str, float]]],
        bi_wp_dict: Dict[str, List[Tuple[str, float]]],
        bi_wn_dict: Dict[str, List[Tuple[str, float]]],
        uni_pos_weight: float,
        bi_pos_weight: float,
        bi_wp_weight: float,
        bi_wn_weight: float,
        data: List[List[Tuple[str, str]]],
        pprint=False):
    total, correct = 0, 0
    for sentence in data:
        tokens, gold = tuple(zip(*sentence))
        pred = [t[0] for t in predict_interporlation(uni_pos_dict, bi_pos_dict, bi_wp_dict, bi_wn_dict, uni_pos_weight, bi_pos_weight, bi_wp_weight, bi_wn_weight, tokens)]
        total += len(tokens)
        correct += len([1 for g, p in zip(gold, pred) if g == p])
        
    accuracy = 100.0 * correct / total
    print('{:5.2f}% - uni_pos: {:3.1f}, bi_pos: {:3.1f}, bi_wp: {:3.1f}, bi_np: {:3.1f}'.format(accuracy, uni_pos_weight, bi_pos_weight, bi_wp_weight, bi_wn_weight))
    return accuracy

In [83]:
uni_pos_weight = 1.0
bi_pos_weight = 1.0
bi_wp_weight = 1.0
bi_wn_weight = 1.0
evaluate_interpolation(uni_pos_dict, bi_pos_dict, bi_wp_dict, bi_wn_dict, uni_pos_weight, bi_pos_weight, bi_wp_weight, bi_wn_weight, dev_data, True)

91.25129014631777

In [84]:
#grid = [0.1, 0.5, 1.0]
grid = [0.2, 0.5]
best = (0, None)
worst = (100, None)
count = 0
for uni_pos_weight in grid:
    for bi_pos_weight in grid:
        for bi_wp_weight in grid:
            for bi_wn_weight in grid:
                count = count + 1
                acc = evaluate_interpolation(uni_pos_dict, bi_pos_dict, bi_wp_dict, bi_wn_dict, uni_pos_weight, bi_pos_weight, bi_wp_weight, bi_wn_weight, dev_data)
                if acc > best[0]: best = (acc, uni_pos_weight, bi_pos_weight, bi_wp_weight, bi_wn_weight)
                if acc < worst[0]: worst = (acc, uni_pos_weight, bi_pos_weight, bi_wp_weight, bi_wn_weight)
print(count)
print('==========================================================')
print('Best : {:5.2f}% - uni_pos: {:3.1f}, bi_pos: {:3.1f}, bi_wp: {:3.1f}, bi_np: {:3.1f}'.format(*best))
print('Worst: {:5.2f}% - uni_pos: {:3.1f}, bi_pos: {:3.1f}, bi_wp: {:3.1f}, bi_np: {:3.1f}'.format(*worst))


16
Best : 93.41% - uni_pos: 0.5, bi_pos: 0.2, bi_wp: 0.2, bi_np: 0.2
Worst: 73.46% - uni_pos: 0.2, bi_pos: 0.5, bi_wp: 0.5, bi_np: 0.5


NLTK

In [85]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [88]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Error loading punkt: <urlopen error [Errno 61] Connection
[nltk_data]     refused>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 61] Connection refused>


False

In [87]:
tokens = nltk.word_tokenize("I bought a car yesterday that was blue.")
print(tokens)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/carol/nltk_data'
    - '/Users/carol/PycharmProjects/cs329/venv/nltk_data'
    - '/Users/carol/PycharmProjects/cs329/venv/share/nltk_data'
    - '/Users/carol/PycharmProjects/cs329/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
nltk.pos_tag(tokens)

In [None]:
def evaluate_nltk(data: List[List[Tuple[str, str]]]):
    total, correct = 0, 0
    for sentence in data:
        tokens, gold = tuple(zip(*sentence))
        pred = [pos for token, pos in nltk.pos_tag(tokens)]
        total += len(tokens)
        correct += len([1 for g, p in zip(gold, pred) if g == p])
    print('{:5.2f}% ({}/{})'.format(100.0 * correct / total, correct, total))

In [None]:
evaluate_nltk(dev_data)