In [12]:
import pandas as pd
from collections import defaultdict, Counter
import random
import math
import csv

In [10]:
def load_data(file_path_x, file_path_y=None):
    words = []
    tags = []

    with open(file_path_x, 'r') as file_x:
        reader_x = csv.reader(file_x)
        words = [row[0] for row in reader_x]

    if file_path_y:
        with open(file_path_y, 'r') as file_y:
            reader_y = csv.reader(file_y)
            tags = [row[0] for row in reader_y]
    
    return words, tags

train_words, train_tags = load_data('train_x.csv', 'train_y.csv')
dev_words, dev_tags = load_data('dev_x.csv', 'dev_y.csv')
test_words, _ = load_data('test_x.csv') 


In [None]:
class TrigramTagger:
    def __init__(self):
        self.trigram_counts = defaultdict(lambda: defaultdict(int))
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.emission_counts = defaultdict(lambda: defaultdict(int))
        self.total_tags = 0
        self.tagset = set()

    def train(self, sentences):
        for sentence in sentences:
            previous_tags = ['<s>', '<s>'] 
            for word, tag in sentence:
                self.trigram_counts[tuple(previous_tags)][tag] += 1
                self.bigram_counts[tuple(previous_tags[:2])][tag] += 1
                self.unigram_counts[tag] += 1
                self.emission_counts[tag][word.lower()] += 1
                self.total_tags += 1
                previous_tags = [previous_tags[1], tag]
            self.trigram_counts[tuple(previous_tags)]['</s>'] += 1 

    def trigram_prob(self, prev_tags, tag):
        return (self.trigram_counts[tuple(prev_tags)][tag] + 1) / \
               (self.bigram_counts[tuple(prev_tags[:2])][tag] + len(self.tagset))

    def emission_prob(self, tag, word):
        return (self.emission_counts[tag][word.lower()] + 1) / \
               (self.unigram_counts[tag] + len(self.emission_counts[tag]))

    def tag_sentence(self, sentence):
        prev_tags = ['<s>', '<s>']
        tagged_sentence = []
        for word in sentence:
            best_tag = None
            max_prob = 0
            for tag in self.tagset:
                prob = self.trigram_prob(prev_tags, tag) * self.emission_prob(tag, word)
                if prob > max_prob:
                    max_prob = prob
                    best_tag = tag
            tagged_sentence.append((word, best_tag))
            prev_tags = [prev_tags[1], best_tag]
        return tagged_sentence


In [None]:
class TrigramTaggerWithSmoothing(TrigramTagger):
    def __init__(self, k=1):
        super().__init__()
        self.k = k 

    def trigram_prob(self, prev_tags, tag):
        return (self.trigram_counts[tuple(prev_tags)][tag] + self.k) / \
               (self.bigram_counts[tuple(prev_tags[:2])][tag] + self.k * len(self.tagset))
