In [0]:
from collections import Counter
import re
import operator


class HMM:
    def __init__(self, initial_probabilities, transition_probabilities, emission_probabilities, initial_frequencies,
                 transition_frequencies, emission_frequencies, all_tags, all_tags_and_counts):
        self.initial_probabilities = initial_probabilities
        self.transition_probabilities = transition_probabilities
        self.emission_probabilities = emission_probabilities
        self.initial_frequencies = initial_frequencies
        self.transition_frequencies = transition_frequencies
        self.emission_frequencies = emission_frequencies
        self.alltags = all_tags
        self.all_tags_and_counts = all_tags_and_counts

    # This function takes path as an argument, reads and returns the list of sentences
    def dataset(path):
        with open(path) as f:
            read_data = f.read()
        # Split data into paragraphs
        paragraphs = re.split('\n\n', read_data)
        sentences = []
        all_tags = []
        for line in paragraphs:
            # Split paragraphs into lines
            lines = re.split('\n', line)
            sentence = ''
            tag = ''
            for word in lines:
                # Split lines into words
                words = re.split(' ', word)
                if len(words) is 4:
                    if words[0] != '-DOCSTART-':
                        if path == '/content/drive/My Drive/497/train.txt':
                            # Take first words from each lines, merge them with tags and create tagged sentences
                            sentence = sentence + words[0].lower() + '|' + words[3] + ' '
                        if path == '/content/drive/My Drive/497/test.txt':
                            sentence = sentence + words[0].lower() + ' '
                        all_tags.append(words[3])
            # Remove spaces from the end of each sentence and store them in a list
            if sentence != '':
                sentences.append(sentence[:len(sentence) - 1])
        HMM.all_tags_and_counts = dict(Counter(all_tags))
        # Remove duplicates and store all tags in a list
        alltags = list(dict.fromkeys(all_tags))
        HMM.alltags = alltags
        return sentences

    # This function takes sentences and create HMM model.
    # Returns initial, transition and emission probabilities of HMM model.
    def hmm(sentences):
        tags = []
        tag_bigrams = []

        all_words_with_tags = []    # All words and tags from train.txt (word | tag)
        for sentence in sentences:
            # Split each sentence into words
            words = sentence.split(' ')
            all_words_with_tags.extend(words)
            all_tags = []
            for i in range(0, len(words)):
                y = words[i].split('|')
                all_tags.append(y[1])
                if i == 0:
                  tags.append(y[1])
            for j in range(0, len(words) - 1):
                tag_bigrams.append(all_tags[j + 1] + "|" + all_tags[j])


        initial_sum = len(tags)

        # ---FREQUENCY LISTS FOR CALCULATE INITIAL, TRANSITION AND EMISSION PROBABILITIES---

        # I keep first tags of sentences and their frequencies in a dictionary
        HMM.initial_frequencies = dict(Counter(tags))

        # I keep the frequencies of the tags coming after another tag (bigram) in a dictionary
        HMM.transition_frequencies = dict(Counter(tag_bigrams))

        # I keep word - tag pairs and their frequencies in a dictionary
        HMM.emission_frequencies = dict(Counter(all_words_with_tags))

        initial_probabilities = {}
        transition_probabilities = {}
        emission_probabilities = {}

        # INITIAL PROBABILITY IS BEING CALCULATED
        for key in HMM.initial_frequencies:
            initial_probabilities[key] = float(HMM.initial_frequencies[key] / initial_sum)

        # TRANSITION PROBABILITY IS BEING CALCULATED
        for key in HMM.transition_frequencies:
            tags = key.split('|')
            transition_probabilities[key] = float(HMM.transition_frequencies[key] / HMM.all_tags_and_counts[tags[1]])
 
        # EMISSION PROBABILITY IS BEING CALCULATED
        for key in HMM.emission_frequencies:
            tags = key.split('|')
            emission_probabilities[key] = float(HMM.emission_frequencies[key] / HMM.all_tags_and_counts[tags[1]])



        return initial_probabilities, transition_probabilities, emission_probabilities

    # This function takes test sentences and apply Viterbi Algorithm on them.
    # Then returns the tagged test sentences.

    def viterbi(sentences):

        sentences_with_tags = []

        for sentence in sentences:
            sentence_with_tags = ''
            # Split each sentence into words
            words = sentence.split(' ')
            rows = len(HMM.alltags)
            columns = len(words)
            # Fills whole matrix with 1's
            viterbi_matrix = [[1 for x in range(0, columns)] for y in range(0, rows)]

            # Fills the first column of the matrix with initial probabilities * emission probabilities
            for i in range(0, rows):
                # Checking emission probability existence in HMM Model 
                try:
                    emission = HMM.emission_probabilities[words[0] + '|' + HMM.alltags[i]]
                    viterbi_matrix[i][0] = float(float(HMM.initial_probabilities[HMM.alltags[i]]) * float(emission))
                except:
                    emission = 0
                    viterbi_matrix[i][0] = float(emission)

            transitions = {}
            if columns > 1:
                for j in range(1, columns):
                    for i in range(0, rows):
                        # Checking emission probability existence in HMM Model
                        try:
                            # Getting emission probability from HMM Model for current word and tag
                            emission = HMM.emission_probabilities[words[j] + '|' + HMM.alltags[i]]
                        # If there is no emission probability in HMM Model, Smoothing algorithm not applied.
                        except:
                            emission = 0
                        for k in range(0, rows):
                            # Checking transition probability existence in HMM Model
                            try:
                                # For each tag we multiply emission probability,transition probability and  previous probabilities
                                # Then we take the maximum of these probabilities for each tag
                                transitions[k] = HMM.transition_probabilities[HMM.alltags[k] + '|' + HMM.alltags[i]] * emission * viterbi_matrix[k][j - 1]
                            # If transition probability does not exists in HMM Model Laplace smoothing applied
                            except:
                                transitions[k] = (1 / ((HMM.all_tags_and_counts[HMM.alltags[i]] + 1))) * emission * viterbi_matrix[k][j - 1]
                                
                        max_transition_index = max(transitions.items(), key=operator.itemgetter(1))[0]
                        # We take the maximum probabilities for each tag and add them to our matrix
                        viterbi_matrix[i][j] = transitions[max_transition_index]

            maxs = []
            index = 0
            all_columns = {}
            for j in range(0, columns):
                each_column = []
                for i in range(0, rows):
                    each_column.append(viterbi_matrix[i][j])
                all_columns[j] = each_column
                element = max(all_columns[j])
                maxs.append(each_column.index(element))
            # Predicted tags are added to the test sentences
            for i in maxs:
                sentence_with_tags = sentence_with_tags + (words[index] + '|' + HMM.alltags[i] + ' ')
                index = index + 1

            sentences_with_tags.append(sentence_with_tags)

        return sentences_with_tags

    # It takes gold sequences and predicted sequences and returns the accuracy
    def accuracy(gold_sequences, predicted_sequences):
        total_words = 0
        correct_found_tags = 0
        for i in range(0, len(gold_sequences)):
            gold_words = gold_sequences[i].split(' ')
            predicted_words = predicted_sequences[i].split(' ')
            for j in range(0, len(gold_words)):
                if gold_words[j] == predicted_words[j]:
                    total_words = total_words + 1
                    correct_found_tags = correct_found_tags + 1
                else:
                    total_words = total_words + 1

        return correct_found_tags / total_words

# Reread test.txt to obtain gold_sequence
def read_sentences(path):
    with open(path) as f:
        read_data = f.read()
        paragraphs = re.split('\n\n', read_data)  # split data into paragraphs
        sentences = []
        for line in paragraphs:
            lines = re.split('\n', line)  # split paragraphs into lines
            sentence = ''
            tag = ''

            for word in lines:
                words = re.split(' ', word)  # split lines into words
                if len(words) is 4:
                    if words[0] != '-DOCSTART-':
                        sentence = sentence + words[0].lower() + '|' + words[3] + ' '
            if sentence != '':
                sentences.append(sentence[:len(sentence) - 1])
    return sentences


sentences = HMM.dataset("/content/drive/My Drive/497/train.txt")

HMM.initial_probabilities, HMM.transition_probabilities, HMM.emission_probabilities = HMM.hmm(sentences)

test_sentences = HMM.dataset('/content/drive/My Drive/497/test.txt')

predicted_sentences = HMM.viterbi(test_sentences)

test_sentences_with_tags = read_sentences('/content/drive/My Drive/497/test.txt')

accuracy = HMM.accuracy(test_sentences_with_tags, predicted_sentences)
print("Accuracy is  : " + str(accuracy))



Accuracy is  : 0.7506535947712418


In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/
