In [3]:
import pandas as pd
import ast
from collections import Counter
import math
import re

In [4]:
def get_bigram_corpus(wordlist):
    """
    Returns the bigram corpus given by the input [wordlist]
    
    get_bigram_corpus creates a dictionary with bigrams from the [wordlist]
    as keys and counts the instances of each bigram to assign values (except
    for the bigram ('.', '<s>') which represents the start of one review and
    the end of another)
    
    wordlist: a list of words (strings)
    """
    corpus = {}
    for i, word in enumerate(wordlist[1:], start=1):
        if word != '<s>':
            if (wordlist[i-1], word) not in corpus:
                corpus[(wordlist[i-1], word)] = 1
            else:
                corpus[(wordlist[i-1], word)] += 1
    return corpus

def get_smooth_bigram_corpus(tokenlist, bigram_corpus):
    """
    Returns a dataframe object where the columns and rows are labeled with the tokens
    in [tokenlist] and the elements are the counts of each bigram (defaulting to 1 to handle
    add-one smoothing) in the format (row, column) so that the count for bigram (x, y) is 
    found with df.loc[x, by]
    
    get_smooth_bigram_corpus also appends the unknown word character [<UNK>] to handle 
    unknown words
    
    tokenlist: a list of tokens (strings)
    bigram_corpus: a dictionary of bigram:count pairings
    """
    tokenlist.append('<UNK>')
    df = pd.DataFrame(1, index = tokenlist, columns = tokenlist) 
    for bigram in bigram_corpus:
        df.loc[bigram[0], bigram[1]] += bigram_corpus[bigram]
    return df

def get_smooth_bigram_prob(bigram, smooth_bigram_corpus):
    """
    Returns the probability of a given [bigram] on a given [smooth_bigram_corpus]
    
    get_smooth_bigram_prob takes the ratio of the value of [bigram] (df.loc(bigram[0], bigram[1]))
    in the table [smooth_bigram_corpus] to the sum of all elements in the same row
    
    bigram: a bigram (tuple of strings)
    smooth_bigram_corpus: a dataframe with tokens as row and column names and bigram counts as values
    """
    return smooth_bigram_corpus.loc[bigram[0], bigram[1]] / smooth_bigram_corpus.loc[bigram[0]].sum()

In [5]:
train_df = pd.read_csv('./data_release/train.csv', encoding='cp1252')


In [6]:

for x in train_df.iterrows():
    print(x[1][0])
    break

Ca n't fail to be entertaining .


In [7]:
train_df

Unnamed: 0,sentence,pos_seq,label_seq
0,Ca n't fail to be entertaining .,"['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...","[0, 0, 0, 0, 0, 0, 0]"
1,How much was he going to tell her ?,"['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Up until that news hit the Committee , Don had...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Could go on to the rugby and go with them coul...,"['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Finally , we went to the office and they gave ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
5,It must be shown that the defendant intended (...,"['PRON', 'VERB', 'VERB', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
6,The Englishman nodded and poured himself more ...,"['DET', 'PROPN', 'VERB', 'CCONJ', 'VERB', 'PRO...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,I demanded .,"['PRON', 'VERB', 'PUNCT']","[0, 0, 0]"
8,What is not known is information on the locati...,"['NOUN', 'VERB', 'ADV', 'VERB', 'VERB', 'NOUN'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,That 's getting in the,"['DET', 'VERB', 'VERB', 'ADP', 'DET']","[1, 0, 0, 0, 0]"


# HMM Model

In [92]:
class HMM():
    def __init__(self, training_df):
        self.bigram_counts = {}
        self.POS_counts = Counter()
        
        self.lex_gen = {}
        self.word_pattern = re.compile("(\w+|<s> |[,.!?;\(\)])")

        self.training_df = training_df
            
        for row in self.training_df.iterrows():
 
            POS_list = ast.literal_eval(row[1][1])
            POS_list.insert(0, '<START>')
            
            POS_bigram_counts = get_bigram_corpus(POS_list)
            
            sentence_list = row[1][0].split()
            sentence_list.insert(0, '<s>')
            
            
            for i, word in enumerate(sentence_list):
                if word not in self.lex_gen:
                    self.lex_gen[word] = {POS_list[i] : 1}
                else:
                    if POS_list[i] not in self.lex_gen[word]:
                        self.lex_gen[word][POS_list[i]] = 1
                    else:
                        self.lex_gen[word][POS_list[i]] += 1
            
            # Updates list of POS counts across whole training set 
            self.POS_counts.update(POS_list)
            
            for bigram in POS_bigram_counts: 
                if bigram in self.bigram_counts:
                    self.bigram_counts[bigram] += POS_bigram_counts[bigram]
                else:
                    self.bigram_counts[bigram] = POS_bigram_counts[bigram]
        
        self.smoothed_bigram_counts_df = get_smooth_bigram_corpus(list(self.POS_counts), self.bigram_counts)
    


      
    def prob_tagged_sequence(self, sequence):
        '''
        
        sequence: tuple where first element is sentence as a string, second element is a 
            list of POS associated with each word of sentence 
        '''
        
        sentence_list = self.word_pattern.findall(sequence[0])
        POS_list = sequence[1]
        
        log_prob_acc = 0

        for i, POS in enumerate(POS_list):
            if i == 0:
                continue

            POS_bigram = (POS_list[i-1], POS) 
            log_prob_acc += math.log(get_smooth_bigram_prob(POS_bigram, self.smoothed_bigram_counts_df))

            log_prob_acc += math.log(self.lex_gen[sentence_list[i]].get(POS, 1)/dict(self.POS_counts)[POS]) 
        
        return math.exp(log_prob_acc)
    
    
    def viterbi(self, sentence):
        '''
        sentence: string
        '''
        
        sentence_list = self.word_pattern.findall(sentence)

        backpointers = []
        scores = []
        previous_scores = []

        POS_list = list(self.POS_counts)
        num_lex_categories = len(self.POS_counts)

        #initialization
        for i in range(1, num_lex_categories):
            POS = POS_list[i]

            initial_prob = get_smooth_bigram_prob(('<START>', POS), self.smoothed_bigram_counts_df)
            lex_gen_prob = self.lex_gen.get(sentence_list[0], '<UNK>').get(POS, 1)/self.POS_counts.get(POS)

            previous_scores.append(initial_prob * lex_gen_prob)

        #iteration
        for t in range(1, len(sentence_list)):

            t_backpointers = []

            for i in range(1, num_lex_categories):

                temp_backpointer = None
                max_score = (0, -1)

                for j in range(1, num_lex_categories):
                    transition_prob = get_smooth_bigram_prob((POS_list[j], POS_list[i]), self.smoothed_bigram_counts_df) 
                    lex_gen_prob = self.lex_gen.get(sentence_list[t], '<UNK>').get(POS, 1) / self.POS_counts.get(POS)
                    #index j-1 accounts for exclusion of START POS during init of previous_scores
                    score = previous_scores[j-1] * transition_prob * lex_gen_prob
                    scores.append(score)
                    
                    print(j)
                    print(score)

                    if score > max_score[0]:
                        max_score = (score, j)
                        temp_backpointer = j

                t_backpointers.append(temp_backpointer)
                previous_scores = scores

            backpointers.append(t_backpointers)

        #backtracking
        tags = []
        max_index = max_score[1]
        print(backpointers)
        for bptr_list in reversed(backpointers):
            print(len(bptr_list))
            print(max_index)
            tags.insert(0, POS_list[max_index])
            
            #index max_index-1 accounts for exclusion of START POS during init of backpointer lists
            max_index = bptr_list[max_index-1]
            
        return tags
            


In [93]:
len(hmm.POS_counts)
    

16

In [94]:
hmm = HMM(train_df)
# print(hmm.POS_counts)
hmm.viterbi('this is a test of the system')

1
1.5167990953765255e-07
2
9.038506584413841e-07
3
1.873855668941158e-07
4
4.56313622348304e-08
5
1.6333434369515513e-08
6
5.245444268798631e-06
7
5.5362615051921065e-08
8
5.127625824756266e-05
9
9.304380261570424e-08
10
4.770139038290372e-07
11
5.663576997891677e-07
12
6.410807647250503e-08
13
1.3337831563964874e-06
14
1.2788814050643707e-07
15
2.8677946659019223e-07
1
4.072022186818519e-09
2
1.913821962486589e-08
3
6.592726304799744e-10
4
1.1999550686040918e-10
5
2.6505502037363253e-10
6
5.71046055251829e-08
7
2.39257198010862e-10
8
2.0535221621347058e-07
9
5.919269509149837e-10
10
1.7403769864311478e-09
11
1.1760774046847494e-08
12
1.9298036265052692e-10
13
9.487923957950389e-09
14
1.382574491961482e-09
15
2.6070860599108388e-09
1
2.008300340705265e-09
2
2.4770175263790997e-09
3
2.7892303597229687e-10
4
2.040868463137668e-10
5
1.860035230692158e-11
6
2.0145656591112272e-08
7
2.1674868112482275e-11
8
2.186620820791585e-08
9
5.536204302086211e-10
10
6.091319452509018e-09
11
9.98261988

9
1.3680900252272348e-11
10
1.506095469026955e-10
11
5.303266811575793e-10
12
2.3157643518063224e-11
13
2.1530288981502807e-08
14
6.91287245980741e-10
15
2.6070860599108388e-09
1
1.4584606686312746e-12
2
2.6073868698727367e-11
3
1.2678319816922585e-11
4
2.8345395321356496e-12
5
2.3250440383651977e-12
6
3.0294220437762816e-10
7
4.168243867785052e-12
8
1.9014094093839866e-09
9
2.73618005045447e-12
10
6.6937576401198e-11
11
3.119568712691643e-11
12
7.719214506021074e-12
13
3.6492015222886113e-10
14
2.765148983922964e-09
15
2.6070860599108388e-09
1
1.4584606686312746e-12
2
2.6073868698727367e-11
3
1.2678319816922585e-11
4
9.448465107118833e-13
5
4.650088076730395e-13
6
1.5147110218881408e-10
7
8.336487735570105e-13
8
9.507047046919933e-10
9
2.73618005045447e-12
10
5.02031823008985e-11
11
3.119568712691643e-11
12
1.543842901204215e-11
13
3.6492015222886113e-10
14
6.91287245980741e-10
15
2.6070860599108388e-09
1
5.3715106425689845e-09
2
5.556341419698802e-08
3
2.2009563202177606e-08
4
5.0076

4
5.405466887782684e-09
5
2.35294456682558e-10
6
8.93679502914003e-09
7
1.977414890877229e-09
8
5.803101517439928e-06
9
1.9116777952508565e-09
10
5.5558188412994335e-09
11
1.8311868343499944e-08
12
3.6280308178299056e-09
13
8.393163501263805e-09
14
2.765148983922964e-09
15
7.821258179732516e-09
1
5.221289193699963e-10
2
1.4862105158274599e-09
3
6.085593512122841e-10
4
1.6440329286386768e-10
5
2.4924472091274924e-10
6
1.2117688175105126e-09
7
1.0112159623246535e-09
8
8.157046366257303e-07
9
1.0215072188363356e-10
10
2.4833840844844457e-08
11
8.703596708409684e-09
12
6.329755894937283e-10
13
6.933482892348362e-09
14
2.0738617379422232e-09
15
2.6070860599108388e-09
1
3.675320884950812e-10
2
3.181011981244739e-09
3
5.958810313953615e-10
4
2.919575718099719e-10
5
3.501516321777988e-10
6
1.0451506051028171e-08
7
3.91814923571795e-11
8
1.9964798798531863e-08
9
1.1391629610058777e-09
10
5.33827171799554e-09
11
2.8076118414224785e-10
12
5.635026589395385e-10
13
8.393163501263805e-09
14
1.382574

['SYM', 'SYM', 'SYM', 'SYM', 'SYM', 'SYM']

In [75]:
k = '["hello", "hi"]'

In [11]:
y = ast.literal_eval(k)

In [12]:
hmm

<__main__.HMM at 0x1109212e8>

In [13]:
diction = {'hello' : 1, 'apple' : 2}

5
4
3
2
1


In [78]:
for i in range(1, 20):
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
