In [3]:
import pandas as pd
import ast
from collections import Counter
import math
import re

In [4]:
def get_bigram_corpus(wordlist):
    """
    Returns the bigram corpus given by the input [wordlist]
    
    get_bigram_corpus creates a dictionary with bigrams from the [wordlist]
    as keys and counts the instances of each bigram to assign values (except
    for the bigram ('.', '<s>') which represents the start of one review and
    the end of another)
    
    wordlist: a list of words (strings)
    """
    corpus = {}
    for i, word in enumerate(wordlist[1:], start=1):
        if word != '<s>':
            if (wordlist[i-1], word) not in corpus:
                corpus[(wordlist[i-1], word)] = 1
            else:
                corpus[(wordlist[i-1], word)] += 1
    return corpus

def get_smooth_bigram_corpus(tokenlist, bigram_corpus):
    """
    Returns a dataframe object where the columns and rows are labeled with the tokens
    in [tokenlist] and the elements are the counts of each bigram (defaulting to 1 to handle
    add-one smoothing) in the format (row, column) so that the count for bigram (x, y) is 
    found with df.loc[x, by]
    
    get_smooth_bigram_corpus also appends the unknown word character [<UNK>] to handle 
    unknown words
    
    tokenlist: a list of tokens (strings)
    bigram_corpus: a dictionary of bigram:count pairings
    """
    tokenlist.append('<UNK>')
    df = pd.DataFrame(1, index = tokenlist, columns = tokenlist) 
    for bigram in bigram_corpus:
        df.loc[bigram[0], bigram[1]] += bigram_corpus[bigram]
    return df

def get_smooth_bigram_prob(bigram, smooth_bigram_corpus):
    """
    Returns the probability of a given [bigram] on a given [smooth_bigram_corpus]
    
    get_smooth_bigram_prob takes the ratio of the value of [bigram] (df.loc(bigram[0], bigram[1]))
    in the table [smooth_bigram_corpus] to the sum of all elements in the same row
    
    bigram: a bigram (tuple of strings)
    smooth_bigram_corpus: a dataframe with tokens as row and column names and bigram counts as values
    """
    return smooth_bigram_corpus.loc[bigram[0], bigram[1]] / smooth_bigram_corpus.loc[bigram[0]].sum()

In [5]:
train_df = pd.read_csv('./data_release/train.csv', encoding='cp1252')


In [6]:

for x in train_df.iterrows():
    print(x[1][0])
    break

Ca n't fail to be entertaining .


In [7]:
train_df

Unnamed: 0,sentence,pos_seq,label_seq
0,Ca n't fail to be entertaining .,"['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...","[0, 0, 0, 0, 0, 0, 0]"
1,How much was he going to tell her ?,"['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Up until that news hit the Committee , Don had...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Could go on to the rugby and go with them coul...,"['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Finally , we went to the office and they gave ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
5,It must be shown that the defendant intended (...,"['PRON', 'VERB', 'VERB', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
6,The Englishman nodded and poured himself more ...,"['DET', 'PROPN', 'VERB', 'CCONJ', 'VERB', 'PRO...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,I demanded .,"['PRON', 'VERB', 'PUNCT']","[0, 0, 0]"
8,What is not known is information on the locati...,"['NOUN', 'VERB', 'ADV', 'VERB', 'VERB', 'NOUN'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,That 's getting in the,"['DET', 'VERB', 'VERB', 'ADP', 'DET']","[1, 0, 0, 0, 0]"


# HMM Model

In [95]:
class HMM():
    def __init__(self, training_df):
        self.bigram_counts = {}
        self.POS_counts = Counter()
        
        self.lex_gen = {}
        self.word_pattern = re.compile("(\w+|<s> |[,.!?;\(\)])")

        self.training_df = training_df
            
        for row in self.training_df.iterrows():
 
            POS_list = ast.literal_eval(row[1][1])
            POS_list.insert(0, '<START>')
            
            POS_bigram_counts = get_bigram_corpus(POS_list)
            
            sentence_list = row[1][0].split()
            sentence_list.insert(0, '<s>')
            
            
            for i, word in enumerate(sentence_list):
                if word not in self.lex_gen:
                    self.lex_gen[word] = {POS_list[i] : 1}
                else:
                    if POS_list[i] not in self.lex_gen[word]:
                        self.lex_gen[word][POS_list[i]] = 1
                    else:
                        self.lex_gen[word][POS_list[i]] += 1
            
            # Updates list of POS counts across whole training set 
            self.POS_counts.update(POS_list)
            
            for bigram in POS_bigram_counts: 
                if bigram in self.bigram_counts:
                    self.bigram_counts[bigram] += POS_bigram_counts[bigram]
                else:
                    self.bigram_counts[bigram] = POS_bigram_counts[bigram]
        
        self.smoothed_bigram_counts_df = get_smooth_bigram_corpus(list(self.POS_counts), self.bigram_counts)
    


      
    def prob_tagged_sequence(self, sequence):
        '''
        
        sequence: tuple where first element is sentence as a string, second element is a 
            list of POS associated with each word of sentence 
        '''
        
        sentence_list = self.word_pattern.findall(sequence[0])
        POS_list = sequence[1]
        
        log_prob_acc = 0

        for i, POS in enumerate(POS_list):
            if i == 0:
                continue

            POS_bigram = (POS_list[i-1], POS) 
            log_prob_acc += math.log(get_smooth_bigram_prob(POS_bigram, self.smoothed_bigram_counts_df))

            log_prob_acc += math.log(self.lex_gen[sentence_list[i]].get(POS, 1)/dict(self.POS_counts)[POS]) 
        
        return math.exp(log_prob_acc)
    
    
    def viterbi(self, sentence):
        
        '''
        
        sentence: string
        '''
        sentence_list = self.word_pattern.findall(sentence)

        backpointers = [[]]
        scores = []
        previous_scores = []
        
        POS_list = list(self.POS_counts)
        
        
        #initialization
        for i in range(1, len(self.POS_counts)):
            POS = POS_list[i]
            
            initial_transition_prob = self.smoothed_bigram_counts_df.loc['<START>', POS] / len(self.training_df) 
            lex_gen_prob = self.lex_gen.get(sentence_list[0], '<UNK>').get(POS, 1)/self.POS_counts.get(POS)
            
            previous_scores.append(initial_transition_prob * lex_gen_prob)
            
    
        #iteration
        for t in range(2, len(sentence_list)):
            for i in range(1, len(self.POS_counts)):
                
                temp_backpointers = []
                
                max_score = (0, -1)
                
                for j in range(1, len(self.POS_counts)-1):
                    transition_prob = self.smoothed_bigram_counts_df.loc[POS_list[j], POS_list[i]]
                    lex_gen_prob = self.lex_gen.get(sentence_list[t], '<UNK>').get(POS, 1)/self.POS_counts.get(POS)
                    

                    score = previous_scores[j] * transition_prob * lex_gen_prob
                    scores.append(score)


                    if score > max_score[0]:
                        max_score = (score, j)
                        temp_backpointers.append(j)
                
                backpointers.append(temp_backpointers)
                
                previous_scores = scores
                

        #backtracking
#         for backpointer in backpointers:
        print(backpointers)
            
                
            
        

    

In [96]:
len(hmm.POS_counts)
    

16

In [97]:
hmm = HMM(train_df)
# print(hmm.POS_counts)
hmm.viterbi('this is a test')

[[], [1, 7], [1, 5, 6], [1, 4, 6], [1, 2, 4, 6], [1, 2, 4, 6], [1, 5, 6], [1, 4, 5, 6], [1, 5, 6], [1, 4, 8], [1, 4, 5, 8], [1, 2, 4, 5, 6], [1, 4, 5, 6], [1, 5, 6], [1, 2, 4, 5, 6, 14], [1, 2, 4, 6], [1, 2, 4, 5, 6], [1, 5, 6], [1, 4, 6], [1, 2, 4, 6], [1, 2, 4, 6], [1, 5, 6], [1, 4, 5, 6], [1, 5, 6], [1, 4, 8], [1, 4, 5, 8], [1, 2, 4, 5, 6], [1, 4, 5, 6], [1, 5, 6], [1, 2, 4, 5, 6, 14], [1, 2, 4, 6]]


In [10]:
k = '["hello", "hi"]'

In [11]:
y = ast.literal_eval(k)

In [12]:
hmm

<__main__.HMM at 0x1109212e8>

In [13]:
diction = {'hello' : 1, 'apple' : 2}

6337

In [65]:
for i in range(1, 20):
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
