In [55]:
import pandas as pd
import ast
from collections import Counter

In [82]:
def get_bigram_corpus(wordlist):
    """
    Returns the bigram corpus given by the input [wordlist]
    
    get_bigram_corpus creates a dictionary with bigrams from the [wordlist]
    as keys and counts the instances of each bigram to assign values (except
    for the bigram ('.', '<s>') which represents the start of one review and
    the end of another)
    
    wordlist: a list of words (strings)
    """
    corpus = {}
    for i, word in enumerate(wordlist[1:], start=1):
        if word != '<s>':
            if (wordlist[i-1], word) not in corpus:
                corpus[(wordlist[i-1], word)] = 1
            else:
                corpus[(wordlist[i-1], word)] += 1
    return corpus

def get_smooth_bigram_corpus(tokenlist, bigram_corpus):
    """
    Returns a dataframe object where the columns and rows are labeled with the tokens
    in [tokenlist] and the elements are the counts of each bigram (defaulting to 1 to handle
    add-one smoothing) in the format (row, column) so that the count for bigram (x, y) is 
    found with df.loc[x, by]
    
    get_smooth_bigram_corpus also appends the unknown word character [<UNK>] to handle 
    unknown words
    
    tokenlist: a list of tokens (strings)
    bigram_corpus: a dictionary of bigram:count pairings
    """
    tokenlist.append('<UNK>')
    df = pd.DataFrame(1, index = tokenlist, columns = tokenlist) 
    for bigram in bigram_corpus:
        df.loc[bigram[0], bigram[1]] += bigram_corpus[bigram]
    return df

In [77]:
train_df = pd.read_csv('./data_release/train.csv', encoding='cp1252')


In [87]:

for x in train_df.iterrows():
    print(x[1][0])
    break

Ca n't fail to be entertaining .


In [88]:
train_df

Unnamed: 0,sentence,pos_seq,label_seq
0,Ca n't fail to be entertaining .,"['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...","[0, 0, 0, 0, 0, 0, 0]"
1,How much was he going to tell her ?,"['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Up until that news hit the Committee , Don had...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Could go on to the rugby and go with them coul...,"['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Finally , we went to the office and they gave ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
5,It must be shown that the defendant intended (...,"['PRON', 'VERB', 'VERB', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
6,The Englishman nodded and poured himself more ...,"['DET', 'PROPN', 'VERB', 'CCONJ', 'VERB', 'PRO...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,I demanded .,"['PRON', 'VERB', 'PUNCT']","[0, 0, 0]"
8,What is not known is information on the locati...,"['NOUN', 'VERB', 'ADV', 'VERB', 'VERB', 'NOUN'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,That 's getting in the,"['DET', 'VERB', 'VERB', 'ADP', 'DET']","[1, 0, 0, 0, 0]"


# HMM Model

In [94]:
class HMM():
    def __init__(self, training_df):
        self.bigram_counts = {}
        self.POS_counts = Counter()
        
        self.lex_gen = {}
        
        self.training_df = training_df
        
        self.count_stuff()
        
        self.smoothed_bigram_counts_df = get_smooth_bigram_corpus(list(self.POS_counts), self.bigram_counts)
    
    def count_stuff(self):
        for row in self.training_df.iterrows():
            
            POS_list = ast.literal_eval(row[1][1])
            POS_bigram_counts = get_bigram_corpus(POS_list)
            
            sentence_list = row[1][0].split()
            
            
            for i, word in enumerate(sentence_list):
                if word not in self.lex_gen:
                    self.lex_gen[word] = {POS_list[i] : 1}
                else:
                    if POS_list[i] not in self.lex_gen[word]:
                        self.lex_gen[word][POS_list[i]] = 1
                    else:
                        self.lex_gen[word][POS_list[i]] += 1
            
            # Updates list of POS counts across whole training set 
            self.POS_counts.update(POS_list)
            
            for bigram in POS_bigram_counts: 
                if bigram in self.bigram_counts:
                    self.bigram_counts[bigram] += POS_bigram_counts[bigram]
                else:
                    self.bigram_counts[bigram] = POS_bigram_counts[bigram]
  
    

In [97]:
hmm = HMM(train_df)
hmm.lex_gen

{'Ca': {'VERB': 8},
 "n't": {'ADV': 574},
 'fail': {'VERB': 2},
 'to': {'PART': 1667, 'ADP': 966},
 'be': {'VERB': 658},
 'entertaining': {'ADJ': 1},
 '.': {'PUNCT': 4923},
 'How': {'ADV': 31},
 'much': {'ADJ': 52, 'ADV': 53},
 'was': {'VERB': 878},
 'he': {'PRON': 593},
 'going': {'VERB': 115},
 'tell': {'VERB': 46},
 'her': {'PRON': 132, 'ADJ': 217},
 '?': {'PUNCT': 570},
 'Up': {'ADP': 3},
 'until': {'ADP': 26},
 'that': {'DET': 328, 'ADP': 597, 'ADJ': 179, 'ADV': 11},
 'news': {'NOUN': 8},
 'hit': {'VERB': 5},
 'the': {'DET': 5651},
 'Committee': {'PROPN': 14},
 ',': {'PUNCT': 5468},
 'Don': {'PROPN': 4},
 'had': {'VERB': 472},
 'won': {'VERB': 14},
 'day': {'NOUN': 55},
 'with': {'ADP': 658},
 'his': {'ADJ': 390, 'PRON': 1},
 'UK': {'PROPN': 43},
 'Vehicle': {'PROPN': 1},
 'Division': {'PROPN': 2},
 'proposals': {'NOUN': 5},
 'Could': {'VERB': 5},
 'go': {'VERB': 188, 'NOUN': 2},
 'on': {'PART': 79, 'ADP': 638, 'ADV': 32},
 'rugby': {'NOUN': 4},
 'and': {'CCONJ': 2535},
 'them': {

In [42]:
k = '["hello", "hi"]'

In [43]:
y = ast.literal_eval(k)

In [44]:
y

['hello', 'hi']

In [91]:
diction = {'hello' : 1, 'apple' : 2}