In [41]:
import pandas as pd
import ast

In [11]:
def get_bigram_corpus(wordlist):
    """
    Returns the bigram corpus given by the input [wordlist]
    
    get_bigram_corpus creates a dictionary with bigrams from the [wordlist]
    as keys and counts the instances of each bigram to assign values (except
    for the bigram ('.', '<s>') which represents the start of one review and
    the end of another)
    
    wordlist: a list of words (strings)
    """
    corpus = {}
    for i, word in enumerate(wordlist[1:], start=1):
        if word != '<s>':
            if (wordlist[i-1], word) not in corpus:
                corpus[(wordlist[i-1], word)] = 1
            else:
                corpus[(wordlist[i-1], word)] += 1
    return corpus

{('Hello', 'my'): 1, ('my', 'name'): 1, ('name', 'is'): 1}

In [49]:
train_df = pd.read_csv('./data_release/train.csv', encoding='cp1252')

In [31]:

for x in train_df.iterrows():
    print(x[1][0])
    break

Ca n't fail to be entertaining .


# HMM Model

In [47]:
class HMM():
    def __init__(self, training_df):
        self.bigram_counts = {}
        self.training_df = training_df
        
        self.count_bigrams()
    
    def count_bigrams(self):
        for row in self.training_df.iterrows():
            sentence_bigram_counts = get_bigram_corpus(ast.literal_eval(row[1][1]))
            for bigram in sentence_bigram_counts: 
                if bigram in self.bigram_counts:
                    self.bigram_counts[bigram] += sentence_bigram_counts[bigram]
                else:
                    self.bigram_counts[bigram] = sentence_bigram_counts[bigram]
  
        return self.bigram_counts
            

In [48]:
hmm = HMM(train_df)
hmm.count_bigrams()

{('VERB', 'ADV'): 5582,
 ('ADV', 'VERB'): 4260,
 ('VERB', 'PART'): 2752,
 ('PART', 'VERB'): 3470,
 ('VERB', 'ADJ'): 3158,
 ('ADJ', 'PUNCT'): 1456,
 ('ADV', 'ADJ'): 1666,
 ('ADJ', 'VERB'): 1058,
 ('VERB', 'PRON'): 3306,
 ('PRON', 'VERB'): 10176,
 ('PRON', 'PUNCT'): 1376,
 ('ADP', 'ADP'): 584,
 ('ADP', 'DET'): 9848,
 ('DET', 'NOUN'): 12206,
 ('NOUN', 'VERB'): 6582,
 ('VERB', 'DET'): 5424,
 ('DET', 'PROPN'): 1714,
 ('PROPN', 'PUNCT'): 2810,
 ('PUNCT', 'PROPN'): 1070,
 ('PROPN', 'VERB'): 2002,
 ('VERB', 'VERB'): 7364,
 ('NOUN', 'ADP'): 11322,
 ('ADP', 'ADJ'): 3906,
 ('ADJ', 'PROPN'): 346,
 ('PROPN', 'PROPN'): 2966,
 ('PROPN', 'NOUN'): 662,
 ('NOUN', 'PUNCT'): 10858,
 ('PART', 'ADP'): 406,
 ('NOUN', 'CCONJ'): 2496,
 ('CCONJ', 'VERB'): 1300,
 ('VERB', 'ADP'): 6132,
 ('ADP', 'PRON'): 2126,
 ('ADV', 'PRON'): 1000,
 ('ADV', 'PUNCT'): 2188,
 ('PUNCT', 'PRON'): 1610,
 ('CCONJ', 'PRON'): 870,
 ('PRON', 'DET'): 272,
 ('PUNCT', 'ADJ'): 1028,
 ('VERB', 'PUNCT'): 3284,
 ('PUNCT', 'ADV'): 1138,
 ('PUNC

In [42]:
k = '["hello", "hi"]'

In [43]:
y = ast.literal_eval(k)

In [44]:
y

['hello', 'hi']