In [1]:
# Importing libraries
import nltk
import numpy as np
from sklearn.model_selection import train_test_split

# download the treebank corpus from nltk
# nltk.download('treebank')             
# nltk.download('universal_tagset')

nltk_data = list( nltk.corpus.treebank.tagged_sents( tagset ='universal') )
train_set , test_set = train_test_split( nltk_data , train_size =0.95 , test_size =0.05 , random_state =123)
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
test_words_without_tags = [ tup [0] for sent in test_set for tup in sent ]

# number of tagged words in training
print(len( train_tagged_words ) )
print(len( test_words_without_tags ) )

training_words = [ word[0] for word in train_tagged_words ]
vocabulary = [ voc for voc in set(training_words) ]
print(f'The size of the vocabulary is: {len(vocabulary)}')

training_tags = [ word[1] for word in train_tagged_words ]
tags = [ lab for lab in set(training_tags)]
tags.append('OOV')      # out of vocabulary
print(f'The possible tags are: {tags}')

##

def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    #now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list) 
    return (count_w_given_tag , count_tag)

def t2_given_t1(t2, t1, train_bag = train_tagged_words): 
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1]) 
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1 
    return (count_t2_t1 , count_t1)

# count how many unique tags there are in the training set
tags = {tag for word,tag in train_tagged_words}


# make matricies
emmision_matrix = np.zeros((len(vocabulary), len(tags)))
for i, word in enumerate(list(vocabulary)):
    for j, tag in enumerate(list(tags)):
        emmision_matrix[i, j] = t2_given_t1(word, tag)[0]/t2_given_t1(word, tag)[1]

tags_matrix = np.zeros((len(tags), len(tags)))
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

print(emmision_matrix)

95656
5020
The size of the vocabulary is: 12102
The possible tags are: ['DET', 'PRON', 'CONJ', 'ADP', '.', 'VERB', 'NOUN', 'ADJ', 'PRT', 'ADV', 'X', 'NUM', 'OOV']


In [None]:
import math

em_matrix = emmision_matrix
trans_matrix = tags_matrix

emission = np.zeros((len(test_words_without_tags), len(tags)))
for row in range(len(test_words_without_tags)):
  w = test_words_without_tags[row]
  if w in vocabulary:
    emission[row, :] = em_matrix[vocabulary.index(w),:]
  else:
    emission[row, :] = [1/len(tags) for _ in range(len(tags))]

transition = trans_matrix
n = len(test_words_without_tags)  # upperbound of sigma sum_i
m = len(tags)                     # upperbound of sigma sum_j
print(n,m)
set_I  = range(0, n)
set_J  = range(0, m)
set_I_mod = range(1,n)

In [None]:
# CONSTRAINTS
# <= constraints  : z_ijk =< z_ij                 for all i,j,k
# constraints = {i,j,k : 
# m.addConstr(
#         lhs=z[i,k]] * z[i-1,j],
#         sense=grb.GRB.LESS_EQUAL,
#         rhs= z[i,j], 
#         name="constraint_{0}".format(i))
#     for j in set_J for i in set_I for k in set_k}
# >= constraints  : z_ijk => z_i-1_j + z_ik -1    for all i,j,k
# constraints = {i,j,k : 
# m.addConstr(
#         lhs=z[i-1, j] * z[i, k]
#         sense=GREATER_EQUAL,
#         rhs=z[i-1, j] + z[i, k] - 1, 
#         name="constraint_{0}".format(i))
#     for i in set_I for j in set_J for K in set_J}
# == constraints  : sum_j z_ij = 1     for all i
# constraints = {i : 
# m.addConstr(
#         lhs=grb.quicksum(z[i,j] for j in set_J),
#         sense=grb.GRB.EQUAL,
#         rhs=1, 
#         name="constraint_{0}".format(i))
#     for i in set

In [None]:
# cplex version
import docplex.mp.model as cpx
import cplex

m = cpx.Model(name="MIP Model")

# VARIABLES
z  = {(i,j): m.binary_var(name="z_{0}_{1}".format(i,j)) 
for i in set_I for j in set_J}

z3  = {(i,j,k): m.binary_var(name="z_{0}_{1}_{2}".format(i,j,k)) 
for i in set_I for j in set_J for k in set_J}


# CONSTRAINTS
# <= constraints  : z_ijk =< z_ij                 for all i,j,k
constraints = {(i,j,k): m.add_constraint(
ct= z3[i,j,k] <= z[i,j],
ctname="constraint_{0}_{1}_{2}".format(i,j,k))
       for i in set_I for j in set_J for k in set_J}

# >= constraints  : z_ijk => z_i-1_j + z_ik -1    for all i,j,k
constraints = {(i,j,k) : m.add_constraint(
ct= z3[i,j,k] >= z[i-1,j] + z[i,k] - 1,
ctname="constraint_{0}_{1}_{2}".format(i,j,k))
       for i in set_I_mod for j in set_J for k in set_J}

# == constraints  : sum_j z_ij = 1     for all i
constraints = {i : m.add_constraint(
ct=m.sum(z[i,j] for j in set_J) == 1,
ctname="constraint_{0}".format(i))
       for i in set_I}


# OBJECTIVE       : sum_i ( sum_j (log(emission[i,j]) * z_ij) + sum_jk (log(transition[k,j]) * z_ijk)))
sum_j = m.sum(math.log(emission[i,j]+1e-15) * z[i,j] for j in set_J for i in set_I_mod)
sum_jk = m.sum((math.log(transition[j,k]+1e-15) * z3[i, j, k]) for j in set_J for k in set_J for i in set_I_mod)
objective = sum_j + sum_jk


# SOLVE
m.maximize(objective)
m.solve()


# SAVE
import pandas as pd
opt_df = pd.DataFrame.from_dict(z, orient="index", 
                                columns = ["variable_object"])
opt_df.index = pd.MultiIndex.from_tuples(opt_df.index, names=["column_i", "column_j"])
opt_df.reset_index(inplace=True)
opt_df["solution_value"] = opt_df["variable_object"]