In [11]:
import pandas as pd
from sklearn.metrics import accuracy_score
import collections
import json

## Task 1: Vocabulary Creation

In [12]:
traindf = pd.read_csv('data/train', sep="\t", header=None)
traindf.head()

Unnamed: 0,0,1,2
0,1,Pierre,NNP
1,2,Vinken,NNP
2,3,",",","
3,4,61,CD
4,5,years,NNS


In [13]:
# Storing train words, indices and labels separately
indices, words, labels = traindf[0], traindf[1], traindf[2]

In [14]:
# calculating word frequencies / occurences
wordFrequencies = collections.Counter()
for word in words:
  wordFrequencies[word] += 1

In [15]:
knownVocab = {}
numOfUnknowns = 0

for word in wordFrequencies:
  if wordFrequencies[word] >= 3:
    knownVocab[word] = wordFrequencies[word]
  else:
    numOfUnknowns += wordFrequencies[word]

In [16]:
sortedVocab = sorted(wordFrequencies.items(), key=lambda x: x[1], reverse=True)

In [17]:
with open('output/vocab.txt', 'w') as vocab:
  vocab.write("'<unk>'\t0\t{}".format(numOfUnknowns))
  for i in range(len(sortedVocab)):
    vocab.write("{}\t{}\t{}\n".format(sortedVocab[i][0], i+1, sortedVocab[i][1]))

In [18]:
print("The selected threshold for unknown words replacement is 3")
print("The total size of my vocabulary", len(sortedVocab))
print("The total occurrences of the special token ‘<unk>’ after replacement", numOfUnknowns)

The selected threshold for unknown words replacement is 3
The total size of my vocabulary 43193
The total occurrences of the special token ‘<unk>’ after replacement 32537


## Task 2: Model Learning

In [19]:
knownWords = list(knownVocab.keys())

words = words.apply(lambda x: x if x in knownWords else '<unk>')

In [20]:
sTos, sTox = {}, {}
startTag, unkTag = '<s>', '<unk>'

# calculating counts of transitions one pos to next pos
for i in range(len(words)-1):
  if indices[i] < indices[i+1]:
    if (labels[i], labels[i+1]) in sTos:
      sTos[(labels[i], labels[i+1])] += 1
    else:
      sTos[(labels[i], labels[i+1])] = 1

for i in range(len(words)):
  if indices[i] == 1:
    if (startTag, labels[i]) in sTos:
      sTos[(startTag, labels[i])] += 1
    else:
      sTos[(startTag, labels[i])] = 1

In [21]:
# calculating counts of emissions i.e word | pos
for i in range(len(words)-1):
  if indices[i] < indices[i+1]:
    if (labels[i], words[i]) in sTox:
      sTox[(labels[i], words[i])] += 1
    else:
      sTox[(labels[i], words[i])] = 1

for i in range(len(words)-1):
  if words[i] == "<unk>":
    if (labels[i], "<unk>") in sTox:
      sTox[(labels[i], "<unk>")] += 1
    else:
      sTox[(labels[i], "<unk>")] = 1

In [22]:
# calculating the frequencies of labels i.e pos tags
labelFrequencies = collections.Counter()
for label in labels:
  labelFrequencies[label] += 1

labelFrequencies[startTag] = 0
for i in indices:
  if i == 1:
    labelFrequencies[startTag] += 1

In [23]:
# calculating the emissions and transitions probability
emissions, transitions = {}, {}

for key, value in sTos.items():
  transitions[str(key)] = value / labelFrequencies[key[0]]

for key, value in sTox.items():
  emissions[str(key)] = value / labelFrequencies[key[0]]

In [24]:
# creating hmm.json and storing emission and transition parameters
hmmJson = {"transition":transitions, "emission":emissions}
json_object = json.dumps(hmmJson, indent=2)
with open('output/hmm.json', 'w') as op:
    op.write(json_object)

In [25]:
print("The number of transition parameters are", len(transitions))
print("The number of emission parameters are", len(emissions))

The number of transition parameters are 1392
The number of emission parameters are 23350


## Task 3: Greedy Decoding with HMM

In [26]:
disticntLabels = list(labelFrequencies.keys())

In [27]:
# reading the emission and transition params that we stored in hmm.json
with open('output/hmm.json', 'r') as hmm:
  jsonData = json.load(hmm)

emissionParams, transitionParams = jsonData["emission"], jsonData["transition"]

In [28]:
# reading dev data
devData = pd.read_csv('data/dev', sep='\t', header=None)
devData.head()

Unnamed: 0,0,1,2
0,1,The,DT
1,2,Arizona,NNP
2,3,Corporations,NNP
3,4,Commission,NNP
4,5,authorized,VBD


In [29]:
# Storing dev words, indices and labels separately
devIndices, devWords, devLabels = devData[0], devData[1], devData[2]

In [30]:
listOfDevSentences, listofDevLabelsSentences = [], []
wordSample, labelSample = [], []
for i in range(len(devData)-1):
  if devIndices[i] < devIndices[i+1]:
    wordSample.append(devWords[i])
    labelSample.append(devLabels[i])
  else:
    wordSample.append(devWords[i])
    listOfDevSentences.append(wordSample)
    wordSample = []

    labelSample.append(devLabels[i])
    listofDevLabelsSentences.append(labelSample)
    labelSample = []

In [31]:
def greedyDecoding(sentence):
  predLabels = []
    
  if sentence[0] not in knownWords:
    sentence[0] = unkTag
  
  maxProb = 0
  state0 = 'UNK'
    
  for label in labelFrequencies:
    try:
      probab = emissionParams[str((label, sentence[0]))] * transitionParams[str((startTag, label))]
      if probab > maxProb:
        maxProb = probab
        state0 = label
    except:
      pass
      
  predLabels.append(state0)
  
  # predicting the labels of the remaining words
  
  for i in range(1, len(sentence)):
    if sentence[i] not in knownWords:
      sentence[i] = unkTag
      
    maxProb = 0
    state = 'UNK'
    
    for label in labelFrequencies:
      try:
        probab = emissionParams[str((label, sentence[i]))] * transitionParams[str((predLabels[-1], label))]
        if probab > maxProb:
          maxProb = probab
          state = label
      except:
        pass
          
    predLabels.append(state)
  
  return predLabels

In [32]:
# applying greedy decoding algorithm to devData
lablesGreedy = [greedyDecoding(sent) for sent in listOfDevSentences]

In [33]:
# creating a single list of all the predicted labels i.e. pos tags using greedy decoding
predGreedyLabels = []
for i in range(len(lablesGreedy)):
  for label in lablesGreedy[i]:
    predGreedyLabels.append(label)

In [34]:
# creating a single list of all the dev labels i.e. pos tags
actualDevLabels = []
for i in range(len(listofDevLabelsSentences)):
  for label in listofDevLabelsSentences[i]:
    actualDevLabels.append(label)

In [35]:
# calculating accuracy on the dev data using Greedy Decoding
print('The prediction accuracy on the dev data using Greedy Decoding is {:.2f}%'.format(accuracy_score(actualDevLabels, predGreedyLabels) * 100))

The prediction accuracy on the dev data using Greedy Decoding is 92.67%


In [36]:
# creating file containing greedy labels (POS tags)
with open('output/predGreedy', 'w') as file:
  for i in range(len(predGreedyLabels)):
    if devIndices[i] == 1 and i > 35:
      file.write("\n")
    file.write("{}\t{}\t{}\n".format(devIndices[i], devWords[i], predGreedyLabels[i]))

In [37]:
! python eval.py -p "output/predGreedy" -g "data/dev"

output/predGreedy
data/dev
'' '38\t.\t.' 131751
total: 131751, correct: 122099, accuracy: 92.67%


## Task 4: Viterbi Decoding with HMM

In [38]:
def viterbiDecoding(sentence):
  seq = {i:{} for i in range(len(sentence))}
  initialpos = {i:{} for i in range(len(sentence))}    
  
  if sentence[0] not in knownWords:
    sentence[0] = unkTag
      
  for label in labelFrequencies:
    if str((startTag, label)) in transitionParams:
      try:
        seq[0][label] = transitionParams[str(((startTag, label)))] * emissionParams[str((label, sentence[0]))]
      except:
        seq[0][label] = 0
  
  for state in seq[0].keys():
    initialpos[0][state] = '<s>'
  
  for i in range(1, len(sentence)):
    # still, check if the word is in the vocabulary
    if sentence[i] not in knownWords:
      sentence[i] = unkTag
          
    for pos in seq[i-1].keys():
      for label in labelFrequencies:
        if str((pos, label)) in transitionParams:
          if label in seq[i]:
            try:
              temp = seq[i-1][pos] *  transitionParams[str((pos, label))] * emissionParams[str((label, sentence[i]))]
              if  temp > seq[i][label]:
                seq[i][label] = temp
                initialpos[i][label] = pos
            except:
              pass
          else:
            try:
              seq[i][label] = seq[i-1][pos] *  transitionParams[str((pos, label))] * emissionParams[str((label, sentence[i]))]
              initialpos[i][label] = pos
            except:
              seq[i][label] = 0
  
  predLabels = []
  
  maxProbab = max(seq[len(sentence)-1].values())
  maxIndex = list(seq[len(sentence)-1].values()).index(maxProbab)
  maxState = list(seq[len(sentence)-1].keys())[maxIndex]
  predLabels.append(maxState)
  
  for i in range(len(sentence)-1, 0, -1):
    try:
      maxState = initialpos[i][maxState]
      predLabels.append(maxState)
    except:
      predLabels.append('UNK')
      
  predLabels = [predLabels[i] for i in range(len(predLabels)-1, -1, -1)]
  return predLabels

In [39]:
labelsViterbi = [viterbiDecoding(sent) for sent in listOfDevSentences]

In [40]:
predViterbiLabels = []
for i in range(len(labelsViterbi)):
  for label in labelsViterbi[i]:
    predViterbiLabels.append(label)

In [41]:
actualDevLabels = []
for i in range(len(listofDevLabelsSentences)):
  for label in listofDevLabelsSentences[i]:
    actualDevLabels.append(label)

In [42]:
# calculating accuracy on the dev data using viterbi Decoding
print('The prediction accuracy on the dev data using viterbi Decoding is {:.2f}%'.format(accuracy_score(actualDevLabels, predViterbiLabels) * 100))

The prediction accuracy on the dev data using viterbi Decoding is 94.36%


In [43]:
# creating file containing Viterbi labels (POS tags)
with open('output/predViterbi', 'w') as file:
  for i in range(len(predViterbiLabels)):
    if devIndices[i] == 1 and i > 35:
      file.write("\n")
    file.write("{}\t{}\t{}\n".format(devIndices[i], devWords[i], predViterbiLabels[i]))

In [44]:
! python eval.py -p "output/predViterbi" -g "data/dev"

output/predViterbi
data/dev
'' '38\t.\t.' 131751
total: 131751, correct: 124322, accuracy: 94.36%


Creating greedy.out and viterbi.out using the Test Data

In [45]:
# reading test data
testData = pd.read_csv('data/test', sep='\t', header=None)
testData.head()

Unnamed: 0,0,1
0,1,Influential
1,2,members
2,3,of
3,4,the
4,5,House


In [47]:
# Storing test words, indices and labels separately
testIndices, testWords = testData[0], testData[1]

In [48]:
listOfTestSentences, wordSampleTest = [], []
for i in range(len(testData)-1):
  if devIndices[i] < testIndices[i+1]:
    wordSampleTest.append(testWords[i])
  else:
    wordSampleTest.append(testWords[i])
    listOfTestSentences.append(wordSampleTest)
    wordSampleTest = []

In [49]:
# applying greedy decoding algorithm to test data 
lablesGreedyTest = [greedyDecoding(sent) for sent in listOfTestSentences]

In [50]:
# applying viterbi decoding algorithm to test data 
lablesViterbiTest = [viterbiDecoding(sent) for sent in listOfTestSentences]

In [51]:
# creating a single list of all the predicted labels i.e. pos tags using greedy 
predGreedyLabelsTest = []
for i in range(len(lablesGreedyTest)):
  for label in lablesGreedyTest[i]:
    predGreedyLabelsTest.append(label)

In [52]:
# creating file greedy.out containing greedy labels (POS tags) for test data
with open('output/greedy.out', 'w') as file:
  for i in range(len(predGreedyLabelsTest)):
    if testIndices[i] == 1 and i > 35:
      file.write("\n")
    file.write("{}\t{}\t{}\n".format(testIndices[i], testWords[i], predGreedyLabelsTest[i]))

In [53]:
# creating a single list of all the predicted labels i.e. pos tags using viterbi 
predViterbiLabelsTest = []
for i in range(len(lablesViterbiTest)):
  for label in lablesViterbiTest[i]:
    predViterbiLabelsTest.append(label)

In [54]:
# creating file viterbi.out containing viterbi labels (POS tags) for test data
with open('output/viterbi.out', 'w') as file:
  for i in range(len(predViterbiLabelsTest)):
    if testIndices[i] == 1 and i > 35:
      file.write("\n")
    file.write("{}\t{}\t{}\n".format(testIndices[i], testWords[i], predViterbiLabelsTest[i]))