# Setting up the enviroment

In [1]:
!pip install conllu
!pip install transformers

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [2]:
from conllu import parse
import numpy as np
from collections import Counter
from random import shuffle,randint
from google.colab import drive
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from sklearn.cluster import KMeans, MiniBatchKMeans
from transformers import AutoTokenizer, BertModel
import torch

In [3]:
# Connecting with my google drive to save and load trained weights, embeddings, and kmeans results.
drive.mount('/content/drive')
uposPath="/content/drive/MyDrive/L390.3/uposHMM/"
xposPath="/content/drive/MyDrive/L390.3/xposHMM/"

Mounted at /content/drive


# Preprocessing Dataset

In [4]:
# Read the file content
with open("/content/drive/MyDrive/L390.3/ptb-train.conllu", "r", encoding="utf-8") as file:
    data = file.read()

# Parse the file content
sentences = parse(data)

In [5]:
upos=set() # all unique universial part of speech tags
xpos=set() # all unique language specific part of speech tags
vocab=set() # all unique words
uposCorpus=[] # List of sentences where each sentence is composed of words and their universial part of speech tags
xposCorpus=[] # List of sentences where each sentence is composed of words and their language specific part of speech tags
allUnlabeldSentences = []
num_words=0
longestSentenceLength = 0
shortestSentenceLength = float("inf")
for s in sentences:
  longestSentenceLength=max(longestSentenceLength,len(s))
  shortestSentenceLength = min(shortestSentenceLength,len(s))
  num_words+=len(s)
  uposSentence=[]
  xposSentence=[]
  unlabeledSentence=[]
  for token in s:
    word=token["form"]
    uposTag=token["upos"]
    xposTag=token["xpos"]
    upos.add(uposTag)
    xpos.add(xposTag)
    vocab.add(word)
    unlabeledSentence.append(word)
    uposSentence.append({"word":word,"tag":uposTag})
    xposSentence.append({"word":word,"tag":xposTag})
  allUnlabeldSentences.append(unlabeledSentence)
  uposCorpus.append(uposSentence)
  xposCorpus.append(xposSentence)
allUnlabeldSentences.sort()
uposCorpus.sort(key=lambda a: [word["word"] for word in a])
xposCorpus.sort(key=lambda a: [word["word"] for word in a])
upos=sorted(upos)
xpos=sorted(xpos)
vocab=sorted(vocab)
wordToIndex = {vocab[i] : i for i in range(len(vocab))}
uposTagToIndex = {upos[i] : i for i in range(len(upos))}
xposTagToIndex = {xpos[i] : i for i in range(len(xpos))}
print(f"Number of sentences in the dataset = {len(sentences)}")
print(f"Total number of words in the sentences = {num_words}")
print(f"Number of unique words = {len(vocab)}")
print(f"Sentence average length = {num_words/len(sentences)}")
print(f"Length of the shortest sentence = {shortestSentenceLength}")
print(f"Length of the longest sentence = {longestSentenceLength}")
print(f"Number of unique universial part of speech tags = {len(upos)}")
print(f"Number of unique language-specific part of speech tags = {len(xpos)}")
print(f"How does an unlabelled sentence look like {allUnlabeldSentences[0]}")
print(f"How does a sentence look like in the processed Corpus {uposCorpus[0]}")

Number of sentences in the dataset = 39832
Total number of words in the sentences = 950028
Number of unique words = 44389
Sentence average length = 23.850873669411527
Length of the shortest sentence = 1
Length of the longest sentence = 141
Number of unique universial part of speech tags = 17
Number of unique language-specific part of speech tags = 45
How does an unlabelled sentence look like ['#', '200', 'million', 'of', 'undated', 'variable-rate', 'notes', 'priced', 'at', 'par', 'via', 'Merill', 'Lynch', 'International', 'Ltd', '.']
How does a sentence look like in the processed Corpus [{'word': '#', 'tag': 'SYM'}, {'word': '200', 'tag': 'NUM'}, {'word': 'million', 'tag': 'NUM'}, {'word': 'of', 'tag': 'ADP'}, {'word': 'undated', 'tag': 'ADJ'}, {'word': 'variable-rate', 'tag': 'ADJ'}, {'word': 'notes', 'tag': 'NOUN'}, {'word': 'priced', 'tag': 'VERB'}, {'word': 'at', 'tag': 'ADP'}, {'word': 'par', 'tag': 'ADJ'}, {'word': 'via', 'tag': 'ADP'}, {'word': 'Merill', 'tag': 'PROPN'}, {'word'

In [6]:
# Creating a smaller dataset to test the model on and estimate the best hyperparmeters
# The code on this cell was only used on the development process. For the final models I used the whole datasets.
testCorpus=uposCorpus[:10000]
testTags=set()
testVocab=set()
for s in testCorpus:
  for word in s:
    testTags.add(word["tag"])
    testVocab.add(word["word"])
testTags=list(testTags)
testVocab=list(testVocab)

testUnlabeldSentences=[]
for sentenceWithTag in testCorpus:
  s=[word["word"] for word in sentenceWithTag]
  testUnlabeldSentences.append(s)

print(f"number of sentences in the testCorpus = {len(testCorpus)}")
print(f"number of unique tags in the testCorpus = {len(testTags)}")
print(f"number of unique words in the testCorpus = {len(testVocab)}")

number of sentences in the testCorpus = 10000
number of unique tags in the testCorpus = 17
number of unique words in the testCorpus = 22061


# HMM Models

## I trained 4 HMM models.

### 1-HMM model trained on the universal part of speech corpus using a supervised approch.
### 2-HMM model trained on the language-specific part of speech corpus using a supervised approch.
### 3-HMM model trained on the universal part of speech corpus using an unsupervised approach (Forward Backward algorithim).
### 4-HMM model trained on the language-specific part of speech corpus using an unsupervised approach (Forward Backward algorithim).

In [7]:
SMALLEST_REPRESENTABLE_FLOAT=np.finfo(np.float64).tiny

class HMM:
  # all probabilities are in log space to avoid underflow
  def __init__(self,hiddenStates,startFromScratch=True):
    self.hiddenStates = hiddenStates
    self.stateToIndex = {self.hiddenStates[i] : i for i in range(len(self.hiddenStates))}
    if startFromScratch:
      self.__initialize_startProbability_randomly()
      self.__initialize_transitionMatrix_randomly()
      self.__initialize_emissionMatrix_randomly()
    else:
      # Load the previously trained parameters
      path = uposPath if len(self.hiddenStates)==17 else xposPath
      self.startProbability = np.load(path + "startProbability.npy")
      self.transitionMatrix = np.load(path + "transitionMatrix.npy")
      self.emissionMatrix = np.load(path + "emissionMatrix.npy")

  def __logSpaceAdd(self,numpyArray,axis=None):
    # I am using the log-sum-exp trick
    if axis==None:
      if numpyArray.shape[0]==1:
        axis=1
      elif numpyArray.shape[1]==1:
        axis=0
      else:
        raise Exception("Axis was not specified")
    maxValue=np.max(numpyArray,axis=axis,keepdims=True)
    temp = np.log(np.sum(np.exp(numpyArray - maxValue),axis=axis,keepdims=True))
    ans=maxValue + temp
    if ans.shape==(1,1):
      ans=ans[0][0]
    return ans

  def __logSpaceAdd2Columns(self,col1,col2):
    ans=np.column_stack((col1,col2))
    return self.__logSpaceAdd(ans,axis=1)

  def __logSpaceAdd2Matricies(self,mat1,mat2):
    ans=self.__logSpaceAdd(np.stack((mat1,mat2),axis=0),axis=0)
    return ans.reshape(ans.shape[1:])


  def __initialize_startProbability_randomly(self):
    self.startProbability = np.random.rand(len(self.hiddenStates),1)+1e-10
    self.startProbability/=np.sum(self.startProbability)
    self.startProbability=np.log(self.startProbability) # covert to log space
    return

  def __initialize_transitionMatrix_randomly(self):
    self.transitionMatrix =  np.random.rand(len(self.hiddenStates),len(self.hiddenStates)) + 1e-10
    for i in range(len(self.hiddenStates)):
      self.transitionMatrix[i]/=np.sum(self.transitionMatrix[i])
    self.transitionMatrix=np.log(self.transitionMatrix) # covert to log space
    return

  def __initialize_emissionMatrix_randomly(self):
    self.emissionMatrix = np.random.rand(len(self.hiddenStates),len(vocab)) + 1e-10
    for i in range(len(self.hiddenStates)):
      self.emissionMatrix[i]/=np.sum(self.emissionMatrix[i])
    self.emissionMatrix=np.log(self.emissionMatrix) # covert to log space
    return

  def viterbiAlgorithm(self,sentence):
    # multiplication is addition in log space
    numTimeSteps,numHiddenStates=len(sentence),len(self.hiddenStates)
    viterbi=np.zeros((numHiddenStates,numTimeSteps))
    backPointer=np.full((numHiddenStates,numTimeSteps),-1)
    firstWordIndex=wordToIndex[sentence[0]]
    viterbi[:,0:1] = self.startProbability + self.emissionMatrix[:,firstWordIndex:firstWordIndex+1]
    for timeStep in range(1,numTimeSteps):
      wordIndex=wordToIndex[sentence[timeStep]]
      viterbi[:,timeStep:timeStep+1] = np.max(viterbi[:,timeStep-1:timeStep] + self.transitionMatrix, axis=0, keepdims=True).T + self.emissionMatrix[:,wordIndex:wordIndex+1]
      backPointer[:,timeStep:timeStep+1] = np.argmax(viterbi[:,timeStep-1:timeStep] + self.transitionMatrix, axis=0, keepdims=True).T

    maxProbability , lastState = np.max(viterbi[:,numTimeSteps-1:numTimeSteps]),np.argmax(viterbi[:,numTimeSteps-1:numTimeSteps])
    mostProbableSequence,currentState,currentTimeStep = [], lastState,numTimeSteps-1
    while currentState != -1:
      mostProbableSequence.append(currentState) # we are appendeing the index of the state
      currentState=backPointer[currentState][currentTimeStep]
      currentTimeStep-=1
    mostProbableSequence.reverse()
    return mostProbableSequence

  def forwardAlgorithm(self,sentence):
    # multiplication is addition in log space
    # for addition in log space I am using the log-sum-exp trick
    numTimeSteps,numHiddenStates=len(sentence),len(self.hiddenStates)
    forward=np.zeros((numHiddenStates,numTimeSteps)) # forward[i][t] the joint probability to see word(1),word(2),..word(t) and state(t) = i.
    firstWordIndex=wordToIndex[sentence[0]]
    forward[:,0:1] = self.startProbability + self.emissionMatrix[: ,firstWordIndex:firstWordIndex+1]
    for timeStep in range(1,numTimeSteps):
      wordIndex=wordToIndex[sentence[timeStep]]
      forward[:,timeStep:timeStep+1] = self.__logSpaceAdd(forward[:,timeStep-1:timeStep] + self.transitionMatrix, axis=0).T + self.emissionMatrix[:,wordIndex:wordIndex+1]
    forwardProbability=self.__logSpaceAdd(forward[:,numTimeSteps-1:numTimeSteps])
    return forwardProbability,forward

  def backwardAlgorithm(self,sentence):
    # multiplication is addition in log space
    # for addition in log space I am using the log-sum-exp trick
    numTimeSteps,numHiddenStates=len(sentence),len(self.hiddenStates)
    backward=np.zeros((numHiddenStates,numTimeSteps)) # backward[i][t] the joint probability to see word(t+1),word(t+2),..word(numTimeSteps-1) and state(t) = i.
    backward[:,numTimeSteps-1:numTimeSteps]=np.zeros((numHiddenStates,1)) # log(1) = 0; these values should be 1 but we are opreating in log space
    for timeStep in range(numTimeSteps-2,-1,-1):
      wordIndex=wordToIndex[sentence[timeStep+1]]
      backward[:,timeStep:timeStep+1] = self.__logSpaceAdd((backward[:,timeStep+1:timeStep+2] + self.emissionMatrix[:,wordIndex:wordIndex+1]).T + self.transitionMatrix, axis=1)
    firstWordIndex=wordToIndex[sentence[0]]
    backwardProbability= self.__logSpaceAdd(self.startProbability+self.emissionMatrix[:,firstWordIndex:firstWordIndex+1]+backward[:,0:1])
    return backwardProbability,backward

  def forwardBackwardAlgorithm(self, batch):
    # multiplication is addition in log space
    # for addition in log space I am using the log-sum-exp trick

    # estimation step
    numHiddenStates,vocabSize=len(self.hiddenStates), len(vocab)
    zeroInLogSpace = np.log(SMALLEST_REPRESENTABLE_FLOAT)
    estimatedStateStartCount = np.full((numHiddenStates,1),zeroInLogSpace)
    estimatedStateTransitionCount = np.full((numHiddenStates,numHiddenStates),zeroInLogSpace)
    estimatedStateEmissionCount = np.full((numHiddenStates,vocabSize),zeroInLogSpace)

    for sentence in batch:
      numTimeSteps=len(sentence)
      sentenceProbability,forward=self.forwardAlgorithm(sentence)
      sentenceProbability,backward=self.backwardAlgorithm(sentence)


      # estimating the state start count
      firstWordIndex=wordToIndex[sentence[0]]
      current_estimated_count = (self.startProbability + self.emissionMatrix[:, firstWordIndex:firstWordIndex+1] + backward[:, 0:1]) - sentenceProbability
      estimatedStateStartCount = self.__logSpaceAdd2Columns(estimatedStateStartCount,current_estimated_count)

      for timeStep in range(numTimeSteps-1):
        # estimating the state transitions count
        wordIndex=wordToIndex[sentence[timeStep+1]]
        current_estimated_count = (((forward[:,timeStep:timeStep+1] + self.transitionMatrix) + self.emissionMatrix[:,wordIndex:wordIndex+1].T) + backward[:,timeStep+1:timeStep+2].T) - sentenceProbability
        estimatedStateTransitionCount = self.__logSpaceAdd2Matricies(estimatedStateTransitionCount,current_estimated_count)

      for timeStep in range(numTimeSteps):
        # estimating the state emission count
        wordIndex = wordToIndex[sentence[timeStep]]
        current_estimated_count = forward[:,timeStep:timeStep+1] + backward[:,timeStep:timeStep+1] - sentenceProbability
        estimatedStateEmissionCount[:,wordIndex:wordIndex+1] = self.__logSpaceAdd2Columns(estimatedStateEmissionCount[:,wordIndex:wordIndex+1],current_estimated_count)

    # maxmization step

    # calculating startprobability matrix
    self.startProbability=estimatedStateStartCount-self.__logSpaceAdd(estimatedStateStartCount)
    # calculating transition matrix
    self.transitionMatrix = estimatedStateTransitionCount-self.__logSpaceAdd(estimatedStateTransitionCount,axis=1)
    # calculating emission matrix
    self.emissionMatrix = estimatedStateEmissionCount-self.__logSpaceAdd(estimatedStateEmissionCount,axis=1)
    return

  def unsupervisedTraining(self,numEpochs,batchSize,sentences):
    for epoch in range(numEpochs):
      for i in range(0,len(sentences),batchSize):
        currentBatch=sentences[i:min(i+batchSize,len(sentences))]
        self.forwardBackwardAlgorithm(currentBatch)
      print(f"Finished Epoch Number {epoch+1}")
      # save the model parameters after each epoch
      path = uposPath if len(self.hiddenStates)==17 else xposPath
      np.save(path + "startProbability.npy",self.startProbability)
      np.save(path + "transitionMatrix.npy",self.transitionMatrix)
      np.save(path + "emissionMatrix.npy",self.emissionMatrix)
    return

  def supervisedTraining(self,corpus):
    numHiddenStates , vocabSize= len(self.hiddenStates) , len(vocab)
    state_i_to_state_j_cnt = [[0] * numHiddenStates for _ in range(numHiddenStates)]
    state_start_cnt = [0] * numHiddenStates
    state_to_word_cnt = [[0] * vocabSize for _ in range(numHiddenStates)]

    for sentence in corpus:
      startStateIndex=self.stateToIndex[sentence[0]["tag"]]
      startWordIndex=wordToIndex[sentence[0]["word"]]
      state_start_cnt[startStateIndex] += 1
      state_to_word_cnt[startStateIndex][startWordIndex] += 1
      for i in range(1,len(sentence)):
        previousStateIndex=self.stateToIndex[sentence[i-1]["tag"]]
        currentSateIndex=self.stateToIndex[sentence[i]["tag"]]
        currentWordIndex=wordToIndex[sentence[i]["word"]]
        state_i_to_state_j_cnt[previousStateIndex][currentSateIndex] += 1
        state_to_word_cnt[currentSateIndex][currentWordIndex] +=1

    # esitmating transition matrix
    for state1 in range(numHiddenStates):
      state1_cnt=sum(state_i_to_state_j_cnt[state1])
      for state2 in range(numHiddenStates):
        self.transitionMatrix[state1][state2] = state_i_to_state_j_cnt[state1][state2]/state1_cnt
        if self.transitionMatrix[state1][state2]==0:
          self.transitionMatrix[state1][state2]=SMALLEST_REPRESENTABLE_FLOAT

    # estimating start probability
    num_starts=sum(state_start_cnt)
    for state in range(numHiddenStates):
      self.startProbability[state]=state_start_cnt[state]/num_starts
      if self.startProbability[state]==0:
        self.startProbability[state]=SMALLEST_REPRESENTABLE_FLOAT

    # estimating emission matrix
    for state in range(numHiddenStates):
      num_occurances=sum(state_to_word_cnt[state])
      for word in range(vocabSize):
        self.emissionMatrix[state][word]= state_to_word_cnt[state][word]/num_occurances
        if self.emissionMatrix[state][word]==0:
          self.emissionMatrix[state][word]=SMALLEST_REPRESENTABLE_FLOAT

    # convert to log space
    self.startProbability=np.log(self.startProbability)
    self.transitionMatrix=np.log(self.transitionMatrix)
    self.emissionMatrix=np.log(self.emissionMatrix)
    return

In [8]:
# Model 1
supervisedUposHmm=HMM(upos)
supervisedUposHmm.supervisedTraining(uposCorpus)

In [9]:
# Model 2
supervisedXposHmm=HMM(xpos)
supervisedXposHmm.supervisedTraining(xposCorpus)

In [10]:
# Model 3
numEpochs, batchSize=20, 512
unsupervisedUposHmm=HMM(upos)
unsupervisedUposHmm.unsupervisedTraining(numEpochs,batchSize,allUnlabeldSentences)
unsupervisedUposHmm.unsupervisedTraining(numEpochs,2*batchSize,allUnlabeldSentences)
unsupervisedUposHmm.unsupervisedTraining(numEpochs,2*2*batchSize,allUnlabeldSentences)


In [11]:
# Model 4
numEpochs, batchSize= 20, 512
unsupervisedXposHmm=HMM(xpos)
unsupervisedXposHmm.unsupervisedTraining(numEpochs,batchSize,allUnlabeldSentences)
unsupervisedXposHmm.unsupervisedTraining(numEpochs,2*batchSize,allUnlabeldSentences)
unsupervisedXposHmm.unsupervisedTraining(numEpochs,2*2*batchSize,allUnlabeldSentences)

# K-Means Clustring with BERT Embeddings

## 1- K-Means on the Universal part of speech tags corpus

## 2- K-Means on the Language specific part of speech tags corpus

In [None]:
# Defining BERT Model
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertModel.from_pretrained("google-bert/bert-base-uncased")
model.eval()

In [None]:
# Device configuration (GPU/CPU)
# GPU is recommended to get the embeddings (5 min on a GPU vs 4 hours on a CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"The current device is {device}")

The current device is cuda


In [None]:
# Getting the word embeddings
wordEmbeddings = []
sentences=[' '.join(s) for s in allUnlabeldSentences]
batchSize=250
sentenceIndex=0
for startIndex in range(0,len(sentences),batchSize):
    batch = sentences[startIndex:min(len(sentences),startIndex+batchSize)]
    inputs = tokenizer(batch, padding=True,return_tensors="pt",add_special_tokens=False)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        # For this task I found out that the first layer embeddings are the best ones to use.
        batch_hidden_states = outputs[2][1].cpu()

    inputs = tokenizer(batch, padding=True,return_tensors="pt",add_special_tokens=False)
    for i in range(len(batch_hidden_states)):
        originalWords = sentences[sentenceIndex].split()
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][i])
        tokenEmbeddings = batch_hidden_states[i]
        tokenIndex=0
        for word in originalWords:
            word=word.lower()
            constructedWord = ""
            wordTokensEmbeddings = []
            while constructedWord!=word:
                currentToken = tokens[tokenIndex] if len(tokens[tokenIndex])<3 or tokens[tokenIndex][:2]!="##" else tokens[tokenIndex][2:]
                constructedWord += currentToken
                wordTokensEmbeddings.append(tokenEmbeddings[tokenIndex])
                tokenIndex+=1
            wordEmbeddings.append(np.mean(wordTokensEmbeddings,axis=0))
        sentenceIndex+=1
np.save("/content/drive/MyDrive/L390.3/wordEmbeddings.npy",wordEmbeddings)

In [12]:
# loading pretrained embeddings.
wordEmbeddings=np.load("/content/drive/MyDrive/L390.3/wordEmbeddings.npy")

In [13]:
# Model 5
# training the upos kmeans model
uposKmeans = KMeans(n_clusters=17, random_state=0) # I tried multiple random_state values and then choose the best one. You can do this automatically by changing the value of the parameter n_init (see https://scikit-learn.org/1.5/modules/generated/sklearn.cluster.KMeans.html).
uposKmeans.fit(wordEmbeddings)
uposKmeansPredictions = uposKmeans.labels_
np.save("/content/drive/MyDrive/L390.3/uposKmeansPredictions.npy",uposKmeansPredictions)
print("Number of iterations:", uposKmeans.n_iter_)

Number of iterations: 18


In [14]:
# Model 6
# training the xpos kmeans model
xposKmeans = KMeans(n_clusters=45, random_state=2) # I tried multiple random_state values and then choose the best one. You can do this automatically by changing the value of the parameter n_init (see https://scikit-learn.org/1.5/modules/generated/sklearn.cluster.KMeans.html).
xposKmeans.fit(wordEmbeddings)
xposKmeansPredictions = xposKmeans.labels_
np.save("/content/drive/MyDrive/L390.3/xposKmeansPredictions.npy",xposKmeansPredictions)
print("Number of iterations:", xposKmeans.n_iter_)

Number of iterations: 59


# Measurment

## I am using 3 main type of measurments to compare the performance of the models

### 1- Word Level Accuracy
### 2- V-measure
### 3- Variation of information

In [15]:
# Defining the measurement functions
def calculate_entropy(cluster):
  """Calculate the entropy of a clustering."""
  total_points = len(cluster)
  if total_points == 0:
      return 0
  label_counts = Counter(cluster)
  probabilities = [count / total_points for count in label_counts.values()]
  entropy = -sum(p * np.log2(p) for p in probabilities)
  return entropy

def calculate_mutual_information(U, V):
  """Calculate the mutual information between two clusterings."""
  total_points = len(U)
  mutual_info = 0
  U_labels, V_labels = set(U), set(V)
  for u in U_labels:
      for v in V_labels:
          intersection_size = sum(1 for i in range(total_points) if U[i] == u and V[i] == v)
          if intersection_size == 0:
              continue
          p_u = sum(1 for x in U if x == u) / total_points
          p_v = sum(1 for x in V if x == v) / total_points
          p_uv = intersection_size / total_points
          mutual_info += p_uv * np.log2(p_uv / (p_u * p_v))
  return mutual_info

# The 3 measures I am using

def calculate_accuracy(goldStandard,prediction):
  numTags=len(set(goldStandard))
  confusionMatrix= np.zeros((numTags,numTags),dtype=int) # confusionMatrix[i][j] => if cluster i is mapped to tag with index j, how many correct tags will I get.
  for i in range(len(prediction)):
    confusionMatrix[prediction[i]][goldStandard[i]]+=1
  _,clusterToTagIndex=linear_sum_assignment(-confusionMatrix) # Hungarian algorithm implementaion to find the optimal matching that maxmizes the accuracy.
  prediction = [clusterToTagIndex[cluster] for cluster in prediction]
  correct_words = sum([prediction[i]==goldStandard[i] for i in range(len(goldStandard))])
  return 100 * correct_words/len(goldStandard)

def calculate_v_measure(true_labels, predicted_labels):
  homo_score = homogeneity_score(true_labels, predicted_labels)
  comp_score = completeness_score(true_labels, predicted_labels)
  v_score = v_measure_score(true_labels, predicted_labels)
  return homo_score, comp_score, v_score

def calculate_variation_of_information(U, V):
  """Calculate the variation of information between two clusterings."""
  entropy_U = calculate_entropy(U)
  entropy_V = calculate_entropy(V)
  mutual_information = calculate_mutual_information(U, V)
  variation_of_information = entropy_U + entropy_V - 2 * mutual_information
  return variation_of_information, variation_of_information / (entropy_U + entropy_V)

In [16]:
# To calculate the measures across different sentences, I concatnated the results for different sentences as if the corups is a one long sentence.
# I decided to do this because the clusters are uniform across sentences and the task is to see how good the models are in clustring words, regardless of the sentence level performance.

# Generating the Gold standards
uposGoldStandard=[]
for sentenceWithTag in uposCorpus:
  uposGoldStandard.extend([uposTagToIndex[word["tag"]] for word in sentenceWithTag])

xposGoldStandard=[]
for sentenceWithTag in xposCorpus:
  xposGoldStandard.extend([xposTagToIndex[word["tag"]] for word in sentenceWithTag])

# Generating HMM models outputs
supervisedUposHmmPredictions=[]
supervisedXposHmmPredictions=[]
unsupervisedUposHmmPredictions=[]
unsupervisedXposHmmPredictions=[]
for s in allUnlabeldSentences:
  supervisedUposHmmPredictions.extend(supervisedUposHmm.viterbiAlgorithm(s))
  supervisedXposHmmPredictions.extend(supervisedXposHmm.viterbiAlgorithm(s))
  unsupervisedUposHmmPredictions.extend(unsupervisedUposHmm.viterbiAlgorithm(s))
  unsupervisedXposHmmPredictions.extend(unsupervisedXposHmm.viterbiAlgorithm(s))

# Loading Kmeans outputs
uposKmeansPredictions = np.load("/content/drive/MyDrive/L390.3/uposKmeansPredictions.npy")
xposKmeansPredictions = np.load("/content/drive/MyDrive/L390.3/xposKmeansPredictions.npy")


In [17]:
# Reporting the resuls of the models on the 3 measurments

# supervisedUposHmm
print(f"The Word Level Accuracy of supervisedUposHmm = {calculate_accuracy(uposGoldStandard,supervisedUposHmmPredictions)} %")
print(f"The v measure of supervisedUposHmm = {calculate_v_measure(uposGoldStandard,supervisedUposHmmPredictions)}")
print(f"The variation of information of supervisedUposHmm = {calculate_variation_of_information(uposGoldStandard,supervisedUposHmmPredictions)}\n")

# supervisedxposHmm
print(f"The Word Level Accuracy of supervisedxposHmm = {calculate_accuracy(xposGoldStandard,supervisedXposHmmPredictions)} %")
print(f"The v measure of supervisedxposHmm = {calculate_v_measure(xposGoldStandard,supervisedXposHmmPredictions)}")
print(f"The variation of information of supervisedxposHmm = {calculate_variation_of_information(xposGoldStandard,supervisedXposHmmPredictions)}\n")

# unsupervisedUposHmm
print(f"The Word Level Accuracy of unsupervisedUposHmm = {calculate_accuracy(uposGoldStandard,unsupervisedUposHmmPredictions)} %")
print(f"The v measure of unsupervisedUposHmm = {calculate_v_measure(uposGoldStandard,unsupervisedUposHmmPredictions)}")
print(f"The variation of information of unsupervisedUposHmm = {calculate_variation_of_information(uposGoldStandard,unsupervisedUposHmmPredictions)}\n")

# unsupervisedXposHmm
print(f"The Word Level Accuracy of unsupervisedXposHmm = {calculate_accuracy(xposGoldStandard,unsupervisedXposHmmPredictions)} %")
print(f"The v measure of unsupervisedXposHmm = {calculate_v_measure(xposGoldStandard,unsupervisedXposHmmPredictions)}")
print(f"The variation of information of unsupervisedXposHmm = {calculate_variation_of_information(xposGoldStandard,unsupervisedXposHmmPredictions)}\n")

# upos Kmeans
print(f"The Word Level Accuracy of uposKmeans = {calculate_accuracy(uposGoldStandard,uposKmeansPredictions)} %")
print(f"The v measure of uposKmeans = {calculate_v_measure(uposGoldStandard,uposKmeansPredictions)}")
print(f"The variation of information of uposKmeans = {calculate_variation_of_information(uposGoldStandard,uposKmeansPredictions)}\n")

# xpos Kmeans
print(f"The Word Level Accuracy of xposKmeans = {calculate_accuracy(xposGoldStandard,xposKmeansPredictions)} %")
print(f"The v measure of xposKmeans = {calculate_v_measure(xposGoldStandard,xposKmeansPredictions)}")
print(f"The variation of information of xposKmeans = {calculate_variation_of_information(xposGoldStandard,xposKmeansPredictions)}")

The Word Level Accuracy of supervisedUposHmm = 96.03485370957488 %
The v measure of supervisedUposHmm = (0.9224970294144421, 0.9209958896710262, 0.9217458483604221)
The variation of information of supervisedUposHmm = (0.5535692454803467, 0.07825415163957822)

The Word Level Accuracy of supervisedxposHmm = 97.02966649404017 %
The v measure of supervisedxposHmm = (0.9492879103476413, 0.947555596457507, 0.9484209623749003)
The variation of information of supervisedxposHmm = (0.44820092764638986, 0.05157903762510059)

The Word Level Accuracy of unsupervisedUposHmm = 47.64606937900778 %
The v measure of unsupervisedUposHmm = (0.45771267530741094, 0.41971122908796255, 0.43788902618237335)
The variation of information of unsupervisedUposHmm = (4.152998001030268, 0.5621109738176269)

The Word Level Accuracy of unsupervisedXposHmm = 36.57123790035662 %
The v measure of unsupervisedXposHmm = (0.5501194312375207, 0.47343491238617813, 0.5089045762002461)
The variation of information of unsupervise

# Comparing performance on short vs long sentences

In [67]:
numWordsBeforeIt = 0
hmmShortSentenceAcc=[]
hmmLongSentenceAcc=[]
kmeansShortSentenceAcc=[]
kmeansLongSentenceAcc=[]
for sentenceIdx in range(len(allUnlabeldSentences)):
  sentence = allUnlabeldSentences[sentenceIdx]
  goldStandard = uposGoldStandard[numWordsBeforeIt : numWordsBeforeIt + len(sentence)]
  unsupervisedUposHmmOutput,hmmAcc = uposClusterToTag(goldStandard,unsupervisedUposHmmPredictions[numWordsBeforeIt : numWordsBeforeIt + len(sentence)])
  uposKmeansOutput,kmeansAcc = uposClusterToTag(goldStandard,uposKmeansPredictions[numWordsBeforeIt : numWordsBeforeIt + len(sentence)])
  goldStandard = [upos[i] for i in goldStandard]
  numWordsBeforeIt += sentenceLength

  if len(sentence)<10:
    hmmShortSentenceAcc.append(hmmAcc)
    kmeansShortSentenceAcc.append(kmeansAcc)
  else:
    hmmLongSentenceAcc.append(hmmAcc)
    kmeansLongSentenceAcc.append(kmeansAcc)

print(f"HMM accuracy for short sentences = {np.mean(hmmShortSentenceAcc)} Vs Kmeans accuracy for short sentences = {np.mean(kmeansShortSentenceAcc)}")
print(f"HMM accuracy for long sentences = {np.mean(hmmLongSentenceAcc)} Vs Kmeans accuracy for long sentences = {np.mean(kmeansLongSentenceAcc)}")


HMM accuracy for short sentences = 78.68333574564578 Vs Kmeans accuracy for short sentences = 74.59705939113235
HMM accuracy for long sentences = 61.985082840133906 Vs Kmeans accuracy for long sentences = 63.75613207518351


# Looking closer at sentences to measure the models qualitatively.

In [40]:
# Define a cluster to tag fucntion
def uposClusterToTag(goldStandard,prediction):
  numTags=17
  confusionMatrix= np.zeros((numTags,numTags),dtype=int) # confusionMatrix[i][j] => if cluster i is mapped to tag with index j, how many correct tags will I get.
  for i in range(len(prediction)):
    confusionMatrix[prediction[i]][goldStandard[i]]+=1
  _,clusterToTagIndex=linear_sum_assignment(-confusionMatrix) # Hungarian algorithm implementaion to find the optimal matching that maxmizes the accuracy.
  prediction = [clusterToTagIndex[cluster] for cluster in prediction]
  correct_words = sum([prediction[i]==goldStandard[i] for i in range(len(goldStandard))])
  acc=100 * correct_words/len(goldStandard)
  prediction = [upos[i] for i in prediction]
  return prediction,acc

In [86]:
# Get a random sentence goldStandard answer and the prediction of the models.
# You can add some constrains to the code to see a particular difference between the model's performance
# The code below gives us a sentence with length < 30 where the word "open" is tagged as "VERB" by the models and the gold standard.
while 1:
  sentenceIdx = randint(0,len(allUnlabeldSentences)-1)
  # sentenceIdx = 39095
  sentence = allUnlabeldSentences[sentenceIdx]
  numWordsBeforeIt = sum([len(allUnlabeldSentences[i]) for i in range(sentenceIdx)])
  sentenceLength = len(allUnlabeldSentences[sentenceIdx])

  goldStandard = uposGoldStandard[numWordsBeforeIt : numWordsBeforeIt + sentenceLength]
  # converting clusters to tags.
  unsupervisedUposHmmOutput,hmmAcc = uposClusterToTag(goldStandard,unsupervisedUposHmmPredictions[numWordsBeforeIt : numWordsBeforeIt + sentenceLength])
  uposKmeansOutput,kmeansAcc = uposClusterToTag(goldStandard,uposKmeansPredictions[numWordsBeforeIt : numWordsBeforeIt + sentenceLength])
  goldStandard = [upos[i] for i in goldStandard]

  if "open" in sentence and uposKmeansOutput[sentence.index("open")]=="VERB" and len(sentence)<30 and unsupervisedUposHmmOutput[sentence.index("open")]=="VERB" and goldStandard[sentence.index("open")]=="VERB":
    # printing statistics
    print(f"Sentence Index is {sentenceIdx}")
    print(*sentence)
    print(f"HMM Accuracy = {hmmAcc} %")
    print(f"K-means Accuracy = {kmeansAcc} %")
    for i in range(len(sentence)):
      print(sentence[i],goldStandard[i],unsupervisedUposHmmOutput[i],uposKmeansOutput[i])
    break

Sentence Index is 30045
The hotel is scheduled to open in 1992 .
HMM Accuracy = 77.77777777777777 %
K-means Accuracy = 66.66666666666667 %
The DET DET VERB
hotel NOUN NOUN VERB
is AUX PART AUX
scheduled VERB PROPN VERB
to PART PART PART
open VERB VERB VERB
in ADP ADP ADP
1992 NUM NUM VERB
. PUNCT PUNCT PUNCT


In [69]:
# The code below to check which words have different tags, so that I can invstigate the performance of the models when the word is used in different contexts
wordToTag={word:set() for word in vocab}
for s in uposCorpus:
  for w in s:
    wordToTag[w["word"]].add(w["tag"])

In [70]:
for word in vocab:
  if len(wordToTag[word])>4:
    print(word)
    print(wordToTag[word])

's
{'VERB', 'PART', 'PROPN', 'AUX', 'PRON', 'NOUN'}
back
{'ADJ', 'ADP', 'VERB', 'ADV', 'NOUN'}
down
{'ADJ', 'ADP', 'VERB', 'ADV', 'NOUN'}
open
{'ADJ', 'ADP', 'VERB', 'ADV', 'NOUN'}
that
{'ADP', 'VERB', 'ADV', 'SCONJ', 'PRON', 'DET', 'NOUN'}
the
{'ADJ', 'PROPN', 'VERB', 'DET', 'NOUN'}
vs.
{'ADJ', 'X', 'CONJ', 'ADP', 'NOUN'}
