<a href="https://colab.research.google.com/github/ZizZu94/nlu-second-assignment/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **[NLU] Second Assignment** 
*   **Zihadul Azam**
*   Id: 221747
*   zihadul.azam@studenti.unitn.it




Commands to set Colab environment

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/Colab Notebooks/nlu/second-assignment/nlu-second-assignment')

In [3]:
%cd /content/drive/My Drive/Colab Notebooks/nlu/second-assignment/nlu-second-assignment

/content/drive/My Drive/Colab Notebooks/nlu/second-assignment/nlu-second-assignment


In [4]:
!pwd

/content/drive/My Drive/Colab Notebooks/nlu/second-assignment/nlu-second-assignment


# **Requirements**


*   SpaCy: run `pip install spacy`
*   Sk

In [39]:
import re
import pandas as pd
import numpy as np
import spacy
from spacy.tokens import Doc
import conll

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from collections import defaultdict

# utils
import utils

Load nlp and set white-space tokenizer

In [6]:
nlp = spacy.load('en')

class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(" ")
        return Doc(self.vocab, words=words)

nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

#### **Global vars**

In [7]:
data_folder_path = './data'
train_path = data_folder_path + '/train.txt'
test_path = data_folder_path + '/test.txt'

#### **Load input data**
Load conll 2003 data from file and preprocess 

In [8]:
def load_conll_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = f.read()
    sentences = data.split('\n\n')
    OUTPUT_DATA = []
    entities = []
    for sent in sentences:
        tokens = sent.split('\n')
        sentence = []
        ent_sentence_spacy = []
        ents = []

        if tokens[0] != '-DOCSTART- -X- -X- O' and tokens[0] != '':
            for x in tokens:
                x_split = x.split()
                # if not short length
                if len(x) > 0 and len(x_split) >= 3:
                    word = x_split[0]
                    word = word.strip()

                    if len(word) > 0:
                        sentence.append(word)
                        try:
                            ent = x_split[-1]
                        except IndexError:
                            print('Index Error: ', x_split)
                        ents.append((word, ent))
                # else:
                    #print('Short length x: ', x, ' . Removed.')

            processed_sentence = ' '.join(sentence)  # .lower()
            OUTPUT_DATA.append((processed_sentence, {'entities': ents}))

    print('Done getting data !')
    print('There are %d sentences.' % (len(sentences)))
    return OUTPUT_DATA

In [9]:
# load conll2003 test data
conll_test_data = load_conll_file(test_path)

print('\n')
print('----- Sample data: first 2 sentences -----')
for k in conll_test_data[:2]:
    print('Sentence: ', k[0])
    print('Entities:')
    for e in k[1]['entities']:
        print(e[0], '|', e[1])

    print('************************************')

Done getting data !
There are 3685 sentences.


----- Sample data: first 2 sentences -----
Sentence:  SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Entities:
SOCCER | O
- | O
JAPAN | B-LOC
GET | O
LUCKY | O
WIN | O
, | O
CHINA | B-PER
IN | O
SURPRISE | O
DEFEAT | O
. | O
************************************
Sentence:  Nadim Ladki
Entities:
Nadim | B-PER
Ladki | I-PER
************************************


Mapper

In [10]:
def convert_spacy_entity_to_conll(iob, type):
    mapper = {
        'PERSON': 'PER',
        'NORP': 'MISC',
        'FAC': 'MISC',
        'ORG': 'ORG',
        'GPE': 'LOC',
        'LOC': 'LOC',
        'PRODUCT': 'MISC',
        'EVENT': 'MISC',
        'WORK_OF_ART': 'MISC',
        'LAW': 'MISC',
        'LANGUAGE': 'MISC'
    }

    if not type in mapper:
        return 'O'
    return iob + '-' + mapper[type]

For each sentence get real entity and predicted entity (with IOB)

In [11]:
def get_refs_hyps(data):
    refs = []
    hyps = []

    for sent in data:
        # get entities of this sentence
        entities = sent[1]['entities']
        hyp = []
        doc = nlp(sent[0])

        for token in doc:
          # get predicted entity
          conll_entity = convert_spacy_entity_to_conll(
                token.ent_iob_, token.ent_type_)
          # add predicted entity to the list
          hyp.append((token.text, conll_entity))

        # add real entities of this sentence
        sent_refs = [(entity[0], entity[1]) for entity in entities]
        refs.append(sent_refs)
        # add predicted entities of this sentence
        hyps.append(hyp)
    return refs, hyps

In [12]:
print('----- Sample get_refs_hyps(data): first 2 sentences -----')
test_refs, test_hyps = get_refs_hyps(conll_test_data[:2])
print("--> refs list:")
print(*test_refs, sep='\n')
print('\n')
print("--> hyps list:")
print(*test_hyps, sep='\n')

----- Sample get_refs_hyps(data): first 2 sentences -----
--> refs list:
[('SOCCER', 'O'), ('-', 'O'), ('JAPAN', 'B-LOC'), ('GET', 'O'), ('LUCKY', 'O'), ('WIN', 'O'), (',', 'O'), ('CHINA', 'B-PER'), ('IN', 'O'), ('SURPRISE', 'O'), ('DEFEAT', 'O'), ('.', 'O')]
[('Nadim', 'B-PER'), ('Ladki', 'I-PER')]


--> hyps list:
[('SOCCER', 'O'), ('-', 'O'), ('JAPAN', 'O'), ('GET', 'O'), ('LUCKY', 'O'), ('WIN', 'B-ORG'), (',', 'O'), ('CHINA', 'B-LOC'), ('IN', 'O'), ('SURPRISE', 'O'), ('DEFEAT', 'O'), ('.', 'O')]
[('Nadim', 'O'), ('Ladki', 'O')]


# **Task 1**

> ## **Task 1.1**



In [13]:
# get refs and hyps
refs, hyps = get_refs_hyps(conll_test_data)

Get confusion matrix

In [14]:
# create flat real values list and predicted values lists
entity_refs = []
for sent in refs:
    for token in sent:
        entity_refs.append(token[1])

entity_hyps = []
for sent in hyps:
    for token in sent:
        entity_hyps.append(token[1])

# get confusion_matrix
labels = np.unique(entity_refs)
conf_mat = confusion_matrix(entity_refs, entity_hyps, labels=labels)
# print confusion matrix
pd_table = pd.DataFrame(conf_mat, index=labels, columns=labels)
display(pd_table)

Unnamed: 0,B-LOC,B-MISC,B-ORG,B-PER,I-LOC,I-MISC,I-ORG,I-PER,O
B-LOC,1129,30,143,52,26,2,40,6,240
B-MISC,19,396,48,23,2,26,13,1,174
B-ORG,178,36,558,150,0,2,94,24,619
B-PER,50,28,168,989,0,1,25,49,307
I-LOC,3,0,0,0,130,5,50,7,62
I-MISC,0,6,4,1,8,94,34,6,63
I-ORG,11,4,6,4,33,17,464,99,197
I-PER,1,0,2,7,20,9,80,887,150
O,53,64,189,63,32,46,279,163,37434


Calculate accuracy per class and overall accuracy

$Accuracy=\frac{TP+TN}{TP+TN+FP+FN}$

In [97]:
def get_accuracy_per_class(confusion_matrix, class_labels):
  """
  this function takes confusion matrix and class names in input
  return a dict with accuracy per class
  """
  class_accuracies = {}

  # Calculate the accuracy for each one of our classes
  for idx, cls in enumerate(class_labels):
    # True negatives are all the samples that are not our current GT class (not the current row) 
    # and were not predicted as the current class (not the current column)
    true_negatives = np.sum(np.delete(np.delete(confusion_matrix, idx, axis=0), idx, axis=1))
    
    # True positives are all the samples of our current class that were predicted as such
    true_positives = confusion_matrix[idx, idx]
    
    # The accuracy for the current class is ratio between correct predictions to all predictions
    class_accuracies[cls] = (true_positives + true_negatives) / np.sum(confusion_matrix)
  return class_accuracies

# get accuracy per class
accuracies_per_clas = get_accuracy_per_class(conf_mat, labels)
# print accuracy per class
accuracy_pd_table = pd.DataFrame().from_dict(accuracies_per_clas, orient='index')
accuracy_pd_table.columns = ['Accuracy']
display(accuracy_pd_table.round(3))

# print overall accuracy (sum_accurecies_all_classes / num_classes)
overall_acc = (np.sum(accuracy_pd_table) / len(accuracy_pd_table))
print('\n')
print('> Overall accuracy: {}'.format(np.format_float_positional(overall_acc, 3)))
print('> Overall accuracy in percentage: {} %'.format(np.format_float_positional(overall_acc*100, 3)))

Unnamed: 0,Accuracy
B-LOC,0.982
B-MISC,0.99
B-ORG,0.964
B-PER,0.98
I-LOC,0.995
I-MISC,0.995
I-ORG,0.979
I-PER,0.987
O,0.942




> Overall accuracy: 0.979
> Overall accuracy in percentage: 97.916 %


> **Extra:** other metrices

In [87]:
class_report = classification_report(entity_refs, entity_hyps, target_names=labels, output_dict=True)
# print confusion matrix
pd_table = pd.DataFrame(class_report)
display(pd_table.round(3))

Unnamed: 0,B-LOC,B-MISC,B-ORG,B-PER,I-LOC,I-MISC,I-ORG,I-PER,O,accuracy,macro avg,weighted avg
precision,0.782,0.702,0.499,0.767,0.518,0.465,0.43,0.714,0.954,0.906,0.648,0.901
recall,0.677,0.564,0.336,0.612,0.506,0.435,0.556,0.767,0.977,0.906,0.603,0.906
f1-score,0.726,0.626,0.402,0.681,0.512,0.45,0.485,0.74,0.965,0.906,0.621,0.902
support,1668.0,702.0,1661.0,1617.0,257.0,216.0,835.0,1156.0,38323.0,0.906,46435.0,46435.0


We can see here that the classification_report has a lower total accuracy w.r.t my calculated Overall accuracy.
I think they are different metrices, **Scikit Learn classification report **contains the simple accuracy, which does not consider **TN** cases:
* classification report accuracy = $\frac{TP}{TP+TN+FP+FN}$

so, to test it I calculated it from my confusion matrix:

In [102]:
total_predictions = conf_mat.sum().sum()
total_correct_pred = np.diag(conf_mat).sum()
test_accuracy = total_correct_pred / total_predictions

print('Total predictions: ', total_predictions)
print('Correct predictions: ', total_correct_pred)
print('Accuracy (Correct predictions / Total predictions): ', test_accuracy)

Total predictions:  46435
Correct predictions:  42081
Accuracy (Correct predictions / Total predictions):  0.9062345213739637


Here I can see, that my hypothesis was right.
So, **Scikit Learn classification report** calculate the simple accuracy, without considering **TN**

> ## **Task 1.1**

In [18]:
accuracy = conll.evaluate(refs, hyps)

table = pd.DataFrame().from_dict(accuracy, orient='index')
display(table.round(3))

Unnamed: 0,p,r,f,s
PER,0.724,0.577,0.642,1617
MISC,0.681,0.547,0.607,702
ORG,0.441,0.297,0.355,1661
LOC,0.771,0.668,0.716,1668
total,0.662,0.518,0.581,5648


# **Task 2**: Grouping of Entities
Write a function to group recognized named entities using noun_chunks method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together).

In [73]:
def get_tokens_indexes(token_list):
  """
    this function returns the unique set of token indexes
  """
  return set([tok.i for tok in token_list])


def get_doc_GOE(doc: spacy.tokens.Doc):
  """
    this function takes a doc as input
    returns list of entities groups of Doc
  """
  # entity list of each chunk (if has any)
  chunk_ents_list = defaultdict(lambda:[])
  unassigned = []

  for ent in doc.ents:
    ent_indexes = get_tokens_indexes(ent)
    found = False

    # check if this ent is part of a chunk
    for i, chunk in enumerate(doc.noun_chunks):
      chunk_indexs = get_chunk_tokens_indexes(chunk)
      if len(ent_indexes.intersection(chunk_indexs)) > 0:
        chunk_ents_list[i].append(ent[0].ent_type_)
        found = True

    # if this ent is not part of a chunk add to unassigned list
    if not found:
      unassigned.append(ent[0].ent_type_)

  # join chunk_ents_list and unassigned list togather and return
  return [ents for _, ents in chunk_ents_list.items()] + [[u] for u in unassigned]

def grouping_of_entities_freq(data):
  """
    this function takes a corpus as input
    return frequency of all groups of entities
  """
  # init result list
  result = defaultdict(lambda:0)

  for sent in data:
      doc = nlp(sent[0])
      # get doc group of entities (GOE)
      doc_GOE = get_doc_GOE(doc)
      # update frequencies in result dictionary
      for group in doc_GOE:
        result['-'.join(group)] += 1
  return result

group_entities_frequency = grouping_of_entities_freq(conll_test_data)
group_entities_frequency = dict(sorted(group_entities_frequency.items(), key=lambda item: item[1], reverse=True))

In [75]:
print('----- Grouping of Entities frequency (top 40) -----')
for key, value in list(group_entities_frequency.items())[:40]:
  print(key, ': ', value)

----- Grouping of Entities frequency (top 40) -----
CARDINAL :  1821
GPE :  1267
PERSON :  1050
ORG :  1043
DATE :  953
NORP :  302
MONEY :  148
ORDINAL :  115
TIME :  104
CARDINAL-PERSON :  91
PERCENT :  87
QUANTITY :  81
EVENT :  64
LOC :  53
NORP-PERSON :  45
PRODUCT :  29
ORG-PERSON :  28
GPE-PERSON :  25
CARDINAL-ORG :  21
FAC :  19
CARDINAL-GPE :  18
CARDINAL-NORP :  17
PERSON-PERSON :  13
WORK_OF_ART :  13
GPE-ORG :  11
PERSON-GPE :  10
LAW :  10
DATE-EVENT :  9
ORG-ORG :  9
GPE-GPE :  9
LANGUAGE :  7
NORP-ORG :  6
ORG-GPE :  5
DATE-ORG :  5
DATE-TIME :  5
DATE-NORP :  5
ORG-DATE :  5
DATE-PERSON :  3
PERSON-ORG :  3
GPE-ORDINAL :  3


> **Extra**: frequency by class number

> example: `"CARDINAL-NORP"` is built with 2 classes or entities

In [79]:
class_num_freq = defaultdict(lambda:0)
for key, value in group_entities_frequency.items():
  classes = key.split('-')
  class_num_freq[len(classes)] += value

print('----- Frequency by number of classes -----')
for key, value in class_num_freq.items():
  print('Group with ', key, ' class: ', value)

----- Frequency by number of classes -----
Group with  1  class:  7166
Group with  2  class:  405
Group with  3  class:  20
Group with  4  class:  3


# **Task 3**: Post-processing 
Write a function that extends the entity span to cover the full noun-compounds. Make use of `compound` dependency relation.