# Oueslati Amine (W9GDX2)-Assignment#2-NLP

### Importing the necessary libraries 

In [None]:
import nltk
from nltk.corpus import brown

import spacy
from spacy.tokens import Doc


nlp = spacy.load('en')
nltk.download('brown')
nltk.download('universal_tagset')

!pip install sklearn-crfsuite

import sklearn
import sklearn_crfsuite

from sklearn_crfsuite import metrics

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 7.4MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


### Getting tagged sentences with the universal tagset

In [None]:
data = brown.tagged_sents(tagset='universal')
data

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

---

The zip function is used here to get the first element of each list in a sentence. 

Thus, the result will be a list of lists of tokens.

In [None]:
words=[list(list(zip(*sent))[0]) for sent in data]
words[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

The same methode is used here to get the list of labels, which is a list of lists of tags.

In [None]:
labels=[list(list(zip(*sent))[1]) for sent in data]
labels[0]

['DET',
 'NOUN',
 'NOUN',
 'ADJ',
 'NOUN',
 'VERB',
 'NOUN',
 'DET',
 'NOUN',
 'ADP',
 'NOUN',
 'ADJ',
 'NOUN',
 'NOUN',
 'VERB',
 '.',
 'DET',
 'NOUN',
 '.',
 'ADP',
 'DET',
 'NOUN',
 'VERB',
 'NOUN',
 '.']

---
### Feature extraction
This function is responsible for the transformation of a list of tokens to a list of dictionaries that contains the required features for the CRF model.

SpaCy is used for the feature extraction, therefore, the input list should be transformed to a spaCy document. 

knowing that the CRF algorithm needs the features of the previous, the current, and the next word to get better accuracy and efficiency; each dictionary will contain 3 sets of the same features (except for the first and the last words).


In [None]:
def token2features(list):
    doc = Doc(nlp.vocab, words=list)
    listOfFeatures=[]
    
    for i  in range(len(doc)):
      features = {
          'lower': doc[i].lower_,
          'suffix': doc[i].suffix_,
          'prefix': doc[i].prefix_,
          'isupper': doc[i].is_upper,
          'istitle': doc[i].is_title,
          'isdigit': doc[i].is_digit
      }
      if i > 0:
          features.update({
              '-1_lower': doc[i-1].lower_,
              '-1_suffix': doc[i-1].suffix_,
              '-1_prefix': doc[i-1].prefix_,
              '-1_isupper': doc[i-1].is_upper,
              '-1_istitle': doc[i-1].is_title,
              '-1_isdigit': doc[i-1].is_digit
          })
      else:
          features['BOS'] = True

      if i < len(doc)-1:
          features.update({
              '+1_lower': doc[i+1].lower_,
              '+1_suffix': doc[i+1].suffix_,
              '+1_prefix': doc[i+1].prefix_,
              '+1_isupper': doc[i+1].is_upper,
              '+1_istitle': doc[i+1].is_title,
              '+1_isdigit': doc[i+1].is_digit,
          })
      else:
          features['EOS'] = True
      
      listOfFeatures.append(features)

    return listOfFeatures

Since the **token2features** function works for just a list of tokens ( a sentence ) , an iterative methode is required for preprocessing the whole data.

In [None]:
listOFListsOfDictionaries  = [token2features(sent) for sent in words]

In [None]:
listOFListsOfDictionaries[0]

[{'+1_isdigit': False,
  '+1_istitle': True,
  '+1_isupper': False,
  '+1_lower': 'fulton',
  '+1_prefix': 'F',
  '+1_suffix': 'ton',
  'BOS': True,
  'isdigit': False,
  'istitle': True,
  'isupper': False,
  'lower': 'the',
  'prefix': 'T',
  'suffix': 'The'},
 {'+1_isdigit': False,
  '+1_istitle': True,
  '+1_isupper': False,
  '+1_lower': 'county',
  '+1_prefix': 'C',
  '+1_suffix': 'nty',
  '-1_isdigit': False,
  '-1_istitle': True,
  '-1_isupper': False,
  '-1_lower': 'the',
  '-1_prefix': 'T',
  '-1_suffix': 'The',
  'isdigit': False,
  'istitle': True,
  'isupper': False,
  'lower': 'fulton',
  'prefix': 'F',
  'suffix': 'ton'},
 {'+1_isdigit': False,
  '+1_istitle': True,
  '+1_isupper': False,
  '+1_lower': 'grand',
  '+1_prefix': 'G',
  '+1_suffix': 'and',
  '-1_isdigit': False,
  '-1_istitle': True,
  '-1_isupper': False,
  '-1_lower': 'fulton',
  '-1_prefix': 'F',
  '-1_suffix': 'ton',
  'isdigit': False,
  'istitle': True,
  'isupper': False,
  'lower': 'county',
  'prefi

### Splitting the data

The labels are already stored in the **labels** list and the text is preprocessed, so, the data can be splitted to a training set and a test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(listOFListsOfDictionaries, labels, test_size = 0.2, random_state = 42)

### Traning the model

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

To evaluate the trained model, the test data set is used to predict the labels and compare it with the true ones.

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted')

0.9784649763630752

### Pos_tagger function
This function accepts a string or a list of tokens and output the pos_tags.

The type of the input is verified, so if the input is a string it will be transformed to a list of tokens. Then the CRF model will predict the tags.

In [None]:
def pos_tagger(sent):
  if type(sent) == str:
    doc = nlp(sent)
    for token in doc:
      tokens  = [token.text for token in doc]
  elif type(sent) == list:
    tokens = sent
  data = [token2features(tokens)]

  result = crf.predict(data)
  return result


In [None]:
sent = input()
print("\n", pos_tagger(sent)[0])