In [None]:
!pip install sklearn-crfsuite
!pip install conllu
!pip install pyconll



In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from conllu import parse

In [None]:
import conllu 
from pathlib import Path

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## READ THE DATASET

In [None]:
from google.colab import files
uploaded = files.upload()

Saving hi-ud-test .conllu to hi-ud-test .conllu
Saving hi-ud-train.conllu to hi-ud-train.conllu


In [None]:
def breaker(data,delim):  
  cur_sent = []
  list_of_sent = []
  for word in data:
    if word == ',,' or word == '\t\t':
      list_of_sent.append(cur_sent)
      cur_sent = []
    else:
      word_split = word.split(delim)
      cur_sent.append(word_split[1:])
  list_of_sent.append(cur_sent)
  list_of_sent[0] = list_of_sent[0][1:]
  return list_of_sent

In [None]:
train_sents = open('hi-ud-train.conllu','r').read().strip().split('\n')
test_sents = open('hi-ud-test .conllu','r').read().strip().split('\n')

train_sents = breaker(train_sents,',')
test_sents = breaker(test_sents,'\t') 



## Features Chosen:
**Word**           - The Word Itself  
**Word.Lower()**   - The Word reduced to lowercase  
**Word.isTitle()** - True if first character is in UpperCase  
**Word.isUpper()** - True if all characters of the string are UpperCase  
**Word.hasUpper()** - Boolean True if atleast one character of the string is UpperCase  
**Word.isDigit()** - True if all characters of the string are Digits  
**Prefix-1**       - Word[0:1]  
**Prefix-2**       - Word[0:2]  
**Prefix-3**       - Word[0:3]   
**Suffix-1**       - Word[-3:0]  
**Suffix-2**       - Word[-2:0]  
**Suffix-3**       - Word[-1:0]  
**has_Hyphen**     - Whether word has hyphen in it  

**BOS**               - If Word is the Beginning of the Sentence  
**-1:Word.Lower()**   - Previous Word reduced to LowerCase   
**-1:Word.isTitle()** - Boolean True if first character of the Previous Word is in UpperCase  
**-1:Word.isUpper()** - Boolean True if all characters of the Previous word are UpperCase  
**-1:Word.Suffix-3**: - Take suffix of the previous word Word[-3:0]  

**EOS**               - If Word is the End of the Sentence  
**+1:Word.Lower()**   - Next Word reduced to LowerCase  
**+1:Word.isTitle()** - Boolean True if first character of the Next Word is in UpperCase  
**+1:Word.isUpper()** - Boolean True if all characters of the Next word are UpperCase   experiment with it.  
**+1:Word.Prefix-3**: - Take prefix of the next word Word[0:3]

sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.

In [None]:
def word2features(sent, i):
    Word = sent[i][0]
    
    features = {
        'Word':           Word,
        'Word.Lower()':   Word.lower(),
        'Word.isTitle()': Word.istitle(),
        'Word.isUpper()': Word.isupper(),
        'Word.hasupper':       any(x.isupper() for x in Word),
        'Word.isDigit()': Word.isdigit(),
        'Prefix-1':       Word[0] if len(Word)>0 else '',
        'Prefix-2':       Word[:2] if len(Word)>1 else '',
        'Prefix-3':       Word[:3] if len(Word)>2 else '',
        'Suffix-1':       Word[-1] if len(Word)>0 else '',
        'Suffix-2':       Word[-2:] if len(Word)>1 else '',
        'Suffix-3':       Word[-3:] if len(Word)>2 else '',
        'has_Hyphen':     '-' in Word,
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:Suffix-3': word1[-3:] if len(word1)>2 else '',
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:Prefix-3': word1[:3] if len(word1)>2 else '',
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label[1] for label in sent]


Extract features from the data:

In [None]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [ sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 55.7 ms, sys: 2.02 ms, total: 57.7 ms
Wall time: 58.1 ms


## Training


In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=300, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3.14 s, sys: 3.71 ms, total: 3.14 s
Wall time: 3.14 s


RUNNING MODEL ON TRAINING DATA

In [None]:
print("MODEL PREDICTION ON TRAINING DATA".center(53))
print("-"*53)

Y_TrainPredicted = crf.predict(X_train)

print(metrics.flat_classification_report(y_train, Y_TrainPredicted))

print('precision: ',  metrics.flat_precision_score(y_train, Y_TrainPredicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(y_train, Y_TrainPredicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(y_train, Y_TrainPredicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(y_train, Y_TrainPredicted))

          MODEL PREDICTION ON TRAINING DATA          
-----------------------------------------------------
              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00       570
         ADP       1.00      1.00      1.00      1387
         ADV       1.00      0.99      1.00       111
         AUX       0.99      1.00      1.00       730
       CCONJ       1.00      1.00      1.00       150
       COMMA       1.00      1.00      1.00       114
         DET       1.00      1.00      1.00       231
        NOUN       1.00      1.00      1.00      1597
         NUM       1.00      1.00      1.00       152
        PART       1.00      1.00      1.00       163
        PRON       1.00      1.00      1.00       431
       PROPN       1.00      1.00      1.00       708
       PUNCT       1.00      1.00      1.00       564
       SCONJ       1.00      1.00      1.00        61
        VERB       1.00      0.99      0.99       640
           X       1.00    

RUNNING MODEL ON TEST DATA

In [None]:
print("MODEL PREDICTION ON TESTING DATA".center(53))
print("-"*53)

Y_TestPredicted = crf.predict(X_test)

print(metrics.flat_classification_report(y_test, Y_TestPredicted))

print('precision: ',  metrics.flat_precision_score(y_test, Y_TestPredicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(y_test, Y_TestPredicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(y_test, Y_TestPredicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(y_test, Y_TestPredicted))

           MODEL PREDICTION ON TESTING DATA          
-----------------------------------------------------
              precision    recall  f1-score   support

         ADJ       0.68      0.72      0.70        94
         ADP       0.96      0.97      0.97       309
         ADV       0.64      0.43      0.51        21
         AUX       0.91      0.94      0.93       139
       CCONJ       1.00      1.00      1.00        25
         DET       0.82      0.89      0.85        36
        NOUN       0.79      0.86      0.82       329
         NUM       0.92      0.92      0.92        25
        PART       0.97      0.97      0.97        33
        PRON       0.91      0.82      0.86        65
       PROPN       0.61      0.55      0.58       145
       PUNCT       1.00      0.84      0.91       135
       SCONJ       0.75      1.00      0.86         3
        VERB       0.85      0.86      0.85        99

    accuracy                           0.85      1458
   macro avg       0.84   

Print Transitions

In [None]:
# This function prints the transition from POS_Tag1 -> POS_Tag2 with its corresponding weight
def printTransitions(transitions):
    for edge, weight in transitions:
        print("%-6s =>  %-9s %0.5f" % (edge[0], edge[1], weight))

PRINTING 10 MOST AND LEAST FREQUENT TRANSITION FEATURES

In [None]:
from collections import Counter
print("Top 10 Most Common POS Transition Features:")
print("-"*43)
printTransitions(Counter(crf.transition_features_).most_common(10))
print("\n")

print("Top 10 Least Common POS Transition Features:")
print("-"*44)
printTransitions(Counter(crf.transition_features_).most_common()[-10:])
print("\n")

Top 10 Most Common POS Transition Features:
-------------------------------------------
VERB   =>  AUX       4.21551
PROPN  =>  PROPN     3.27241
ADJ    =>  NOUN      2.76868
DET    =>  NOUN      2.37537
NUM    =>  NOUN      2.28368
AUX    =>  AUX       2.02066
NOUN   =>  ADP       1.99948
PROPN  =>  ADP       1.95803
VERB   =>  SCONJ     1.83607
PART   =>  NUM       1.65821


Top 10 Least Common POS Transition Features:
--------------------------------------------
ADP    =>  CCONJ     -1.12419
CCONJ  =>  AUX       -1.16132
AUX    =>  PART      -1.17216
ADV    =>  AUX       -1.28991
PROPN  =>  PART      -1.34432
AUX    =>  ADP       -1.37030
PROPN  =>  AUX       -1.51801
ADJ    =>  PRON      -1.89973
DET    =>  ADP       -1.94040
ADJ    =>  ADP       -2.26893


