In [1]:
import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import scipy.stats

import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from collections import Counter

# Preprocessing
The format of file is not compatible with pyconllu reader library, thus we adopt a different implementation to read the file and parse it: 
- Open the file
- Read lines and break whenever a NULL tag is encountered
- We use `strip()` to remove all unnecessary spaces from words and tags
- We use `split()` to parse and divide lines w.r.t the given splitter :
    - In case of Train data splitter = ','
    - In case of Test data splitter = '\t'
 


In [2]:
def preproc(fname, fg):
    set_tags, tags = list(), list()
    set_sents, sents = list(), list()
    fp = open(fname, encoding="utf-8")
    lines = fp.readlines() # reading data rowise
    
    for line in lines[1:]: # iterate over all rows
        if fg == 1: # in case of test data
            index, word, tag = line.split("\t") # split on tabs
            index = index.strip()
            word = word.strip()
            tag = tag.strip()
        if fg == 0: # in case of test data
            index, word, tag = line.strip().split(',') # split on commas
        if len(tag) > 0:
            if tag == "COMMA":
                sents.append(",")
                tags.append("COMMA")
            else:
                sents.append(word)
                tags.append(tag)
        else: # if a newline is encountered
            set_sents.append(sents) # append current list to main list
            set_tags.append(tags)
            sents, tags = list(), list() # reinitialize
    return set_sents, set_tags

In [3]:
# build train set and test set using preproc functions
train_sents, train_tags = preproc("hi-ud-train.conllu", 0)
test_sents, test_tags = preproc("hi-ud-test.conllu", 1)

In [4]:
# print some data 
print("# train_sents : ", len(train_sents))
print(pd.DataFrame(list(zip(train_sents[5], train_tags[5]))))

print("\n\n# test_sents : ", len(test_sents))
print(pd.DataFrame(list(zip(test_sents[5], test_tags[5])) ) )


# train_sents :  499
            0      1
0       yahAz   PRON
1      lagane   VERB
2        vAlA    ADP
3        wIna    NUM
4        xina   NOUN
5          kA    ADP
6      ijwimA   NOUN
7        pUre    ADJ
8        xeSa   NOUN
9          ke    ADP
10      logoM   NOUN
11         ko    ADP
12  AmaMwriwa    ADJ
13     karawA   VERB
14         hE    AUX
15          .  PUNCT


# test_sents :  99
                  0      1
0              1876  PROPN
1               meM    ADP
2              yaha    DET
3             sWAna   NOUN
4               eka    NUM
5              bAra   NOUN
6              Pira    ADV
7           prakASa   NOUN
8               meM    ADP
9               AyA   VERB
10              ","  PUNCT
11             jaba   PRON
12        wawkAlIna    ADJ
13   purAwawvavewwA   NOUN
14           lOYrda   NOUN
15       karniMGama  PROPN
16               ne    ADP
17  mahAparinirvANa  PROPN
18            mUrwi  PROPN
19               kI    ADP
20             Koja   NOUN
21     

# Feature Extraction 

| S. No. | Feature | Convinience | Implementation |
|--------|---------|-------------|----------------|
|2| LowerCase| Reduces the ambiguity of case in words | `word.lower()`|
|3| IsUpperCase| Check if the word is all upper case, can check for emphasis in words | `word.isupper()`|
|4| IsTittleCase| Check if the word is all upper case, can check for emphasis in words | `word.istitle()`|
|5| IsDigit| Filtering digits adds in details to words and reduces chances of certain POS numerics can't be assigned | `word.isdigit()`|
|6| Suffix[-3:]| Extracting last 3 letters as suffix as most hindi suffixes involve 2 or 3 letters | `word[-3:]`|       
|7| Suffix[-2:]| -do- | `word[-2:]`|
|8| Prefix[3:]| Extracting first 3 letters as prefix as most hindi prefixes involve 2 or 3 letters | `word[:3]`|        
|9| Prefix[2:]| -do- | `word[:2]`|
|10| Stem | Extracting Stem from the word helps in removing common prefixes and suffixes to get root form of words reducing possible vocabulary | `ps.stem(word)`|
|11| Lemma | Extracting Lemma from the word helps in  reducing possible vocabulary size and reduces ambiguity among words with same base meaning | `ws.lemmatize(word)`|
|12| -1_word | Previous word improves results by introducing context | `sent[i-1]`|
|13| -1_word_Lowercase | -do- | `sent[i-1].lower()`|
|14| -1_word_istitlecase |-do-| `sent[i-1].istitle()`|
|15| -1_word_isuppercase|-do-| `sent[i-1].isupper()`|
|16| -1_word_Stem |-do-|`ps.stem(sent[i-1])`|
|17| -1_word_Lemma |-do-|`ws.lemmatize(sent[i-1])` |
|18| START |  True if the word is first word of sentence or begining of sentence | |
|19| +1_word | Next word improves results by introducing context | `sent[i+1]`|
|20| +1_word_Lowercase | -do- | `sent[i+1].lower()`|
|21| +1_word_istitlecase |-do-| `sent[i+1].istitle()`|
|22| +1_word_isuppercase|-do-| `sent[i+1].isupper()`|
|23| +1_word_Stem |-do-|`ps.stem(sent[i+1])`|
|24| +1_word_Lemma |-do-|`ws.lemmatize(sent[i+1])` |
|25| END |  True if the word is last word of sentence or emd of sentence | |
|26| -2_word|Next word improves results by introducing context | `sent[i-2]`|
|27| NextToStart | True if the word is second word of sentence | |
|28| -2_word|Next word improves results by introducing context | `sent[i+2]`|
|29| PrevToEnd | True if the word is second last word of sentence | |
   

In [5]:
ps = PorterStemmer()
ws = WordNetLemmatizer()

def extractFeatures(sent, tags, i):
    word = sent[i]
    postag = tags[i]

    features = {
        'bias': 1.0,
        'LowerCase': word.lower(),
        'IsUpperCase': word.isupper(),
        'IsTittleCase': word.istitle(),
        'IsDigit': word.isdigit(),
        'Suffix[-3:]': word[-3:],
        'Suffix[-2:]': word[-2:],
        'Prefix[3:]': word[:3],
        'Prefix[2:]': word[:2],
        'Stem': ps.stem(word),
        'Lemma' : ws.lemmatize(word),
    }
    if i > 0:
        features.update({
            '-1_word' : sent[i-1],
#             '-1_word_isdigit()': sent[i-1].isdigit(),
            '-1_word_Lowercase': sent[i-1].lower(),
            '-1_word_istitlecase': sent[i-1].istitle(),
            '-1_word_isuppercase': sent[i-1].isupper(),
            '-1_word_Stem': ps.stem(sent[i-1]),
            '-1_word_Lemma' : ws.lemmatize(sent[i-1]),
        })
    else:
        features['START'] = True

    if i < len(sent)-1:
        features.update({
            '+1:word' : sent[i+1],
#             '+1_word_isdigit()': sent[i+1].isdigit(),
            '+1:word_Lower': sent[i+1].lower(),
            '+1:word_istitlecase': sent[i+1].istitle(),
            '+1:word_isuppercase': sent[i+1].isupper(),
            '+1_word_Stem': ps.stem(sent[i+1]),
            '+1_word_Lemma' : ws.lemmatize(sent[i+1]),
        })
    else:
        features['END'] = True
    
    if i > 1:
        features.update({
            '-2_word' : sent[i-2],
#             '-2_word_isdigit()': sent[i-2].isdigit(),
#             '-2_word_islower()': sent[i-2].lower(),
#             '-2_word_istitle()': sent[i-2].istitle(),
#             '-2_word_isupper()': sent[i-2].isupper(),
#             '-2_word_Stem': ps.stem(sent[i-2]),
#             '-2_word_Lemma' : ws.lemmatize(sent[i-2]),
        })
    else:
        features['NextToStart'] = True

    if i < len(sent)-2:
        features.update({
            '+2:word' : sent[i+2],
# #             '+2_word_isdigit()': sent[i+1].isdigit(),
#             '+2:word.lower()': sent[i+2].lower(),
#             '+2:word.istitle()': sent[i+2].istitle(),
#             '+2:word.isupper()': sent[i+2].isupper(),
#             '+2_word_Stem': ps.stem(sent[i+2]),
#             '+2_word_Lemma' : ws.lemmatize(sent[i+2]),
        })
    else:
        features['PrevToEnd'] = True
    
    return features

In [6]:
def sentFeat(sent, tags):
    return [extractFeatures(sent, tags, i) for i in range(len(sent))]

In [7]:
X_train = [sentFeat(train_sents[i], train_tags[i]) for i in range(len(train_sents))]
y_train = train_tags #[sent2labels(s) for s in training_sentences]

X_test = [sentFeat(test_sents[i], test_tags[i]) for i in range(len(test_sents))]
y_test = test_tags #[sent2labels(s) for s in training_sentences]


In [8]:
# print pre processed data
print(len(X_train))           
print(pd.DataFrame(X_train[5]))
print(y_train[5], "\n")

print(len(X_test))           
print(pd.DataFrame(X_test[5]))
print(y_test[5])

499
    bias  LowerCase  IsUpperCase  IsTittleCase  IsDigit Suffix[-3:]  \
0    1.0      yahaz        False         False    False         hAz   
1    1.0     lagane        False         False    False         ane   
2    1.0       vala        False         False    False         AlA   
3    1.0       wina        False         False    False         Ina   
4    1.0       xina        False         False    False         ina   
5    1.0         ka        False         False    False          kA   
6    1.0     ijwima        False         False    False         imA   
7    1.0       pure        False         False    False         Ure   
8    1.0       xesa        False         False    False         eSa   
9    1.0         ke        False         False    False          ke   
10   1.0      logom        False         False    False         goM   
11   1.0         ko        False         False    False          ko   
12   1.0  amamwriwa        False         False    False         iwa   
13

In [9]:
# train with out hyper parameter optimization
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train) # fit the model


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [10]:
labels = list(crf.classes_) # extract labels from crf model 
print(labels)

['DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'NUM', 'AUX', 'PUNCT', 'PRON', 'VERB', 'CCONJ', 'PART', 'COMMA', 'SCONJ', 'X']


In [11]:
y_pred_t = crf.predict(X_train)
print("Accuracy, F1 Score on Train Set with out hyper parameter optimization: ", metrics.flat_accuracy_score(y_train, y_pred_t), metrics.flat_f1_score(y_train, y_pred_t, average='weighted', labels=labels))

y_pred = crf.predict(X_test)
print("Accuracy, F1 Score on Test Set with out hyper parameter optimization: ", metrics.flat_accuracy_score(y_test, y_pred), metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))

Accuracy, F1 Score on Train Set with out hyper parameter optimization:  0.9996051592524349 0.9996051006702492
Accuracy, F1 Score on Test Set with out hyper parameter optimization:  0.8518005540166205 0.8511909604809106


In [12]:
# prediction, recall, f1 results for both test and train sets
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print("Metrics for Train Set :\n", metrics.flat_classification_report(
    y_train, y_pred_t, labels=sorted_labels, digits=3
))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print("\n\nMetrics for Test Set :\n", metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Metrics for Train Set :
               precision    recall  f1-score   support

           X      1.000     1.000     1.000         2
        PART      1.000     1.000     1.000       163
       CCONJ      1.000     1.000     1.000       150
       SCONJ      1.000     1.000     1.000        61
         ADJ      1.000     1.000     1.000       569
         ADP      1.000     1.000     1.000      1384
         ADV      1.000     1.000     1.000       110
        VERB      1.000     0.995     0.998       639
         DET      1.000     1.000     1.000       230
       COMMA      1.000     1.000     1.000       114
        NOUN      1.000     1.000     1.000      1596
        PRON      1.000     1.000     1.000       430
       PROPN      1.000     1.000     1.000       707
         NUM      1.000     1.000     1.000       152
       PUNCT      1.000     1.000     1.000       563
         AUX      0.996     1.000     0.998       728

    accuracy                          1.000      7598
 

In [13]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions with out hyper parameter optimization:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions with out hyper parameter optimization:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions with out hyper parameter optimization:
VERB   -> AUX     4.064907
PROPN  -> PROPN   2.651279
ADJ    -> NOUN    2.504953
NUM    -> NOUN    2.080024
PROPN  -> ADP     2.045232
AUX    -> AUX     2.043706
DET    -> NOUN    1.876709
PRON   -> ADP     1.678233
NOUN   -> ADP     1.675150
AUX    -> SCONJ   1.665517
PART   -> NUM     1.510606
VERB   -> SCONJ   1.371483
PROPN  -> PUNCT   1.356431
DET    -> ADJ     1.319981
PROPN  -> CCONJ   1.249946
ADV    -> VERB    1.171623
CCONJ  -> PROPN   1.112683
NOUN   -> CCONJ   1.025188
NUM    -> NUM     0.998060
ADJ    -> VERB    0.986983

Top unlikely transitions with out hyper parameter optimization:
NUM    -> PROPN   -0.758209
PROPN  -> PART    -0.760037
VERB   -> NUM     -0.787313
AUX    -> ADP     -0.787440
DET    -> CCONJ   -0.792342
VERB   -> ADJ     -0.813883
ADP    -> AUX     -0.830124
VERB   -> VERB    -0.832582
PRON   -> CCONJ   -0.865954
NUM    -> PRON    -0.868445
ADP    -> CCONJ   -0.905165
PROPN  -> AUX     -0.9072

In [14]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive features with out hyper parameter optimization:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative features with out hyper parameter optimization:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top positive features with out hyper parameter optimization:
4.627470 NUM      IsDigit
3.543670 ADJ      Suffix[-3:]:iwa
3.257322 VERB     Suffix[-2:]:ne
3.056400 NOUN     Suffix[-2:]:oM
2.796039 PRON     Prefix[3:]:apa
2.752691 PRON     Prefix[3:]:Apa
2.730343 PRON     Prefix[2:]:Ap
2.597030 PRON     Prefix[2:]:is
2.513943 PRON     Prefix[2:]:ap
2.497892 NOUN     bias

Top negative features with out hyper parameter optimization:
-1.126591 ADP      -2_word:nihArane
-1.276119 NOUN     Prefix[2:]:ra
-1.283902 X        bias
-1.287043 CCONJ    +1:word_isuppercase
-1.321439 AUX      -2_word:xeKane
-1.332726 NOUN     Suffix[-3:]:Ina
-1.346224 PROPN    IsTittleCase
-1.449749 AUX      -2_word:jagaha
-1.455890 PROPN    Suffix[-2:]:oM
-1.555633 NOUN     IsDigit


In [15]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# randomized search for two hyperparameters
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.3min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x20100A70>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x20100BD0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'NUM', 'AUX', 'PUNCT', 'PRON', 'VERB', 'CCONJ', 'PART', 'COMMA', 'SCONJ', 'X']),
                   verbose=1)

In [16]:
warnings.filterwarnings('ignore')
y_pred_t = rs.predict(X_train)
print("Accuracy, F1 Score on Train Set with Hyperparameter Optimization: ", metrics.flat_accuracy_score(y_train, y_pred_t), metrics.flat_f1_score(y_train, y_pred_t, average='weighted', labels=labels))
warnings.filterwarnings('ignore')
y_pred = rs.predict(X_test)
print("Accuracy, F1 Score on Test Set with Hyperparameter Optimization: ", metrics.flat_accuracy_score(y_test, y_pred), metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))

Accuracy, F1 Score on Train Set with Hyperparameter Optimization:  0.9998683864174783 0.9998683800510951
Accuracy, F1 Score on Test Set with Hyperparameter Optimization:  0.8587257617728532 0.857706394995462


In [23]:
# prediction, recall, f1 results
sorted_labels = sorted(
    list(rs.classes_),
    key=lambda name: (name[1:], name[0])
)
print("Metrics for Train Set with Hyperparameter Optimization:\n", metrics.flat_classification_report(
    y_train, y_pred_t, labels=sorted_labels, digits=3
))

sorted_labels = sorted(
    list(rs.classes_),
    key=lambda name: (name[1:], name[0])
)
print("\n\nMetrics for Test Set with Hyperparameter Optimization :\n", metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))


Metrics for Train Set with Hyperparameter Optimization:
               precision    recall  f1-score   support

           X      1.000     1.000     1.000         2
        PART      1.000     1.000     1.000       163
       CCONJ      1.000     1.000     1.000       150
       SCONJ      1.000     1.000     1.000        61
         ADJ      1.000     1.000     1.000       569
         ADP      1.000     1.000     1.000      1384
         ADV      1.000     1.000     1.000       110
        VERB      1.000     0.998     0.999       639
         DET      1.000     1.000     1.000       230
       COMMA      1.000     1.000     1.000       114
        NOUN      1.000     1.000     1.000      1596
        PRON      1.000     1.000     1.000       430
       PROPN      1.000     1.000     1.000       707
         NUM      1.000     1.000     1.000       152
       PUNCT      1.000     1.000     1.000       563
         AUX      0.999     1.000     0.999       728

    accuracy           