In [55]:
def readdata(filename,delimiter):
    sentences=[]
    with open(filename, "r",encoding='utf8') as f:
        sentence=[]
        for tag in f.readlines()[1:]:
            if(tag=="{}{}\n".format(delimiter,delimiter)):
                sentences.append(sentence)
                sentence=[]
                continue
            fields=tag.strip().split(delimiter)
            sentence.append((fields[1].strip('\"'),fields[2]))
        sentences.append(sentence)
    return sentences


## Features Chosen:
**word** – the word itself <br />
**word.Lower()** - the word is reduced to lowercase <br />
**word.isTitle()** - Boolean True if only the first character is in uppercase and rest are lowercase
**word.isUpper()** - Boolean True if all characters of string are uppercase  
**word.isDigit()** - Boolean True if all characters of string are Digits<br />
**Prefix-1** - word[0] first character of word<br />
**Prefix-2** - word[0:2] first 2 characters of word<br />
**Prefix-3** - word[0:3] first 3 characters of word<br />
**Suffix-1** - word[-1] last character of word<br />
**Suffix-2** - word[-2:] last 2 characters of word<br />
**Suffix-3** - word[-3:] last 3 characters of word<br />
**has_Hyphen** - Boolean True if word has hyphen in it<br />

**BOS** - If word is the Beginning of the Sentence<br />
**-1:word.Lower()** - previous word reduced to lowercase<br />
**-1:word.isTitle()** - Boolean True if only first character of previous word is uppercase and rest are lowercase<br />
**-1:word.isUpper()** - Boolean True if all characters of the previous word are uppercase<br />
**-1:postag** - pos tag of previous word<br />

**EOS** - If word is end of the sentence<br />
**+1:word.Lower()** - next word reduced to lowercase<br />
**+1:word.isTitle()** - Boolean True if only first character of the next word is in uppercase and rest in lowercase<br />
**+1:word.isUpper()** - Boolean True if all characters of next word are in uppercase<br />
**+1:postag** - pos tag of next word<br />

The actual word is useful in determining the pos tag. Identifying if the word is a title can be useful as generally proper nouns are titles. isdigit() is used for identifying a number. Different length Prefix and suffix can be used for identifying information regarding tense of the word. Even information can be obtained of previous word and next word in the sentence. If we know the pos tag of the previous word and the next word, then the pos tag of current word can also be estimated upto a certain extent with this information.The BOS and EOS features are useful as sentences usually begin and end with certain tags.



In [56]:
# This function returns the features of a word
# Input:  'sentence' <list of tuples(Word, POS_Tag)
# Output: dict {"feature" : value}
def WordToFeatures(sentence, i):
    word = sentence[i][0]
    
    features = {
        'word':           word,
        'word.Lower()':   word.lower(),
        'word.isTitle()': word.istitle(),
        'word.isUpper()': word.isupper(),
        'word.isDigit()': word.isdigit(),
        'Prefix-1':       word[0] if len(word)>0 else '',
        'Prefix-2':       word[:2] if len(word)>1 else '',
        'Prefix-3':       word[:3] if len(word)>2 else '',
        'Suffix-1':       word[-1] if len(word)>0 else '',
        'Suffix-2':       word[-2:] if len(word)>1 else '',
        'Suffix-3':       word[-3:] if len(word)>2 else '',
        'has_Hyphen':     '-' in word,
    }
    
    if (i > 0):
        prev = sentence[i-1][0]
        postag1 = sentence[i-1][1]
        features.update({
            '-1:word.Lower()'   : prev.lower(),
            '-1:word.isTitle()' : prev.istitle(),
            '-1:word.isUpper()' : prev.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if (i < len(sentence)-1):
        nextw = sentence[i+1][0]
        postag1 = sentence[i+1][1]
        features.update({
            '+1:word.Lower()'   : nextw.lower(),
            '+1:word.isTitle()' : nextw.istitle(),
            '+1:word.isUpper()' : nextw.isupper(),
            '+1:postag': postag1,
            
        })
    else:
        features['EOS'] = True

    return features



# This function returns the features of each word in a sentence
# Input: a 'sentence' <list of tuples(Word, POS_tag)>
# Output: a <list of <dict of {"feature" : value}>> corresponsing to each word
def SentenceToFeatures(sentence):
    return [WordToFeatures(sentence, i) for i in range(len(sentence))]


# This function returns the labels of each word in a sentence
# Input: 'sentence' <list of tuples(Word, POS_tags)>
# Output: <list of POS_tags>
def SentenceToLabels(sentence):
    return [fields[1] for fields in sentence]

In [57]:
Train_Set   = readdata('hi-ud-train.conllu', ',')
Test_Set    = readdata('hi-ud-test.conllu', '\t')

X_Train     = [SentenceToFeatures(sentence) for sentence in Train_Set]
Y_TrainTrue = [SentenceToLabels(sentence) for sentence in Train_Set]

X_Test      = [SentenceToFeatures(sentence) for sentence in Test_Set]
Y_TestTrue  = [SentenceToLabels(sentence) for sentence in Test_Set]

In [58]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

Model = sklearn_crfsuite.CRF(
    algorithm      = 'lbfgs',
    c1             = 0.1,
    c2             = 0.1,
    max_iterations = 300,
    all_possible_transitions = True
)
Model.fit(X_Train, Y_TrainTrue)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [59]:
print("                MODEL PREDICTION ON TRAINING DATA")
print('')


Y_TrainPredicted = Model.predict(X_Train)

print(metrics.flat_classification_report(Y_TrainTrue, Y_TrainPredicted))

print('precision: ',  metrics.flat_precision_score(Y_TrainTrue, Y_TrainPredicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(Y_TrainTrue, Y_TrainPredicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(Y_TrainTrue, Y_TrainPredicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(Y_TrainTrue, Y_TrainPredicted))

                MODEL PREDICTION ON TRAINING DATA

              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00       570
         ADP       1.00      1.00      1.00      1387
         ADV       0.99      0.99      0.99       111
         AUX       0.99      1.00      0.99       730
       CCONJ       1.00      1.00      1.00       150
       COMMA       1.00      1.00      1.00       114
         DET       1.00      0.99      0.99       231
        NOUN       1.00      1.00      1.00      1597
         NUM       1.00      1.00      1.00       152
        PART       1.00      1.00      1.00       163
        PRON       1.00      1.00      1.00       431
       PROPN       1.00      1.00      1.00       708
       PUNCT       1.00      1.00      1.00       564
       SCONJ       0.98      1.00      0.99        61
        VERB       1.00      0.98      0.99       640
           X       1.00      1.00      1.00         2

   micro avg       1.00      

In [60]:
print("                   MODEL PREDICTION ON TESTING DATA")
print(" ")

Y_TestPredicted = Model.predict(X_Test)

print(metrics.flat_classification_report(Y_TestTrue, Y_TestPredicted))

print('precision: ',  metrics.flat_precision_score(Y_TestTrue, Y_TestPredicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(Y_TestTrue, Y_TestPredicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(Y_TestTrue, Y_TestPredicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(Y_TestTrue, Y_TestPredicted))

                   MODEL PREDICTION ON TESTING DATA
 
              precision    recall  f1-score   support

         ADJ       0.74      0.78      0.76        94
         ADP       0.96      0.98      0.97       309
         ADV       0.71      0.48      0.57        21
         AUX       0.96      0.97      0.97       139
       CCONJ       1.00      1.00      1.00        25
         DET       0.89      0.92      0.90        36
        NOUN       0.83      0.89      0.86       329
         NUM       1.00      0.92      0.96        25
        PART       1.00      0.97      0.98        33
        PRON       0.92      0.88      0.90        65
       PROPN       0.81      0.69      0.74       145
       PUNCT       1.00      1.00      1.00       135
       SCONJ       0.60      1.00      0.75         3
        VERB       0.90      0.88      0.89        99

   micro avg       0.90      0.90      0.90      1458
   macro avg       0.88      0.88      0.88      1458
weighted avg       0.90   

In [64]:
# This function prints the transition from POS_Tag1 -> POS_Tag2 with its corresponding weight
def printTransitions(transitions):
    for edge, weight in transitions:
        print("%-6s =>  %-9s %0.5f" % (edge[0], edge[1], weight))

In [65]:
from collections import Counter

print("Top 10 Most Common POS Transition Features:")
print("")
printTransitions(Counter(Model.transition_features_).most_common(10))
print("\n")

print("Top 10 Least Common POS Transition Features:")
print("")
printTransitions(Counter(Model.transition_features_).most_common()[-10:])
print("\n")

Top 10 Most Common POS Transition Features:

VERB   =>  AUX       2.20021
AUX    =>  AUX       1.55228
NUM    =>  NOUN      1.47586
ADJ    =>  NOUN      1.37534
PROPN  =>  ADP       1.35694
PROPN  =>  PROPN     1.33343
NOUN   =>  ADP       1.28564
VERB   =>  SCONJ     1.25156
DET    =>  NOUN      1.22831
NOUN   =>  VERB      1.09714


Top 10 Least Common POS Transition Features:

AUX    =>  ADP       -1.06835
COMMA  =>  ADP       -1.08245
NUM    =>  PRON      -1.10012
PUNCT  =>  PUNCT     -1.10883
ADP    =>  COMMA     -1.22990
DET    =>  CCONJ     -1.33245
CCONJ  =>  AUX       -1.47313
ADJ    =>  PRON      -1.84110
ADJ    =>  ADP       -2.05786
DET    =>  ADP       -2.30424


