In [None]:
!pip install conllu

### Parsing the data and creating train,test and dev sets

In [None]:
from conllu import parse_incr
from io import open
file=open('hi_hdtb-ud-train.conllu','r',encoding='utf-8')
ud_files=[]
for tokenlist in parse_incr(file):
    ud_files.append(tokenlist)

In [None]:
def dataset(ud_files):
    bank=[]
    for sentence in ud_files:
        tokens=[]
        tags=[]

        for token in sentence:
            tokens.append(token['form'])
            tags.append(token['upostag'])

        bank.append((tokens,tags))
    return bank

In [None]:
train=dataset(ud_files)

In [None]:
def separate(bank):
    X,y=[],[]
    for index in range(len(bank)):
        X.append(bank[index][0])
        y.append(bank[index][1])
    return X,y

In [None]:
X,y=separate(train)

In [None]:
file=open('hi_hdtb-ud-test.conllu','r',encoding='utf-8')
ud_files=[]
for tokenlist in parse_incr(file):
    ud_files.append(tokenlist)
test=dataset(ud_files)

In [None]:
file=open('hi_hdtb-ud-dev.conllu','r',encoding='utf-8')
ud_files=[]
for tokenlist in parse_incr(file):
    ud_files.append(tokenlist)
dev=dataset(ud_files)

In [None]:
Xtest,ytest=separate(test)

In [None]:
Xdev,ydev=separate(dev)

### Creating function to extract features and using it on the train, test and dev sets.

In [None]:
def extract_features(sentence, index):
    return{
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'next_word':sentence[index+1] if index<len(sentence)-1 else '',
      'prev_word':'' if index == 0 else sentence[index-1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit()
    }

In [None]:
xtrain=[]
for index in range(len(X)):
    arrange=[]
    for i in range(len(X[index])):
        arrange.append(extract_features(X[index],i))
    xtrain.append(arrange)

In [None]:
xtest=[]
for index in range(len(Xtest)):
    arrange=[]
    for i in range(len(Xtest[index])):
        arrange.append(extract_features(Xtest[index],i))
    xtest.append(arrange)

In [None]:
xdev=[]
for index in range(len(Xdev)):
    arrange=[]
    for i in range(len(Xdev[index])):
        arrange.append(extract_features(Xdev[index],i))
    xdev.append(arrange)

### Importing sklearn_crf suite and initializing the model

In [None]:
import warnings
warnings.filterwarnings('ignore')
!pip install sklearn_crfsuite
from sklearn_crfsuite import CRF


hindi_crf = CRF(
    algorithm='lbfgs',
    c1=0.20,
    c2=0.3,
    max_iterations=100,
    all_possible_transitions=True
)

print("Started training ")
hindi_crf.fit(xtrain, y)
print("Finished training ")

In [None]:
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
print("##nltk##")
y_pred = hindi_crf.predict(xtest)
print("F1 score on Test Data")
print(metrics.flat_f1_score(ytest, y_pred,average='weighted',labels=hindi_crf.classes_))

# This presents class wise score. Helps see which classes (tags) are the ones with most problems.
print("Class wise score:")
print(metrics.flat_classification_report(
    ytest, y_pred, labels=hindi_crf.classes_, digits=3
))
from sklearn.metrics import accuracy_score
accuracy_score(ytest,y_pred)

In [None]:
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
y_pred = model.predict(xdev)
print("F1 score on Dev Data")
print(metrics.flat_f1_score(ydev, y_pred,average='weighted',labels=hindi_crf.classes_))
print(metrics.flat_accuracy_score(ydev, y_pred))

### Testing of the model

In [None]:
sentence='पत्तेदार सब्जियां आपके स्वास्थ्य के लिए अच्छी होती हैं ।'
list1=[]
list1.append(sentence.split())
xtesting=[]
for index in range(len(list1)):
    arrange=[]
    for i in range(len(list1[index])):
        arrange.append(extract_features(list1[index],i))
    xtesting.append(arrange)

In [None]:
pred = hindi_crf.predict(xtesting)

In [None]:
pred