# This notebook contains several classification models used for morphological tagging of ukrainian language

Data used for creating morph tags dataset: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2837

For morphological taggin we can use any classification model. It is supervised learning where we learn  an algorithm to predict on of the several morph tags.

In [1]:
import warnings
import numpy as np
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron, PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

from data_processing import process_data, create_language_voc, extract_morph_tags, get_dataset

### Preparing dataset

In [2]:
processed_data = process_data()
words = create_language_voc(processed_data)
tags = extract_morph_tags(processed_data)
x, y = get_dataset(processed_data, words, tags)
del processed_data

x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.33)
del x
del y

In [3]:
x_train = np.array(x_train, dtype=np.int8)

In [4]:
x_test = np.array(x_test, dtype=np.int8)

### Metircs for models comparison

In [6]:
def metrics(model, x, y):
    y_pred = model.predict(x)
    return classification_report(y, y_pred)

### Logistic Regression

In [6]:
LR = LogisticRegression()
LR.fit(x_train,y_train)
print(metrics(clf, x_test,y_test))

0.8762189259717071

In [7]:
LR = LogisticRegression()
parameters = {'penalty':("l1","l2"),"C":[0.1,1,2,5,10],"fit_intercept":('True','False')}
clf = GridSearchCV(LR, parameters, cv=5)
clf.fit(x_train,y_train)
print(metrics(clf, x_test,y_test))

             precision    recall  f1-score   support

        ADJ       0.96      0.72      0.82      2794
        ADP       1.00      0.99      0.99      2625
        ADV       0.93      0.82      0.87      1585
        AUX       0.76      1.00      0.86       249
      CCONJ       0.93      0.96      0.95      1159
        DET       0.97      0.99      0.98      1140
       INTJ       0.94      0.46      0.62        35
       NOUN       0.80      0.98      0.88      6847
        NUM       0.89      0.73      0.80       340
       PART       0.93      0.84      0.88       887
       PRON       0.99      0.96      0.97      1318
      PROPN       0.92      0.69      0.79       770
      PUNCT       1.00      1.00      1.00      5432
      SCONJ       0.84      0.93      0.89       611
        SYM       0.82      0.38      0.51        24
       VERB       0.92      0.81      0.86      3186
          X       0.90      0.50      0.64       111
          _       1.00      1.00      1.00   

In [12]:
# del LR
del clf

### Perceptron

In [6]:
per = Perceptron()
per.fit(x_train,y_train)
print(metrics(per, x_test,y_test))

             precision    recall  f1-score   support

        ADJ       0.97      0.71      0.82      2792
        ADP       0.99      0.99      0.99      2599
        ADV       0.95      0.79      0.87      1576
        AUX       0.78      0.94      0.85       272
      CCONJ       0.90      0.96      0.93      1178
        DET       0.97      0.98      0.98      1112
       INTJ       0.43      0.59      0.50        27
       NOUN       0.75      0.99      0.85      6774
        NUM       0.89      0.73      0.80       342
       PART       0.96      0.72      0.82       920
       PRON       0.99      0.95      0.97      1317
      PROPN       0.94      0.69      0.79       781
      PUNCT       0.99      1.00      1.00      5519
      SCONJ       0.78      0.96      0.86       546
        SYM       0.89      0.77      0.83        22
       VERB       1.00      0.69      0.82      3226
          X       0.94      0.58      0.72       110
          _       0.16      1.00      0.28   

### PassiveAggressiveClassifier

In [7]:
pac = PassiveAggressiveClassifier()
pac.fit(x_train,y_train)
print(metrics(pac, x_test,y_test))

             precision    recall  f1-score   support

        ADJ       0.98      0.70      0.82      2803
        ADP       1.00      0.99      0.99      2615
        ADV       0.76      0.86      0.81      1619
        AUX       0.77      0.93      0.84       263
      CCONJ       0.95      0.89      0.92      1160
        DET       0.98      0.97      0.97      1126
       INTJ       0.54      0.49      0.51        39
       NOUN       0.87      0.93      0.90      6758
        NUM       0.84      0.79      0.81       332
       PART       0.84      0.85      0.85       903
       PRON       0.99      0.95      0.97      1302
      PROPN       0.90      0.68      0.78       760
      PUNCT       1.00      1.00      1.00      5540
      SCONJ       0.83      0.90      0.86       567
        SYM       0.91      0.38      0.54        26
       VERB       0.78      0.88      0.83      3183
          X       0.95      0.55      0.69       110
          _       0.56      1.00      0.72   

### Support Vector Machine

In [None]:
svm = SVC()
svm.fit(x_train,y_train)
print(metrics(svm, x_test,y_test))

### Decision Tree

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
print(metrics(dtc, x_test,y_test))

### Neural Networks

In [None]:
mlpc = MLPClassifier()
mlpc.fit(x_train,y_train)
print(metrics(mlpc, x_test,y_test))