# Kannada POS Tagger using Linear Chain CRF

We will now train and tag the words using the features extracted from Word2Vec in the previous step.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

#### Loading the word features and labels

In [2]:
X = np.load("kannada-features.numpy").astype(np.float32)
Y = np.load("kannada-labels.numpy").astype(np.int32)

np.random.seed(546)
indices = np.arange(X.shape[0])
np.random.shuffle(indices)

X = X[indices]
Y = Y[indices]

**Each sentence has varying number of words. So, we need to determine sequence lengths before hand**

In [3]:
num_examples, num_words, num_features = X.shape
num_tags = np.unique(Y).size

sequence_lengths = np.full(num_examples, 0, dtype=np.int32)
for idx, row in enumerate(X):
    count = 0
    for word in row:
        if np.all(word == -1):
            break
        count += 1
    sequence_lengths[idx] = count

#### Splitting the data into Training and Test sets

In [4]:
split = 100
x_test = X[-split:,:,:]
y_test = Y[-split:,:]
s_test = sequence_lengths[-split:]

x = X[0:-split,:,:]
y = Y[0:-split,:]
sequence_lengths = sequence_lengths[0:-split]

#### Training on Linear Chain CRF

Tensorflow kinda makes it all easier

In [5]:
with tf.Graph().as_default():
    with tf.Session() as session:
        x_t = tf.constant(x)
        xt_t = tf.constant(x_test)
        y_t = tf.constant(y)
        yt_t = tf.constant(y_test)
        sequence_lengths_t = tf.constant(sequence_lengths)
        st_t = tf.constant(s_test)
        
        weights = tf.get_variable("weights", [num_features, num_tags])
        matricized_x_t = tf.reshape(x_t, [-1, num_features])
        matricized_unary_scores = tf.matmul(matricized_x_t, weights)
        unary_scores = tf.reshape(matricized_unary_scores, [num_examples-split, num_words, num_tags])
        
        matricized_xt_t = tf.reshape(xt_t, [-1, num_features])
        matricized_ust = tf.matmul(matricized_xt_t, weights)
        ust = tf.reshape(matricized_ust, [split, num_words, num_tags])
        
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(unary_scores, y_t, sequence_lengths_t)
        
        loss = tf.reduce_mean(-log_likelihood)
        train_op = tf.train.GradientDescentOptimizer(0.03).minimize(loss)
        
        session.run(tf.global_variables_initializer())
        for i in range(100):
            tf_ust, tf_unary_scores, tf_transition_params, _ = session.run([ust, unary_scores, transition_params, train_op])
            if i%5 == 0:
                correct_labels = 0
                total_labels = 0
                for tf_unary_scores_, y_, sequence_length_ in zip(tf_unary_scores, y, sequence_lengths):
                    tf_unary_scores_ = tf_unary_scores_[:sequence_length_]
                    y_ = y_[:sequence_length_]
                    
                    viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(tf_unary_scores_, tf_transition_params)
                    
                    correct_labels += np.sum(np.equal(viterbi_sequence, y_))
                    total_labels += sequence_length_
                accuracy = 100.0 * correct_labels / float(total_labels)
                print "Classification Accuracy (Training set): ", accuracy
        correct_labels = 0
        total_labels = 0
        pred_labels = []
        actual_labels = []
        for a, b, c in zip(tf_ust, y_test, s_test):
            a = a[:c]
            b = b[:c]
            
            vs, _ = tf.contrib.crf.viterbi_decode(a, tf_transition_params)
            correct_labels += np.sum(np.equal(vs, b))
            total_labels += c
            
            actual_labels = actual_labels + b.tolist()
            pred_labels = pred_labels + vs
            
        accuracy = 100.0 * correct_labels / float(total_labels)
        print "-------------------------------------------------"
        print "Classification Accuracy (Test set): ", accuracy

Classification Accuracy (Training set):  5.61625455242
Classification Accuracy (Training set):  58.8269120184
Classification Accuracy (Training set):  61.5487828254
Classification Accuracy (Training set):  66.9541882308
Classification Accuracy (Training set):  67.0308606479
Classification Accuracy (Training set):  69.7335633506
Classification Accuracy (Training set):  70.1169254361
Classification Accuracy (Training set):  71.4970289438
Classification Accuracy (Training set):  71.7270461951
Classification Accuracy (Training set):  72.5321065747
Classification Accuracy (Training set):  73.1071497029
Classification Accuracy (Training set):  73.5671842055
Classification Accuracy (Training set):  73.8738738739
Classification Accuracy (Training set):  74.3339083765
Classification Accuracy (Training set):  74.583093732
Classification Accuracy (Training set):  74.6981023577
Classification Accuracy (Training set):  75.0239601303
Classification Accuracy (Training set):  75.2923135902
Classificat

### Model Evaluation - Classification Report

In [6]:
from sklearn.metrics import classification_report

target_names = np.array(['CC', 'DEM', 'DET', 'INJ', 'IRR', 'JJ', 'NN', 'NUM', 'PRP', 'PSP',
       'QC', 'RB', 'SYM', 'UT', 'VM', 'WQ'])

print classification_report(actual_labels, pred_labels, target_names=target_names[np.unique(actual_labels)].tolist())

             precision    recall  f1-score   support

         CC       0.90      0.60      0.72        15
        DEM       0.85      0.65      0.73        17
        DET       0.00      0.00      0.00        12
         JJ       0.14      0.08      0.10        49
         NN       0.60      0.78      0.68       306
        NUM       0.00      0.00      0.00         2
        PRP       0.74      0.58      0.65       110
        PSP       0.80      0.29      0.42        14
         QC       0.69      0.45      0.55        20
         RB       0.74      0.30      0.43        46
        SYM       0.99      0.99      0.99        83
         UT       1.00      0.89      0.94         9
         VM       0.64      0.72      0.67       190
         WQ       0.00      0.00      0.00         2

avg / total       0.65      0.66      0.64       875



  'precision', 'predicted', average, warn_for)
