In [26]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import random

In [27]:
def load_data(files):
    data, sent = [], []
    for file in files:
        with open(file, 'r') as rf:
            for line in rf:
                if line.strip() != '':
                    # Note: the shared corpus is already tokenized
                    sent.append(line.strip().split('\t'))
                else:
                    if len(sent) > 0:
                        data.append(sent)
                        sent = []
    return data

sents = load_data(['FB_HI_EN_CR.txt', 'TWT_HI_EN_CR.txt', 'WA_HI_EN_CR.txt',"WA_TE_EN_CR.txt", "FB_BN_EN_CR.txt","FB_TE_EN_CR.txt" ,"TWT_BN_EN_CR.txt" ,"TWT_TE_EN_CR.txt" ,"WA_BN_EN_CR.txt" ])

In [28]:
random.seed(42)
random.shuffle(sents)
train_sents = sents[:int(0.8*len(sents))]
valid_sents = sents[int(0.8*len(sents)):]
print("# Train sentences: %d" % (len(train_sents)))
print("# Validation sentences: %d" % (len(valid_sents)))

# Train sentences: 4186
# Validation sentences: 1047


In [29]:
print(train_sents[0])

[['@nandi_harika', 'univ', '@'], ['elanti', 'te', 'G_R'], ['innovative', 'en', 'G_J'], ['ideas', 'univ', 'G_N'], ['ala', 'te', 'G_X'], ['vastai', 'en', 'G_X'], ['asalu', 'te', 'G_X'], ['nvu', 'univ', 'G_X'], ['super', 'en', 'G_R'], ['machi', 'univ', 'G_X'], [':)', 'univ', 'E']]


In [30]:
def word2features(sent, k):
    word = sent[k][0]
    lang = sent[k][1]
    wordClean = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM']).lower()
    normalizedWord = wordClean.lower()
    anyCap = any(char.isupper() for char in word)
    allCap = all(char.isupper() for char in word)
    hasSpecial = any(ord(char) > 32 and ord(char) < 65 for char in word)
    hashTag = word[0] == '#'
    mention = word[0] == '@'
    features = [
        'token=%s' % (word),
        'lang=%s' % (lang),                                              
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.lower.normalized='+ normalizedWord,
        'anyCap=%s' %anyCap,
        'allCap=%s' %allCap,
        'hasSpecial=%s' %hasSpecial,
        'hasTag=%s' % hashTag,
        'mention=%s'% mention,
        'vowelPercentage=%s' %vowelPercentage(word),
    ]
    for i in range(1,10):
        # if the value of n is greater than the word length, we exit the loop
        if i > len(word):
            break
        character_features = [word[j:j+i] for j in range(len(word)-i+1)]
        features.extend([
            # is count of individual n-grams important? is the order important?
            "char-%d-gram=%s" % (i, ' '.join(list(set(character_features))))
        ])
    if k == 0:
            
        # first word in the sentence
        features.append('BOS')
    else:
        if k == len(sent)-1:
            features.extend(["-1:word=%s" % (sent[k-1][0])])
            features.extend(["-1:lang=%s" % (sent[k-1][1])])
            features.extend(["-1:tag=%s" % (sent[k-1][2])])
            features.extend(['previous_word.lower=' + sent[k-1][0].lower()])
            features.extend(['word-1.isupper=%s' % sent[k-1][0].isupper()])
            features.extend(['word-1.istitle=%s' % sent[k-1][0].istitle()])
            features.extend(['word-1.isdigit=%s' % sent[k-1][0].isdigit()])
            features.extend(['word-1.vowel=%s' % vowelPercentage(sent[k-1][0])])      
        else:
            features.extend(['word+1.isupper=%s' % sent[k+1][0].isupper()])
            features.extend(['ahead_word.lower=' + sent[k+1][0].lower()])
            features.extend(['word+1.istitle=%s' % sent[k+1][0].istitle()])
            features.extend(['word+1.isdigit=%s' % sent[k+1][0].isdigit()])
            features.extend(['word+1.vowel=%s' % vowelPercentage(sent[k+1][0])])
            features.extend(['word-1.vowel=%s' % vowelPercentage(sent[k-1][0])])
            features.extend(['word-1.isupper=%s' % sent[k-1][0].isupper()])
            features.extend(['previous_word.lower=' + sent[k-1][0].lower()])
            features.extend(['word-1.istitle=%s' % sent[k-1][0].istitle()])
            features.extend(['word-1.isdigit=%s' % sent[k-1][0].isdigit()])
            features.extend(["+1:word=%s" % (sent[k+1][0])])
            features.extend(["+1:lang=%s" % (sent[k+1][1])])
            features.extend(["+1:tag=%s" % (sent[k+1][2])])
            features.extend(["-1:tag=%s" % (sent[k-1][2])])
            features.extend(["-1:lang=%s" % (sent[k-1][1])])
            features.extend(["-1:word=%s" % (sent[k-1][0])])           
            
    if i == len(sent):
        # last word in the sentence         
        features.append('EOS')
 
    return features
        
def sent2features(sent):
    # generating features for all the words/tokens in a sentence `sent`    
    return [word2features(sent, i) for i in range(len(sent))]

def sent2langs(sent):
    return [language_label for token, language_label, pos_tag in sent]

def sent2pos(sent):
    return [pos_tag for token, language_label, pos_tag in sent]

def sent2tokens(sent):
    return [token for token, language_label, pos_tag in sent]

def vowelPercentage(s):
    vowels = "aeiou"
    count = 0.
    for char in s:
        if char in vowels:
            count += 1
    return (count/len(s))

In [31]:
%%time
X_train = [sent2features(sent) for sent in train_sents]
# for training a pos-tagging system
y_train = [sent2pos(sent) for sent in train_sents]

X_test = [sent2features(sent) for sent in valid_sents]
y_test = [sent2pos(sent) for sent in valid_sents]

CPU times: user 2.24 s, sys: 144 ms, total: 2.39 s
Wall time: 2.39 s


In [32]:
print(X_train[0])

[['token=@nandi_harika', 'lang=univ', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'word.lower.normalized=nandiharika', 'anyCap=False', 'allCap=False', 'hasSpecial=True', 'hasTag=False', 'mention=True', 'vowelPercentage=0.38461538461538464', 'char-1-gram=h r a _ k d @ n i', 'char-2-gram=ha _h di ri @n an na nd i_ ka ar ik', 'char-3-gram=ndi _ha nan and rik i_h ika di_ ari @na har', 'char-4-gram=rika hari ndi_ @nan andi i_ha arik _har nand di_h', 'char-5-gram=_hari di_ha @nand andi_ harik arika nandi i_har ndi_h', 'char-6-gram=i_hari @nandi harika andi_h nandi_ di_har ndi_ha _harik', 'char-7-gram=ndi_har di_hari nandi_h i_harik @nandi_ _harika andi_ha', 'char-8-gram=andi_har di_harik nandi_ha ndi_hari i_harika @nandi_h', 'char-9-gram=andi_hari ndi_harik @nandi_ha nandi_har di_harika', 'BOS'], ['token=elanti', 'lang=te', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'word.lower.normalized=elanti', 'anyCap=False', 'allCap=False', 'hasSpecial=False'

In [33]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 1.69 s, sys: 0 ns, total: 1.69 s
Wall time: 1.69 s


In [34]:
trainer.set_params({
    'c1': 1e-4,   # coefficient for L1 penalty
    'c2': 0.5,  # coefficient for L2 penalty
    'max_iterations': 300,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [35]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [36]:
%%time
trainer.train('POS_model')

CPU times: user 1min 6s, sys: 44 ms, total: 1min 6s
Wall time: 1min 6s


In [37]:
tagger = pycrfsuite.Tagger()
tagger.open('POS_model')

<contextlib.closing at 0x7f49de7efe10>

In [38]:
example_sent = valid_sents[5]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2pos(example_sent)))

@ANI_news tumhari party hamesha note hi toh leti hai , bahut saare note . 💵

Predicted: @ G_N G_N G_N G_N G_N G_PRP G_V G_V G_X G_N G_N G_N G_X G_X
Correct:   @ G_PRP G_N G_R G_N G_PRT CC G_N G_V G_X G_SYM G_SYM G_N G_X G_X


In [39]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [40]:
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        for y in x:
            if y not in unique_list: 
                unique_list.append(y) 
    # print list 
    #for x in unique_list: 
       # print(x)
    return unique_list

In [41]:
labels = {}
for i,j in enumerate(unique(y_train)):
    labels.update({j:i})
labels

{'@': 0,
 'G_R': 1,
 'G_J': 2,
 'G_N': 3,
 'G_X': 4,
 'E': 5,
 'G_V': 6,
 '$': 7,
 'G_PRT': 8,
 'G_PRP': 9,
 'CC': 10,
 'PSP': 11,
 'DT': 12,
 'G_SYM': 13,
 '#': 14,
 'U': 15,
 '~': 16,
 'null': 17}

In [42]:
uniq = unique(y_train)
uniq

['@',
 'G_R',
 'G_J',
 'G_N',
 'G_X',
 'E',
 'G_V',
 '$',
 'G_PRT',
 'G_PRP',
 'CC',
 'PSP',
 'DT',
 'G_SYM',
 '#',
 'U',
 '~',
 'null']

In [43]:
import numpy as np
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=uniq))

              precision    recall  f1-score   support

           @       0.95      1.00      0.98       442
         G_R       0.84      0.67      0.75       501
         G_J       0.81      0.63      0.71       854
         G_N       0.75      0.89      0.82      5276
         G_X       0.87      0.86      0.87      2824
           E       0.96      0.86      0.91       175
         G_V       0.80      0.73      0.76      2296
           $       0.75      0.69      0.72       164
       G_PRT       0.69      0.52      0.60       494
       G_PRP       0.82      0.76      0.79      1175
          CC       0.77      0.68      0.72       314
         PSP       0.76      0.73      0.74       938
          DT       0.86      0.86      0.86       497
       G_SYM       0.74      0.42      0.54       140
           #       0.87      0.94      0.90       175
           U       0.95      0.84      0.89       133
           ~       0.75      0.60      0.67        20
        null       0.00    

In [44]:
predict = np.array([tag for row in y_pred for tag in row])
true = np.array([tag for row in y_test for tag in row])

In [45]:
count = 0
accuracy = 0
for i in range(len(predict)):
    if predict[i] == true[i]:
        count+=1
accuracy = count/len(true)
accuracy

0.8012018938934078

In [46]:
print(confusion_matrix(true, predict ,labels=uniq))

[[ 441    0    0    1    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [   0  338   16   76    8    0   27    0   10    4    0   13    3    4
     2    0    0    0]
 [   1   19  534  237    4    0   41    3    3    4    1    2    5    0
     0    0    0    0]
 [  15    7   62 4695  144    2  231    1   23   39    0   27   13    7
     6    3    0    1]
 [   1    4    5  276 2428    5   30    1   10   27    0   21    7    0
     1    1    4    3]
 [   0    1    0    6   13  151    1    1    0    1    0    0    1    0
     0    0    0    0]
 [   1   11   15  490   58    0 1680    0   11   13    0   14    1    0
     2    0    0    0]
 [   0    0    0   16   14    0    1  113    1    0    0    1    0    1
    13    0    0    4]
 [   1    7    4   98   20    0   23    0  258   14   10   51    2    3
     1    1    0    1]
 [   0    6    6  129   29    0   35    0   19  895    3   23   26    3
     0    1    0    0]
 [   1    0    1   16    6    0    5    0    9    