# Part-of-Speech tagging of Code-Mixed Social Media Text


A program to tag Parts Of Speech on code-mixed data from http://www.amitavadas.com/Code-Mixing.html competition which consisted of Bengali-English, Hindi-English, Telugu_English code mixed texts.
We have extracted n-gram features, neighboring words and language to train a 3 CRF models. Achieved an F1_score of 0.79,0.79,0.71 for Bengali, Hindi and Telugu respectively

##########################################

Task at hand : POS tagging on Code Mixed Text 


Input Data : Text with Language and POS tags 

##########################################

Features usec : 1 - 5 grams + Language + Next word + Previous word + Beginning of Sentence + End of Sentence

Model Selected : Conditional Random Fields (CRF)

Parameters
    
    Algirithm : L-BFGS - Gradient descent using the L-BFGS method
    'c1': 0.09
    'c2': 0.50
    
##########################################

Accuracy Measure 

F1_score 

    BENGALI - ENGLISH
    Micro : 0.79
    Macro : 0.68
    Weighted : 0.78
    
    HINDI - ENGLISH
    Micro : 0.79
    Macro : 0.75
    Weighted : 0.79
    
    TELUGU - ENGLISH
    Micro : 0.71
    Macro : 0.63
    Weighted : 0.71

## Importing libraries

In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import random

## Loading data

In [2]:
def load_data(files):
    data, sent = [], []
    for file in files:
        with open(file, 'r') as rf:
            for line in rf:
                if line.strip() != '':
                    # Note: the shared corpus is already tokenized
                    sent.append(line.strip().split('\t'))
                else:
                    if len(sent) > 0:
                        data.append(sent)
                        sent = []
    return data

sents_BN = load_data(['data/FB_BN_EN_CR.txt', 'data/TWT_BN_EN_CR.txt', 'data/WA_BN_EN_CR.txt'])
sents_HI = load_data(['data/FB_HI_EN_CR.txt', 'data/TWT_HI_EN_CR.txt', 'data/WA_HI_EN_CR.txt'])
sents_TE = load_data(['data/FB_TE_EN_CR.txt', 'data/TWT_TE_EN_CR.txt', 'data/WA_TE_EN_CR.txt'])

## Train - Test data split

In [3]:
random.seed(7)
random.shuffle(sents_BN)
random.shuffle(sents_HI)
random.shuffle(sents_TE)
train_sents_BN = sents_BN[:int(0.8*len(sents_BN))]
valid_sents_BN = sents_BN[int(0.8*len(sents_BN)):]
train_sents_HI = sents_HI[:int(0.8*len(sents_HI))]
valid_sents_HI = sents_HI[int(0.8*len(sents_HI)):]
train_sents_TE = sents_TE[:int(0.8*len(sents_TE))]
valid_sents_TE = sents_TE[int(0.8*len(sents_TE)):]
print("# Train sentences for BENGALI-ENGLISH: %d" % (len(train_sents_BN)))
print("# Validation sentences for BENGALI-ENGLISH: %d" % (len(valid_sents_BN)))
print("# Train sentences for HINDI-ENGLISH: %d" % (len(train_sents_HI)))
print("# Validation sentences for HINDI-ENGLISH: %d" % (len(valid_sents_HI)))
print("# Train sentences for TELUGU-ENGLISH: %d" % (len(train_sents_TE)))
print("# Validation sentences for TELUGU-ENGLISH: %d" % (len(valid_sents_TE)))

# Train sentences for BENGALI-ENGLISH: 499
# Validation sentences for BENGALI-ENGLISH: 125
# Train sentences for HINDI-ENGLISH: 2104
# Validation sentences for HINDI-ENGLISH: 526
# Train sentences for TELUGU-ENGLISH: 1583
# Validation sentences for TELUGU-ENGLISH: 396


## Feature Extraction

N - gram vectors : 1 - 5

Language extraction

End of Sentence

Begining of Sentence

Next word

Previous word

In [4]:
def word2langs(sent):
    return [language_label for token, language_label, pos_tag in sent]

def word2features(sent, k):
    
    word = sent[k][0]
    features = [
        'token=%s' % (word)
    ]
    
    lang = sent[k][1]
    features.append(lang)
    
    # extracting n-grams, for n=1 to 5
    for i in range(1,6):
        # if the value of n is greater than the word length, we exit the loop
        if i > len(word):
            break
        character_features = [word[j:j+i] for j in range(len(word)-i+1)]
        features.extend([
            # is count of individual n-grams important? is the order important?
            "char-%d-gram=%s" % (i, ' '.join(list(set(character_features))))
        ])
    if k == 0:
        # first word in the sentence
        features.append('BOS')
    else:
        features.extend([
            "-1:word=%s" % (sent[k-1][0])
        ])
    if k == (len(sent)-1):
        # last word in the sentence         
        features.append('EOS')
    else:
        features.extend([
            "+1:word=%s" % (sent[k+1][0])
        ])
 
    return features
        
def sent2features(sent):
    # generating features for all the words/tokens in a sentence `sent`    
    return [word2features(sent, i) for i in range(len(sent))]



def sent2pos(sent):
    return [pos_tag for token, language_label, pos_tag in sent]

def sent2tokens(sent):
    return [token for token, language_label, pos_tag in sent]

In [5]:
#BENGALI-ENGLISH
X_train_BN = [sent2features(sent) for sent in train_sents_BN]
y_train_BN = [sent2pos(sent) for sent in train_sents_BN]

X_test_BN = [sent2features(sent) for sent in valid_sents_BN]
y_test_BN = [sent2pos(sent) for sent in valid_sents_BN]


##HINDI-ENGLISH
X_train_HI = [sent2features(sent) for sent in train_sents_HI]
y_train_HI = [sent2pos(sent) for sent in train_sents_HI]

X_test_HI = [sent2features(sent) for sent in valid_sents_HI]
y_test_HI = [sent2pos(sent) for sent in valid_sents_HI]


##HINDI-ENGLISH
X_train_TE = [sent2features(sent) for sent in train_sents_TE]
y_train_TE = [sent2pos(sent) for sent in train_sents_TE]

X_test_TE = [sent2features(sent) for sent in valid_sents_TE]
y_test_TE = [sent2pos(sent) for sent in valid_sents_TE]

## Model Training

### Bengali-English

In [7]:
#BENGALI
trainer_BN = pycrfsuite.Trainer(algorithm='lbfgs', verbose=False)

for xseq, yseq in zip(X_train_BN, y_train_BN):
    trainer_BN.append(xseq, yseq)
    
trainer_BN.set_params({
    'c1': 0.09,   # coefficient for L1 penalty
    'c2': 0.50,  # coefficient for L2 penalty
    'max_iterations': 1000,  # stop earlier
    'linesearch' : 'StrongBacktracking',
    'num_memories' : 15,

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})    
    
display(trainer_BN.params())

trainer_BN.train('BN.crfsuite')


['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

### Hindi-English

In [9]:
#HINDI
trainer_HI = pycrfsuite.Trainer(algorithm='lbfgs', verbose=False)

for xseq, yseq in zip(X_train_HI, y_train_HI):
    trainer_HI.append(xseq, yseq)
    
trainer_HI.set_params({
    'c1': 0.09,   # coefficient for L1 penalty
    'c2': 0.50,  # coefficient for L2 penalty
    'max_iterations': 1000,  # stop earlier
    'linesearch' : 'StrongBacktracking',
    'num_memories' : 15,

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})    
    
display(trainer_HI.params())

trainer_HI.train('HI.crfsuite')

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

### Telugu-English

In [10]:
#TELUGU
trainer_TE = pycrfsuite.Trainer(algorithm='lbfgs', verbose=False)

for xseq, yseq in zip(X_train_TE, y_train_TE):
    trainer_TE.append(xseq, yseq)
    
trainer_TE.set_params({
    'c1': 0.09,   # coefficient for L1 penalty
    'c2': 0.50,  # coefficient for L2 penalty
    'max_iterations': 1000,  # stop earlier
    'linesearch' : 'StrongBacktracking',
    'num_memories' : 15,

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})    
    
display(trainer_TE.params())

trainer_TE.train('TE.crfsuite')

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

## Model Evaluation

### Bengali-English

In [8]:
tagger = pycrfsuite.Tagger()
tagger.open('BN.crfsuite')

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )


y_pred_BN = [tagger.tag(xseq) for xseq in X_test_BN]


print(bio_classification_report(y_test_BN, y_pred_BN))

              precision    recall  f1-score   support

           #       0.83      0.91      0.87        33
           $       1.00      0.43      0.60        21
           @       0.94      0.83      0.88        41
          CC       0.93      0.76      0.84        68
          DT       1.00      0.70      0.82        33
           E       0.69      0.50      0.58        18
         G_J       0.84      0.54      0.66       137
         G_N       0.72      0.92      0.81      1072
       G_PRP       0.82      0.82      0.82       286
       G_PRT       0.74      0.49      0.59       102
         G_R       0.73      0.43      0.54        89
       G_SYM       0.64      0.47      0.54        60
         G_V       0.78      0.64      0.70       424
         G_X       0.98      0.89      0.93       463
         PSP       0.76      0.74      0.75       207
           U       0.00      0.00      0.00         2

   micro avg       0.79      0.79      0.79      3056
   macro avg       0.77   

  'precision', 'predicted', average, warn_for)


### Hindi-English

In [11]:
tagger = pycrfsuite.Tagger()
tagger.open('HI.crfsuite')

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )


y_pred_HI = [tagger.tag(xseq) for xseq in X_test_HI]


print(bio_classification_report(y_test_HI, y_pred_HI))

              precision    recall  f1-score   support

           #       0.85      0.64      0.73       121
           $       0.82      0.53      0.64        70
           @       0.70      0.95      0.80       222
          CC       0.71      0.54      0.62       136
          DT       0.92      0.86      0.88       269
           E       0.78      0.78      0.78        58
         G_J       0.74      0.52      0.61       382
         G_N       0.74      0.90      0.81      2421
       G_PRP       0.85      0.80      0.82       604
       G_PRT       0.65      0.52      0.58       242
         G_R       0.89      0.74      0.81       265
       G_SYM       0.74      0.53      0.62        60
         G_V       0.83      0.75      0.79      1271
         G_X       0.90      0.85      0.88      1185
         PSP       0.80      0.70      0.75       475
           U       0.71      0.72      0.71        57
           ~       1.00      1.00      1.00        10

   micro avg       0.79   

### Telugu-English

In [12]:
tagger = pycrfsuite.Tagger()
tagger.open('TE.crfsuite')

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )


y_pred_TE = [tagger.tag(xseq) for xseq in X_test_TE]


print(bio_classification_report(y_test_TE, y_pred_TE))

              precision    recall  f1-score   support

           #       0.82      0.56      0.67        48
           $       0.82      0.46      0.58        68
           @       0.91      0.89      0.90       246
          CC       0.94      0.87      0.90        68
          DT       0.90      0.77      0.83       123
           E       0.93      0.80      0.86        82
         G_J       0.85      0.65      0.73       271
         G_N       0.70      0.82      0.76      2148
       G_PRP       0.70      0.49      0.57       221
       G_PRT       0.76      0.55      0.64       140
         G_R       0.81      0.66      0.73       118
       G_SYM       0.00      0.00      0.00        12
         G_V       0.75      0.51      0.61       534
         G_X       0.61      0.70      0.65      1147
         PSP       0.75      0.62      0.68       203
           U       0.91      0.97      0.94        70
        null       0.08      0.02      0.04        44
           ~       0.67    