In [9]:
from codemixeddata import LoadDataSet
import posfeatures as pos
import lidffeatures as lidf
import chunkfeatures as chunk
from pipeline import *
from customutils import *
from sklearn_crfsuite import scorers, metrics, CRF
import numpy as np
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
import pandas as pd

In [10]:
X, y = LoadDataSet(xlabels=['WORD', 'POSITION', 'EPOS', 'EPOSSCORE', 'LANG', 'POS'], ylabel='CHUNK')

In [11]:
lidf_features = FeatureStacker([('lex', lidf.AddLexTransformer()),
                                ('bnc_count', lidf.BNCCountsTransformer()),
                                ('lex_norm', lidf.LexNormCountsTransformer()),
                                ('in_bnc_or_lexnorm', lidf.LexNormBNCSpecialTransformer()),
                                ('hindi_dict', lidf.HindiDictionaryCountsTransformer()),
                                ('capitalization', lidf.CapitalizationTransformer()),
                                ('cngram', lidf.CharacterNgramTransformer()),
                                ('en_rest_pos', lidf.PoSTransformer()),
                                ('en_rest_pos_confidence', lidf.PoSConfidenceTransformer()),
                                ('affixes', pos.AffixesTransformer())
                              ])

lidf_model = Pipeline([
        ('features', lidf_features),
        ('classifier', CRF())
    ])

In [12]:
pos_features = FeatureStacker([ ('lex', pos.AddLexTransformer()),
                                ('predicted_lang', pos.PredictedLangTransformer()),
                                ('en_rest_pos', pos.PoSTransformer()),
                                ('en_rest_pos_confidence', pos.PoSConfidenceTransformer()), 
                                ('context', pos.ContextTransformer()),
                                ('affixes', pos.AffixesTransformer()),
                                ('en_clusters', pos.HWCTransformer()),
                              ])

pos_model = Pipeline([
        ('features', pos_features),
        ('classifier', CRF())
    ])

In [13]:
X = run_step(X, lidf_model, "LANG")

In [14]:
#X = run_step(X, pos_model, "POS")

In [21]:
chunk_features = FeatureStacker([('lex', pos.AddLexTransformer()),
                                ('predicted_lang', pos.PredictedLangTransformer()),
                                ('en_rest_pos', pos.PoSTransformer()),
#                                ('en_rest_pos_confidence', pos.PoSConfidenceTransformer()), 
#                                ('context', pos.ContextTransformer()),
#                                ('affixes', pos.AffixesTransformer()),
#                                ('en_clusters', pos.HWCTransformer()),
                                ('predicted_pos', chunk.PoSTransformer()),
#                                ('pos_context', chunk.POSContextTransformer()),
                                ('lex__predicted_pos', chunk.LexPoSTransformer()),
                              ])

chunk_model = Pipeline([
        ('features', chunk_features),
        ('classifier', CRF())
    ])
#chunk_model.set_params(features__affixes__prefix_len=2, features__affixes__suffix_len=2)

In [22]:
chunk_acc = cross_val_score(chunk_model, X, y, scorers.flat_accuracy, cv=10, n_jobs=-1)

Accuracy {{'%.2f' % (chunk_acc.mean() * 100)}}