In [17]:
from codemixeddata import *
from customclassifiers import *
from preprocessor import *
import posfeatures as pos
import lidffeatures as lidf
from pipeline import *
from customutils import *
from sklearn_crfsuite import scorers, metrics, CRF
import numpy as np
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.base import BaseEstimator
import random

In [5]:
create_dataset_with_features_file(rerun_pos_tagger=True)

In [None]:
X, y = LoadDataSet(xlabels=['WORD', 'LANG', 'NORM', 'POS'], ylabel='CHUNK')

In [None]:
X = run_step(X, NormEstimator(script=1, rebuild=True), 'NORM')

### Shallow Parser accuracy with gold pos, gold normalization, gold language id.

In [None]:
X, y = LoadDataSet(xlabels=['WORD', 'LANG', 'NORM', 'POS'], ylabel='CHUNK')

In [None]:
results = cross_val_score(CRF(), X, y, scorers.flat_accuracy, cv=10, n_jobs=-1)

Accuracy {{'%.2f' % (results.mean() * 100)}}

## Language Identification

In [18]:
X, y = LoadDataSetWFeatures(xlabels=['WORD', 'EPOS', 'EPOSSCORE', 'POSITION'], ylabel='LANG')

In [19]:
features = FeatureStacker([#('lex', lidf.AddLexTransformer()),
#                            ('position', lidf.AddPositionTransformer()),
                            ('bnc_count', lidf.BNCCountsTransformer()),
                            ('lex_norm', lidf.LexNormCountsTransformer()),
                            ('in_bnc_or_lexnorm', lidf.LexNormBNCSpecialTransformer()),  
                            ('hindi_dict', lidf.HindiDictionaryCountsTransformer()),
#                            ('word_length', lidf.WordLengthTransformer()),
                            ('capitalization', lidf.CapitalizationTransformer()),
                            ('cngram', lidf.CharacterNgramTransformer()),
#                            ('en_rest_pos', lidf.PoSTransformer()),
#                            ('en_rest_pos_confidence', lidf.PoSConfidenceTransformer()),
                            ('affixes', lidf.AffixesTransformer())
                          ])

model = Pipeline([
        ('features', features),
        ('classifier', CRF())
    ])

#model.set_params(features__cngram__indices=range(1,4))

In [8]:
lidf_acc = cross_val_score(model, X, y, scorers.flat_accuracy, cv=10, n_jobs=-1)

Accuracy {{'%.2f' % (lidf_acc.mean() * 100)}}

## POS tagger

In [14]:
X, y = LoadDataSetWFeatures(xlabels=['WORD', 'POSITION', 'EPOS', 'EPOSSCORE', 'LANG', 'NORM', 'HPOS_GOLD', 'HPOS_PRED'], ylabel='POS')

In [16]:
X, y = PruneSentencesPreprocessor(X, y)

In [11]:
X = run_step(X, NormEstimator(script=1, rebuild=True), 'NORM')

In [15]:
features = FeatureStacker([ ('lex', pos.AddLexTransformer()),
                            ('gold_lang', pos.GoldLangTransformer()),
                            ('en_rest_pos', pos.EnRestPoSTransformer()),
#                            ('combined_pos', pos.CombinedPoSTransformer()),                               
                            ('hi_pos', pos.HiPoSTransformer()),
                            ('normlex', pos.NormLexTransformer()),                            
                            ('en_rest_pos_confidence', pos.PoSConfidenceTransformer()), 
                            ('affixes', pos.AffixesTransformer()),
                            ('en_clusters', pos.HWCTransformer()),
                          ])

model = Pipeline([
        ('features', features),
        ('classifier', CRF())
    ])
model.set_params(features__affixes__form=1, features__hi_pos__gold=True)

Pipeline(steps=[('features', FeatureStacker(transformer_list=[('lex', AddLexTransformer()), ('gold_lang', GoldLangTransformer()), ('en_rest_pos', EnRestPoSTransformer(ignore=False)), ('hi_pos', HiPoSTransformer(gold=True, ignore=False)), ('normlex', NormLexTransformer(ignore=False)), ('en_rest_pos_confidence...
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False))])

In [16]:
pos_acc = cross_val_score(model, X, y, scorers.flat_accuracy, cv=10, n_jobs=-1)

Accuracy {{'%.2f' % (pos_acc.mean() * 100)}}