In [6]:
'''
Runner file for the pipeline
'''
from codemixeddata import *
from customclassifiers import *
from preprocessor import *
from customutils import *
from estimators import *
from pipeline import CMSTPipeline

import posfeatures as pos
import lidffeatures as lidf
import chunkfeatures as chunk

from sklearn_crfsuite import scorers, metrics, CRF

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.cross_validation import cross_val_score, train_test_split, cross_val_predict, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import make_scorer, accuracy_score
from collections import namedtuple
from sklearn.externals import joblib

from subprocess import check_output

In [11]:
lidf_features = FeatureStacker([
    ('bnc_count', lidf.BNCCountsTransformer()),
    ('lex_norm', lidf.LexNormCountsTransformer()),
    ('in_bnc_or_lexnorm', lidf.LexNormBNCSpecialTransformer()),
    ('hindi_dict', lidf.HindiDictionaryCountsTransformer()),
    ('capitalization', lidf.CapitalizationTransformer()),
    ('cngram', lidf.CharacterNgramTransformer()),
    ('affixes', lidf.AffixesTransformer())
])

lidf_model = Pipeline([
    ('features', lidf_features),
    ('classifier', CRF())
])

pos_features = FeatureStacker([
#    ('lex', pos.AddLexTransformer()),
    ('normlex', pos.NormLexTransformer()),
    ('lang', pos.AddLangTransformer()),
    ('computed_pos', pos.ComputedPOSTransformer()),
#    ('computed_pos_confidence', pos.PoSConfidenceTransformer()),
    ('affixes', pos.AffixesTransformer()),
    ('context', pos.ContextTransformer())
    #('en_clusters', pos.HWCTransformer())
  ])

pos_model = Pipeline([
    ('features', pos_features),
    ('classifier', CRF())
])
pos_model.set_params(features__affixes__strategy='all_norm', features__computed_pos__strategy='combined')
# pos_model.set_params(features__affixes__strategy='all_norm')

chunk_features = FeatureStacker([
#     ('lex', pos.AddLexTransformer()),
#     ('lang', pos.AddLangTransformer()),
#     ('context', pos.ContextTransformer()),
    ('affixes', pos.AffixesTransformer()),
#     ('en_clusters', pos.HWCTransformer()),
    ('pos', chunk.PoSTransformer()),
    ('pos_context', chunk.POSContextTransformer()),
    ('lex__predicted_pos', chunk.LexPoSTransformer()),
    ('norm_lex', pos.NormLexTransformer()),
])

chunk_model = Pipeline([
    ('features', chunk_features),
    ('classifier', CRF())
])
chunk_model.set_params(features__affixes__strategy='all_norm')

# norm_model = Pipeline([
#     ('classifier', NormEstimator(rebuild=True))
# ])
norm_model = NormEstimator(rebuild=False)
hpos_model = Pipeline([
    ('classifier', HPOSEstimator())
])

step = namedtuple('step', 'name model')


In [13]:

data = LoadDataSetWFeatures2()
model = CMSTPipeline([
        step('LANG', lidf_model),
#        step('NORM', norm_model),
#        step('HPOS', hpos_model),
#        step('POS', pos_model),
#        step('CHUNK', chunk_model),
])

X, y = SeperateColumn(data, 'CHUNK')

chunk_model.fit(X, y)

# accuracy = []
# kf = KFold(len(X), n_folds=10)
# fold_num = 1

# for train_index, test_index in kf:
#     print "fold number %d" % fold_num    

#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     current = postagger_accuracy_score(y_pred, y_test)
#     print '\t%.2f' % (current * 100)

#     fold_num += 1
#     accuracy.append(current)

# print '%.2f' % (np.array(accuracy).mean() * 100)



Pipeline(steps=[('features', FeatureStacker(transformer_list=[('affixes', AffixesTransformer(prefix_len=5, strategy='all_norm', suffix_len=5)), ('pos', PoSTransformer(ignore=False)), ('pos_context', POSContextTransformer(end=2, start=-2)), ('lex__predicted_pos', LexPoSTransformer()), ('norm_lex', NormLexTran...
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False))])

In [14]:
joblib.dump(chunk_model, './models/chunk.p')

['./models/chunk.p']

In [9]:
pos_model.predict(X_test)

NameError: name 'X_test' is not defined

In [5]:
joblib.dump(norm_model, './models/norm.p')

PicklingError: Can't pickle <type 'instancemethod'>: it's not found as __builtin__.instancemethod

In [67]:
def tokenize_epos_eposscore(sentence):
    inpf = open('input_file', 'w')
    inpf.write(sentence + '\n')
    inpf.close()
    tpos_command = './ark_tweet/runTagger.sh --output-format conll %s' % inpf.name 
    out = check_output(tpos_command.split())
    X_test = []
    x = []
    for line in out.split('\n'):
        line = line.strip()
        if line:
            WORD, EPOS, EPOSSCORE = map(lambda d: d.strip(), line.split('\t'))

            obv = {'WORD': WORD, 'EPOS': EPOS, 'EPOSSCORE': EPOSSCORE}
            x.append(obv)
    X_test.append(np.array(x))
    X_test = np.array(X_test)
    return X_test

In [68]:
sentence = 'What is this? tum apna kaam karo'
X_test = tokenize_epos_eposscore(sentence)

In [69]:
lidf_model = joblib.load('./models/lidf.p')
def add_language_idf(X_test):
    y_pred = lidf_model.predict(X_test)
    return y_pred

In [70]:
AddColumn(X_test, add_language_idf(X_test), 'LANG')

array([[{'EPOSSCORE': '0.9968', 'EPOS': 'O', 'WORD': 'What', 'LANG': 'en'},
        {'EPOSSCORE': '0.9994', 'EPOS': 'V', 'WORD': 'is', 'LANG': 'en'},
        {'EPOSSCORE': '0.8788', 'EPOS': 'O', 'WORD': 'this', 'LANG': 'en'},
        {'EPOSSCORE': '0.9983', 'EPOS': ',', 'WORD': '?', 'LANG': 'rest'},
        {'EPOSSCORE': '0.8029', 'EPOS': 'G', 'WORD': 'tum', 'LANG': 'hi'},
        {'EPOSSCORE': '0.8650', 'EPOS': 'G', 'WORD': 'apna', 'LANG': 'hi'},
        {'EPOSSCORE': '0.8247', 'EPOS': 'G', 'WORD': 'kaam', 'LANG': 'hi'},
        {'EPOSSCORE': '0.5832', 'EPOS': 'G', 'WORD': 'karo', 'LANG': 'hi'}]], dtype=object)

In [47]:
tpos_command = './ark_tweet/runTagger.sh --output-format conll %s' % inpf.name 

In [48]:

out

'What\tO\t0.9968\nis\tV\t0.9994\nthis\tO\t0.8788\n?\t,\t0.9983\ntum\tG\t0.8029\napna\tG\t0.8650\nkaam\tG\t0.8247\nkaro\tG\t0.5832\n\n'

In [49]:
print out

What	O	0.9968
is	V	0.9994
this	O	0.8788
?	,	0.9983
tum	G	0.8029
apna	G	0.8650
kaam	G	0.8247
karo	G	0.5832




In [60]:
X_test

array([[{'EPOSSCORE': '0.9968', 'EPOS': 'O', 'WORD': 'What'},
        {'EPOSSCORE': '0.9994', 'EPOS': 'V', 'WORD': 'is'},
        {'EPOSSCORE': '0.8788', 'EPOS': 'O', 'WORD': 'this'},
        {'EPOSSCORE': '0.9983', 'EPOS': ',', 'WORD': '?'},
        {'EPOSSCORE': '0.8029', 'EPOS': 'G', 'WORD': 'tum'},
        {'EPOSSCORE': '0.8650', 'EPOS': 'G', 'WORD': 'apna'},
        {'EPOSSCORE': '0.8247', 'EPOS': 'G', 'WORD': 'kaam'},
        {'EPOSSCORE': '0.5832', 'EPOS': 'G', 'WORD': 'karo'}]], dtype=object)

In [62]:
y_pred_lang = model.predict(X_test)

In [63]:
AddColumn(X_test, y_pred_lang, 'LANG')

array([[{'EPOSSCORE': '0.9968', 'EPOS': 'O', 'WORD': 'What', 'LANG': 'en'},
        {'EPOSSCORE': '0.9994', 'EPOS': 'V', 'WORD': 'is', 'LANG': 'en'},
        {'EPOSSCORE': '0.8788', 'EPOS': 'O', 'WORD': 'this', 'LANG': 'en'},
        {'EPOSSCORE': '0.9983', 'EPOS': ',', 'WORD': '?', 'LANG': 'rest'},
        {'EPOSSCORE': '0.8029', 'EPOS': 'G', 'WORD': 'tum', 'LANG': 'hi'},
        {'EPOSSCORE': '0.8650', 'EPOS': 'G', 'WORD': 'apna', 'LANG': 'hi'},
        {'EPOSSCORE': '0.8247', 'EPOS': 'G', 'WORD': 'kaam', 'LANG': 'hi'},
        {'EPOSSCORE': '0.5832', 'EPOS': 'G', 'WORD': 'karo', 'LANG': 'hi'}]], dtype=object)

In [56]:
word

'What'

In [57]:
X_test[0]

{'EPOS': 'O', 'EPOSSCORE': '0.9968', 'WORD': 'What'}

In [58]:
X[0]

[{'CHUNK': 'B-NP',
  'EPOS': '^',
  'EPOSSCORE': '0.7803',
  'HPOS': 'PRP',
  'NORM': '\xe0\xa4\xa4\xe0\xa5\x87\xe0\xa4\xb0\xe0\xa5\x87',
  'POS': 'PRON',
  'POSITION': 0.0,
  'WORD': 'tere',
  '_HPOS': 'PRP'},
 {'CHUNK': 'B-NP',
  'EPOS': 'G',
  'EPOSSCORE': '0.3119',
  'HPOS': 'PSP',
  'NORM': '\xe0\xa4\xae\xe0\xa4\xbe',
  'POS': 'NOUN',
  'POSITION': 0.08333333333333333,
  'WORD': 'ma',
  '_HPOS': 'PSP'},
 {'CHUNK': 'B-NP',
  'EPOS': 'G',
  'EPOSSCORE': '0.8136',
  'HPOS': 'NN',
  'NORM': '\xe0\xa4\xac\xe0\xa4\xbe\xe0\xa4\xaa',
  'POS': 'NOUN',
  'POSITION': 0.16666666666666666,
  'WORD': 'baap',
  '_HPOS': 'NN'},
 {'CHUNK': 'I-NP',
  'EPOS': 'V',
  'EPOSSCORE': '0.3765',
  'HPOS': 'PSP',
  'NORM': '\xe0\xa4\x95\xe0\xa5\x8b',
  'POS': 'ADP',
  'POSITION': 0.25,
  'WORD': 'ko',
  '_HPOS': 'PSP'},
 {'CHUNK': 'B-NP',
  'EPOS': 'O',
  'EPOSSCORE': '0.9694',
  'HPOS': 'DEM',
  'NORM': '\xe0\xa4\x89\xe0\xa4\xb8',
  'POS': 'PRON',
  'POSITION': 0.3333333333333333,
  'WORD': 'us',
  '_HPOS'

In [79]:
def add_hi_pos_tagger(X):
    ypos = []
    trn = transliterator(source='eng', target='hin')
    for x in X:
        for obv in x:
            if obv['LANG'] == 'en':
                obv['NORM'] = trn.transform(obv['LANG'])

    for x in X:
        sent = []
        for obv in x:
            sent.append(obv['NORM'])
        sent = ' '.join(sent)
        payload = {'input': sent}
        r = requests.post('http://api.ilmt.iiit.ac.in/hin/pan/1/4', data=payload)
        pos_output = r.json()['postagger-4']
        _y = []
        for line in pos_output.split('\n'):
            with_tag = False
            try:
                int(line[0])
                with_tag = True
            except:
                pass
            if with_tag:
                _y.append(line.split()[2])
        ypos.append(_y)
    for x in X:
        for obv in x:
            if obv['LANG'] == 'en':
                obv['NORM'] = obv['WORD']
    return ypos


In [81]:
run_hindi_pos_tagger(X_test, gold=False)

IOError: [Errno 2] No such file or directory: 'data/final_normalization_predictions_wx'