# Evaluating features extraction

In [1]:
import sys
import os
package_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, package_dir)
from pyseqlab.features_extraction import FOFeatureExtractor, HOFeatureExtractor, SeqsRepresenter
from pyseqlab.ho_crf_ad import HOCRFAD, HOCRFADModelRepresentation
from pyseqlab.fo_crf import FirstOrderCRF, FirstOrderCRFModelRepresentation
from pyseqlab.hosemi_crf_ad import HOSemiCRFAD, HOSemiCRFADModelRepresentation
from pyseqlab.workflow import TrainingWorkflow
from pyseqlab.utilities import ReaderWriter, SequenceStruct, TemplateGenerator, \
                               create_directory, generate_updated_model, generate_trained_model
from pyseqlab.attributes_extraction import GenericAttributeExtractor, NERSegmentAttributeExtractor
from pyseqlab.crf_learning import Learner

# define frequently used directories
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

def trainconfig_general(options):
    template_generator = TemplateGenerator()
    templateXY = {}
    # generating template for tracks
    xy_options, y_options = options
    x_options, y_options_x = xy_options
    template_generator.generate_template_XY('w', x_options, y_options_x, templateXY)
    if(y_options == ""):
        templateY = {'Y':()}
    else:
        templateY = template_generator.generate_template_Y(y_options)
    print("templateY : ", templateY)
    print("templateXY : ", templateXY)
    filter_obj = None
    ascaler_class = None
    return(templateXY, templateY, ascaler_class, filter_obj)

def load_seqs():
    seqs = []
    X = [{'w':'The'}, {'w':'dog'}, {'w':'barks'}]
    Y = ['DT', 'N', 'V']
    seqs.append(SequenceStruct(X, Y))
    X = [{'w':'Cool'}, {'w':'dog'}]
    Y = ['ADJ', 'N']
    seqs.append(SequenceStruct(X, Y))
    return(seqs)

def load_segments():
    seqs = []
    X = [{'w':'New'}, {'w':'Haven'}, {'w':'is'}, {'w':'beautiful'}]
    Y = ['L', 'L', 'O', 'O']
    seqs.append(SequenceStruct(X, Y, 'O'))
    X = [{'w':'England'}, {'w':'is'}, {'w':'part'}, {'w':'of'}, {'w':'United'}, {'w':'Kingdom'}]
    Y = ['L', 'O', 'O', 'O', 'L', 'L']
    seqs.append(SequenceStruct(X, Y, 'O'))
    return(seqs)

def train_crfs(model_type, optimization_options, dsplit_options, trainconfig_options):
    if(model_type == "HO_AD"):
        crf_model = HOCRFAD
        model_repr = HOCRFADModelRepresentation
        fextractor = HOFeatureExtractor
    elif(model_type == "FO"):
        crf_model = FirstOrderCRF 
        model_repr = FirstOrderCRFModelRepresentation
        fextractor = FOFeatureExtractor
    elif(model_type == "HOSemi_AD"):
        crf_model = HOSemiCRFAD 
        model_repr = HOSemiCRFADModelRepresentation
        fextractor = HOFeatureExtractor        
    template_xy, template_y, ascaler_class, filter_obj = trainconfig_general(trainconfig_options)
    wd = create_directory('wd', current_dir)
    workflow_trainer = TrainingWorkflow(template_y, template_xy, model_repr, 
                                        crf_model, fextractor, NERSegmentAttributeExtractor,
                                        "", optimization_options, wd, filter_obj)
    if(model_type == "HOSemi_AD"):
        seqs = load_segments()
    else:
        seqs = load_seqs()
    data_split = workflow_trainer.seq_parsing_workflow(seqs, dsplit_options)
    models_info = workflow_trainer.traineval_folds(data_split, meval=True)



In [2]:
dsplit_options = {'method':"none"}
bfgs_optimization    = {'method': "L-BFGS-B",
                        'regularization_type': 'l2',
                        'regularization_value': 0
                        }
perceptron_optimization = {'method' : 'COLLINS-PERCEPTRON',
                        'num_epochs':10,
                        'update_type':'max-fast',
                        'beam_size':-1,
                        'shuffle_seq':True,
                        'avg_scheme':'avg_uniform',
                        "tolerance":1e-16
                        }
x_range = range(0,1)
x_ngrams = "1-gram"
y_ngrams_x = "1-state:2-states"
y_ngrams = ["", "2-states", "2-states:3-states", "1-state", "1-state:2-states"]
    
#checking optimization options
for model_type in ("FO", "HO_AD", "HOSemi_AD"):
    print("model_type: ", model_type)
    for optimiz_option in (bfgs_optimization, perceptron_optimization):
        print("optimiz_option: ", optimiz_option)
        for y_ngram in y_ngrams:
            options = (((x_ngrams, x_range), y_ngrams_x), y_ngram)
            print("y_ngram: ", y_ngram)
            train_crfs(model_type, optimiz_option, dsplit_options, options)
            print("-"*40)
        print("*"*50)
    print("|"*60)
    

model_type:  FO
optimiz_option:  {'regularization_value': 0, 'method': 'L-BFGS-B', 'regularization_type': 'l2'}
y_ngram:  
templateY :  {'Y': ()}
templateXY :  {'w': {(0,): ((0,), (-1, 0))}}
dumping globalfeatures -- processed seqs:  1
dumping globalfeatures -- processed seqs:  2
constructing model -- processed seqs:  1
constructing model -- processed seqs:  2
identifying model active features -- processed seqs:  1
identifying model active features -- processed seqs:  2
iteration  1
iteration  2
iteration  3
iteration  4
iteration  5
iteration  6
iteration  7
iteration  8
iteration  9
iteration  10
iteration  11
iteration  12
iteration  13
sequence decoded -- 1 sequences are left
sequence decoded -- 0 sequences are left
f1 1.0
performance  1.0
----------------------------------------
y_ngram:  2-states
templateY :  {'Y': [(-1, 0)]}
templateXY :  {'w': {(0,): ((0,), (-1, 0))}}
dumping globalfeatures -- processed seqs:  1
dumping globalfeatures -- processed seqs:  2
constructing model --