In [1]:
from utils import sp_utils
import pandas as pd
import skseq
from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.structured_perceptron import StructuredPerceptron
from skseq.sequences.id_feature import IDFeatures
from seqeval.metrics import classification_report
from collections import defaultdict
import skseq.sequences.structured_perceptron as spc
from skseq.sequences import extended_feature

# Perceptron

In [2]:
feature_type = ["Initial features", "Transition features", "Final features", "Emission features"]

## Corpus

In [3]:
from utils.NERcorpus import NERCorpus
data_path = "../nlp_d2_data/"
corpus = NERCorpus()
train_seq = corpus.read_sequence_list_csv(f"{data_path}train_data_ner.csv")
test_seq = corpus.read_sequence_list_csv(f"{data_path}test_data_ner.csv")

In [4]:
corpus.tag_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

## Simple Features

In [5]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [6]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 5
sp.get_num_states(), sp.get_num_observations()

(17, 55145)

In [None]:
sp.load_model(dir="saved_models/sp_simple")

[ 6.2  7.8 10.4 ... -3.2  1.2  0. ]
Number of features learned: 39802
Number of sequences: 38366


In [None]:
sp_utils.evaluate_model(sp, train_seq, test_seq)

SP -  Accuracy Train: 0.812 Test: 0.284


In [7]:
inv_feature_dict = {word: pos for pos, word in feature_mapper.feature_dict.items()}

In [9]:
p = "David had been asked to write a challenging program for Maria ."
sp_utils.predict_new_sentance(sp, feature_mapper, p, train_seq, feature_type, inv_feature_dict)

David/O had/O been/O asked/O to/O write/O a/O challenging/O program/O for/O Maria/O ./O 
([[0]], [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]], [[28]], [[], [532], [455], [659], [10], [7318], [63], [6045], [594], [251], [], [27]])
Initial features
	 [0]
		 init_tag:O


Transition features
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O
	 [3]
		 prev_tag:O::O


Final features
	 [28]
		 final_prev_tag:O


Emission features
	 []
	 [532]
		 id:had::O
	 [455]
		 id:been::O
	 [659]
		 id:asked::O
	 [10]
		 id:to::O
	 [7318]
		 id:write::O
	 [63]
		 id:a::O
	 [6045]
		 id:challenging::O
	 [594]
		 id:program::O
	 [251]
		 id:for::O
	 []
	 [27]
		 id:.::O


None


## Extended Features

In [None]:
feature_ext_mapper = skseq.sequences.extended_feature.ExtendedFeatures(train_seq)
feature_ext_mapper.build_features()

In [6]:
sp_ext = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_ext_mapper)
sp_ext.num_epochs = 5
sp_ext.get_num_states(), sp_ext.get_num_observations()

(17, 55145)

In [None]:
sp_ext.load_model(dir = "saved_models/sp_extended")

[21.8  4.4  5.4 ...  1.6  0.   0. ]
Number of features learned: 99696
Number of sequences: 38366


In [None]:
sp_utils.evaluate_model(sp_ext, train_seq, test_seq)

SP EXT -  Accuracy Train: 0.809 Test: 0.557


In [14]:
inv_feature_dict = {word: pos for pos, word in feature_ext_mapper.feature_dict.items()}

In [None]:
p = "David had been asked to write a challenging program for Maria ."
sp_utils.predict_new_sentance(sp_ext, feature_mapper, p, train_seq, feature_type, inv_feature_dict)

David/B-per had/O been/O asked/O to/O write/O a/O challenging/O program/O for/O Maria/B-per ./O 
([[343]], [[582], [10], [10], [10], [10], [10], [10], [10], [10], [258], [582]], [[94]], [[19626, 19627, 254, 11487, 19628, 348], [1725, 1726, 1727, 1728, 9], [1498, 1499, 1500, 890, 9], [2102, 2103, 1858, 2104, 19], [35, 36, 19], [19759, 19760, 12551, 3469, 19], [212, 213, 19], [16477, 16478, 897, 292, 19], [1919, 1920, 40, 1921, 19], [857, 858, 521, 859, 86], [24139, 24140, 254, 7710, 17097, 2930], [91, 92, 93]])
Initial features
	 [343]
		 init_tag:B-per


Transition features
	 [582]
		 prev_tag:B-per::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [258]
		 prev_tag:O::B-per
	 [582]
		 prev_tag:B-per::O


Final features
	 [94]
		 final_prev_tag:O


Emission features
	 [19626, 19627, 254, 11487, 19628, 348]
		 id:David::B-per
		 lower:david::

## Cython Optimization

In [5]:
from skseq.sequences.structured_perceptron_optimized import StructuredPerceptronOptimized

In [6]:
feature_ext_mapper = skseq.sequences.extended_feature.ExtendedFeatures(train_seq)
feature_ext_mapper.build_features()

In [7]:
sp_opt = StructuredPerceptronOptimized(corpus.word_dict, corpus.tag_dict, feature_ext_mapper)

In [None]:
sp_opt.load_model(dir = "saved_models/sp_optimized")

[1.61916e+04 1.57200e+02 1.66200e+02 ... 6.00000e+00 0.00000e+00
 0.00000e+00]
Number of features learned: 99696
Number of sequences: 38366


In [None]:
sp_utils.evaluate_model(sp_opt, train_seq, test_seq)

SP EXT -  Accuracy Train: 0.290 Test: 0.230


In [12]:
inv_feature_dict = {word: pos for pos, word in feature_ext_mapper.feature_dict.items()}

In [None]:
p = "David had been asked to write a challenging program for Maria ."
sp_utils.predict_new_sentance(sp_opt, feature_mapper, p, train_seq, feature_type, inv_feature_dict)

David/B-geo had/O been/O asked/O to/O write/O a/O challenging/O program/O for/B-tim Maria/B-geo ./O 
([[237]], [[37], [10], [10], [10], [10], [10], [10], [10], [109], [10606], [37]], [[94]], [[30, 23692, 243], [1725, 1726, 1727, 1728, 9], [1498, 1499, 1500, 890, 9], [2102, 2103, 1858, 2104, 19], [35, 36, 19], [19759, 19760, 12551, 3469, 19], [212, 213, 19], [16477, 16478, 897, 292, 19], [1919, 1920, 40, 1921, 19], [41624, 1744], [30, 16466, 705, 193], [91, 92, 93]])
Initial features
	 [237]
		 init_tag:B-geo


Transition features
	 [37]
		 prev_tag:B-geo::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [10]
		 prev_tag:O::O
	 [109]
		 prev_tag:O::B-tim
	 [10606]
		 prev_tag:B-tim::B-geo
	 [37]
		 prev_tag:B-geo::O


Final features
	 [94]
		 final_prev_tag:O


Emission features
	 [30, 23692, 243]
		 capitalized::B-geo
		 prefix:dav::B-geo
		 pos_bucket:start::B-geo
	 [1725, 1726, 1727, 17