Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: Fix failing dev check in CRF #283

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/ports/postgres/modules/crf/test/crf_test_small.sql_in
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
(2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'),
(3,E'His actions prevent disaster.');
analyze crf_document;

-- Features table
CREATE TABLE crf_feature_test (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO crf_feature_test VALUES
Expand Down Expand Up @@ -90,7 +90,7 @@
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze crf_label;
analyze test_crf_label;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming that the table crf_label doesn't exist, why wasn't crf install check always red?


-- Segment table
CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
Expand All @@ -106,15 +106,15 @@
(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(4,3,'.',4);
analyze test_segmenttbl;

-- extract features for tokens stored in segmenttbl
-- extract features for tokens stored in segmenttbl
SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','test_crf_label','crf_regex','crf_feature_test','viterbi_mtbl','viterbi_rtbl');

-- Expected viterbi labeling result
-- The result is produced from Dr. Sunita's CRF java package with the same input
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying);
INSERT INTO expected_extraction VALUES
(1,0,'chancellor','NNP'),(1,1,'of','IN'), (1,2,'the','DT'), (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'),
Expand Down
84 changes: 49 additions & 35 deletions src/ports/postgres/modules/crf/test/crf_train_large.sql_in
Original file line number Diff line number Diff line change
Expand Up @@ -234,26 +234,40 @@ INSERT INTO train_new_segmenttbl VALUES
(30, 7, 'years', 13, 31),
(31, 7, '.', 44, 31);

CREATE TABLE train_new_regex(pattern text,name text);
CREATE TABLE train_new_regex(pattern text,name text);
INSERT INTO train_new_regex VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'),
('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'),
('^.+s$','endsWithS'), ('^.+ing$','endsWithIng'),
('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
analyze train_new_regex;
analyze train_new_regex;

SELECT crf_train_fgen('train_new_segmenttbl', 'train_new_regex', 'crf_label', 'train_new_dictionary', 'train_new_featuretbl','train_new_featureset');
CREATE TABLE crf_label_new (id integer,label character varying);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two files crf_test_small.sql_in and crf_train_large.sql_in have different indentation. Can we make them consistent

INSERT INTO crf_label_new VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,'''''');
INSERT INTO crf_label_new VALUES
(39,<!'``'!>);
m4_changequote(,)
INSERT INTO crf_label_new VALUES
(40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze crf_label_new;

SELECT lincrf_train('train_new_featuretbl', 'train_new_featureset', 'crf_label', 'train_new_stats', 'train_new_crf_feature', 30);
SELECT crf_train_fgen('train_new_segmenttbl', 'train_new_regex', 'crf_label_new', 'train_new_dictionary', 'train_new_featuretbl','train_new_featureset');

-- Expected feature table
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_crf_feature_new(id integer,name text,prev_label integer,label integer,weight float);
SELECT lincrf_train('train_new_featuretbl', 'train_new_featureset', 'crf_label_new', 'train_new_stats', 'train_new_crf_feature', 30);

INSERT INTO expected_crf_feature_new VALUES
-- Expected feature table
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_crf_feature_new(id integer,name text,prev_label integer,label integer,weight float);

INSERT INTO expected_crf_feature_new VALUES
(0, 'S.', -1, 12, 0.5516753522178934),
(1, 'W_freight', -1, 12, 5.959241076198326),
(2, 'E.', 12, 13, 2.0789747316372034),
Expand Down Expand Up @@ -545,33 +559,33 @@ INSERT INTO train_new_regex VALUES
(288, 'E.', 27, 13, 0.6748848167259296),
(289, 'W_past', -1, 7, 2.852378831268221);

SELECT assert(
SUM(abs(c1.weight-c2.weight)) < 0.1,
'Total difference between extracted feature weights and expected feature weights is > 0.1.')
FROM expected_crf_feature_new c1, train_new_crf_feature c2
WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;
SELECT assert(
SUM(abs(c1.weight-c2.weight)) < 0.1,
'Total difference between extracted feature weights and expected feature weights is > 0.1.')
FROM expected_crf_feature_new c1, train_new_crf_feature c2
WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;

-- Compare the expected features and the extraction features. It fails
-- if the features do not match.
SELECT assert(s1.count+s2.count = 0, 'Features extracted do not match expected features.')
FROM (
SELECT count(*) FROM(
SELECT name, prev_label, label
FROM expected_crf_feature_new
EXCEPT ALL
SELECT name, prev_label_id, label_id
FROM train_new_crf_feature
) AS U
)s1,
(
SELECT count(*) FROM(
SELECT name, prev_label_id, label_id
FROM train_new_crf_feature
EXCEPT ALL
SELECT name, prev_label, label
FROM expected_crf_feature_new
) AS U
)s2;
-- Compare the expected features and the extraction features. It fails
-- if the features do not match.
SELECT assert(s1.count+s2.count = 0, 'Features extracted do not match expected features.')
FROM (
SELECT count(*) FROM(
SELECT name, prev_label, label
FROM expected_crf_feature_new
EXCEPT ALL
SELECT name, prev_label_id, label_id
FROM train_new_crf_feature
) AS U
)s1,
(
SELECT count(*) FROM(
SELECT name, prev_label_id, label_id
FROM train_new_crf_feature
EXCEPT ALL
SELECT name, prev_label, label
FROM expected_crf_feature_new
) AS U
)s2;

!>)
m4_changequote(<!`!>,<!'!>)