From a1f6efca70f6f4ff44bf7ca0a23b7922dda35bbf Mon Sep 17 00:00:00 2001 From: qibaoyuan Date: Mon, 27 Apr 2015 22:28:14 +0800 Subject: [PATCH 1/4] Fix the ArffLoader bug in processing --- .../com/yahoo/labs/samoa/instances/ArffLoader.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java index feb5702d..66dff009 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java @@ -30,7 +30,7 @@ import java.util.logging.Logger; /** - * + * * @author abifet */ public class ArffLoader implements Serializable { @@ -92,7 +92,7 @@ public Instance readInstanceDense() { if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { // System.out.println(streamTokenizer.nval + "Num "); this.setValue(instance, numAttribute, streamTokenizer.nval, true); - numAttribute++; + //numAttribute++; } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD || streamTokenizer.ttype == 34)) { @@ -104,12 +104,14 @@ public Instance readInstanceDense() { } else if (isNumeric == true) { value = Double.valueOf(streamTokenizer.sval).doubleValue(); } else { - value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval); + value = this.instanceInformation.attribute(numAttribute).indexOfValue( + streamTokenizer.sval); } this.setValue(instance, numAttribute, value, isNumeric); - numAttribute++; + //numAttribute++; } + numAttribute++; streamTokenizer.nextToken(); } streamTokenizer.nextToken(); From 5d24253f3e6faef1050af939ef57683ce93b90a0 Mon Sep 17 00:00:00 2001 From: qibaoyuan Date: Tue, 5 May 2015 16:38:07 +0800 Subject: [PATCH 2/4] SAMOA-26: Fix the ArffLoader bug --- .../labs/samoa/instances/ArffLoader.java | 97 ++++++++++++------- .../yahoo/labs/samoa/instances/Attribute.java | 62 ++++++------ .../labs/samoa/instances/DenseInstance.java | 6 +- 3 files changed, 98 insertions(+), 67 deletions(-) diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java index 66dff009..dc22bb82 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java @@ -19,6 +19,7 @@ * limitations under the License. * #L% */ + import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; @@ -30,7 +31,6 @@ import java.util.logging.Logger; /** - * * @author abifet */ public class ArffLoader implements Serializable { @@ -87,15 +87,16 @@ public Instance readInstanceDense() { while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { // For each line while (streamTokenizer.ttype != StreamTokenizer.TT_EOL - && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { + && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { // For each item if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { // System.out.println(streamTokenizer.nval + "Num "); this.setValue(instance, numAttribute, streamTokenizer.nval, true); //numAttribute++; - } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD - || streamTokenizer.ttype == 34)) { + } else if (streamTokenizer.sval != null && ( + streamTokenizer.ttype == StreamTokenizer.TT_WORD + || streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) { // System.out.println(streamTokenizer.sval + "Str"); boolean isNumeric = attributes.get(numAttribute).isNumeric(); double value; @@ -121,13 +122,15 @@ public Instance readInstanceDense() { } catch (IOException ex) { Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); } + //System.out.println(instance); return (numAttribute > 0) ? instance : null; } private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) { double valueAttribute; - if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) { - valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); + if (this.instanceInformation.attribute(numAttribute).isNominal) { + valueAttribute = value; + //this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); // System.out.println(value +"/"+valueAttribute+" "); } else { @@ -146,7 +149,7 @@ private void setValue(Instance instance, int numAttribute, double value, boolean private Instance readInstanceSparse() { // Return a Sparse Instance Instance instance = new SparseInstance(1.0, null); // (this.instanceInformation.numAttributes() - // + 1); + // + 1); // System.out.println(this.instanceInformation.numAttributes()); int numAttribute; ArrayList attributeValues = new ArrayList(); @@ -156,7 +159,7 @@ private Instance readInstanceSparse() { streamTokenizer.nextToken(); // Remove the '{' char // For each line while (streamTokenizer.ttype != StreamTokenizer.TT_EOL - && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { + && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { while (streamTokenizer.ttype != '}') { // For each item // streamTokenizer.nextToken(); @@ -173,18 +176,22 @@ private Instance readInstanceSparse() { if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { // System.out.print(streamTokenizer.nval + " "); - this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true); + this.setSparseValue(instance, indexValues, attributeValues, numAttribute, + streamTokenizer.nval, true); // numAttribute++; - } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD + } else if (streamTokenizer.sval != null && ( + streamTokenizer.ttype == StreamTokenizer.TT_WORD || streamTokenizer.ttype == 34)) { // System.out.print(streamTokenizer.sval + "-"); if (attributes.get(numAttribute).isNumeric()) { this.setSparseValue(instance, indexValues, attributeValues, numAttribute, - Double.valueOf(streamTokenizer.sval).doubleValue(), true); + Double.valueOf(streamTokenizer.sval).doubleValue(), true); } else { - this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation - .attribute(numAttribute).indexOfValue(streamTokenizer.sval), false); + this.setSparseValue(instance, indexValues, attributeValues, numAttribute, + this.instanceInformation + .attribute(numAttribute).indexOfValue(streamTokenizer.sval), + false); } } streamTokenizer.nextToken(); @@ -204,16 +211,19 @@ private Instance readInstanceSparse() { arrayIndexValues[i] = indexValues.get(i).intValue(); arrayAttributeValues[i] = attributeValues.get(i).doubleValue(); } - instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes()); + instance.addSparseValues(arrayIndexValues, arrayAttributeValues, + this.instanceInformation.numAttributes()); return instance; } - private void setSparseValue(Instance instance, List indexValues, List attributeValues, - int numAttribute, double value, boolean isNumber) { + private void setSparseValue(Instance instance, List indexValues, + List attributeValues, + int numAttribute, double value, boolean isNumber) { double valueAttribute; if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) { - valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); + valueAttribute = + this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); } else { valueAttribute = value; } @@ -237,7 +247,7 @@ private Instance readDenseInstanceSparse() { streamTokenizer.nextToken(); // Remove the '{' char // For each line while (streamTokenizer.ttype != StreamTokenizer.TT_EOL - && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { + && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { while (streamTokenizer.ttype != '}') { // For each item // streamTokenizer.nextToken(); @@ -251,15 +261,18 @@ private Instance readDenseInstanceSparse() { this.setValue(instance, numAttribute, streamTokenizer.nval, true); // numAttribute++; - } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD + } else if (streamTokenizer.sval != null && ( + streamTokenizer.ttype == StreamTokenizer.TT_WORD || streamTokenizer.ttype == 34)) { // System.out.print(streamTokenizer.sval + // "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" "); if (attributes.get(numAttribute).isNumeric()) { - this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true); + this.setValue(instance, numAttribute, + Double.valueOf(streamTokenizer.sval).doubleValue(), true); } else { this.setValue(instance, numAttribute, - this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false); + this.instanceInformation.attribute(numAttribute) + .indexOfValue(streamTokenizer.sval), false); // numAttribute++; } } @@ -289,7 +302,8 @@ private InstanceInformation getHeader() { while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) { // For each line // if (streamTokenizer.ttype == '@') { - if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) { + if (streamTokenizer.ttype == StreamTokenizer.TT_WORD + && streamTokenizer.sval.startsWith("@") == true) { // streamTokenizer.nextToken(); String token = streamTokenizer.sval.toUpperCase(); if (token.startsWith("@RELATION")) { @@ -307,22 +321,12 @@ private InstanceInformation getHeader() { String type = streamTokenizer.sval; // System.out.println("* " + name + ":" + type + " "); if (streamTokenizer.ttype == '{') { + parseDoubleBrackests(name); + } else if (streamTokenizer.ttype == 10) {//for the buggy non-formal input arff file streamTokenizer.nextToken(); - List attributeLabels = new ArrayList(); - while (streamTokenizer.ttype != '}') { - - if (streamTokenizer.sval != null) { - attributeLabels.add(streamTokenizer.sval); - // System.out.print(streamTokenizer.sval + ","); - } else { - attributeLabels.add(Double.toString(streamTokenizer.nval)); - // System.out.print(streamTokenizer.nval + ","); - } - - streamTokenizer.nextToken(); + if (streamTokenizer.ttype == '{') { + parseDoubleBrackests(name); } - // System.out.println(); - attributes.add(new Attribute(name, attributeLabels)); } else { // Add attribute attributes.add(new Attribute(name)); @@ -343,6 +347,27 @@ private InstanceInformation getHeader() { return new InstanceInformation(relation, attributes); } + private void parseDoubleBrackests(String name) throws IOException { + + streamTokenizer.nextToken(); + List attributeLabels = new ArrayList(); + while (streamTokenizer.ttype != '}') { + + if (streamTokenizer.sval != null) { + attributeLabels.add(streamTokenizer.sval); + // System.out.print(streamTokenizer.sval + ","); + } else { + attributeLabels.add(Double.toString(streamTokenizer.nval)); + // System.out.print(streamTokenizer.nval + ","); + } + + streamTokenizer.nextToken(); + } + // System.out.println(); + attributes.add(new Attribute(name, attributeLabels)); + + } + private void initStreamTokenizer(Reader reader) { BufferedReader br = new BufferedReader(reader); diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java index 8609d6e0..6ebd6782 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java @@ -32,37 +32,38 @@ import java.util.Map; /** - * * @author abifet */ public class Attribute implements Serializable { public static final String ARFF_ATTRIBUTE = "@attribute"; public static final String ARFF_ATTRIBUTE_NUMERIC = "NUMERIC"; + public static final String ARFF_ATTRIBUTE_NOMINAL = "NOMINAL"; + public static final String ARFF_ATTRIBUTE_DATE = "DATE"; /** - * - */ + * + */ protected boolean isNominal; /** - * - */ + * + */ protected boolean isNumeric; /** - * - */ + * + */ protected boolean isDate; /** - * - */ + * + */ protected String name; /** - * - */ + * + */ protected List attributeValues; /** - * + * * @return */ public List getAttributeValues() { @@ -70,12 +71,12 @@ public List getAttributeValues() { } /** - * - */ + * + */ protected int index; /** - * + * * @param string */ public Attribute(String string) { @@ -84,7 +85,7 @@ public Attribute(String string) { } /** - * + * * @param attributeName * @param attributeValues */ @@ -95,14 +96,14 @@ public Attribute(String attributeName, List attributeValues) { } /** - * - */ + * + */ public Attribute() { this(""); } /** - * + * * @return */ public boolean isNominal() { @@ -110,7 +111,7 @@ public boolean isNominal() { } /** - * + * * @return */ public String name() { @@ -118,7 +119,7 @@ public String name() { } /** - * + * * @param value * @return */ @@ -127,7 +128,7 @@ public String value(int value) { } /** - * + * * @return */ public boolean isNumeric() { @@ -135,20 +136,19 @@ public boolean isNumeric() { } /** - * + * * @return */ public int numValues() { if (isNumeric()) { return 0; - } - else { + } else { return attributeValues.size(); } } /** - * + * * @return */ public int index() { // RuleClassifier @@ -167,7 +167,7 @@ boolean isDate() { private Map valuesStringAttribute; /** - * + * * @param value * @return */ @@ -198,7 +198,13 @@ public String toString() { text.append(ARFF_ATTRIBUTE).append(" ").append(Utils.quote(this.name)).append(" "); - text.append(ARFF_ATTRIBUTE_NUMERIC); + if (isNominal) { + text.append(ARFF_ATTRIBUTE_NOMINAL); + } else if (isNumeric) { + text.append(ARFF_ATTRIBUTE_NUMERIC); + } else if (isDate) { + text.append(ARFF_ATTRIBUTE_DATE); + } return text.toString(); } diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java index 984675e7..57d1bfd7 100644 --- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java +++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java @@ -25,7 +25,6 @@ */ /** - * * @author abifet */ public class DenseInstance extends SingleLabelInstance { @@ -62,9 +61,10 @@ public DenseInstance(double numberAttributes) { public String toString() { StringBuffer text = new StringBuffer(); - for (int i = 0; i < this.instanceInformation.numAttributes(); i++) { - if (i > 0) + for (int i = 0; i < this.instanceData.numAttributes(); i++) { + if (i > 0) { text.append(","); + } text.append(this.value(i)); } text.append(",").append(this.weight()); From d2e5531da243750e8c8a4db5e8e10ad31e3ff827 Mon Sep 17 00:00:00 2001 From: qibaoyuan Date: Wed, 13 May 2015 16:19:46 +0800 Subject: [PATCH 3/4] add test for ArffLoader --- .../labs/samoa/instances/ArffLoaderTest.java | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java diff --git a/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java new file mode 100644 index 00000000..d40f6400 --- /dev/null +++ b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java @@ -0,0 +1,107 @@ +package com.yahoo.labs.samoa.instances; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2014 - 2015 Apache Software Foundation + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ +import org.junit.Before; +import org.junit.Test; + +import java.io.StringReader; + +import static org.junit.Assert.assertEquals; + +public class ArffLoaderTest { + + private ArffLoader loader; + + private StringReader reader; + + @Before + public void setUp() { + String inputString = "@relation test.txt\n" + + "\n" + + "@attribute Dur numeric\n" + + "@attribute Proto {udp,tcp,icmp,arp,ipx/spx,ipv6-icmp,pim,esp,igmp,rtcp,rtp,ipv6,udt}\n" + + "@attribute Dir {' <->',' ',' ->',' ?>',' who',' <-',' ',...,0,0,2,252,145,Background\n" + + "1471.787109,udp,' <->',CON,0,0,2,252,145,Background"; + reader = new StringReader(inputString); + int size = 0; + int classAttribute = 10; + loader = new ArffLoader(reader, size, classAttribute); + + } + + @Test + public void testGetHeader() { + InstanceInformation header = loader.getStructure(); + assertEquals(10, header.numAttributes()); + assertEquals(9, header.classIndex()); + assertEquals(true,header.attribute(0).isNumeric()); + assertEquals(false,header.attribute(1).isNumeric()); + assertEquals(false,header.attribute(2).isNumeric()); + assertEquals(false,header.attribute(3).isNumeric()); + assertEquals(true,header.attribute(4).isNumeric()); + assertEquals(true,header.attribute(5).isNumeric()); + assertEquals(true,header.attribute(6).isNumeric()); + assertEquals(true,header.attribute(7).isNumeric()); + assertEquals(true,header.attribute(8).isNumeric()); + assertEquals(false,header.attribute(9).isNumeric()); + + assertEquals(7,header.attribute(2).numValues()); + assertEquals(" <->",header.attribute(2).value(0)); + assertEquals(" ",header.attribute(2).value(1)); + assertEquals(" ->",header.attribute(2).value(2)); + assertEquals(" ?>",header.attribute(2).value(3)); + assertEquals(" who",header.attribute(2).value(4)); + assertEquals(" <-",header.attribute(2).value(5)); + assertEquals(" Date: Wed, 13 May 2015 16:30:46 +0800 Subject: [PATCH 4/4] add test for ArffLoader --- .../labs/samoa/instances/ArffLoaderTest.java | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java index d40f6400..62fd7b74 100644 --- a/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java +++ b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java @@ -19,6 +19,7 @@ * limitations under the License. * #L% */ + import org.junit.Before; import org.junit.Test; @@ -63,45 +64,45 @@ public void testGetHeader() { InstanceInformation header = loader.getStructure(); assertEquals(10, header.numAttributes()); assertEquals(9, header.classIndex()); - assertEquals(true,header.attribute(0).isNumeric()); - assertEquals(false,header.attribute(1).isNumeric()); - assertEquals(false,header.attribute(2).isNumeric()); - assertEquals(false,header.attribute(3).isNumeric()); - assertEquals(true,header.attribute(4).isNumeric()); - assertEquals(true,header.attribute(5).isNumeric()); - assertEquals(true,header.attribute(6).isNumeric()); - assertEquals(true,header.attribute(7).isNumeric()); - assertEquals(true,header.attribute(8).isNumeric()); - assertEquals(false,header.attribute(9).isNumeric()); + assertEquals(true, header.attribute(0).isNumeric()); + assertEquals(false, header.attribute(1).isNumeric()); + assertEquals(false, header.attribute(2).isNumeric()); + assertEquals(false, header.attribute(3).isNumeric()); + assertEquals(true, header.attribute(4).isNumeric()); + assertEquals(true, header.attribute(5).isNumeric()); + assertEquals(true, header.attribute(6).isNumeric()); + assertEquals(true, header.attribute(7).isNumeric()); + assertEquals(true, header.attribute(8).isNumeric()); + assertEquals(false, header.attribute(9).isNumeric()); - assertEquals(7,header.attribute(2).numValues()); - assertEquals(" <->",header.attribute(2).value(0)); - assertEquals(" ",header.attribute(2).value(1)); - assertEquals(" ->",header.attribute(2).value(2)); - assertEquals(" ?>",header.attribute(2).value(3)); - assertEquals(" who",header.attribute(2).value(4)); - assertEquals(" <-",header.attribute(2).value(5)); - assertEquals(" ", header.attribute(2).value(0)); + assertEquals(" ", header.attribute(2).value(1)); + assertEquals(" ->", header.attribute(2).value(2)); + assertEquals(" ?>", header.attribute(2).value(3)); + assertEquals(" who", header.attribute(2).value(4)); + assertEquals(" <-", header.attribute(2).value(5)); + assertEquals("