diff --git a/suppressions.xml b/suppressions.xml index 3b929c8b..b23a50d4 100644 --- a/suppressions.xml +++ b/suppressions.xml @@ -28,7 +28,6 @@ - diff --git a/tigon-client/src/main/java/co/cask/tigon/StandaloneMain.java b/tigon-client/src/main/java/co/cask/tigon/StandaloneMain.java index 61a2b848..1c115bdf 100644 --- a/tigon-client/src/main/java/co/cask/tigon/StandaloneMain.java +++ b/tigon-client/src/main/java/co/cask/tigon/StandaloneMain.java @@ -95,7 +95,7 @@ public static void main(String[] args) { List arguments = Lists.newArrayList(); arguments.add("/Users/gandu/workspace/tigon/tigon-examples/SentimentAnalysis/target/" + "SentimentAnalysis-0.1.0-SNAPSHOT.jar"); - arguments.add("co.cask.tigon.SentimentAnalysis"); + arguments.add("co.cask.tigon.sentiment.SentimentAnalysis"); args = arguments.toArray(new String[arguments.size()]); System.out.println("Tigon Standalone Client"); if (args.length > 0) { diff --git a/tigon-examples/SentimentAnalysis/pom.xml b/tigon-examples/SentimentAnalysis/pom.xml index f81d705e..592a04d4 100644 --- a/tigon-examples/SentimentAnalysis/pom.xml +++ b/tigon-examples/SentimentAnalysis/pom.xml @@ -28,7 +28,7 @@ SentimentAnalysis - co.cask.tigon.sentimentanalysis.SentimentAnalysis + co.cask.tigon.sentiment.SentimentAnalysis @@ -37,6 +37,11 @@ tigon-api ${project.version} + + com.lingpipe + lingpipe + 4.1.0 + org.slf4j slf4j-api @@ -66,5 +71,4 @@ - \ No newline at end of file diff --git a/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Analysis.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Analysis.java new file mode 100644 index 00000000..e4bf842f --- /dev/null +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Analysis.java @@ -0,0 +1,87 @@ +/* + * Copyright © 2014 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package co.cask.tigon.sentiment; + +import co.cask.tigon.api.annotation.Batch; +import co.cask.tigon.api.annotation.ProcessInput; +import co.cask.tigon.api.flow.flowlet.AbstractFlowlet; +import co.cask.tigon.api.flow.flowlet.FlowletContext; +import com.google.common.base.Throwables; +import com.google.common.io.ByteStreams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; + +/** + * Basic java-based sentiment classifier. + */ +public class Analysis extends AbstractFlowlet { + + private static final Logger LOG = LoggerFactory.getLogger(Analysis.class); + private static final String LOCALIZED_FILENAME = "localized.txt"; + + TextClassifier classifierModel = null; + + @Override + public void initialize(FlowletContext context) throws Exception { + super.initialize(context); + + InputStream in = null; + FileOutputStream out = null; + try { + in = this.getClass().getClassLoader().getResourceAsStream("java_trained_classifier.txt"); + out = new FileOutputStream(LOCALIZED_FILENAME); // localized within container, so it get cleaned. + ByteStreams.copy(in, out); + } catch (IOException e) { + throw Throwables.propagate(e); + } finally { + try { + if (in != null) { + in.close(); + } + if (out != null) { + out.close(); + } + } catch (IOException e) { + throw Throwables.propagate(e); + } + } + InputStream modelInputStream = new FileInputStream(new File(LOCALIZED_FILENAME)); + classifierModel = TextClassifier.createFromObjectStream(new ObjectInputStream(modelInputStream)); + LOG.info("Initialized Analysis flowlet."); + } + + @Batch(100) + @ProcessInput + public void classifyTweet(String tweet) throws FileNotFoundException, ClassifierResultException { +// while (tweetIterator.hasNext()) { +// System.out.println(classify(tweetIterator.next())); +// } + System.out.println(classify(tweet).toString()); + } + + public ClassificationResult classify(String text) throws FileNotFoundException, ClassifierResultException { + return classifierModel.classify(text); + } +} diff --git a/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassificationResult.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassificationResult.java new file mode 100644 index 00000000..25174511 --- /dev/null +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassificationResult.java @@ -0,0 +1,56 @@ +/* + * Copyright © 2014 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package co.cask.tigon.sentiment; + +import com.google.common.base.Objects; + +/** + * This class represents the value and confidence of any classification. + */ +public class ClassificationResult { + private String value; + private double confidence; + private Sentiment sentiment; + + public ClassificationResult(String value, double confidence) throws ClassifierResultException { + this.value = value; + this.confidence = confidence; + + if (value.equals("pos")) { + sentiment = Sentiment.positive; + } else if (value.equals("neg")) { + sentiment = Sentiment.negative; + } else if (value.equals("neu")) { + sentiment = Sentiment.neutral; + } else { + throw new ClassifierResultException("Classifier return result not recognized. "); + } + } + + @Override + public String toString() { + return Objects.toStringHelper(this) + .add("sentiment", sentiment.toString()) + .add("value", value) + .add("confidence", confidence).toString(); + } + + public static enum Sentiment { + positive, neutral, negative; + } + +} diff --git a/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassifierResultException.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassifierResultException.java new file mode 100644 index 00000000..8452b3c7 --- /dev/null +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassifierResultException.java @@ -0,0 +1,26 @@ +/* + * Copyright © 2014 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package co.cask.tigon.sentiment; + +/** + * Exception when the Classifier couldn't classify correctly. + */ +public class ClassifierResultException extends Exception { + public ClassifierResultException(String message) { + super(message); + } +} diff --git a/tigon-examples/SentimentAnalysis/src/main/java/Normalization.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Normalization.java similarity index 93% rename from tigon-examples/SentimentAnalysis/src/main/java/Normalization.java rename to tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Normalization.java index c400ca81..1788d5ef 100644 --- a/tigon-examples/SentimentAnalysis/src/main/java/Normalization.java +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Normalization.java @@ -13,6 +13,8 @@ * License for the specific language governing permissions and limitations under * the License. */ +package co.cask.tigon.sentiment; + import co.cask.tigon.api.annotation.Batch; import co.cask.tigon.api.annotation.ProcessInput; @@ -27,16 +29,13 @@ public class Normalization extends AbstractFlowlet { private static final Logger LOG = LoggerFactory.getLogger(Normalization.class); - /** - * Emitter for emitting sentences from this Flowlet. - */ private OutputEmitter out; @ProcessInput @Batch(100) public void process(String text) { if (text != null) { - LOG.info(text); + LOG.info("Received tweet: " + text); out.emit(text); } } diff --git a/tigon-examples/SentimentAnalysis/src/main/java/SentimentAnalysis.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/SentimentAnalysis.java similarity index 90% rename from tigon-examples/SentimentAnalysis/src/main/java/SentimentAnalysis.java rename to tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/SentimentAnalysis.java index e8d37d25..34437d28 100644 --- a/tigon-examples/SentimentAnalysis/src/main/java/SentimentAnalysis.java +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/SentimentAnalysis.java @@ -13,6 +13,7 @@ * License for the specific language governing permissions and limitations under * the License. */ +package co.cask.tigon.sentiment; import co.cask.tigon.api.flow.Flow; import co.cask.tigon.api.flow.FlowSpecification; @@ -30,8 +31,10 @@ public FlowSpecification configure() { .withFlowlets() .add(new TweetCollector()) .add(new Normalization()) + .add(new Analysis()) .connect() .from(new TweetCollector()).to(new Normalization()) + .from(new Normalization()).to(new Analysis()) .build(); } } diff --git a/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TextClassifier.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TextClassifier.java new file mode 100644 index 00000000..7b16f0bb --- /dev/null +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TextClassifier.java @@ -0,0 +1,51 @@ +/* + * Copyright © 2014 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package co.cask.tigon.sentiment; + +import com.aliasi.classify.Classification; +import com.aliasi.classify.LMClassifier; + +import java.io.IOException; +import java.io.ObjectInputStream; + +/** + * Basic Text Classification + * + * Comments: has no vectorization strategy beyond using the default + * implementation + * + * + */ +public class TextClassifier{ + + private final LMClassifier classifier; + + private TextClassifier(LMClassifier classifier) { + this.classifier = classifier; + } + + public ClassificationResult classify(String data) throws ClassifierResultException { + Classification classification = classifier.classify(data); + return new ClassificationResult(classification.bestCategory(), 1.0); + } + + public static TextClassifier createFromObjectStream(ObjectInputStream inputStream) + throws IOException, ClassNotFoundException { + LMClassifier classifier = (LMClassifier) inputStream.readObject(); + return new TextClassifier(classifier); + } +} diff --git a/tigon-examples/SentimentAnalysis/src/main/java/TweetCollector.java b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TweetCollector.java similarity index 87% rename from tigon-examples/SentimentAnalysis/src/main/java/TweetCollector.java rename to tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TweetCollector.java index 679d68d3..c95ddc3f 100644 --- a/tigon-examples/SentimentAnalysis/src/main/java/TweetCollector.java +++ b/tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TweetCollector.java @@ -1,8 +1,25 @@ +/* + * Copyright © 2014 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package co.cask.tigon.sentiment; + import co.cask.tigon.api.annotation.Tick; import co.cask.tigon.api.flow.flowlet.AbstractFlowlet; import co.cask.tigon.api.flow.flowlet.FlowletContext; import co.cask.tigon.api.flow.flowlet.OutputEmitter; -import co.cask.tigon.api.metrics.Metrics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import twitter4j.Status; diff --git a/tigon-examples/SentimentAnalysis/src/main/resources/java_trained_classifier.txt b/tigon-examples/SentimentAnalysis/src/main/resources/java_trained_classifier.txt new file mode 100644 index 00000000..b2bb6512 Binary files /dev/null and b/tigon-examples/SentimentAnalysis/src/main/resources/java_trained_classifier.txt differ diff --git a/tigon-examples/SentimentAnalysis/src/main/resources/twitter4j.properties b/tigon-examples/SentimentAnalysis/src/main/resources/twitter4j.properties index 35606a27..5ce613af 100644 --- a/tigon-examples/SentimentAnalysis/src/main/resources/twitter4j.properties +++ b/tigon-examples/SentimentAnalysis/src/main/resources/twitter4j.properties @@ -1,5 +1,21 @@ -oauth.consumerKey=guzpQsLtdKs0jlap64nY1nX4N -oauth.consumerSecret=z7Ux5TPQyeOXd8xlXmm87V3qxi1vABSV9NFVelDaCus8m39tIe -oauth.accessToken=366210197-mpzoVZgENXzrEnVXvgdOqoDkCv55m2M5IYSp4ouv -oauth.accessTokenSecret=Q9ST3W4d68KgBnIKmQYWgFaBdCBrVsQXtE54ol8UhudoL -debug=false \ No newline at end of file +# +# Copyright © 2014 Cask Data, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# + +oauth.consumerKey=5bPCAbHLyeNIzGbaCyFZIQMMl +oauth.consumerSecret=OTH2It9nYhyqolWIi0068v7Oa1WLXviW3swT7DpYJTCxD9mx6B +oauth.accessToken=2827531970-P8HdLEm8bYusoXXMM0bUHivCVqJRQO7wDHlTeRB +oauth.accessTokenSecret=18togr81QeDw9GM7F4OSmo8aSZgLKmoUjfeAJmnKD0445 +debug=false