This repository has been archived by the owner on Dec 27, 2022. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Parth Gandhi
committed
Sep 23, 2014
1 parent
a5087a7
commit e965075
Showing
12 changed files
with
272 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/Analysis.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/* | ||
* Copyright © 2014 Cask Data, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package co.cask.tigon.sentiment; | ||
|
||
import co.cask.tigon.api.annotation.Batch; | ||
import co.cask.tigon.api.annotation.ProcessInput; | ||
import co.cask.tigon.api.flow.flowlet.AbstractFlowlet; | ||
import co.cask.tigon.api.flow.flowlet.FlowletContext; | ||
import com.google.common.base.Throwables; | ||
import com.google.common.io.ByteStreams; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.FileNotFoundException; | ||
import java.io.FileOutputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.ObjectInputStream; | ||
|
||
/** | ||
* Basic java-based sentiment classifier. | ||
*/ | ||
public class Analysis extends AbstractFlowlet { | ||
|
||
private static final Logger LOG = LoggerFactory.getLogger(Analysis.class); | ||
private static final String LOCALIZED_FILENAME = "localized.txt"; | ||
|
||
TextClassifier classifierModel = null; | ||
|
||
@Override | ||
public void initialize(FlowletContext context) throws Exception { | ||
super.initialize(context); | ||
|
||
InputStream in = null; | ||
FileOutputStream out = null; | ||
try { | ||
in = this.getClass().getClassLoader().getResourceAsStream("java_trained_classifier.txt"); | ||
out = new FileOutputStream(LOCALIZED_FILENAME); // localized within container, so it get cleaned. | ||
ByteStreams.copy(in, out); | ||
} catch (IOException e) { | ||
throw Throwables.propagate(e); | ||
} finally { | ||
try { | ||
if (in != null) { | ||
in.close(); | ||
} | ||
if (out != null) { | ||
out.close(); | ||
} | ||
} catch (IOException e) { | ||
throw Throwables.propagate(e); | ||
} | ||
} | ||
InputStream modelInputStream = new FileInputStream(new File(LOCALIZED_FILENAME)); | ||
classifierModel = TextClassifier.createFromObjectStream(new ObjectInputStream(modelInputStream)); | ||
LOG.info("Initialized Analysis flowlet."); | ||
} | ||
|
||
@Batch(100) | ||
@ProcessInput | ||
public void classifyTweet(String tweet) throws FileNotFoundException, ClassifierResultException { | ||
// while (tweetIterator.hasNext()) { | ||
// System.out.println(classify(tweetIterator.next())); | ||
// } | ||
System.out.println(classify(tweet).toString()); | ||
} | ||
|
||
public ClassificationResult classify(String text) throws FileNotFoundException, ClassifierResultException { | ||
return classifierModel.classify(text); | ||
} | ||
} |
56 changes: 56 additions & 0 deletions
56
...xamples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassificationResult.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/* | ||
* Copyright © 2014 Cask Data, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package co.cask.tigon.sentiment; | ||
|
||
import com.google.common.base.Objects; | ||
|
||
/** | ||
* This class represents the value and confidence of any classification. | ||
*/ | ||
public class ClassificationResult { | ||
private String value; | ||
private double confidence; | ||
private Sentiment sentiment; | ||
|
||
public ClassificationResult(String value, double confidence) throws ClassifierResultException { | ||
this.value = value; | ||
this.confidence = confidence; | ||
|
||
if (value.equals("pos")) { | ||
sentiment = Sentiment.positive; | ||
} else if (value.equals("neg")) { | ||
sentiment = Sentiment.negative; | ||
} else if (value.equals("neu")) { | ||
sentiment = Sentiment.neutral; | ||
} else { | ||
throw new ClassifierResultException("Classifier return result not recognized. "); | ||
} | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return Objects.toStringHelper(this) | ||
.add("sentiment", sentiment.toString()) | ||
.add("value", value) | ||
.add("confidence", confidence).toString(); | ||
} | ||
|
||
public static enum Sentiment { | ||
positive, neutral, negative; | ||
} | ||
|
||
} |
26 changes: 26 additions & 0 deletions
26
...es/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/ClassifierResultException.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/* | ||
* Copyright © 2014 Cask Data, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package co.cask.tigon.sentiment; | ||
|
||
/** | ||
* Exception when the Classifier couldn't classify correctly. | ||
*/ | ||
public class ClassifierResultException extends Exception { | ||
public ClassifierResultException(String message) { | ||
super(message); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
tigon-examples/SentimentAnalysis/src/main/java/co/cask/tigon/sentiment/TextClassifier.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* | ||
* Copyright © 2014 Cask Data, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package co.cask.tigon.sentiment; | ||
|
||
import com.aliasi.classify.Classification; | ||
import com.aliasi.classify.LMClassifier; | ||
|
||
import java.io.IOException; | ||
import java.io.ObjectInputStream; | ||
|
||
/** | ||
* Basic Text Classification | ||
* | ||
* Comments: has no vectorization strategy beyond using the default | ||
* implementation | ||
* | ||
* | ||
*/ | ||
public class TextClassifier{ | ||
|
||
private final LMClassifier classifier; | ||
|
||
private TextClassifier(LMClassifier classifier) { | ||
this.classifier = classifier; | ||
} | ||
|
||
public ClassificationResult classify(String data) throws ClassifierResultException { | ||
Classification classification = classifier.classify(data); | ||
return new ClassificationResult(classification.bestCategory(), 1.0); | ||
} | ||
|
||
public static TextClassifier createFromObjectStream(ObjectInputStream inputStream) | ||
throws IOException, ClassNotFoundException { | ||
LMClassifier classifier = (LMClassifier) inputStream.readObject(); | ||
return new TextClassifier(classifier); | ||
} | ||
} |
19 changes: 18 additions & 1 deletion
19
...nalysis/src/main/java/TweetCollector.java → .../cask/tigon/sentiment/TweetCollector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+54.3 KB
tigon-examples/SentimentAnalysis/src/main/resources/java_trained_classifier.txt
Binary file not shown.
26 changes: 21 additions & 5 deletions
26
tigon-examples/SentimentAnalysis/src/main/resources/twitter4j.properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,21 @@ | ||
oauth.consumerKey=guzpQsLtdKs0jlap64nY1nX4N | ||
oauth.consumerSecret=z7Ux5TPQyeOXd8xlXmm87V3qxi1vABSV9NFVelDaCus8m39tIe | ||
oauth.accessToken=366210197-mpzoVZgENXzrEnVXvgdOqoDkCv55m2M5IYSp4ouv | ||
oauth.accessTokenSecret=Q9ST3W4d68KgBnIKmQYWgFaBdCBrVsQXtE54ol8UhudoL | ||
debug=false | ||
# | ||
# Copyright © 2014 Cask Data, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
# use this file except in compliance with the License. You may obtain a copy of | ||
# the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations under | ||
# the License. | ||
# | ||
|
||
oauth.consumerKey=5bPCAbHLyeNIzGbaCyFZIQMMl | ||
oauth.consumerSecret=OTH2It9nYhyqolWIi0068v7Oa1WLXviW3swT7DpYJTCxD9mx6B | ||
oauth.accessToken=2827531970-P8HdLEm8bYusoXXMM0bUHivCVqJRQO7wDHlTeRB | ||
oauth.accessTokenSecret=18togr81QeDw9GM7F4OSmo8aSZgLKmoUjfeAJmnKD0445 | ||
debug=false |