Skip to content

Commit

Permalink
Correcting bugs related to floating point errors, refactoring code
Browse files Browse the repository at this point in the history
Floating point errors were caused due to discrepancy in the way Python and
Java handle floats. This has not been fully resolved but it has been seen
that it does not affect the result much. Most errors were solved after the
rounding off function was rewritten.

The test file created using getNltkVader.py was not handling non-ascii
values correctly as a result of which java version was giving a different
value. That has been resolved.

While porting the code, the part where idioms are handled was wrongly
implemented in the java version. That has been corrected.

Added tests for comparing results of Amazon Reviews, Movie Reviews and
NYTimes Editorial snippets.
  • Loading branch information
Animesh Pandey committed Jan 13, 2017
1 parent e89db58 commit d1d30c4
Show file tree
Hide file tree
Showing 13 changed files with 43,377 additions and 4,277 deletions.
29 changes: 29 additions & 0 deletions plugin.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="FacetManager">
<facet type="Python" name="Python">
<configuration sdkName="Python 2.7.11 (E:\Miniconda2\python.exe)" />
</facet>
</component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_7" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
<excludeFolder url="file://$MODULE_DIR$/src/test/resources/temp" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-analyzers-common:5.5.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-core:5.5.0" level="project" />
<orderEntry type="library" name="Maven: commons-lang:commons-lang:2.6" level="project" />
<orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.12" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
<orderEntry type="library" name="Python 2.7.11 (E:\Miniconda2\python.exe) interpreter library" level="application" />
</component>
</module>
68 changes: 30 additions & 38 deletions src/main/java/com/vader/SentimentAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
import org.apache.commons.lang.StringUtils;

import java.io.IOException;
import java.math.RoundingMode;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.util.*;

/**
Expand Down Expand Up @@ -51,9 +48,8 @@ private float valenceModifier(String precedingWord, float currentValence) {
scalar = Utils.BOOSTER_DICTIONARY.get(precedingWordLower);
if (currentValence < 0.0)
scalar *= -1.0;
if (Utils.isUpper(precedingWord) && inputStringProperties.isCapDIff()) {
if (Utils.isUpper(precedingWord) && inputStringProperties.isCapDIff())
scalar = (currentValence > 0.0) ? scalar + Utils.ALL_CAPS_BOOSTER_SCORE : scalar - Utils.ALL_CAPS_BOOSTER_SCORE;
}
}
return scalar;
}
Expand All @@ -74,8 +70,7 @@ private float checkForNever(float currentValence, int startI, int i, int closeTo
if (startI == 1) {
String wordAtDistanceTwoLeft = wordsAndEmoticons.get(i - 2);
String wordAtDistanceOneLeft = wordsAndEmoticons.get(i - 1);
if ((wordAtDistanceTwoLeft.equals("never")) &&
(wordAtDistanceOneLeft.equals("so") || (wordAtDistanceOneLeft.equals("this")))) {
if ((wordAtDistanceTwoLeft.equals("never")) && (wordAtDistanceOneLeft.equals("so") || (wordAtDistanceOneLeft.equals("this")))) {
currentValence *= 1.5f;
} else if (isNegative(new ArrayList<>(Collections.singletonList(wordsAndEmoticons.get(closeTokenIndex))))) {
currentValence *= Utils.N_SCALAR;
Expand Down Expand Up @@ -248,7 +243,10 @@ else if (valence < 0.0f)
neutralSentimentCount += 1;
}
return new ArrayList<>(Arrays.asList(
positiveSentimentScore, negativeSentimentScore, (float) neutralSentimentCount));
positiveSentimentScore,
negativeSentimentScore,
(float) neutralSentimentCount)
);
}

private HashMap<String, Float> polarityScores(ArrayList<Float> currentSentimentState) {
Expand All @@ -269,7 +267,7 @@ else if (totalValence < 0.0f)

float compoundPolarity = normalizeScore(totalValence);

logger.debug(currentSentimentState);
logger.debug("Final token-wise sentiment state: " + currentSentimentState);

ArrayList<Float> siftedScores = siftSentimentScores(currentSentimentState);
float positiveSentimentScore = siftedScores.get(0);
Expand All @@ -288,16 +286,24 @@ else if (positiveSentimentScore < Math.abs(negativeSentimentScore))

logger.debug("Normalization Factor: " + normalizationFactor);

logger.debug(String.format("Pre-Normalized Scores: %s %s %s %s",
Math.abs(positiveSentimentScore),
Math.abs(negativeSentimentScore),
Math.abs(neutralSentimentCount),
compoundPolarity
));

logger.debug(String.format("Pre-Round Scores: %s %s %s %s",
Math.abs(positiveSentimentScore / normalizationFactor),
Math.abs(negativeSentimentScore / normalizationFactor),
Math.abs(neutralSentimentCount / normalizationFactor),
compoundPolarity));
compoundPolarity
));

final float normalizedPositivePolarity = roundScores(Math.abs(positiveSentimentScore / normalizationFactor), 3);
final float normalizedNegativePolarity = roundScores(Math.abs(negativeSentimentScore / normalizationFactor), 3);
final float normalizedNeutralPolarity = roundScores(Math.abs(neutralSentimentCount / normalizationFactor), 3);
final float normalizedCompoundPolarity = roundScores(compoundPolarity, 4);
final float normalizedPositivePolarity = roundDecimal(Math.abs(positiveSentimentScore / normalizationFactor), 3);
final float normalizedNegativePolarity = roundDecimal(Math.abs(negativeSentimentScore / normalizationFactor), 3);
final float normalizedNeutralPolarity = roundDecimal(Math.abs(neutralSentimentCount / normalizationFactor), 3);
final float normalizedCompoundPolarity = roundDecimal(compoundPolarity, 4);

return new HashMap<String, Float>() {{
put("compound", normalizedCompoundPolarity);
Expand Down Expand Up @@ -381,7 +387,8 @@ private boolean hasNegativeWord(ArrayList<String> tokenList, ArrayList<String> n
private boolean isNegative(ArrayList<String> tokenList, ArrayList<String> newNegWords, boolean checkContractions) {
newNegWords.addAll(Utils.NEGATIVE_WORDS);
boolean result = hasNegativeWord(tokenList, newNegWords) || hasAtLeast(tokenList);
if (checkContractions) return result;
if (checkContractions)
return result;
return result || hasContraction(tokenList);
}

Expand All @@ -394,34 +401,19 @@ private boolean isNegative(ArrayList<String> tokenList) {
return hasNegativeWord(tokenList, Utils.NEGATIVE_WORDS) || hasAtLeast(tokenList) || hasContraction(tokenList);
}

private Float normalizeScore(Float score, Float alpha) {
double normalizedScore = score / Math.sqrt((score.doubleValue() * score.doubleValue()) + alpha.doubleValue());
private float normalizeScore(float score, float alpha) {
double normalizedScore = score / Math.sqrt((score * score) + alpha);
return (float) normalizedScore;
}

private Float normalizeScore(Float score) {
double normalizedScore = score / Math.sqrt((score.doubleValue() * score.doubleValue()) + 15.0);
private float normalizeScore(float score) {
double normalizedScore = score / Math.sqrt((score * score) + 15.0f);
return (float) normalizedScore;
}

private static float roundScores(float currentScore, int roundTo) {
try {
String characteristicPart = "##.";
String mantissaPart = "";
for (int i = 0; i < roundTo; i++)
mantissaPart = mantissaPart.concat("0");
DecimalFormat df = new DecimalFormat(characteristicPart + mantissaPart);
String formatted = df.format(currentScore);
double finalValue = (double) df.parse(formatted);
return (float) finalValue;
} catch (ParseException e) {
return currentScore;
} catch (ClassCastException cce) {
return currentScore;
}
}

public static void main(String[] args) {
System.out.println(roundScores(0.3125f, 3));
private static float roundDecimal(float currentValue, int roundTo) {
float n = (float) Math.pow(10.0, (double) roundTo);
float number = Math.round(currentValue * n);
return number / n;
}
}
23 changes: 0 additions & 23 deletions src/test/VaderTest/getNltkVader.py

This file was deleted.

70 changes: 54 additions & 16 deletions src/test/java/com/vader/sentiment/TestNLTKTweets.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,86 @@
import org.junit.Assert;
import org.junit.Test;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URL;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;


/**
* @author Animesh Pandey
* Created on 4/14/2016.
*/
public class TestNLTKTweets {

private static final ClassLoader loader = TestNLTKTweets.class.getClassLoader();
private static List<String> testFiles = new ArrayList<>();

static {
testFiles.add("amazonReviewSnippets_GroundTruth_vader.tsv");
testFiles.add("movieReviewSnippets_GroundTruth_vader.tsv");
testFiles.add("nytEditorialSnippets_GroundTruth_vader.tsv");
testFiles.add("tweets_GroundTruth_vader.tsv");
}

@Test
public void readGroundTruth() {
URL gtFile = loader.getResource("tweets_GroundTruth_Vader.tsv");
if (gtFile != null) {
try (BufferedReader br = new BufferedReader(new FileReader(gtFile.getFile()))) {
for (String fileName : testFiles) {
InputStream inputStream = loader.getResourceAsStream(fileName);
try (BufferedReader br = new BufferedReader(new InputStreamReader(inputStream))) {
String line;
while ((line = br.readLine()) != null) {
String[] gtFileData = line.split("\\t");

float negativeScore = Float.parseFloat(gtFileData[1]);
float neutralScore = Float.parseFloat(gtFileData[2]);
float positiveScore = Float.parseFloat(gtFileData[3]);
float compoundScore = Float.parseFloat(gtFileData[4]);
float expectedNegativeScore = Float.parseFloat(gtFileData[1]);
float expectedNeutralScore = Float.parseFloat(gtFileData[2]);
float expectedPositiveScore = Float.parseFloat(gtFileData[3]);
float expectedCompoundScore = Float.parseFloat(gtFileData[4]);
String inputString = gtFileData[5];

SentimentAnalyzer sentimentAnalyzer = new SentimentAnalyzer(inputString);
sentimentAnalyzer.analyse();

HashMap<String, Float> inputStringPolarity = sentimentAnalyzer.getPolarity();
float actualNegativeScore = inputStringPolarity.get("negative");
float actualPositiveScore = inputStringPolarity.get("positive");
float actualNeutralScore = inputStringPolarity.get("neutral");
float actualCompoundScore = inputStringPolarity.get("compound");

Assert.assertTrue(inputStringPolarity.get("negative") == negativeScore);
Assert.assertTrue(inputStringPolarity.get("positive") == positiveScore);
Assert.assertTrue(inputStringPolarity.get("neutral") == neutralScore);
Assert.assertTrue(inputStringPolarity.get("compound") == compoundScore);
Assert.assertFalse(error(actualNegativeScore, expectedNegativeScore));
Assert.assertFalse(error(actualPositiveScore, expectedPositiveScore));
Assert.assertFalse(error(actualNeutralScore, expectedNeutralScore));
Assert.assertFalse(error(actualCompoundScore, expectedCompoundScore));
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Test Passed for " + fileName);
}
}

private int noOfDecimalDigits(float value) {
String text = Float.toString(Math.abs(value));
return text.length() - text.indexOf('.') - 1;
}

/**
* Due to Floating Point Precision errors results used to differ by 1
* e.g. 0.0345 from NLTK might be 0.0344 or 0.0346 when calculated
* in Java. This was mainly due to rounding off errors.
* To handle this the difference between two values should not be
* greater than 1.
* <p>
* error(0.0345, 0.0344) => false
* error(0.0345, 0.0346) => false
* error(0.0345, 0.0348) => true
*
* @param actual
* @param experiment
* @return true iff the difference between actual and experiment is
* greater than 1.0
*/
private boolean error(float actual, float experiment) {
int maxPlaces = Math.max(noOfDecimalDigits(actual), noOfDecimalDigits(experiment));
return ((Math.abs(Math.abs(actual * maxPlaces) - Math.abs(experiment * maxPlaces))) > 1.0);
}
}
Loading

0 comments on commit d1d30c4

Please sign in to comment.