Permalink
Browse files

- Added target for the nearest neighbor finder tool

- Reduced the default timeout for long-running tests

- Added a jaccardIndex overload for two sets of elements

- Fixed javadoc

- Made class serializable

- Added support for maximum path length

- Reworked to use the new SimpleDependencyPath class

- Revised error message for clarity

- Updated to use new SimpleDependencyPath features

- Updated to use new SimpleDependencyPath features

- Major rewrite for clarity

- Fixed toString()

- Added static methods for testing the category of particular POS tag

- Implemented missing methods

- Added more imports (not sure why...)

- Added logging

- Added support for shuffling the edges of a graph using a fixed Random for reproducability

- Multithreaded the edge similarity comparison

- Probably some bug fixes too

- Updated to use IntPair instead of Pair<Integer>

- Updated to use IntSet

- Major overhaul to bring performance in line with the earlier primitive
  collection enhancements

- Probably needs a lot of clean up still

- Updates to use new primitive collections

- Updates to use new primitive collections

- Added support for clustering to a fixed number of clusters

- Added (limited) support for writing in GEXF format

- Added (limited) support for writing in Pajek format

- Made default output verbose

- Remove extra temporary file that was just hanging around unneeded

- Added support for getting the String that backs the document

- Added a new class for testing the association between two terms, which is
  already partially supported by the BigramExtractor, but this class supports
  limiting the number of items being associated which enables better scaling
  through iterative association testing.

- This class needs a lot of work.

- Added an iterator for the documents in WaCkypedia

- Fixed HTML bug in javadoc

- Added new tool for running the NearestNeighborFinder from the command line

- Fixed javadoc

- Added support for changing the loging level of any logger namespace

- Added new tool for speeding up repeated nearest-neighbor computations by
  partitioning a SemanticSpace into clusters (using K-means) and then only
  searching a subset.

- Added support for reading and writing to streams and byte[] arrays

- Updated to be an IntSet

- Added Pair implementation

- Fixed iterator remove bug

- Added more extensive real-world unit tests

- Added unit test details that match the example in the Ahn et al. paper

- Added more tests
  • Loading branch information...
1 parent ec8c85f commit dc6cdb6cf315d6fdf7ee782ba4dde0da70ea574a David Jurgens committed Dec 1, 2011
Showing with 3,154 additions and 832 deletions.
  1. +23 −5 build.xml
  2. +17 −1 src/edu/ucla/sspace/common/Similarity.java
  3. +4 −1 src/edu/ucla/sspace/common/VectorMapSemanticSpace.java
  4. +37 −16 src/edu/ucla/sspace/dependency/BreadthFirstPathIterator.java
  5. +2 −1 src/edu/ucla/sspace/dependency/DependencyExtractorManager.java
  6. +55 −17 src/edu/ucla/sspace/dependency/DependencyIterator.java
  7. +8 −23 src/edu/ucla/sspace/dependency/FilteredDependencyIterator.java
  8. +118 −85 src/edu/ucla/sspace/dependency/SimpleDependencyPath.java
  9. +31 −0 src/edu/ucla/sspace/dv/PennTags.java
  10. +13 −14 src/edu/ucla/sspace/graph/DirectedMultigraph.java
  11. +3 −0 src/edu/ucla/sspace/graph/GraphIO.java
  12. +80 −6 src/edu/ucla/sspace/graph/Graphs.java
  13. +96 −58 src/edu/ucla/sspace/graph/LinkClustering.java
  14. +6 −6 src/edu/ucla/sspace/graph/SimpleGraphIterator.java
  15. +4 −12 src/edu/ucla/sspace/graph/SparseDirectedTypedEdgeSet.java
  16. +625 −112 src/edu/ucla/sspace/graph/SparseTypedEdgeSet.java
  17. +13 −5 src/edu/ucla/sspace/graph/SubgraphIterator.java
  18. +429 −445 src/edu/ucla/sspace/graph/UndirectedMultigraph.java
  19. +17 −4 src/edu/ucla/sspace/graph/WeightedLinkClustering.java
  20. +261 −0 src/edu/ucla/sspace/graph/io/GexfIO.java
  21. +94 −0 src/edu/ucla/sspace/graph/io/PajekIO.java
  22. +2 −2 src/edu/ucla/sspace/mains/GenericMain.java
  23. +1 −0 src/edu/ucla/sspace/matrix/SvdlibjDriver.java
  24. +12 −0 src/edu/ucla/sspace/text/StringDocument.java
  25. +182 −0 src/edu/ucla/sspace/text/TermAssociationFinder.java
  26. +125 −0 src/edu/ucla/sspace/text/WaCkypediaDocumentIterator.java
  27. +1 −1 src/edu/ucla/sspace/tools/BigramExtractor.java
  28. +182 −0 src/edu/ucla/sspace/tools/NearestNeighborFinderTool.java
  29. +2 −1 src/edu/ucla/sspace/util/Counter.java
  30. +19 −0 src/edu/ucla/sspace/util/LoggerUtil.java
  31. +438 −0 src/edu/ucla/sspace/util/NearestNeighborFinder.java
  32. +72 −6 src/edu/ucla/sspace/util/SerializableUtil.java
  33. +9 −5 src/edu/ucla/sspace/util/primitive/CompactIntSet.java
  34. +67 −0 src/edu/ucla/sspace/util/primitive/IntPair.java
  35. +1 −1 src/edu/ucla/sspace/util/primitive/TroveIntSet.java
  36. +86 −2 test/edu/ucla/sspace/dependency/BreadthFirstPathIteratorTest.java
  37. +12 −0 test/edu/ucla/sspace/graph/LinkClusteringTests.java
  38. +7 −3 test/edu/ucla/sspace/graph/UndirectedMultigraphTests.java
View
@@ -204,8 +204,7 @@
</artifact:mvn>
</target>
- <target name="tools" depends="sse-jar,svd-jar,tc-jar">
- </target>
+ <target name="tools" depends="sse-jar,svd-jar,tc-jar,nnf-jar"/>
<!--
**
@@ -217,7 +216,7 @@
<target name="jar" depends="compile">
- <jar destfile="${bin.dir}/sspace-lib.jar" basedir="classes">
+ <jar destfile="${dist.dir}/sspace-lib.jar" basedir="classes">
<include name="**/*.class"/>
<exclude name="jnt/*"/>
<manifest>
@@ -521,6 +520,21 @@
</jar>
</target>
+ <target name="nnf-jar" depends="compile">
+ <jar destfile="${tools.dir}/nnf.jar" basedir="classes">
+ <include name="**/*.class"/>
+ <manifest>
+ <!-- Who is building this jar? -->
+ <attribute name="Built-By" value="${user.name}"/>
+ <!-- Information about the program itself -->
+ <attribute name="Implementation-Vendor" value="AIRhead Research"/>
+ <attribute name="Implementation-Title" value="SVD"/>
+ <attribute name="Implementation-Version" value="${version}"/>
+ <attribute name="Main-Class" value="edu.ucla.sspace.tools.NearestNeighborFinderTool"/>
+ </manifest>
+ </jar>
+ </target>
+
<!--
**
**
@@ -567,11 +581,15 @@
<cobertura-report srcdir="${src.dir}" destdir="${coverage.html.dir}" />
</target>
+ <property name="junit.default.timeout" value="5000" /> <!-- 5 minute default timeout is 300000 -->
+
<target name="test" depends="compile-tests">
<delete dir="reports"/>
<mkdir dir="reports"/>
- <junit printsummary="yes" fork="yes" haltonfailure="no">
+ <junit printsummary="yes" fork="yes" haltonfailure="no" timeout="5000">
+ <!-- <sysproperty key="junit.default.timeout" value="${junit.default.timeout}" />-->
+
<jvmarg value="-Xmx1g"/>
<jvmarg value="-debug"/>
<classpath location="${build.instrumented.dir}"/>
@@ -587,7 +605,7 @@
<include name="**/*Tests.java"/>
</fileset>
</batchtest>
-
+ <!-- -->
</junit>
</target>
@@ -878,6 +878,21 @@ public static double euclideanSimilarity(Vector a, Vector b) {
/**
* Computes the <a href="http://en.wikipedia.org/wiki/Jaccard_index">Jaccard
+ * index</a> of the two sets of elements.
+ */
+ public static double jaccardIndex(Set<?> a, Set<?> b) {
+ int intersection = 0;
+ for (Object o : a) {
+ if (b.contains(o))
+ intersection++;
+ }
+
+ double union = a.size() + b.size() - intersection;
+ return intersection / union;
+ }
+
+ /**
+ * Computes the <a href="http://en.wikipedia.org/wiki/Jaccard_index">Jaccard
* index</a> comparing the similarity both arrays when viewed as sets of
* samples.
*/
@@ -2264,7 +2279,8 @@ else if (a instanceof SparseVector && b instanceof SparseVector) {
}
/**
- * Returns the cosine similarity of the two {@code IntegerVector} instances
+ * Returns the Tanimoto Coefficient of the two {@code IntegerVector}
+ * instances
*
* @throws IllegalArgumentException when the length of the two vectors are
* not the same.
@@ -62,7 +62,10 @@
*
* @author Keith Stevens
*/
-public class VectorMapSemanticSpace<T extends Vector> implements SemanticSpace {
+public class VectorMapSemanticSpace<T extends Vector>
+ implements SemanticSpace, java.io.Serializable {
+
+ private static final long serialVersionUID = 1L;
private static final Logger LOGGER =
Logger.getLogger(VectorMapSemanticSpace.class.getName());
@@ -50,49 +50,70 @@
* The paths that have been expanded from the starting node but have not yet
* been returned.
*/
- protected final Queue<DependencyPath> frontier;
-
+ protected final Queue<SimpleDependencyPath> frontier;
+
+ /**
+ * The maximum length of any path returned by the iterator.
+ */
+ private final int maxPathLength;
+
/**
* Creates a new iterator over all the paths starting at the provided index.
*
* @param startNode the node that will start all the paths to be generated.
*/
public BreadthFirstPathIterator(DependencyTreeNode startNode) {
- frontier = new ArrayDeque<DependencyPath>();
+ this(startNode, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Creates a new iterator over all the paths starting at the provided index
+ * that will only return paths up to the specified maximum length.
+ *
+ * @param startNode the node that will start all the paths to be generated.
+ * @param maxPathLength the maximum path length to return
+ *
+ * @throws IllegalArgumentException if {@maxPathLength} is &lt; 1.
+ */
+ public BreadthFirstPathIterator(DependencyTreeNode startNode,
+ int maxPathLength) {
+ if (maxPathLength < 1)
+ throw new IllegalArgumentException(
+ "Must specify a path length greater than or equal to 1");
+ this.maxPathLength = maxPathLength;
+ frontier = new ArrayDeque<SimpleDependencyPath>();
// Base-case: find all the paths of length 1
for (DependencyRelation rel : startNode.neighbors()) {
// Orient the path depending on whether the root was the head of the
// relationship or not. This ensures that the root is always the
// first node in the path and any expansion will continue away from
// the root.
- SimpleDependencyPath p = new SimpleDependencyPath(
- Collections.singletonList(rel),
- rel.headNode().equals(startNode));
- frontier.offer(p);
+ frontier.offer(new SimpleDependencyPath(
+ rel, rel.headNode().equals(startNode)));
}
}
/**
* Expands the breadth-first frontier by adding all the new paths one link
* away to the end of {@code frontier}.
*/
- /* package-private */ void advance(DependencyPath path) {
+ /* package-private */ void advance(SimpleDependencyPath path) {
+ if (path.length() >= maxPathLength)
+ return;
+
// Get the last node and last relation to decide how to expand.
- DependencyTreeNode last = path.last();
- DependencyRelation lastRel = path.lastRelation();
+ DependencyRelation lastRelation = path.lastRelation();
+ DependencyTreeNode last = path.last();
// Expand all of the possible relations from the last node, creating a
// new path for each, except if the relation is the one that generated
// this path.
for (DependencyRelation rel : last.neighbors()) {
// Skip re-adding the current relation
- if (lastRel.equals(rel))
+ if (lastRelation.equals(rel))
continue;
- // Use an extension of the path, rather than having to copy all of
- // the nodes again. This just creates a view of path with rel as
- // the last relation in path
- DependencyPath extended = new ExtendedPathView(path, rel);
+ SimpleDependencyPath extended = path.extend(rel);
frontier.offer(extended);
}
}
@@ -109,7 +130,7 @@ public boolean hasNext() {
* or greater than the previously returned path.
*/
public DependencyPath next() {
- DependencyPath p = frontier.remove();
+ SimpleDependencyPath p = frontier.remove();
// Expand the frontier 1 link starting from the current path
advance(p);
return p;
@@ -123,7 +123,8 @@ public static synchronized DependencyExtractor getExtractor(String name) {
*/
public static synchronized DependencyExtractor getDefaultExtractor() {
if (defaultExtractor == null)
- throw new IllegalStateException("No extractors available");
+ throw new IllegalStateException(
+ "No DependencyExtractors available.");
return defaultExtractor;
}
}
@@ -21,6 +21,7 @@
package edu.ucla.sspace.dependency;
+import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
@@ -38,7 +39,7 @@
*
* Note that this class is <b>NOT</b> thread safe.
*/
-public class DependencyIterator extends BreadthFirstPathIterator {
+public class DependencyIterator implements Iterator<DependencyPath> {
/**
* The maximum length of the returned paths. The length is considedered to
@@ -47,6 +48,12 @@
private final int maxPathLength;
/**
+ * The paths that have been expanded from the starting node but have not yet
+ * been returned.
+ */
+ protected final Queue<SimpleDependencyPath> frontier;
+
+ /**
* The {@link DependencyRelationAcceptor} that validates each link before it is
* traversed and returned as part of a {@link DependencyPath}.
*/
@@ -68,42 +75,73 @@
public DependencyIterator(DependencyTreeNode startNode,
DependencyRelationAcceptor acceptor,
int maxPathLength) {
- super(startNode);
if (maxPathLength < 1)
throw new IllegalArgumentException(
- "Must specify a path length greater than 1");
-
- this.acceptor = acceptor;
+ "Must specify a path length greater than or equal to 1");
this.maxPathLength = maxPathLength;
+ this.acceptor = acceptor;
+ frontier = new ArrayDeque<SimpleDependencyPath>();
+
+ // Base-case: find all the paths of length 1
+ for (DependencyRelation rel : startNode.neighbors()) {
+ // Orient the path depending on whether the root was the head of the
+ // relationship or not. This ensures that the root is always the
+ // first node in the path and any expansion will continue away from
+ // the root.
+ if (acceptor.accept(rel)) {
+ frontier.offer(new SimpleDependencyPath(
+ rel, rel.headNode().equals(startNode)));
+ }
+ }
}
/**
* Extends the path in its growth direction and adds to the frontier those
* relations that are shorter than the maximum path length and that are
* accepted by the {@code DependencyRelationAcceptor} used by this instance.
*/
- @Override void advance(DependencyPath path) {
- // Skip processing paths that would exceed the maximum length
- if (path.length() == maxPathLength)
+ void advance(SimpleDependencyPath path) {
+ if (path.length() >= maxPathLength)
return;
// Get the last node and last relation to decide how to expand.
- DependencyTreeNode last = path.last();
- DependencyRelation lastRel = path.lastRelation();
+ DependencyRelation lastRelation = path.lastRelation();
+ DependencyTreeNode last = path.last();
// Expand all of the possible relations from the last node, creating a
// new path for each, except if the relation is the one that generated
// this path.
for (DependencyRelation rel : last.neighbors()) {
- // Skip re-adding the current relation and those relations that do
- // not pass the filter
- if (lastRel.equals(rel) || !acceptor.accept(rel))
+ // Skip re-adding the current relation
+ if (lastRelation.equals(rel) || !acceptor.accept(rel))
continue;
- // Use an extension of the path, rather than having to copy all of
- // the nodes again. This just creates a view of path with rel as
- // the last relation in path
- DependencyPath extended = new ExtendedPathView(path, rel);
+ SimpleDependencyPath extended = path.extend(rel);
frontier.offer(extended);
}
}
+
+ /**
+ * Returns {@code true} if there are still paths to return for the tree.
+ */
+ public boolean hasNext() {
+ return !frontier.isEmpty();
+ }
+
+ /**
+ * Returns the next {@code DependencyPath} in the tree whose length is equal
+ * or greater than the previously returned path.
+ */
+ public DependencyPath next() {
+ SimpleDependencyPath p = frontier.remove();
+ // Expand the frontier 1 link starting from the current path
+ advance(p);
+ return p;
+ }
+
+ /**
+ * Throws an {@code UnsupportedOperationException} if called
+ */
+ public void remove() {
+ throw new UnsupportedOperationException("Removal is not possible");
+ }
}
@@ -42,12 +42,6 @@
public class FilteredDependencyIterator implements Iterator<DependencyPath> {
/**
- * The maximum length of the returned paths. The length is considedered to
- * not include the first term.
- */
- private final int maxPathLength;
-
- /**
* The {@link DependencyPathAcceptor} that validates each link before it is
* traversed and returned as part of a {@link DependencyPath}.
*/
@@ -89,19 +83,15 @@ public FilteredDependencyIterator(DependencyTreeNode startNode,
* @param startNode the node that will start all the paths to be generated.
* @param acceptor The {@link DependencyPathAcceptor} that will validate
* the paths returned by this iterator
- * @param maxPathLength the maximum number of nodes in any path
+ * @param maxPathLength the maximum number of relations in any path
*
* @throws IllegalArgumentException if {@code maxPathLength} is less than 1
*/
public FilteredDependencyIterator(DependencyTreeNode startNode,
DependencyPathAcceptor acceptor,
int maxPathLength) {
- if (maxPathLength < 1)
- throw new IllegalArgumentException(
- "Must specify a path length greater than 1");
- this.iterator = new BreadthFirstPathIterator(startNode);
+ this.iterator = new BreadthFirstPathIterator(startNode, maxPathLength);
this.acceptor = acceptor;
- this.maxPathLength = maxPathLength;
advance();
}
@@ -110,20 +100,15 @@ public FilteredDependencyIterator(DependencyTreeNode startNode,
* the value to {@code null} if no further paths exist.
*/
private void advance() {
- DependencyPath p = null;
+ next = null;
// While the underlying iterator has paths, check whether any are
- // accepted by the filter. If a path is over the maximum path length,
- // break, since no further returned paths will be smaller.
- while (iterator.hasNext()) {
- p = iterator.next();
- if (p.length() > maxPathLength) {
- p = null;
- break;
- } else if (acceptor.accepts(p))
- break;
+ // accepted by the filter.
+ while (iterator.hasNext() && next == null) {
+ DependencyPath p = iterator.next();
+ if (acceptor.accepts(p))
+ next = p;
}
- next = p;
}
/**
Oops, something went wrong.

0 comments on commit dc6cdb6

Please sign in to comment.