Skip to content

Commit

Permalink
Luchen's code to dump out docids of docs in the index.
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmy0017 committed Oct 23, 2015
2 parents 14dfe45 + 048b407 commit 491dd9e
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 0 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,13 @@ sh target/appassembler/bin/TweetSearcher -index twitter-index
```

The demo starts up an HTTP server on port `8080`, but this can be changed with the `-port` option. Query via a web browser at `http://localhost:8080/search?query=query`. Try `birthday`, as there are always birthdays being celebrated.

### IndexCounter:

Output all the document IDs in a Lucene Index.

```sh
sh target/appassembler/bin/IndexCounter -indexPath /path/to/index \
-docIdPath /path/to/save/docIds
```
=======
4 changes: 4 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
<mainClass>io.anserini.nrts.TweetSearcher</mainClass>
<name>TweetSearcher</name>
</program>
<program>
<mainClass>io.anserini.util.IndexCounter</mainClass>
<name>IndexCounter</name>
</program>
</programs>
</configuration>
</plugin>
Expand Down
51 changes: 51 additions & 0 deletions src/main/java/io/anserini/util/IndexCounter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package io.anserini;

import java.io.File;
import java.io.IOException;
import java.io.BufferedWriter;
import java.io.FileWriter;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.store.FSDirectory;

public class IndexCounter {

public void readIndex(String indexDir, String docIdPath) throws IOException{
FSDirectory dir = FSDirectory.open(new File(indexDir).toPath());
DirectoryReader reader = DirectoryReader.open(dir);

FileWriter fw = new FileWriter(new File(docIdPath));
BufferedWriter bw = new BufferedWriter(fw);
int len = reader.numDocs();
for (int i = 0; i < len; i ++){
String docName = reader.document(i).get("docname");
bw.write(docName + "\n");
//System.out.println("IndexCounter: " + i + " docs got");
if ((i & 65535) == 0){
System.out.println("IndexCounter: " + i + " docs got");
}
//System.out.println(docName);


}
bw.close();
}

public static void main(String[] clArgs) {
Args args = new Args(clArgs);
final String indexDir = args.getString("-indexPath") + "/index";
final String docIdPath = args.getString("-docIdPath");

args.check();

System.out.println("Index path: " + indexDir);
System.out.println("DocId path: " + docIdPath);
final IndexCounter ic = new IndexCounter();
try {
ic.readIndex(indexDir, docIdPath);
}
catch (IOException e){
e.printStackTrace();
}
}
}

0 comments on commit 491dd9e

Please sign in to comment.