diff --git a/pom.xml b/pom.xml index e66f27f94f..a002094962 100644 --- a/pom.xml +++ b/pom.xml @@ -1,4 +1,5 @@ - + 4.0.0 anserini ingester @@ -71,11 +72,13 @@ + org.apache.lucene lucene-benchmark - ${LUCENE_VERSION} + ${LUCENE_VERSION} + org.apache.lucene lucene-core @@ -87,16 +90,13 @@ commons-logging 1.2 + org.apache.httpcomponents httpclient 4.5.1 - - commons-cli - commons-cli - 1.2 - + org.apache.commons commons-lang3 @@ -126,6 +126,7 @@ log4j-api 2.4 + org.apache.logging.log4j log4j-core @@ -137,20 +138,24 @@ gson 2.4 + org.twitter4j twitter4j-stream 4.0.4 + com.twitter twitter-text 1.13.0 + com.google.guava guava 19.0-rc2 + diff --git a/src/main/java/io/anserini/document/ClueWeb09WarcRecord.java b/src/main/java/io/anserini/document/ClueWeb09WarcRecord.java index 63f7ada340..461fd5d1e9 100644 --- a/src/main/java/io/anserini/document/ClueWeb09WarcRecord.java +++ b/src/main/java/io/anserini/document/ClueWeb09WarcRecord.java @@ -46,579 +46,579 @@ public final class ClueWeb09WarcRecord { - public static String WARC_VERSION = "WARC/0.18"; - public static String WARC_VERSION_LINE = "WARC/0.18\n"; - private static String NEWLINE = "\n"; - - private static byte MASK_THREE_BYTE_CHAR = (byte) (0xE0); - private static byte MASK_TWO_BYTE_CHAR = (byte) (0xC0); - private static byte MASK_TOPMOST_BIT = (byte) (0x80); - private static byte MASK_BOTTOM_SIX_BITS = (byte) (0x1F); - private static byte MASK_BOTTOM_FIVE_BITS = (byte) (0x3F); - private static byte MASK_BOTTOM_FOUR_BITS = (byte) (0x0F); - private WarcHeader warcHeader = new WarcHeader(); - private byte[] warcContent = null; - private String warcFilePath = ""; - - /** - * Default Constructor - */ - public ClueWeb09WarcRecord() { - } - - /** - * Copy Constructor - * - * @param o - */ - public ClueWeb09WarcRecord(ClueWeb09WarcRecord o) { - this.warcHeader = new WarcHeader(o.warcHeader); - this.warcContent = o.warcContent; - } - - /** - * Our read line implementation. We cannot allow buffering here (for gzip - * streams) so, we need to use DataInputStream. Also - we need to account - * for java's UTF8 implementation - * - * @param in the input data stream - * @return the read line (or null if eof) - * @throws java.io.IOException - */ - public static String readLineFromInputStream(DataInputStream in) throws IOException { - StringBuilder retString = new StringBuilder(); - - boolean keepReading = true; - try { - do { - char thisChar = 0; - byte readByte = in.readByte(); - - // check to see if it's a multibyte character - if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) { - // need to read the next 2 bytes - if (in.available() < 2) { - // treat these all as individual characters - retString.append((char) readByte); - int numAvailable = in.available(); - for (int i = 0; i < numAvailable; i++) { - retString.append((char) (in.readByte())); - } - continue; - } - byte secondByte = in.readByte(); - byte thirdByte = in.readByte(); - // ensure the topmost bit is set - if (((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) - || ((thirdByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)) { - // treat these as individual characters - retString.append((char) readByte); - retString.append((char) secondByte); - retString.append((char) thirdByte); - continue; - } - int finalVal = (thirdByte & MASK_BOTTOM_FIVE_BITS) + 64 - * (secondByte & MASK_BOTTOM_FIVE_BITS) + 4096 - * (readByte & MASK_BOTTOM_FOUR_BITS); - thisChar = (char) finalVal; - } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) { - // need to read next byte - if (in.available() < 1) { - // treat this as individual characters - retString.append((char) readByte); - continue; - } - byte secondByte = in.readByte(); - if ((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) { - retString.append((char) readByte); - retString.append((char) secondByte); - continue; - } - int finalVal = (secondByte & MASK_BOTTOM_FIVE_BITS) + 64 - * (readByte & MASK_BOTTOM_SIX_BITS); - thisChar = (char) finalVal; - } else { - // interpret it as a single byte - thisChar = (char) readByte; - } - - if (thisChar == '\n') { - keepReading = false; - } else { - retString.append(thisChar); - } - } while (keepReading); - } catch (EOFException eofEx) { - return null; - } - - if (retString.length() == 0) { - return ""; - } - - return retString.toString(); - } - - /** - * The actual heavy lifting of reading in the next WARC record - * - * @param in the data input stream - * @param headerBuffer a blank string buffer to contain the WARC header - * @return the content byts (w/ the headerBuffer populated) - * @throws java.io.IOException - */ - private static byte[] readNextRecord(DataInputStream in, StringBuilder headerBuffer) - throws IOException { - if (in == null) { - return null; - } - if (headerBuffer == null) { - return null; - } - - String line = null; - boolean foundMark = false; - boolean inHeader = true; - byte[] retContent = null; - - // cannot be using a buffered reader here!!!! - // just read the header - // first - find our WARC header - while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) { - if (line.startsWith(WARC_VERSION)) { - foundMark = true; + public static String WARC_VERSION = "WARC/0.18"; + public static String WARC_VERSION_LINE = "WARC/0.18\n"; + private static String NEWLINE = "\n"; + + private static byte MASK_THREE_BYTE_CHAR = (byte) (0xE0); + private static byte MASK_TWO_BYTE_CHAR = (byte) (0xC0); + private static byte MASK_TOPMOST_BIT = (byte) (0x80); + private static byte MASK_BOTTOM_SIX_BITS = (byte) (0x1F); + private static byte MASK_BOTTOM_FIVE_BITS = (byte) (0x3F); + private static byte MASK_BOTTOM_FOUR_BITS = (byte) (0x0F); + private WarcHeader warcHeader = new WarcHeader(); + private byte[] warcContent = null; + private String warcFilePath = ""; + + /** + * Default Constructor + */ + public ClueWeb09WarcRecord() { + } + + /** + * Copy Constructor + * + * @param o + */ + public ClueWeb09WarcRecord(ClueWeb09WarcRecord o) { + this.warcHeader = new WarcHeader(o.warcHeader); + this.warcContent = o.warcContent; + } + + /** + * Our read line implementation. We cannot allow buffering here (for gzip + * streams) so, we need to use DataInputStream. Also - we need to account + * for java's UTF8 implementation + * + * @param in the input data stream + * @return the read line (or null if eof) + * @throws java.io.IOException + */ + public static String readLineFromInputStream(DataInputStream in) throws IOException { + StringBuilder retString = new StringBuilder(); + + boolean keepReading = true; + try { + do { + char thisChar = 0; + byte readByte = in.readByte(); + + // check to see if it's a multibyte character + if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) { + // need to read the next 2 bytes + if (in.available() < 2) { + // treat these all as individual characters + retString.append((char) readByte); + int numAvailable = in.available(); + for (int i = 0; i < numAvailable; i++) { + retString.append((char) (in.readByte())); } + continue; + } + byte secondByte = in.readByte(); + byte thirdByte = in.readByte(); + // ensure the topmost bit is set + if (((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) + || ((thirdByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)) { + // treat these as individual characters + retString.append((char) readByte); + retString.append((char) secondByte); + retString.append((char) thirdByte); + continue; + } + int finalVal = (thirdByte & MASK_BOTTOM_FIVE_BITS) + 64 + * (secondByte & MASK_BOTTOM_FIVE_BITS) + 4096 + * (readByte & MASK_BOTTOM_FOUR_BITS); + thisChar = (char) finalVal; + } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) { + // need to read next byte + if (in.available() < 1) { + // treat this as individual characters + retString.append((char) readByte); + continue; + } + byte secondByte = in.readByte(); + if ((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) { + retString.append((char) readByte); + retString.append((char) secondByte); + continue; + } + int finalVal = (secondByte & MASK_BOTTOM_FIVE_BITS) + 64 + * (readByte & MASK_BOTTOM_SIX_BITS); + thisChar = (char) finalVal; + } else { + // interpret it as a single byte + thisChar = (char) readByte; } - // no WARC mark? - if (!foundMark) { - return null; - } - - // then read to the first newline - // make sure we get the content length here - int contentLength = -1; - boolean foundContentLength = false; - while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) { - if ((line.trim().length() == 0) && foundContentLength) { - inHeader = false; - } else { - headerBuffer.append(line); - headerBuffer.append(NEWLINE); - String[] thisHeaderPieceParts = line.split(":", 2); - if (thisHeaderPieceParts.length == 2) { - if (thisHeaderPieceParts[0].toLowerCase(Locale.US).startsWith("content-length")) { - foundContentLength = true; - try { - contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim()); - } catch (NumberFormatException nfEx) { - contentLength = -1; - } - } - } - } - } - - if (contentLength < 0) { - return null; - } - - // now read the bytes of the content - retContent = new byte[contentLength]; - int totalWant = contentLength; - int totalRead = 0; - while (totalRead < contentLength) { - try { - int numRead = in.read(retContent, totalRead, totalWant); - if (numRead < 0) { - return null; - } else { - totalRead += numRead; - totalWant = contentLength - totalRead; - } // end if (numRead < 0) / else - } catch (EOFException eofEx) { - // resize to what we have - if (totalRead > 0) { - byte[] newReturn = new byte[totalRead]; - System.arraycopy(retContent, 0, newReturn, 0, totalRead); - return newReturn; - } else { - return null; - } - } // end try/catch (EOFException) - } // end while (totalRead < contentLength) - - return retContent; - } - - /** - * Reads in a WARC record from a data input stream - * - * @param in the input stream - * @return a WARC record (or null if eof) - * @throws java.io.IOException - */ - public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in) throws IOException { - StringBuilder recordHeader = new StringBuilder(); - byte[] recordContent = readNextRecord(in, recordHeader); - if (recordContent == null) { - return null; - } - - // extract out our header information - String thisHeaderString = recordHeader.toString(); - String[] headerLines = thisHeaderString.split(NEWLINE); - - ClueWeb09WarcRecord retRecord = new ClueWeb09WarcRecord(); - for (int i = 0; i < headerLines.length; i++) { - String[] pieces = headerLines[i].split(":", 2); - if (pieces.length != 2) { - retRecord.addHeaderMetadata(pieces[0], ""); - continue; - } - String thisKey = pieces[0].trim(); - String thisValue = pieces[1].trim(); - - // check for known keys - if (thisKey.equals("WARC-Type")) { - retRecord.setWarcRecordType(thisValue); - } else if (thisKey.equals("WARC-Date")) { - retRecord.setWarcDate(thisValue); - } else if (thisKey.equals("WARC-Record-ID")) { - retRecord.setWarcUUID(thisValue); - } else if (thisKey.equals("Content-Type")) { - retRecord.setWarcContentType(thisValue); - } else { - retRecord.addHeaderMetadata(thisKey, thisValue); - } + if (thisChar == '\n') { + keepReading = false; + } else { + retString.append(thisChar); } - - // set the content - retRecord.setContent(recordContent); - - return retRecord; - } - - /** - * Retrieves the total record length (header and content) - * - * @return total record length - */ - public int getTotalRecordLength() { - int headerLength = warcHeader.toString().length(); - return (headerLength + warcContent.length); + } while (keepReading); + } catch (EOFException eofEx) { + return null; } - /** - * Sets the record content (copy) - * - * @param o record to copy from - */ - public void set(ClueWeb09WarcRecord o) { - this.warcHeader = new WarcHeader(o.warcHeader); - this.warcContent = o.warcContent; + if (retString.length() == 0) { + return ""; } - /** - * Gets the file path from this WARC file (if set) - */ - public String getWarcFilePath() { - return warcFilePath; + return retString.toString(); + } + + /** + * The actual heavy lifting of reading in the next WARC record + * + * @param in the data input stream + * @param headerBuffer a blank string buffer to contain the WARC header + * @return the content byts (w/ the headerBuffer populated) + * @throws java.io.IOException + */ + private static byte[] readNextRecord(DataInputStream in, StringBuilder headerBuffer) + throws IOException { + if (in == null) { + return null; } - - /** - * Sets the warc file path (optional - for use with getWarcFilePath) - * - * @param path - */ - public void setWarcFilePath(String path) { - warcFilePath = path; + if (headerBuffer == null) { + return null; } - /** - * Sets the record type string - * - * @param recordType - */ - public void setWarcRecordType(String recordType) { - warcHeader.recordType = recordType; + String line = null; + boolean foundMark = false; + boolean inHeader = true; + byte[] retContent = null; + + // cannot be using a buffered reader here!!!! + // just read the header + // first - find our WARC header + while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) { + if (line.startsWith(WARC_VERSION)) { + foundMark = true; + } } - /** - * Sets the content type string - * - * @param contentType - */ - public void setWarcContentType(String contentType) { - warcHeader.contentType = contentType; + // no WARC mark? + if (!foundMark) { + return null; } - /** - * Sets the WARC header date string - * - * @param dateString - */ - public void setWarcDate(String dateString) { - warcHeader.dateString = dateString; + // then read to the first newline + // make sure we get the content length here + int contentLength = -1; + boolean foundContentLength = false; + while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) { + if ((line.trim().length() == 0) && foundContentLength) { + inHeader = false; + } else { + headerBuffer.append(line); + headerBuffer.append(NEWLINE); + String[] thisHeaderPieceParts = line.split(":", 2); + if (thisHeaderPieceParts.length == 2) { + if (thisHeaderPieceParts[0].toLowerCase(Locale.US).startsWith("content-length")) { + foundContentLength = true; + try { + contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim()); + } catch (NumberFormatException nfEx) { + contentLength = -1; + } + } + } + } } - /** - * Sets the WARC uuid string - * - * @param UUID - */ - public void setWarcUUID(String UUID) { - warcHeader.UUID = UUID; + if (contentLength < 0) { + return null; } - /** - * Adds a key/value pair to a WARC header. This is needed to filter out - * known keys - * - * @param key - * @param value - */ - public void addHeaderMetadata(String key, String value) { - // don't allow addition of known keys - if (key.equals("WARC-Type")) { - return; + // now read the bytes of the content + retContent = new byte[contentLength]; + int totalWant = contentLength; + int totalRead = 0; + while (totalRead < contentLength) { + try { + int numRead = in.read(retContent, totalRead, totalWant); + if (numRead < 0) { + return null; + } else { + totalRead += numRead; + totalWant = contentLength - totalRead; + } // end if (numRead < 0) / else + } catch (EOFException eofEx) { + // resize to what we have + if (totalRead > 0) { + byte[] newReturn = new byte[totalRead]; + System.arraycopy(retContent, 0, newReturn, 0, totalRead); + return newReturn; + } else { + return null; } - if (key.equals("WARC-Date")) { - return; - } - if (key.equals("WARC-Record-ID")) { - return; - } - if (key.equals("Content-Type")) { - return; - } - if (key.equals("Content-Length")) { - return; - } - - warcHeader.metadata.put(key, value); + } // end try/catch (EOFException) + } // end while (totalRead < contentLength) + + return retContent; + } + + /** + * Reads in a WARC record from a data input stream + * + * @param in the input stream + * @return a WARC record (or null if eof) + * @throws java.io.IOException + */ + public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in) throws IOException { + StringBuilder recordHeader = new StringBuilder(); + byte[] recordContent = readNextRecord(in, recordHeader); + if (recordContent == null) { + return null; } - /** - * Clears all metadata items from a header - */ - public void clearHeaderMetadata() { - warcHeader.metadata.clear(); + // extract out our header information + String thisHeaderString = recordHeader.toString(); + String[] headerLines = thisHeaderString.split(NEWLINE); + + ClueWeb09WarcRecord retRecord = new ClueWeb09WarcRecord(); + for (int i = 0; i < headerLines.length; i++) { + String[] pieces = headerLines[i].split(":", 2); + if (pieces.length != 2) { + retRecord.addHeaderMetadata(pieces[0], ""); + continue; + } + String thisKey = pieces[0].trim(); + String thisValue = pieces[1].trim(); + + // check for known keys + if (thisKey.equals("WARC-Type")) { + retRecord.setWarcRecordType(thisValue); + } else if (thisKey.equals("WARC-Date")) { + retRecord.setWarcDate(thisValue); + } else if (thisKey.equals("WARC-Record-ID")) { + retRecord.setWarcUUID(thisValue); + } else if (thisKey.equals("Content-Type")) { + retRecord.setWarcContentType(thisValue); + } else { + retRecord.addHeaderMetadata(thisKey, thisValue); + } } - /** - * Gets the set of metadata items from the header - */ - public Set> getHeaderMetadata() { - return warcHeader.metadata.entrySet(); + // set the content + retRecord.setContent(recordContent); + + return retRecord; + } + + /** + * Retrieves the total record length (header and content) + * + * @return total record length + */ + public int getTotalRecordLength() { + int headerLength = warcHeader.toString().length(); + return (headerLength + warcContent.length); + } + + /** + * Sets the record content (copy) + * + * @param o record to copy from + */ + public void set(ClueWeb09WarcRecord o) { + this.warcHeader = new WarcHeader(o.warcHeader); + this.warcContent = o.warcContent; + } + + /** + * Gets the file path from this WARC file (if set) + */ + public String getWarcFilePath() { + return warcFilePath; + } + + /** + * Sets the warc file path (optional - for use with getWarcFilePath) + * + * @param path + */ + public void setWarcFilePath(String path) { + warcFilePath = path; + } + + /** + * Sets the record type string + * + * @param recordType + */ + public void setWarcRecordType(String recordType) { + warcHeader.recordType = recordType; + } + + /** + * Sets the content type string + * + * @param contentType + */ + public void setWarcContentType(String contentType) { + warcHeader.contentType = contentType; + } + + /** + * Sets the WARC header date string + * + * @param dateString + */ + public void setWarcDate(String dateString) { + warcHeader.dateString = dateString; + } + + /** + * Sets the WARC uuid string + * + * @param UUID + */ + public void setWarcUUID(String UUID) { + warcHeader.UUID = UUID; + } + + /** + * Adds a key/value pair to a WARC header. This is needed to filter out + * known keys + * + * @param key + * @param value + */ + public void addHeaderMetadata(String key, String value) { + // don't allow addition of known keys + if (key.equals("WARC-Type")) { + return; } - - /** - * Gets a value for a specific header metadata key - * - * @param key - */ - public String getHeaderMetadataItem(String key) { - return warcHeader.metadata.get(key); + if (key.equals("WARC-Date")) { + return; } - - /** - * Sets the byte content for this record - * - * @param content - */ - public void setContent(byte[] content) { - warcContent = content; - warcHeader.contentLength = content.length; + if (key.equals("WARC-Record-ID")) { + return; } - - /** - * Retrieves the byte content for this record - */ - public byte[] getByteContent() { - return warcContent; + if (key.equals("Content-Type")) { + return; } - - /** - * Retrieves the bytes content as a UTF-8 string - */ - public String getContentUTF8() { - return new String(warcContent, StandardCharsets.UTF_8); + if (key.equals("Content-Length")) { + return; } + warcHeader.metadata.put(key, value); + } + + /** + * Clears all metadata items from a header + */ + public void clearHeaderMetadata() { + warcHeader.metadata.clear(); + } + + /** + * Gets the set of metadata items from the header + */ + public Set> getHeaderMetadata() { + return warcHeader.metadata.entrySet(); + } + + /** + * Gets a value for a specific header metadata key + * + * @param key + */ + public String getHeaderMetadataItem(String key) { + return warcHeader.metadata.get(key); + } + + /** + * Sets the byte content for this record + * + * @param content + */ + public void setContent(byte[] content) { + warcContent = content; + warcHeader.contentLength = content.length; + } + + /** + * Retrieves the byte content for this record + */ + public byte[] getByteContent() { + return warcContent; + } + + /** + * Retrieves the bytes content as a UTF-8 string + */ + public String getContentUTF8() { + return new String(warcContent, StandardCharsets.UTF_8); + } + + /** + * Gets the header record type string + */ + public String getHeaderRecordType() { + return warcHeader.recordType; + } + + @Override + public String toString() { + StringBuilder retBuffer = new StringBuilder(); + retBuffer.append(warcHeader.toString()); + retBuffer.append(NEWLINE); + retBuffer.append(warcContent); + return retBuffer.toString(); + } + + /** + * Gets the WARC header as a string + */ + public String getHeaderString() { + return warcHeader.toString(); + } + + /** + * Serialization output + * + * @param out + * @throws java.io.IOException + */ + public void write(DataOutput out) throws IOException { + warcHeader.write(out); + out.write(warcContent); + } + + /** + * Serialization input + * + * @param in + * @throws java.io.IOException + */ + public void readFields(DataInput in) throws IOException { + warcHeader.readFields(in); + int contentLengthBytes = warcHeader.contentLength; + warcContent = new byte[contentLengthBytes]; + in.readFully(warcContent); + } + + public String getDocid() { + return getHeaderMetadataItem("WARC-TREC-ID"); + } + + public String getURL() { + return getHeaderMetadataItem("WARC-Target-URI"); + } + + public String getContent() { + String str = getContentUTF8(); + int i = str.indexOf("Content-Length:"); + int j = str.indexOf("\n", i); + + return str.substring(j + 1); + } + + /** + * Sets the byte content for this record + * + * @param content + */ + public void setContent(String content) { + setContent(content.getBytes()); + } + + public String getDisplayContentType() { + return "text/html"; + } + + /** + * Warc header class + */ + public class WarcHeader { + public String contentType = ""; + public String UUID = ""; + public String dateString = ""; + public String recordType = ""; + public HashMap metadata = new HashMap(); + public int contentLength = 0; + /** - * Gets the header record type string + * Default constructor */ - public String getHeaderRecordType() { - return warcHeader.recordType; - } - - @Override - public String toString() { - StringBuilder retBuffer = new StringBuilder(); - retBuffer.append(warcHeader.toString()); - retBuffer.append(NEWLINE); - retBuffer.append(warcContent); - return retBuffer.toString(); + public WarcHeader() { } /** - * Gets the WARC header as a string + * Copy Constructor + * + * @param o other WARC header */ - public String getHeaderString() { - return warcHeader.toString(); + public WarcHeader(WarcHeader o) { + this.contentType = o.contentType; + this.UUID = o.UUID; + this.dateString = o.dateString; + this.recordType = o.recordType; + this.metadata.putAll(o.metadata); + this.contentLength = o.contentLength; } /** * Serialization output * - * @param out + * @param out the data output stream * @throws java.io.IOException */ public void write(DataOutput out) throws IOException { - warcHeader.write(out); - out.write(warcContent); + out.writeUTF(contentType); + out.writeUTF(UUID); + out.writeUTF(dateString); + out.writeUTF(recordType); + out.writeInt(metadata.size()); + Iterator> metadataIterator = metadata.entrySet().iterator(); + while (metadataIterator.hasNext()) { + Entry thisEntry = metadataIterator.next(); + out.writeUTF(thisEntry.getKey()); + out.writeUTF(thisEntry.getValue()); + } + out.writeInt(contentLength); } /** * Serialization input * - * @param in + * @param in the data input stream * @throws java.io.IOException */ public void readFields(DataInput in) throws IOException { - warcHeader.readFields(in); - int contentLengthBytes = warcHeader.contentLength; - warcContent = new byte[contentLengthBytes]; - in.readFully(warcContent); - } - - public String getDocid() { - return getHeaderMetadataItem("WARC-TREC-ID"); - } - - public String getURL() { - return getHeaderMetadataItem("WARC-Target-URI"); - } - - public String getContent() { - String str = getContentUTF8(); - int i = str.indexOf("Content-Length:"); - int j = str.indexOf("\n", i); - - return str.substring(j + 1); - } - - /** - * Sets the byte content for this record - * - * @param content - */ - public void setContent(String content) { - setContent(content.getBytes()); - } - - public String getDisplayContentType() { - return "text/html"; + contentType = in.readUTF(); + UUID = in.readUTF(); + dateString = in.readUTF(); + recordType = in.readUTF(); + metadata.clear(); + int numMetaItems = in.readInt(); + for (int i = 0; i < numMetaItems; i++) { + String thisKey = in.readUTF(); + String thisValue = in.readUTF(); + metadata.put(thisKey, thisValue); + } + contentLength = in.readInt(); } - /** - * Warc header class - */ - public class WarcHeader { - public String contentType = ""; - public String UUID = ""; - public String dateString = ""; - public String recordType = ""; - public HashMap metadata = new HashMap(); - public int contentLength = 0; - - /** - * Default constructor - */ - public WarcHeader() { - } - - /** - * Copy Constructor - * - * @param o other WARC header - */ - public WarcHeader(WarcHeader o) { - this.contentType = o.contentType; - this.UUID = o.UUID; - this.dateString = o.dateString; - this.recordType = o.recordType; - this.metadata.putAll(o.metadata); - this.contentLength = o.contentLength; - } - - /** - * Serialization output - * - * @param out the data output stream - * @throws java.io.IOException - */ - public void write(DataOutput out) throws IOException { - out.writeUTF(contentType); - out.writeUTF(UUID); - out.writeUTF(dateString); - out.writeUTF(recordType); - out.writeInt(metadata.size()); - Iterator> metadataIterator = metadata.entrySet().iterator(); - while (metadataIterator.hasNext()) { - Entry thisEntry = metadataIterator.next(); - out.writeUTF(thisEntry.getKey()); - out.writeUTF(thisEntry.getValue()); - } - out.writeInt(contentLength); - } - - /** - * Serialization input - * - * @param in the data input stream - * @throws java.io.IOException - */ - public void readFields(DataInput in) throws IOException { - contentType = in.readUTF(); - UUID = in.readUTF(); - dateString = in.readUTF(); - recordType = in.readUTF(); - metadata.clear(); - int numMetaItems = in.readInt(); - for (int i = 0; i < numMetaItems; i++) { - String thisKey = in.readUTF(); - String thisValue = in.readUTF(); - metadata.put(thisKey, thisValue); - } - contentLength = in.readInt(); - } - - @Override - public String toString() { - StringBuilder retBuffer = new StringBuilder(); + @Override + public String toString() { + StringBuilder retBuffer = new StringBuilder(); - retBuffer.append(WARC_VERSION); - retBuffer.append(NEWLINE); + retBuffer.append(WARC_VERSION); + retBuffer.append(NEWLINE); - retBuffer.append("WARC-Type: " + recordType + NEWLINE); - retBuffer.append("WARC-Date: " + dateString + NEWLINE); + retBuffer.append("WARC-Type: " + recordType + NEWLINE); + retBuffer.append("WARC-Date: " + dateString + NEWLINE); - retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE); - Iterator> metadataIterator = metadata.entrySet().iterator(); - while (metadataIterator.hasNext()) { - Entry thisEntry = metadataIterator.next(); - retBuffer.append(thisEntry.getKey()); - retBuffer.append(": "); - retBuffer.append(thisEntry.getValue()); - retBuffer.append(NEWLINE); - } + retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE); + Iterator> metadataIterator = metadata.entrySet().iterator(); + while (metadataIterator.hasNext()) { + Entry thisEntry = metadataIterator.next(); + retBuffer.append(thisEntry.getKey()); + retBuffer.append(": "); + retBuffer.append(thisEntry.getValue()); + retBuffer.append(NEWLINE); + } - retBuffer.append("Content-Type: " + contentType + NEWLINE); - retBuffer.append("Content-Length: " + contentLength + NEWLINE); + retBuffer.append("Content-Type: " + contentType + NEWLINE); + retBuffer.append("Content-Length: " + contentLength + NEWLINE); - return retBuffer.toString(); - } + return retBuffer.toString(); } + } } diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java index 32fe3cd794..34b78ec790 100644 --- a/src/main/java/io/anserini/index/IndexArgs.java +++ b/src/main/java/io/anserini/index/IndexArgs.java @@ -24,25 +24,25 @@ */ public class IndexArgs { - // required arguments + // required arguments - @Option(name = "-input", metaVar = "[Path]", required = true, usage = "Collection Directory") - String input; + @Option(name = "-input", metaVar = "[Path]", required = true, usage = "Collection Directory") + String input; - @Option(name = "-index", metaVar = "[Path]", required = true, usage = "Lucene index") - String index; + @Option(name = "-index", metaVar = "[Path]", required = true, usage = "Lucene index") + String index; - @Option(name = "-threads", metaVar = "[Number]", required = true, usage = "Number of Threads") - int threads; + @Option(name = "-threads", metaVar = "[Number]", required = true, usage = "Number of Threads") + int threads; - // optional arguments + // optional arguments - @Option(name = "-positions", usage = "Boolean switch to index positions") - boolean positions = false; + @Option(name = "-positions", usage = "Boolean switch to index positions") + boolean positions = false; - @Option(name = "-optimize", usage = "Boolean switch to optimize index (force merge)") - boolean optimize = false; + @Option(name = "-optimize", usage = "Boolean switch to optimize index (force merge)") + boolean optimize = false; - @Option(name = "-doclimit", metaVar = "[Number]", required = false, usage = "Maximum number of *.warc documents to index (-1 to index everything)") - int doclimit = -1; + @Option(name = "-doclimit", metaVar = "[Number]", required = false, usage = "Maximum number of *.warc documents to index (-1 to index everything)") + int doclimit = -1; } \ No newline at end of file diff --git a/src/main/java/io/anserini/index/IndexClueWeb09b.java b/src/main/java/io/anserini/index/IndexClueWeb09b.java index a565cc40c1..80fbd1091a 100644 --- a/src/main/java/io/anserini/index/IndexClueWeb09b.java +++ b/src/main/java/io/anserini/index/IndexClueWeb09b.java @@ -57,244 +57,244 @@ */ public final class IndexClueWeb09b { - private static final Logger LOG = LogManager.getLogger(IndexClueWeb09b.class); + private static final Logger LOG = LogManager.getLogger(IndexClueWeb09b.class); - public static final String FIELD_BODY = "contents"; - public static final String FIELD_ID = "id"; - private static final String RESPONSE = "response"; + public static final String FIELD_BODY = "contents"; + public static final String FIELD_ID = "id"; + private static final String RESPONSE = "response"; - private final class IndexerThread extends Thread { + private final class IndexerThread extends Thread { - final private Path inputWarcFile; + final private Path inputWarcFile; - final private IndexWriter writer; + final private IndexWriter writer; - volatile int addCount; + volatile int addCount; - public IndexerThread(IndexWriter writer, Path inputWarcFile) throws IOException { - this.writer = writer; - this.inputWarcFile = inputWarcFile; - setName(inputWarcFile.getFileName().toString()); - } + public IndexerThread(IndexWriter writer, Path inputWarcFile) throws IOException { + this.writer = writer; + this.inputWarcFile = inputWarcFile; + setName(inputWarcFile.getFileName().toString()); + } - private int indexWarcFile() throws IOException { + private int indexWarcFile() throws IOException { - int i = 0; + int i = 0; - try (DataInputStream inStream = new DataInputStream(new GZIPInputStream(Files.newInputStream(inputWarcFile, StandardOpenOption.READ)))) { + try (DataInputStream inStream = new DataInputStream(new GZIPInputStream(Files.newInputStream(inputWarcFile, StandardOpenOption.READ)))) { - // iterate through our stream - ClueWeb09WarcRecord wDoc; - while ((wDoc = ClueWeb09WarcRecord.readNextWarcRecord(inStream)) != null) { - // see if it's a response record - if (RESPONSE.equals(wDoc.getHeaderRecordType())) { + // iterate through our stream + ClueWeb09WarcRecord wDoc; + while ((wDoc = ClueWeb09WarcRecord.readNextWarcRecord(inStream)) != null) { + // see if it's a response record + if (RESPONSE.equals(wDoc.getHeaderRecordType())) { - String id = wDoc.getDocid(); + String id = wDoc.getDocid(); - org.jsoup.nodes.Document jDoc = Jsoup.parse(wDoc.getContent()); + org.jsoup.nodes.Document jDoc = Jsoup.parse(wDoc.getContent()); - String contents = jDoc.text(); - // don't index empty documents - if (contents.trim().length() == 0) { - System.err.println(id); - continue; - } + String contents = jDoc.text(); + // don't index empty documents + if (contents.trim().length() == 0) { + System.err.println(id); + continue; + } - // make a new, empty document - Document document = new Document(); + // make a new, empty document + Document document = new Document(); - // document ID - document.add(new StringField(FIELD_ID, id, Field.Store.YES)); + // document ID + document.add(new StringField(FIELD_ID, id, Field.Store.YES)); - // entire document - if (positions) - document.add(new TextField(FIELD_BODY, contents, Field.Store.NO)); - else - document.add(new NoPositionsTextField(FIELD_BODY, contents)); + // entire document + if (positions) + document.add(new TextField(FIELD_BODY, contents, Field.Store.NO)); + else + document.add(new NoPositionsTextField(FIELD_BODY, contents)); - writer.addDocument(document); - i++; - } - } - } - return i; + writer.addDocument(document); + i++; + } } + } + return i; + } - @Override - public void run() { - try { - addCount = indexWarcFile(); - System.out.println("*./" + inputWarcFile.getParent().getFileName().toString() + File.separator + inputWarcFile.getFileName().toString() + " " + addCount); - } catch (IOException ioe) { - System.out.println(Thread.currentThread().getName() + ": ERROR: unexpected IOException:"); - ioe.printStackTrace(System.out); - } - } + @Override + public void run() { + try { + addCount = indexWarcFile(); + System.out.println("*./" + inputWarcFile.getParent().getFileName().toString() + File.separator + inputWarcFile.getFileName().toString() + " " + addCount); + } catch (IOException ioe) { + System.out.println(Thread.currentThread().getName() + ": ERROR: unexpected IOException:"); + ioe.printStackTrace(System.out); + } } + } - private final Path indexPath; - private final Path docDir; + private final Path indexPath; + private final Path docDir; - private boolean positions = false; + private boolean positions = false; - public void setPositions(boolean positions) { - this.positions = positions; - } + public void setPositions(boolean positions) { + this.positions = positions; + } - private boolean optimize = false; + private boolean optimize = false; - public void setOptimize(boolean optimize) { - this.optimize = optimize; - } + public void setOptimize(boolean optimize) { + this.optimize = optimize; + } - private int doclimit = -1; + private int doclimit = -1; - public void setDocLimit(int doclimit) { - this.doclimit = doclimit; - } + public void setDocLimit(int doclimit) { + this.doclimit = doclimit; + } - public IndexClueWeb09b(String docsPath, String indexPath) throws IOException { + public IndexClueWeb09b(String docsPath, String indexPath) throws IOException { - this.indexPath = Paths.get(indexPath); - if (!Files.exists(this.indexPath)) - Files.createDirectories(this.indexPath); + this.indexPath = Paths.get(indexPath); + if (!Files.exists(this.indexPath)) + Files.createDirectories(this.indexPath); - docDir = Paths.get(docsPath); - if (!Files.exists(docDir) || !Files.isReadable(docDir) || !Files.isDirectory(docDir)) { - System.out.println("Document directory '" + docDir.toString() + "' does not exist or is not readable, please check the path"); - System.exit(1); - } + docDir = Paths.get(docsPath); + if (!Files.exists(docDir) || !Files.isReadable(docDir) || !Files.isDirectory(docDir)) { + System.out.println("Document directory '" + docDir.toString() + "' does not exist or is not readable, please check the path"); + System.exit(1); } + } - private final static PathMatcher matcher = FileSystems.getDefault().getPathMatcher("glob:*.warc.gz"); + private final static PathMatcher matcher = FileSystems.getDefault().getPathMatcher("glob:*.warc.gz"); - static List discoverWarcFiles(Path p) { + static List discoverWarcFiles(Path p) { - final List warcFiles = new ArrayList<>(); + final List warcFiles = new ArrayList<>(); - FileVisitor fv = new SimpleFileVisitor() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + FileVisitor fv = new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - Path name = file.getFileName(); - if (name != null && matcher.matches(name)) - warcFiles.add(file); - return FileVisitResult.CONTINUE; - } - }; + Path name = file.getFileName(); + if (name != null && matcher.matches(name)) + warcFiles.add(file); + return FileVisitResult.CONTINUE; + } + }; - try { - Files.walkFileTree(p, fv); - } catch (IOException e) { - e.printStackTrace(); - } - return warcFiles; + try { + Files.walkFileTree(p, fv); + } catch (IOException e) { + e.printStackTrace(); } - - /** - * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter}, - * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}. - * - * @return KStemAnalyzer - * @throws IOException - */ - public static Analyzer analyzer() throws IOException { - return CustomAnalyzer.builder() - .withTokenizer("classic") - .addTokenFilter("classic") - .addTokenFilter("lowercase") - .addTokenFilter("kstem") - .build(); + return warcFiles; + } + + /** + * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter}, + * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}. + * + * @return KStemAnalyzer + * @throws IOException + */ + public static Analyzer analyzer() throws IOException { + return CustomAnalyzer.builder() + .withTokenizer("classic") + .addTokenFilter("classic") + .addTokenFilter("lowercase") + .addTokenFilter("kstem") + .build(); + } + + public int indexWithThreads(int numThreads) throws IOException, InterruptedException { + + System.out.println("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); + + final Directory dir = FSDirectory.open(indexPath); + + final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); + + iwc.setSimilarity(new BM25Similarity()); + iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + iwc.setRAMBufferSizeMB(256.0); + iwc.setUseCompoundFile(false); + iwc.setMergeScheduler(new ConcurrentMergeScheduler()); + + final IndexWriter writer = new IndexWriter(dir, iwc); + + final ExecutorService executor = Executors.newFixedThreadPool(numThreads); + + List warcFiles = discoverWarcFiles(docDir); + if (doclimit > 0 && warcFiles.size() < doclimit) + warcFiles = warcFiles.subList(0, doclimit); + + for (Path f : warcFiles) + executor.execute(new IndexerThread(writer, f)); + + //add some delay to let some threads spawn by scheduler + Thread.sleep(30000); + executor.shutdown(); // Disable new tasks from being submitted + + try { + // Wait for existing tasks to terminate + while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { + Thread.sleep(1000); + } + } catch (InterruptedException ie) { + // (Re-)Cancel if current thread also interrupted + executor.shutdownNow(); + // Preserve interrupt status + Thread.currentThread().interrupt(); } - public int indexWithThreads(int numThreads) throws IOException, InterruptedException { - - System.out.println("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'..."); - - final Directory dir = FSDirectory.open(indexPath); + int numIndexed = writer.maxDoc(); - final IndexWriterConfig iwc = new IndexWriterConfig(analyzer()); - - iwc.setSimilarity(new BM25Similarity()); - iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - iwc.setRAMBufferSizeMB(256.0); - iwc.setUseCompoundFile(false); - iwc.setMergeScheduler(new ConcurrentMergeScheduler()); - - final IndexWriter writer = new IndexWriter(dir, iwc); - - final ExecutorService executor = Executors.newFixedThreadPool(numThreads); - - List warcFiles = discoverWarcFiles(docDir); - if (doclimit > 0 && warcFiles.size() < doclimit) - warcFiles = warcFiles.subList(0, doclimit); + try { + writer.commit(); + if (optimize) + writer.forceMerge(1); + } finally { + writer.close(); + } - for (Path f : warcFiles) - executor.execute(new IndexerThread(writer, f)); + return numIndexed; + } - //add some delay to let some threads spawn by scheduler - Thread.sleep(30000); - executor.shutdown(); // Disable new tasks from being submitted + public static void main(String[] args) throws IOException, InterruptedException { - try { - // Wait for existing tasks to terminate - while (!executor.awaitTermination(5, TimeUnit.MINUTES)) { - Thread.sleep(1000); - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - executor.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); - } + IndexArgs indexArgs = new IndexArgs(); - int numIndexed = writer.maxDoc(); + CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90)); - try { - writer.commit(); - if (optimize) - writer.forceMerge(1); - } finally { - writer.close(); - } - - return numIndexed; + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + System.err.println(e.getMessage()); + parser.printUsage(System.err); + System.err.println("Example: IndexClueWeb09b" + parser.printExample(OptionHandlerFilter.REQUIRED)); + return; } - public static void main(String[] args) throws IOException, InterruptedException { - - IndexArgs indexArgs = new IndexArgs(); + final long start = System.nanoTime(); + IndexClueWeb09b indexer = new IndexClueWeb09b(indexArgs.input, indexArgs.index); - CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: IndexClueWeb09b" + parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } + indexer.setPositions(indexArgs.positions); + indexer.setOptimize(indexArgs.optimize); + indexer.setDocLimit(indexArgs.doclimit); - final long start = System.nanoTime(); - IndexClueWeb09b indexer = new IndexClueWeb09b(indexArgs.input, indexArgs.index); + LOG.info("Index path: " + indexArgs.index); + LOG.info("Threads: " + indexArgs.threads); + LOG.info("Positions: " + indexArgs.positions); + LOG.info("Optimize (merge segments): " + indexArgs.optimize); + LOG.info("Doc limit: " + (indexArgs.doclimit == -1 ? "all docs" : "" + indexArgs.doclimit)); - indexer.setPositions(indexArgs.positions); - indexer.setOptimize(indexArgs.optimize); - indexer.setDocLimit(indexArgs.doclimit); + LOG.info("Indexer: start"); - LOG.info("Index path: " + indexArgs.index); - LOG.info("Threads: " + indexArgs.threads); - LOG.info("Positions: " + indexArgs.positions); - LOG.info("Optimize (merge segments): " + indexArgs.optimize); - LOG.info("Doc limit: " + (indexArgs.doclimit == -1 ? "all docs" : "" + indexArgs.doclimit)); - - LOG.info("Indexer: start"); - - int numIndexed = indexer.indexWithThreads(indexArgs.threads); - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); - } + int numIndexed = indexer.indexWithThreads(indexArgs.threads); + final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); + LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); + } } diff --git a/src/main/java/io/anserini/search/SearchClueWeb09b.java b/src/main/java/io/anserini/search/SearchClueWeb09b.java index a7ec003ac0..03f14fa1f8 100644 --- a/src/main/java/io/anserini/search/SearchClueWeb09b.java +++ b/src/main/java/io/anserini/search/SearchClueWeb09b.java @@ -48,155 +48,155 @@ */ public final class SearchClueWeb09b implements Closeable { - private final IndexReader reader; + private final IndexReader reader; - public SearchClueWeb09b(String indexDir) throws IOException { + public SearchClueWeb09b(String indexDir) throws IOException { - Path indexPath = Paths.get(indexDir); + Path indexPath = Paths.get(indexDir); - if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { - throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); - } - - this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); + if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { + throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); } - @Override - public void close() throws IOException { - reader.close(); - } + this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); + } - private static String extract(String line, String tag) { + @Override + public void close() throws IOException { + reader.close(); + } - int i = line.indexOf(tag); + private static String extract(String line, String tag) { - if (i == -1) throw new IllegalArgumentException("line does not contain the tag : " + tag); + int i = line.indexOf(tag); - int j = line.indexOf("\"", i + tag.length() + 2); + if (i == -1) throw new IllegalArgumentException("line does not contain the tag : " + tag); - if (j == -1) throw new IllegalArgumentException("line does not contain quotation"); + int j = line.indexOf("\"", i + tag.length() + 2); - return line.substring(i + tag.length() + 2, j); - } + if (j == -1) throw new IllegalArgumentException("line does not contain quotation"); - /** - * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt - * @return SortedMap where keys are query/topic IDs and values are title portions of the topics - * @throws IOException - */ - static SortedMap readQueries(Path topicsFile) throws IOException { + return line.substring(i + tag.length() + 2, j); + } - SortedMap map = new TreeMap<>(); - List lines = Files.readAllLines(topicsFile, StandardCharsets.UTF_8); + /** + * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt + * @return SortedMap where keys are query/topic IDs and values are title portions of the topics + * @throws IOException + */ + static SortedMap readQueries(Path topicsFile) throws IOException { - String number = ""; - String query = ""; + SortedMap map = new TreeMap<>(); + List lines = Files.readAllLines(topicsFile, StandardCharsets.UTF_8); - for (String line : lines) { + String number = ""; + String query = ""; - line = line.trim(); + for (String line : lines) { - if (line.startsWith("") && line.endsWith("")) - query = line.substring(7, line.length() - 8).trim(); + if (line.startsWith("")) - map.put(Integer.parseInt(number), query); + if (line.startsWith("") && line.endsWith("")) + query = line.substring(7, line.length() - 8).trim(); - } + if (line.startsWith("")) + map.put(Integer.parseInt(number), query); - lines.clear(); - return map; } - /** - * Prints TREC submission file to the standard output stream. - * - * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt - * @param operator Default search operator: AND or OR - * @throws IOException - * @throws ParseException - */ - - public void search(String topicsFile, String submissionFile, QueryParser.Operator operator) throws IOException, ParseException { - - Path topicsPath = Paths.get(topicsFile); - - if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) { - throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file."); - } - - IndexSearcher searcher = new IndexSearcher(reader); - searcher.setSimilarity(new BM25Similarity()); - - - final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString(); - - PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII)); - - - QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer()); - queryParser.setDefaultOperator(operator); - - - SortedMap topics = readQueries(topicsPath); - - for (Map.Entry entry : topics.entrySet()) { - - int qID = entry.getKey(); - String queryString = entry.getValue(); - Query query = queryParser.parse(queryString); - - /** - * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. - */ - ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; - - /** - * the first column is the topic number. - * the second column is currently unused and should always be "Q0". - * the third column is the official document identifier of the retrieved document. - * the fourth column is the rank the document is retrieved. - * the fifth column shows the score (integer or floating point) that generated the ranking. - * the sixth column is called the "run tag" and should be a unique identifier for your - */ - for (int i = 0; i < hits.length; i++) { - int docId = hits[i].doc; - Document doc = searcher.doc(docId); - out.print(qID); - out.print("\tQ0\t"); - out.print(doc.get(FIELD_ID)); - out.print("\t"); - out.print(i); - out.print("\t"); - out.print(hits[i].score); - out.print("\t"); - out.print(runTag); - out.println(); - } - } - out.flush(); - out.close(); + lines.clear(); + return map; + } + + /** + * Prints TREC submission file to the standard output stream. + * + * @param topicsFile One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt + * @param operator Default search operator: AND or OR + * @throws IOException + * @throws ParseException + */ + + public void search(String topicsFile, String submissionFile, QueryParser.Operator operator) throws IOException, ParseException { + + Path topicsPath = Paths.get(topicsFile); + + if (!Files.exists(topicsPath) || !Files.isRegularFile(topicsPath) || !Files.isReadable(topicsPath)) { + throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file."); } - public static void main(String[] args) throws IOException, ParseException { + IndexSearcher searcher = new IndexSearcher(reader); + searcher.setSimilarity(new BM25Similarity()); + + + final String runTag = "BM25_Krovetz_" + FIELD_BODY + "_" + operator.toString(); - if (args.length != 3) { - System.err.println("Usage: SearcherCW09B "); - System.err.println("topicsFile: input file containing queries. One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt"); - System.err.println("submissionFile: redirect stdout to capture the submission file for trec_eval or gdeval.pl"); - System.err.println("indexDir: index directory"); - System.exit(1); - } + PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII)); - String topicsFile = args[0]; - String submissionFile = args[1]; - String indexDir = args[2]; - SearchClueWeb09b searcher = new SearchClueWeb09b(indexDir); - searcher.search(topicsFile, submissionFile, QueryParser.Operator.OR); - searcher.close(); + QueryParser queryParser = new QueryParser(FIELD_BODY, analyzer()); + queryParser.setDefaultOperator(operator); + + + SortedMap topics = readQueries(topicsPath); + + for (Map.Entry entry : topics.entrySet()) { + + int qID = entry.getKey(); + String queryString = entry.getValue(); + Query query = queryParser.parse(queryString); + + /** + * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query. + */ + ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; + + /** + * the first column is the topic number. + * the second column is currently unused and should always be "Q0". + * the third column is the official document identifier of the retrieved document. + * the fourth column is the rank the document is retrieved. + * the fifth column shows the score (integer or floating point) that generated the ranking. + * the sixth column is called the "run tag" and should be a unique identifier for your + */ + for (int i = 0; i < hits.length; i++) { + int docId = hits[i].doc; + Document doc = searcher.doc(docId); + out.print(qID); + out.print("\tQ0\t"); + out.print(doc.get(FIELD_ID)); + out.print("\t"); + out.print(i); + out.print("\t"); + out.print(hits[i].score); + out.print("\t"); + out.print(runTag); + out.println(); + } + } + out.flush(); + out.close(); + } + + public static void main(String[] args) throws IOException, ParseException { + + if (args.length != 3) { + System.err.println("Usage: SearcherCW09B "); + System.err.println("topicsFile: input file containing queries. One of: topics.web.1-50.txt topics.web.51-100.txt topics.web.101-150.txt topics.web.151-200.txt"); + System.err.println("submissionFile: redirect stdout to capture the submission file for trec_eval or gdeval.pl"); + System.err.println("indexDir: index directory"); + System.exit(1); } + + String topicsFile = args[0]; + String submissionFile = args[1]; + String indexDir = args[2]; + + SearchClueWeb09b searcher = new SearchClueWeb09b(indexDir); + searcher.search(topicsFile, submissionFile, QueryParser.Operator.OR); + searcher.close(); + } }