src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.poi.hwpf.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;

import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

/**
 * Class to extract the text from a Word Document.
 * 
 * You should use either getParagraphText() or getText() unless
 *  you have a strong reason otherwise.
 *
 * @author Nick Burch (nick at torchbox dot com)
 */
public class WordExtractor extends POIOLE2TextExtractor {
	private POIFSFileSystem fs;
	private HWPFDocument doc;
	
	/**
	 * Create a new Word Extractor
	 * @param is InputStream containing the word file
	 */
	public WordExtractor(InputStream is) throws IOException {
		this( HWPFDocument.verifyAndBuildPOIFS(is) );
	}

	/**
	 * Create a new Word Extractor
	 * @param fs POIFSFileSystem containing the word file
	 */
	public WordExtractor(POIFSFileSystem fs) throws IOException {
		this(new HWPFDocument(fs));
		this.fs = fs;
	}
	
	/**
	 * Create a new Word Extractor
	 * @param dir DirectoryNode containing the word file
	 */
	public WordExtractor(DirectoryNode dir) throws IOException {
		this(new HWPFDocument(dir));
		this.fs = fs;
	}
	
	/**
	 * Create a new Word Extractor
	 * @param doc The HWPFDocument to extract from
	 */
	public WordExtractor(HWPFDocument doc) throws IOException {
		super(doc);
		this.doc = doc;
	}

	/**
	 * Command line extractor, so people will stop moaning that
	 *  they can't just run this.
	 */
	public static void main(String[] args) throws IOException {
		if(args.length == 0) {
			System.err.println("Use:");
			System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
			System.exit(1);
		}

		// Process the first argument as a file
		FileInputStream fin = new FileInputStream(args[0]);
		WordExtractor extractor = new WordExtractor(fin);
		System.out.println(extractor.getText());
	}
	
	/**
	 * Get the text from the word file, as an array with one String
	 *  per paragraph
	 */
	public String[] getParagraphText() {
		String[] ret;
		
		// Extract using the model code
		try {
	    	Range r = doc.getRange();

			ret = new String[r.numParagraphs()];
			for(int i=0; i<ret.length; i++) {
				Paragraph p = r.getParagraph(i);
				ret[i] = p.text();
				
				// Fix the line ending
				if(ret[i].endsWith("\r")) {
					ret[i] = ret[i] + "\n";
				}
			}
		} catch(Exception e) {
			// Something's up with turning the text pieces into paragraphs
			// Fall back to ripping out the text pieces
			ret = new String[1];
			ret[0] = getTextFromPieces();
		}
		
		return ret;
	}
	
	/**
	 * Add the header/footer text, if it's not empty
	 */
	private void appendHeaderFooter(String text, StringBuffer out) {
		if(text == null || text.length() == 0)
			return;

		text = text.replace('\r', '\n');
		if(! text.endsWith("\n")) {
			out.append(text);
			out.append('\n');
			return;
		}
		if(text.endsWith("\n\n")) {
			out.append(text.substring(0, text.length()-1));
			return;
		}
		out.append(text);
		return;
	}
	/**
	 * Grab the text from the headers
	 */
	public String getHeaderText() {
		HeaderStories hs = new HeaderStories(doc);
		
		StringBuffer ret = new StringBuffer();
		if(hs.getFirstHeader() != null) {
			appendHeaderFooter(hs.getFirstHeader(), ret);
		}
		if(hs.getEvenHeader() != null) {
			appendHeaderFooter(hs.getEvenHeader(), ret);
		}
		if(hs.getOddHeader() != null) {
			appendHeaderFooter(hs.getOddHeader(), ret);
		}
		
		return ret.toString();
	}
	/**
	 * Grab the text from the footers
	 */
	public String getFooterText() {
		HeaderStories hs = new HeaderStories(doc);
		
		StringBuffer ret = new StringBuffer();
		if(hs.getFirstFooter() != null) {
			appendHeaderFooter(hs.getFirstFooter(), ret);
		}
		if(hs.getEvenFooter() != null) {
			appendHeaderFooter(hs.getEvenFooter(), ret);
		}
		if(hs.getOddFooter() != null) {
			appendHeaderFooter(hs.getOddFooter(), ret);
		}
		
		return ret.toString();
	}
	
	/**
	 * Grab the text out of the text pieces. Might also include various
	 *  bits of crud, but will work in cases where the text piece -> paragraph
	 *  mapping is broken. Fast too.
	 */
	public String getTextFromPieces() {
    	StringBuffer textBuf = new StringBuffer();
    	
    	Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    	while (textPieces.hasNext()) {
    		TextPiece piece = (TextPiece) textPieces.next();

    		String encoding = "Cp1252";
    		if (piece.isUnicode()) {
    			encoding = "UTF-16LE";
    		}
    		try {
    			String text = new String(piece.getRawBytes(), encoding);
    			textBuf.append(text);
    		} catch(UnsupportedEncodingException e) {
    			throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
    		}
    	}
    	
    	String text = textBuf.toString();
    	
    	// Fix line endings (Note - won't get all of them
    	text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    	text = text.replaceAll("\r\r", "\r\n\r\n");
    	
    	if(text.endsWith("\r")) {
    		text += "\n";
    	}
    	
    	return text;
	}
	
	/**
	 * Grab the text, based on the paragraphs. Shouldn't include any crud,
	 *  but slightly slower than getTextFromPieces().
	 */
	public String getText() {
		StringBuffer ret = new StringBuffer();
		
		ret.append(getHeaderText());
		
		String[] text = getParagraphText();
		for(int i=0; i<text.length; i++) {
			ret.append(text[i]);
		}
		
		ret.append(getFooterText());
		
		return ret.toString();
	}
	
	/**
	 * Removes any fields (eg macros, page markers etc)
	 *  from the string.
	 */
	public static String stripFields(String text) {
		return Range.stripFields(text);
	}
}