wicket-core/src/main/java/org/apache/wicket/markup/parser/XmlPullParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.wicket.markup.parser;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.text.ParseException;

import org.apache.wicket.markup.parser.XmlTag.TagType;
import org.apache.wicket.markup.parser.XmlTag.TextSegment;
import org.apache.wicket.util.io.FullyBufferedReader;
import org.apache.wicket.util.io.IOUtils;
import org.apache.wicket.util.io.XmlReader;
import org.apache.wicket.util.lang.Args;
import org.apache.wicket.util.parse.metapattern.parsers.TagNameParser;
import org.apache.wicket.util.parse.metapattern.parsers.VariableAssignmentParser;
import org.apache.wicket.util.string.Strings;

/**
 * A fairly shallow markup pull parser which parses a markup string of a given type of markup (for
 * example, html, xml, vxml or wml) into ComponentTag and RawMarkup tokens.
 * 
 * @author Jonathan Locke
 * @author Juergen Donnerstag
 */
public final class XmlPullParser implements IXmlPullParser
{
	/** */
	public static final String STYLE = "style";

	/** */
	public static final String SCRIPT = "script";

	/**
	 * The encoding of the XML.
	 */
	private String encoding;

	/**
	 * A XML independent reader which loads the whole source data into memory and which provides
	 * convenience methods to access the data.
	 */
	private FullyBufferedReader input;

	/** temporary variable which will hold the name of the closing tag. */
	private String skipUntilText;

	/** The last substring selected from the input */
	private CharSequence lastText;

	/** Everything in between &lt;!DOCTYPE ... &gt; */
	private CharSequence doctype;

	/** The type of what is in lastText */
	private HttpTagType lastType = HttpTagType.NOT_INITIALIZED;

	/** The last tag found */
	private XmlTag lastTag;

	/**
	 * Construct.
	 */
	public XmlPullParser()
	{
	}

	@Override
	public final String getEncoding()
	{
		return encoding;
	}

	@Override
	public final CharSequence getDoctype()
	{
		return doctype;
	}

	@Override
	public final CharSequence getInputFromPositionMarker(final int toPos)
	{
		return input.getSubstring(toPos);
	}

	@Override
	public final CharSequence getInput(final int fromPos, final int toPos)
	{
		return input.getSubstring(fromPos, toPos);
	}

	/**
	 * Whatever will be in between the current index and the closing tag, will be ignored (and thus
	 * treated as raw markup (text). This is useful for tags like 'script'.
	 * 
	 * @throws ParseException
	 */
	private void skipUntil() throws ParseException
	{
		// this is a tag with non-XHTML text as body - skip this until the
		// skipUntilText is found.
		final int startIndex = input.getPosition();
		final int tagNameLen = skipUntilText.length();

		int pos = input.getPosition() - 1;
		String endTagText = null;
		int lastPos = 0;
		while (!skipUntilText.equalsIgnoreCase(endTagText))
		{
			pos = input.find("</", pos + 1);
			if ((pos == -1) || ((pos + (tagNameLen + 2)) >= input.size()))
			{
				throw new ParseException(
					skipUntilText + " tag not closed" + getLineAndColumnText(), startIndex);
			}

			lastPos = pos + 2;
			endTagText = input.getSubstring(lastPos, lastPos + tagNameLen).toString();
		}

		input.setPosition(pos);
		lastText = input.getSubstring(startIndex, pos);
		lastType = HttpTagType.BODY;

		// Check that the tag is properly closed
		lastPos = input.find('>', lastPos + tagNameLen);
		if (lastPos == -1)
		{
			throw new ParseException(skipUntilText + " tag not closed" + getLineAndColumnText(),
				startIndex);
		}

		// Reset the state variable
		skipUntilText = null;
	}

	/**
	 * 
	 * @return line and column number
	 */
	private String getLineAndColumnText()
	{
		return " (line " + input.getLineNumber() + ", column " + input.getColumnNumber() + ")";
	}

	/**
	 * @return XXX
	 * @throws ParseException
	 */
	@Override
	public final HttpTagType next() throws ParseException
	{
		// Reached end of markup file?
		if (input.getPosition() >= input.size())
		{
			return HttpTagType.NOT_INITIALIZED;
		}

		if (skipUntilText != null)
		{
			skipUntil();
			return lastType;
		}

		// Any more tags in the markup?
		final int openBracketIndex = input.find('<');

		// Tag or Body?
		if (input.charAt(input.getPosition()) != '<')
		{
			// It's a BODY
			if (openBracketIndex == -1)
			{
				// There is no next matching tag.
				lastText = input.getSubstring(-1);
				input.setPosition(input.size());
				lastType = HttpTagType.BODY;
				return lastType;
			}

			lastText = input.getSubstring(openBracketIndex);
			input.setPosition(openBracketIndex);
			lastType = HttpTagType.BODY;
			return lastType;
		}

		// Determine the line number
		input.countLinesTo(openBracketIndex);

		// Get index of closing tag and advance past the tag
		int closeBracketIndex = -1;

		if (openBracketIndex != -1 && openBracketIndex < input.size() - 1)
		{
			char nextChar = input.charAt(openBracketIndex + 1);

			if ((nextChar == '!') || (nextChar == '?'))
				closeBracketIndex = input.find('>', openBracketIndex);
			else
				closeBracketIndex = input.findOutOfQuotes('>', openBracketIndex);
		}

		if (closeBracketIndex == -1)
		{
			throw new ParseException("No matching close bracket at" + getLineAndColumnText(),
				input.getPosition());
		}

		// Get the complete tag text
		lastText = input.getSubstring(openBracketIndex, closeBracketIndex + 1);

		// Get the tagtext between open and close brackets
		String tagText = lastText.subSequence(1, lastText.length() - 1).toString();
		if (tagText.length() == 0)
		{
			throw new ParseException("Found empty tag: '<>' at" + getLineAndColumnText(),
				input.getPosition());
		}

		// Type of the tag, to be determined next
		final TagType type;

		// If the tag ends in '/', it's a "simple" tag like <foo/>
		if (tagText.endsWith("/"))
		{
			type = TagType.OPEN_CLOSE;
			tagText = tagText.substring(0, tagText.length() - 1);
		}
		else if (tagText.startsWith("/"))
		{
			// The tag text starts with a '/', it's a simple close tag
			type = TagType.CLOSE;
			tagText = tagText.substring(1);
		}
		else
		{
			// It must be an open tag
			type = TagType.OPEN;

			// If open tag and starts with "s" like "script" or "style", than ...
			if ((tagText.length() > STYLE.length()) &&
				((tagText.charAt(0) == 's') || (tagText.charAt(0) == 'S')))
			{
				final String lowerCase = tagText.toLowerCase();
				if (lowerCase.startsWith(SCRIPT))
				{
					String typeAttr = "type=";
					int idxOfType = lowerCase.indexOf(typeAttr);
					if (idxOfType > 0)
					{
						// +1 to remove the ' or "
						String typePrefix = lowerCase.substring(idxOfType + typeAttr.length() + 1);
						if (typePrefix.startsWith("text/javascript"))
						{
							// prepare to skip everything between the open and close tag
							skipUntilText = SCRIPT;
						}
						// any other type is assumed to be a template so it can contain child nodes.
						// See WICKET-5288
					}
					else
					{
						// no type attribute so it is 'text/javascript'
						// prepare to skip everything between the open and close tag
						skipUntilText = SCRIPT;
					}
				}
				else if (lowerCase.startsWith(STYLE))
				{
					// prepare to skip everything between the open and close tag
					skipUntilText = STYLE;
				}
			}
		}

		// Handle special tags like <!-- and <![CDATA ...
		final char firstChar = tagText.charAt(0);
		if ((firstChar == '!') || (firstChar == '?'))
		{
			specialTagHandling(tagText, openBracketIndex, closeBracketIndex);

			input.countLinesTo(openBracketIndex);
			TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
				input.getColumnNumber());
			lastTag = new XmlTag(text, type);

			return lastType;
		}

		TextSegment text = new TextSegment(lastText, openBracketIndex, input.getLineNumber(),
			input.getColumnNumber());
		XmlTag tag = new XmlTag(text, type);
		lastTag = tag;

		// Parse the tag text and populate tag attributes
		if (parseTagText(tag, tagText))
		{
			// Move to position after the tag
			input.setPosition(closeBracketIndex + 1);
			lastType = HttpTagType.TAG;
			return lastType;
		}
		else
		{
			throw new ParseException("Malformed tag" + getLineAndColumnText(), openBracketIndex);
		}
	}

	/**
	 * Handle special tags like <!-- --> or <![CDATA[..]]> or <?xml>
	 * 
	 * @param tagText
	 * @param openBracketIndex
	 * @param closeBracketIndex
	 * @throws ParseException
	 */
	protected void specialTagHandling(String tagText, final int openBracketIndex,
		int closeBracketIndex) throws ParseException
	{
		// Handle comments
		if (tagText.startsWith("!--"))
		{
			// downlevel-revealed conditional comments e.g.: <!--[if (gt IE9)|!(IE)]><!-->
			if (tagText.contains("![endif]--"))
			{
				lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;

				// Move to position after the tag
				input.setPosition(closeBracketIndex + 1);
				return;
			}

			// Conditional comment? E.g.
			// "<!--[if IE]><a href='test.html'>my link</a><![endif]-->"
			if (tagText.startsWith("!--[if ") && tagText.endsWith("]"))
			{
				int pos = input.find("]-->", openBracketIndex + 1);
				if (pos == -1)
				{
					throw new ParseException("Unclosed conditional comment beginning at" +
						getLineAndColumnText(), openBracketIndex);
				}

				pos += 4;
				lastText = input.getSubstring(openBracketIndex, pos);

				// Actually it is no longer a comment. It is now
				// up to the browser to select the section appropriate.
				input.setPosition(closeBracketIndex + 1);
				lastType = HttpTagType.CONDITIONAL_COMMENT;
			}
			else
			{
				// Normal comment section.
				// Skip ahead to "-->". Note that you can not simply test for
				// tagText.endsWith("--") as the comment might contain a '>'
				// inside.
				int pos = input.find("-->", openBracketIndex + 1);
				if (pos == -1)
				{
					throw new ParseException("Unclosed comment beginning at" +
						getLineAndColumnText(), openBracketIndex);
				}

				pos += 3;
				lastText = input.getSubstring(openBracketIndex, pos);
				lastType = HttpTagType.COMMENT;
				input.setPosition(pos);
			}
			return;
		}

		// The closing tag of a conditional comment, e.g.
		// "<!--[if IE]><a href='test.html'>my link</a><![endif]-->
		// and also <!--<![endif]-->"
		if (tagText.equals("![endif]--"))
		{
			lastType = HttpTagType.CONDITIONAL_COMMENT_ENDIF;
			input.setPosition(closeBracketIndex + 1);
			return;
		}

		// CDATA sections might contain "<" which is not part of an XML tag.
		// Make sure escaped "<" are treated right
		if (tagText.startsWith("!["))
		{
			final String startText = (tagText.length() <= 8 ? tagText : tagText.substring(0, 8));
			if (startText.toUpperCase().equals("![CDATA["))
			{
				int pos1 = openBracketIndex;
				do
				{
					// Get index of closing tag and advance past the tag
					closeBracketIndex = findChar('>', pos1);

					if (closeBracketIndex == -1)
					{
						throw new ParseException("No matching close bracket at" +
							getLineAndColumnText(), input.getPosition());
					}

					// Get the tagtext between open and close brackets
					tagText = input.getSubstring(openBracketIndex + 1, closeBracketIndex)
						.toString();

					pos1 = closeBracketIndex + 1;
				}
				while (tagText.endsWith("]]") == false);

				// Move to position after the tag
				input.setPosition(closeBracketIndex + 1);

				lastText = tagText;
				lastType = HttpTagType.CDATA;
				return;
			}
		}

		if (tagText.charAt(0) == '?')
		{
			lastType = HttpTagType.PROCESSING_INSTRUCTION;

			// Move to position after the tag
			input.setPosition(closeBracketIndex + 1);
			return;
		}

		if (tagText.startsWith("!DOCTYPE"))
		{
			lastType = HttpTagType.DOCTYPE;

			// Get the tagtext between open and close brackets
			doctype = input.getSubstring(openBracketIndex + 1, closeBracketIndex);

			// Move to position after the tag
			input.setPosition(closeBracketIndex + 1);
			return;
		}

		// Move to position after the tag
		lastType = HttpTagType.SPECIAL_TAG;
		input.setPosition(closeBracketIndex + 1);
	}

	/**
	 * @return MarkupElement
	 */
	@Override
	public final XmlTag getElement()
	{
		return lastTag;
	}

	/**
	 * @return The xml string from the last element
	 */
	@Override
	public final CharSequence getString()
	{
		return lastText;
	}

	/**
	 * @return The next XML tag
	 * @throws ParseException
	 */
	public final XmlTag nextTag() throws ParseException
	{
		while (next() != HttpTagType.NOT_INITIALIZED)
		{
			switch (lastType)
			{
				case TAG :
					return lastTag;

				case BODY :
					break;

				case COMMENT :
					break;

				case CONDITIONAL_COMMENT :
					break;

				case CDATA :
					break;

				case PROCESSING_INSTRUCTION :
					break;

				case SPECIAL_TAG :
					break;
			}
		}

		return null;
	}

	/**
	 * Find the char but ignore any text within ".." and '..'
	 * 
	 * @param ch
	 *            The character to search
	 * @param startIndex
	 *            Start index
	 * @return -1 if not found, else the index
	 */
	private int findChar(final char ch, int startIndex)
	{
		char quote = 0;

		for (; startIndex < input.size(); startIndex++)
		{
			final char charAt = input.charAt(startIndex);
			if (quote != 0)
			{
				if (quote == charAt)
				{
					quote = 0;
				}
			}
			else if ((charAt == '"') || (charAt == '\''))
			{
				quote = charAt;
			}
			else if (charAt == ch)
			{
				return startIndex;
			}
		}

		return -1;
	}

	/**
	 * Parse the given string.
	 * <p>
	 * Note: xml character encoding is NOT applied. It is assumed the input provided does have the
	 * correct encoding already.
	 * 
	 * @param string
	 *            The input string
	 * @throws IOException
	 *             Error while reading the resource
	 */
	@Override
	public void parse(final CharSequence string) throws IOException
	{
		Args.notNull(string, "string");

		this.input = new FullyBufferedReader(new StringReader(string.toString()));
		this.encoding = null;
	}

	/**
	 * Reads and parses markup from an input stream, using UTF-8 encoding by default when not
	 * specified in XML declaration.
	 * 
	 * @param in
	 *            The input stream to read and parse
	 * @throws IOException
	 * 
	 * @see {@link #parse(InputStream, String)}
	 */
	@Override
	public void parse(final InputStream in) throws IOException
	{
		// When XML declaration does not specify encoding, it defaults to UTF-8
		parse(in, "UTF-8");
	}

	/**
	 * Reads and parses markup from an input stream.
	 * <p>
	 * Note: The input is closed after parsing.
	 * 
	 * @param inputStream
	 *            The input stream to read and parse
	 * @param encoding
	 *            The default character encoding of the input
	 * @throws IOException
	 */
	@Override
	public void parse(final InputStream inputStream, final String encoding) throws IOException
	{
		Args.notNull(inputStream, "inputStream");

		try
		{
			XmlReader xmlReader = new XmlReader(new BufferedInputStream(inputStream, 4000),
				encoding);
			this.input = new FullyBufferedReader(xmlReader);
			this.encoding = xmlReader.getEncoding();
		}
		finally
		{
			IOUtils.closeQuietly(inputStream);
		}
	}

	@Override
	public final void setPositionMarker()
	{
		input.setPositionMarker(input.getPosition());
	}

	@Override
	public final void setPositionMarker(final int pos)
	{
		input.setPositionMarker(pos);
	}

	@Override
	public String toString()
	{
		return input.toString();
	}

	/**
	 * Parses the text between tags. For example, "a href=foo.html".
	 * 
	 * @param tag
	 * @param tagText
	 *            The text between tags
	 * @return false in case of an error
	 * @throws ParseException
	 */
	private boolean parseTagText(final XmlTag tag, final String tagText) throws ParseException
	{
		// Get the length of the tagtext
		final int tagTextLength = tagText.length();

		// If we match tagname pattern
		final TagNameParser tagnameParser = new TagNameParser(tagText);
		if (tagnameParser.matcher().lookingAt())
		{
			// Extract the tag from the pattern matcher
			tag.name = tagnameParser.getName();
			tag.namespace = tagnameParser.getNamespace();

			// Are we at the end? Then there are no attributes, so we just
			// return the tag
			int pos = tagnameParser.matcher().end(0);
			if (pos == tagTextLength)
			{
				return true;
			}

			// Extract attributes
			final VariableAssignmentParser attributeParser = new VariableAssignmentParser(tagText);
			while (attributeParser.matcher().find(pos))
			{
				// Get key and value using attribute pattern
				String value = attributeParser.getValue();

				// In case like <html xmlns:wicket> will the value be null
				if (value == null)
				{
					value = "";
				}

				// Set new position to end of attribute
				pos = attributeParser.matcher().end(0);

				// Chop off double quotes or single quotes
				if (value.startsWith("\"") || value.startsWith("\'"))
				{
					value = value.substring(1, value.length() - 1);
				}

				// Trim trailing whitespace
				value = value.trim();

				// Unescape
				value = Strings.unescapeMarkup(value).toString();

				// Get key
				final String key = attributeParser.getKey();

				// Put the attribute in the attributes hash
				if (null != tag.getAttributes().put(key, value))
				{
					throw new ParseException("Same attribute found twice: " + key +
						getLineAndColumnText(), input.getPosition());
				}

				// The input has to match exactly (no left over junk after
				// attributes)
				if (pos == tagTextLength)
				{
					return true;
				}
			}

			return true;
		}

		return false;
	}
}