Skip to content

Commit

Permalink
JAMES-2019 Remove title tag out of JsoupTextExtract when text/html
Browse files Browse the repository at this point in the history
  • Loading branch information
quynhn authored and chibenwa committed May 15, 2017
1 parent 58178c5 commit a56cdba
Show file tree
Hide file tree
Showing 3 changed files with 1,182 additions and 3 deletions.
Expand Up @@ -26,23 +26,28 @@
import org.apache.commons.io.IOUtils;
import org.apache.james.mailbox.extractor.ParsedContent;
import org.apache.james.mailbox.extractor.TextExtractor;

import com.google.common.base.Charsets;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.google.common.collect.Maps;


public class JsoupTextExtractor implements TextExtractor {
private static final String TITLE_HTML_TAG = "title";

@Override
public ParsedContent extractContent(InputStream inputStream, String contentType, String fileName) throws Exception {
Map<String, List<String>> emptyMetadata = Maps.newHashMap();
if (contentType != null) {
if (contentType.equals("text/plain")) {
return new ParsedContent(IOUtils.toString(inputStream), emptyMetadata);
return new ParsedContent(IOUtils.toString(inputStream, Charsets.UTF_8), emptyMetadata);
}
if (contentType.equals("text/html")) {
String text = Jsoup.parse(IOUtils.toString(inputStream)).text();
return new ParsedContent(text, emptyMetadata);
Document doc = Jsoup.parse(IOUtils.toString(inputStream, Charsets.UTF_8));
doc.select(TITLE_HTML_TAG).remove();
return new ParsedContent(doc.text(), emptyMetadata);
}
}
return new ParsedContent(null, emptyMetadata);
Expand Down
@@ -0,0 +1,46 @@
/****************************************************************
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
****************************************************************/

package org.apache.james.mailbox.inmemory;

import static org.assertj.core.api.Assertions.assertThat;
import org.apache.james.mailbox.extractor.TextExtractor;

import java.io.InputStream;

import org.junit.Before;
import org.junit.Test;

public class JsoupTextExtractorTest {
private TextExtractor textExtractor;

@Before
public void setUp() {
textExtractor = new JsoupTextExtractor();
}

@Test
public void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/html.txt");

assertThat(textExtractor.extractContent(inputStream, "text/html", null).getTextualContent())
.doesNotContain("*|MC:SUBJECT|*");
}

}

0 comments on commit a56cdba

Please sign in to comment.