Skip to content

Commit

Permalink
feat: move HTML2x from ui to importer
Browse files Browse the repository at this point in the history
  • Loading branch information
kptdobe committed May 23, 2022
1 parent ac2e569 commit 729f842
Show file tree
Hide file tree
Showing 3 changed files with 295 additions and 0 deletions.
159 changes: 159 additions & 0 deletions src/importer/HTML2x.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/*
* Copyright 2022 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
/* eslint-disable class-methods-use-this, no-console */

import path from 'path';
import { Response } from 'node-fetch';
import { JSDOM } from 'jsdom';
import PageImporter from './PageImporter.js';
import PageImporterResource from './PageImporterResource.js';
import MemoryHandler from '../storage/MemoryHandler.js';

// import docxStylesXML from '../resources/styles.xml';

function preprocessDOM(document) {
const elements = document.querySelectorAll('body, header, footer, div, span, section, main');
const getComputedStyle = document.defaultView?.getComputedStyle;
if (getComputedStyle) {
elements.forEach((element) => {
// css background images will be lost -> write them in the DOM
const style = getComputedStyle(element);
if (style['background-image'] && style['background-image'].toLowerCase() !== 'none') {
// eslint-disable-next-line no-param-reassign
element.style['background-image'] = style['background-image'];
}
});
}
}

// eslint-disable-next-line no-unused-vars
async function defaultTransformDOM({ url, document, html }) {
return document.body;
}

// eslint-disable-next-line no-unused-vars
async function defaultGenerateDocumentPath({ url, document }) {
let p = new URL(url).pathname;
if (p.endsWith('/')) {
p = `${p}index`;
}
return decodeURIComponent(p)
.toLowerCase()
.replace(/\.html$/, '')
.replace(/[^a-z0-9/]/gm, '-');
}

async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
let name = 'index';
let dirname = '';

const transfrom = transformCfg || {};

if (!transfrom.transformDOM) {
transfrom.transformDOM = defaultTransformDOM;
}

if (!transfrom.generateDocumentPath) {
transfrom.generateDocumentPath = defaultGenerateDocumentPath;
}

if (options.preprocess !== false) {
preprocessDOM(doc);
}

const html = doc.documentElement.outerHTML;
class InternalImporter extends PageImporter {
async fetch() {
return new Response(html);
}

async process(document) {
let output = await transfrom.transformDOM({ url, document, html });
output = output || document.body;

let p = await transfrom.generateDocumentPath({ url, document });
if (!p) {
// provided function returns null -> apply default
p = await defaultGenerateDocumentPath({ url, document });
}

name = path.basename(p);
dirname = path.dirname(p);

const pir = new PageImporterResource(name, dirname, output, null, {
html: output.outerHTML,
});
return [pir];
}
}

const logger = {
debug: () => {},
info: () => {},
log: () => {},
warn: (...args) => console.error(...args),
error: (...args) => console.error(...args),
};

const storageHandler = new MemoryHandler(logger);
const importer = new InternalImporter({
storageHandler,
skipDocxConversion: !toDocx,
skipMDFileCreation: !toMd,
logger,
mdast2docxOptions: {
stylesXML: options.docxStylesXML,
svg2png: options.svg2png,
},
});

const pirs = await importer.import(url);

const res = {
html: pirs[0].extra.html,
};

res.path = `${dirname}${name}`;

if (toMd) {
const md = await storageHandler.get(pirs[0].md);
res.md = md;
}
if (toDocx) {
const docx = await storageHandler.get(pirs[0].docx);
res.docx = docx;
}
return res;
}

async function html2md(url, document, transformCfg, options = {}) {
let doc = document;
if (typeof document === 'string') {
doc = new JSDOM(document, { runScripts: undefined }).window.document;
}
return html2x(url, doc, transformCfg, true, false, options);
}

async function html2docx(url, document, transformCfg, options = {}) {
let doc = document;
if (typeof document === 'string') {
doc = new JSDOM(document, { runScripts: undefined }).window.document;
}
return html2x(url, doc, transformCfg, true, true, options);
}

export {
html2md,
html2docx,
defaultTransformDOM,
defaultGenerateDocumentPath,
};
4 changes: 4 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import WPAdminAjaxPager from './wp/explorers/WPAdminAjaxPager.js';
import WPContentPager from './wp/explorers/WPContentPager.js';
import WPPostWrapPager from './wp/explorers/WPPostWrapPager.js';

import { html2md, html2docx } from './importer/HTML2x.js';

export {
PagingExplorer,
PagingExplorerParams,
Expand All @@ -47,4 +49,6 @@ export {
WPAdminAjaxPager,
WPContentPager,
WPPostWrapPager,
html2md,
html2docx,
};
132 changes: 132 additions & 0 deletions test/importers/HTML2x.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright 2020 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { ok, strictEqual } from 'assert';
import { describe, it } from 'mocha';
import { JSDOM } from 'jsdom';
import { docx2md } from '@adobe/helix-docx2md';
import MockMediaHandler from '../mocks/MockMediaHandler.js';

import DOMUtils from '../../src/utils/DOMUtils.js';
import {
html2md,
html2docx,
defaultGenerateDocumentPath,
defaultTransformDOM,
} from '../../src/importer/HTML2x.js';

describe('defaultTransformDOM tests', () => {
it('default transformation', async () => {
const { document } = new JSDOM('<html><body><h1>Hello World</h1></body></html>', { runScripts: undefined }).window;
const out = await defaultTransformDOM({ document });
strictEqual(out.outerHTML, '<body><h1>Hello World</h1></body>');
});
});

describe('defaultGenerateDocumentPath tests', () => {
it('default paths', async () => {
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com' }), '/index');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/' }), '/index');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index.html' }), '/index');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index' }), '/index');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page' }), '/page');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page.html' }), '/page');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page' }), '/folder/page');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page.html' }), '/folder/page');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page/' }), '/folder/page/index');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page with spaces.html' }), '/folder/page-with-spaces');
strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/PagE_with_3xtr4_charactére.html' }), '/folder/page-with-3xtr4-charact-re');
});
});

describe('html2md tests', () => {
it('html2md provides a default transformation', async () => {
const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>');
strictEqual(out.html.trim(), '<body><h1>Hello World</h1></body>');
strictEqual(out.md.trim(), '# Hello World');
strictEqual(out.path, '/page');
});

it('html2md handles a custom transformations', async () => {
const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
transformDOM: ({ document }) => {
const p = document.createElement('p');
p.innerHTML = 'My Hello to the World';
return p;
},
generateDocumentPath: () => '/my-custom-path',
});
strictEqual(out.html.trim(), '<p>My Hello to the World</p>');
strictEqual(out.md.trim(), 'My Hello to the World');
strictEqual(out.path, '/my-custom-path');
});

it('html2md can deal with null returning transformation', async () => {
const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
transformDOM: () => null,
generateDocumentPath: () => null,
});
strictEqual(out.html.trim(), '<body><h1>Hello World</h1></body>');
strictEqual(out.md.trim(), '# Hello World');
strictEqual(out.path, '/page');
});

it('css background images are stored on elements and can be used during transformation', async () => {
const out = await html2md(
'https://www.sample.com/page.html',
'<html><head><style>div { background-image: url("./image.png"); }</style></head><body><div>div witth background image!</div></body></html>',
{
transformDOM: ({ document }) => {
const div = document.querySelector('div');
const img = DOMUtils.getImgFromBackground(div, document);
div.replaceWith(img);
return document.body;
},
},
);
strictEqual(out.html.trim(), '<body><img src="./image.png"></body>');
});
});

describe('html2docx tests', () => {
it('html2docx provides a default transformation', async () => {
const out = await html2docx('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>');
strictEqual(out.html.trim(), '<body><h1>Hello World</h1></body>');
strictEqual(out.md.trim(), '# Hello World');
strictEqual(out.path, '/page');
ok(out.docx);
const md = await docx2md(out.docx, {
mediaHandler: new MockMediaHandler(),
});
strictEqual(out.md, md);
});

it('html2docx handles a custom transformations', async () => {
const out = await html2docx('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
transformDOM: ({ document }) => {
const p = document.createElement('p');
p.innerHTML = 'My Hello to the World';
return p;
},
generateDocumentPath: () => '/my-custom-path',
});
strictEqual(out.html.trim(), '<p>My Hello to the World</p>');
strictEqual(out.md.trim(), 'My Hello to the World');
strictEqual(out.path, '/my-custom-path');

ok(out.docx);
const md = await docx2md(out.docx, {
mediaHandler: new MockMediaHandler(),
});
strictEqual(out.md, md);
});
});

0 comments on commit 729f842

Please sign in to comment.