Skip to content
This repository has been archived by the owner on Nov 4, 2023. It is now read-only.

Commit

Permalink
feat: wip class scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
cdaringe committed Jan 4, 2021
1 parent 8ed7ff7 commit d0cad9b
Show file tree
Hide file tree
Showing 12 changed files with 5,906 additions and 271 deletions.
1,158 changes: 1,158 additions & 0 deletions packages/json-schema-producer/factorio.schema.d.ts

Large diffs are not rendered by default.

4,455 changes: 4,455 additions & 0 deletions packages/json-schema-producer/factorio.schema.json

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions packages/json-schema-producer/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@
"name": "@dino-dna/factorio-ts-json-schema-producer",
"version": "1.0.0",
"dependencies": {
"@dino-dna/factorio-ts-json-schema-producer": "/home/cdaringe/src/factorio-ts/packages/json-schema-producer",
"@types/puppeteer": "^5.4.2",
"@types/turndown": "^5.0.0",
"ajv": "7.0.3",
"cd": "^0.3.3",
"bluebird": "^3.7.2",
"happy-dom": "^2.1.3",
"puppeteer": "5.4.1",
"turndown": "^7.0.0",
"whatwg-encoding": "^1.0.5"
},
"devDependencies": {
"@types/bluebird": "^3.5.33",
"@types/json-schema": "^7.0.6",
"@types/puppeteer": "^5.4.2",
"@types/turndown": "^5.0.0",
"ava": "^3.15.0",
"json-schema-to-typescript": "^10.0.3"
},
Expand Down
290 changes: 100 additions & 190 deletions packages/json-schema-producer/src/from-website.ts
Original file line number Diff line number Diff line change
@@ -1,157 +1,60 @@
import { writeFileSync } from "fs";
import { Window } from "happy-dom";
import Bluebird from "bluebird";
import { promises as fs } from "fs";
import { JSONSchema4 } from "json-schema";
import { compile } from "json-schema-to-typescript";
import pup from "puppeteer";
import { fromLuaType } from "./json-schema";
import TurndownService from "turndown";
import { Page, Browser, launch } from "puppeteer";
import { classNames } from "./globals";
import { scrapeClass } from "./scrape/classes";
import { scrapeDefines } from "./scrape/defines";
import { loadVirtualPage, toDocument } from "./scrape/dom";

type PageMeta = { baseUrl: string; pageBasename: string };
const asUrlCorrectedMarkdown = (
s: string,
{ baseUrl, pageBasename }: PageMeta
) => {
const turndown = new TurndownService({});
// turndown.addRule("correct-urls", {
// filter: "a",
// replacement: (content) => {

// return content;
// },
// });
let md = turndown.turndown(s);
if (s.match(/href/)) {
// cases
// 0. absolute rules
// no op. the following rules should not meddle with http(s):// urls
// 1. on-page-urls/anchors
md = md.replace(new RegExp(`\\]\\(${pageBasename}`, "gi"), `](${baseUrl}`);
// 2. relative urls
const relmatches = md.match(new RegExp(`\\]\\((?!http)(\.+.html)`));
if (relmatches?.length) {
md = md.replace(
new RegExp(`\\]\\((?!http)\.+.html`, "gi"),
`](${baseUrl.replace(pageBasename, relmatches[1])}`
);
}
}
return md;
};

export type FactorioApi = {
defines: Record<string, Record<string, string>>;
const createGetDefines = (urlRoot: string, browser: Browser) => async () => {
const pageBasename = "defines.html";
const baseUrl = `${urlRoot}/${pageBasename}`;
const page = await browser.newPage();
await page.goto(baseUrl, {
waitUntil: "networkidle2",
});
return scrapeDefines(toDocument(await page.content()), {
baseUrl,
pageBasename,
});
};

export type FactorioJsonSchema = JSONSchema4;

const scrapeNestedDefines = (rootEl: Element, pageMeta: PageMeta) => {
const [headerEl, contentEl] = Array.from(rootEl.children) as [
HTMLElement,
HTMLElement
];
if (!headerEl || !contentEl) {
throw new Error(`unexpected defines HTML structure ${rootEl.innerHTML}`);
}
const name = headerEl.innerText.trim();
const descriptionEl = Array.from(contentEl.children).find(
(el) => el.tagName === "p"
) as HTMLElement | undefined;

const briefMembersEl = Array.from(contentEl.children).find(
(el) => el.className === "brief-members"
);
if (!briefMembersEl) {
throw new Error(`unable to locate brief-members el`);
}
const localFields = Array.from(briefMembersEl.querySelectorAll("tr")).map(
(el) => {
const parts = el.id.split(".");
return {
name: parts[parts.length - 1],
description: asUrlCorrectedMarkdown(
el.querySelector(".description")?.innerHTML.trim() || "",
pageMeta
),
};
}
);

const localProperties = localFields.reduce((acc, { name, description }) => {
const subschema: JSONSchema4 = {
type: "string",
description,
// type: "object",
};
// hack for ts mapping support
Object.defineProperty(subschema, "tsType", {
enumerable: false,
get: () => "unknown",
const createGetClasses = (
browser: Browser,
classLinks: { text: string; href: string }[]
) =>
classLinks.map(({ text: className, href }) => async () => {
const page = await browser.newPage();
await page.goto(href, {
waitUntil: "networkidle2",
});
const parts = href.split("/");
return {
...acc,
[name]: subschema,
className,
schema: scrapeClass(toDocument(await page.content()), className, {
baseUrl: href,
pageBasename: parts[parts.length - 1],
}),
};
}, {} as Required<JSONSchema4>["properties"]);

const nestedProperties = Array.from(contentEl.children)
.filter((el) => el.className === "element")
.map((v) => scrapeNestedDefines(v, pageMeta))
.reduce((acc, { name, schema }) => {
return {
...acc,
[name]: schema,
};
}, {} as Required<JSONSchema4>["properties"]);

const schema: JSONSchema4 = {
type: "object",
description: descriptionEl?.innerText.trim() || "",
properties: {
...localProperties,
...nestedProperties,
},
required: [
...Object.keys(localProperties),
...Object.keys(nestedProperties),
],
additionalProperties: false,
};
return { name, schema };
};

const loadVirtualPage = (html: string) => {
const window = new Window();
const document = window.document;
document.write(html);
return { document, window };
};
});

const scrapeDefines = (html: string, pageMeta: PageMeta) => {
const { document } = loadVirtualPage(html);
const l2s = Array.from(document.querySelectorAll("body > .element")).map(
(l1) => {
const { name, schema } = scrapeNestedDefines(
(l1 as any) as Element,
pageMeta
);
return { name, schema };
}
);
const schema: JSONSchema4 = {
type: "object",
description:
"Factorio constants, persistent handles.\n@{see https://lua-api.factorio.com/latest/defines.html}\nFactorio does not include types associated with defines :/",
properties: l2s.reduce(
(acc, { name, schema }) => ({
...acc,
[name]: schema,
}),
{}
),
required: l2s.map(({ name }) => name),
additionalProperties: false,
};
return schema;
const enumerateClasses = async (page: Page, baseUrl: string) => {
const { document } = loadVirtualPage(await page.content());
debugger;
return Array.from(
document.getElementById("Classes").nextElementSibling.nextElementSibling
.nextElementSibling.nextElementSibling.nextElementSibling.firstChild
.firstChild.childNodes
).map((el) => {
const classAnchor = el.firstChild.firstChild as any;
if (!classAnchor) throw new Error("no class anchor element found");
const text = classAnchor.textContent;
const href: string = classAnchor.getAttribute("href");
if (!text || !href) throw new Error(`unable to find text or href`);
return { text, href: `${baseUrl}/${href}` };
});
};

export const produce = async ({
Expand All @@ -165,65 +68,72 @@ export const produce = async ({
string,
{
slug: string;
parse: (page: pup.Page) => JSONSchema4;
parse: (page: Page) => JSONSchema4;
}
>;
}) => {
const browser = await pup.launch({ headless: true });
const browser = await launch({ headless: true });
const page = await browser.newPage();
const pageBasename = "defines.html";
const baseUrl = `${urls.apiRoot}/${pageBasename}`;
await page.goto(baseUrl, {
waitUntil: "networkidle2",
});
const defines = await scrapeDefines(await page.content(), {
baseUrl,
pageBasename,
});
const schema: FactorioJsonSchema = {
await page.goto(urls.apiRoot, { waitUntil: "networkidle2" });
const classLinks = await enumerateClasses(page, urls.apiRoot);
const [defines, ...classes] = Bluebird.map(
[
createGetDefines(urls.apiRoot, browser),
...createGetClasses(browser, classLinks),
],
(fn) => fn(),
{
concurrency: 5,
}
);
const schema: JSONSchema4 = {
type: "object",
description: "Factorio Lua API",
required: ["defines"],
required: [...classNames, "defines"],
properties: {
defines,
...globalClasses,
},
additionalProperties: false,
};
const tso = await compile(schema, "FactorioApi");
writeFileSync("debug.d.ts", tso);
await Promise.all([
fs.writeFile("factorio.schema.json", JSON.stringify(schema, null, 2)),
fs.writeFile("factorio.schema.d.ts", tso),
]);
browser.close();
};

export const parseArgText = (text: string) => {
const [name, r1] = text.split("::").map((s) => s.trim());
const [_, type] = r1.match(/^([a-zA-Z0-9]+)\s*/)!;
const r2 = r1.replace(type, "").trim();
const optional = !!r2.match(/^\(optional/);
const description = r2.replace("(optional):", "").trim();
return {
name,
optional,
type: fromLuaType(type),
description,
};
};
// export const parseArgText = (text: string) => {
// const [name, r1] = text.split("::").map((s) => s.trim());
// const [_, type] = r1.match(/^([a-zA-Z0-9]+)\s*/)!;
// const r2 = r1.replace(type, "").trim();
// const optional = !!r2.match(/^\(optional/);
// const description = r2.replace("(optional):", "").trim();
// return {
// name,
// optional,
// type: fromLuaType(type),
// description,
// };
// };

const parseEventHtml = (el: Element) => {
const [c1, c2] = Array.from(el.children) || [];
const name = c1!.textContent;
const [descriptionEl, _empty, detailEl] = Array.from(c2.children) || [];
const description = descriptionEl?.innerHTML || "";
const [_detailHeader, detailContent] = Array.from(detailEl.children) || [];
const args = (Array.from(detailContent?.children) || [])
.filter(Boolean)
.map((node) => parseArgText((node as HTMLElement).innerText));
return {
name,
description,
args,
};
};
// const parseEventHtml = (el: Element) => {
// const [c1, c2] = Array.from(el.children) || [];
// const name = c1!.textContent;
// const [descriptionEl, _empty, detailEl] = Array.from(c2.children) || [];
// const description = descriptionEl?.innerHTML || "";
// const [_detailHeader, detailContent] = Array.from(detailEl.children) || [];
// const args = (Array.from(detailContent?.children) || [])
// .filter(Boolean)
// .map((node) => parseArgText((node as HTMLElement).innerText));
// return {
// name,
// description,
// args,
// };
// };

const parseEvents = (page: pup.Page) => {
Array.from(document.querySelectorAll(`[id*=on_]`)).map(parseEventHtml);
};
// const parseEvents = (page: pup.Page) => {
// Array.from(document.querySelectorAll(`[id*=on_]`)).map(parseEventHtml);
// };
8 changes: 8 additions & 0 deletions packages/json-schema-producer/src/globals.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export const classNames = [
"game",
"script",
"remote",
"commands",
"settings",
"rcon",
];
1 change: 1 addition & 0 deletions packages/json-schema-producer/src/interfaces.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export type PageMeta = { baseUrl: string; pageBasename: string };
26 changes: 26 additions & 0 deletions packages/json-schema-producer/src/markdown.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import TurndownService from "turndown";
import { PageMeta } from "./interfaces";

export const asUrlCorrectedMarkdown = (
s: string,
{ baseUrl, pageBasename }: PageMeta
) => {
const turndown = new TurndownService({});
let md = turndown.turndown(s);
if (s.match(/href/)) {
// cases
// 0. absolute rules
// no op. the following rules should not meddle with http(s):// urls
// 1. on-page-urls/anchors
md = md.replace(new RegExp(`\\]\\(${pageBasename}`, "gi"), `](${baseUrl}`);
// 2. relative urls
const relmatches = md.match(new RegExp(`\\]\\((?!http)(\.+.html)`));
if (relmatches?.length) {
md = md.replace(
new RegExp(`\\]\\((?!http)\.+.html`, "gi"),
`](${baseUrl.replace(pageBasename, relmatches[1])}`
);
}
}
return md;
};

0 comments on commit d0cad9b

Please sign in to comment.