Skip to content

Commit

Permalink
Flatten result object formats, use native promises
Browse files Browse the repository at this point in the history
  • Loading branch information
blakeembrey committed Dec 18, 2016
1 parent ce737a8 commit 4f46001
Show file tree
Hide file tree
Showing 20 changed files with 63,559 additions and 61,520 deletions.
14 changes: 8 additions & 6 deletions src/extract.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import defaultSnippets from './snippets'
import { scrapeUrl } from './scrape'
import { ScrapeResult, Snippet, ScrapeOptions, ExtractOptions } from './interfaces'
import { Result, Snippet, ScrapeOptions, ExtractOptions } from './interfaces'

/**
* Extract rich snippets from the scraping result.
*/
export async function extract (result: ScrapeResult<any>, options: ExtractOptions = {}): Promise<Snippet | undefined> {
export async function extract (result: Result, options: ExtractOptions = {}): Promise<Snippet | undefined> {
if (result == null) {
return
}

const snippets = options.snippets || defaultSnippets
const extract = snippets[result.type]

if (extract) {
return extract(result, options)
if (result.type == null || !snippets[result.type]) {
return {
url: result.url,
encodingFormat: result.encodingFormat
}
}

return
return snippets[result.type](result, options)
}

/**
Expand Down
3 changes: 2 additions & 1 deletion src/interfaces/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export * from './formats'
export * from './options'
export * from './results'
export * from './snippets'
export * from './formats'
26 changes: 7 additions & 19 deletions src/interfaces/options.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Readable } from 'stream'
import Promise = require('any-promise')
import { Snippet } from './snippets'
import { BaseResult, Result } from './results'

/**
* HTTP headers interface.
Expand Down Expand Up @@ -29,7 +29,7 @@ export interface RequestResult {
* Content scraping options.
*/
export interface ScrapeOptions {
scrapers?: Scraper<any>[]
scrapers?: Scraper[]
useOEmbed?: boolean
fallbackOnFavicon?: boolean
makeRequest? (url: string): Promise<RequestResult>
Expand All @@ -46,35 +46,23 @@ export interface ExtractOptions {
extractExifData? (url: string, stream: Readable, abort: AbortFn): Promise<any>
}

/**
* Re-used base interface for scraped and extracted information.
*/
export interface ScrapeResult <T extends any> {
type: 'html' | 'image' | 'video' | 'pdf' | 'link' | string
content: T
contentUrl: string
contentSize?: number
encodingFormat?: string
}

/**
* Format for detecting support for scraping information.
*/
export interface Scraper <T> {
supported (result: ScrapeResult<any>, headers: Headers): boolean
export interface Scraper {
supported (result: BaseResult, headers: Headers): boolean
handle (
result: ScrapeResult<any>,
headers: Headers,
result: BaseResult,
stream: Readable,
abort: AbortFn,
options: ScrapeOptions
): ScrapeResult<T> | Promise<ScrapeResult<T>>
): Result | Promise<Result>
}

/**
* Interface to extract information from the scraped content.
*/
export type Extract = (result: ScrapeResult<any>, options: ExtractOptions) => undefined | Snippet | Promise<Snippet>
export type Extract = (result: Result, options: ExtractOptions) => undefined | Snippet | Promise<Snippet>

/**
* Map of methods for extracting.
Expand Down
53 changes: 53 additions & 0 deletions src/interfaces/results.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import {
ResultJsonLd,
ResultApplinks,
ResultDublinCore,
ResultHtml,
ResultSailthru,
ResultTwitter,
Icon,
Alternative
} from 'htmlmetaparser'

import { Headers } from './options'
import { ExifData } from './formats'

export interface BaseResult {
type?: string
url: string
status: number
headers: Headers
encodingFormat?: string
}

export interface HtmlResult extends BaseResult {
type: 'html'
jsonld?: ResultJsonLd
rdfa?: ResultJsonLd
microdata?: ResultJsonLd
twitter?: ResultTwitter
html?: ResultHtml
dublincore?: ResultDublinCore
applinks?: ResultApplinks
sailthru?: ResultSailthru
icons: Array<Icon>
alternate: Array<Alternative>
oembed?: any
}

export interface ImageResult extends BaseResult {
type: 'image'
exifData: ExifData
}

export interface VideoResult extends BaseResult {
type: 'video'
exifData: ExifData
}

export interface PdfResult extends BaseResult {
type: 'pdf'
exifData: ExifData
}

export type Result = HtmlResult | ImageResult | VideoResult | PdfResult | BaseResult
19 changes: 9 additions & 10 deletions src/interfaces/snippets.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
export interface SnippetBase {
type: 'html' | 'image' | 'video' | 'pdf' | 'link' | string
contentUrl: string
export interface BaseSnippet {
type?: string
url: string
canonicalUrl?: string
contentSize?: number
encodingFormat?: string
}

Expand Down Expand Up @@ -101,7 +100,7 @@ export interface RichEntity {

export type Entity = ArticleEntity | VideoEntity | ImageEntity | RichEntity

export interface HtmlSnippet extends SnippetBase {
export interface HtmlSnippet extends BaseSnippet {
type: 'html'
entity?: Entity
image?: HtmlSnippetImage | HtmlSnippetImage[]
Expand All @@ -127,11 +126,11 @@ export interface HtmlSnippet extends SnippetBase {
apps?: HtmlSnippetApps
}

export interface VideoSnippet extends SnippetBase {
export interface VideoSnippet extends BaseSnippet {
type: 'video'
}

export interface ImageSnippet extends SnippetBase {
export interface ImageSnippet extends BaseSnippet {
type: 'image'
dateModified?: Date
dateCreated?: Date
Expand All @@ -146,7 +145,7 @@ export interface ImageSnippet extends SnippetBase {
megapixels?: number
}

export interface PdfSnippet extends SnippetBase {
export interface PdfSnippet extends BaseSnippet {
type: 'pdf'
author?: string
title?: string
Expand All @@ -157,8 +156,8 @@ export interface PdfSnippet extends SnippetBase {
dateModified?: Date
}

export interface LinkSnippet extends SnippetBase {
export interface LinkSnippet extends BaseSnippet {
type: 'link'
}

export type Snippet = PdfSnippet | LinkSnippet | VideoSnippet | ImageSnippet | HtmlSnippet
export type Snippet = PdfSnippet | LinkSnippet | VideoSnippet | ImageSnippet | HtmlSnippet | BaseSnippet
36 changes: 16 additions & 20 deletions src/scrape.ts
Original file line number Diff line number Diff line change
@@ -1,54 +1,50 @@
import Promise = require('any-promise')
import { Readable } from 'stream'
import { parse } from 'content-type'
import defaultScrapers from './scrapers'
import { Headers, AbortFn, ScrapeResult, ScrapeOptions } from './interfaces'
import { Headers, AbortFn, Result, ScrapeOptions, BaseResult } from './interfaces'
import { makeRequest as defaultMakeRequest } from './support'

/**
* Scrape metadata from a URL.
*/
export function scrapeUrl (url: string, options: ScrapeOptions = {}): Promise<ScrapeResult<any>> {
export async function scrapeUrl (url: string, options: ScrapeOptions = {}): Promise<Result> {
const makeRequest = options.makeRequest || defaultMakeRequest
const res = await makeRequest(url)

return makeRequest(url).then(res => {
return scrapeStream(res.url, res.headers, res.stream, res.abort, options)
})
return scrapeStream(res.url, res.status, res.headers, res.stream, res.abort, options)
}

/**
* Scrape metadata from a stream (with headers/URL).
*/
export function scrapeStream (
contentUrl: string,
export async function scrapeStream (
url: string,
status: number,
headers: Headers,
stream: Readable,
abort?: AbortFn,
options: ScrapeOptions = {}
): Promise<ScrapeResult<any>> {
): Promise<Result> {
const encodingFormat = headers['content-type'] ? parse(headers['content-type']).type : undefined
const contentLength = Number(headers['content-length'])
const contentSize = isFinite(contentLength) ? contentLength : undefined
const close = abort || (() => stream.resume())
const scrapers = options.scrapers || defaultScrapers

const result: ScrapeResult<null> = {
type: 'link',
content: null,
contentUrl,
encodingFormat,
contentSize
const base: BaseResult = {
url,
status,
headers,
encodingFormat
}

// Traverse the available scrapers to extract information.
for (const rule of scrapers) {
if (rule.supported(result, headers)) {
return Promise.resolve(rule.handle(result, headers, stream, close, options))
if (rule.supported(base, headers)) {
return rule.handle(base, stream, close, options)
}
}

// Abort unhandled types.
close()

return Promise.resolve(result)
return base
}
59 changes: 27 additions & 32 deletions src/scrapers/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,25 @@ import { resolve } from 'url'
import { promises as Jsonld } from 'jsonld'
import { parse } from 'content-type'
import { makeRequest as defaultMakeRequest, concat } from '../support'

import {
Headers,
AbortFn,
ScrapeResult,
ScrapeOptions
} from '../interfaces'
import { AbortFn, BaseResult, HtmlResult, ScrapeOptions } from '../interfaces'

/**
* Check support for HTML.
*/
export function supported ({ encodingFormat }: ScrapeResult<null>) {
export function supported ({ encodingFormat }: BaseResult) {
return encodingFormat === 'text/html'
}

export async function handle (
result: ScrapeResult<HtmlContent>,
_headers: Headers,
base: BaseResult,
stream: Readable,
_abort: AbortFn,
options: ScrapeOptions
): Promise<ScrapeResult<HtmlContent>> {
const { contentUrl } = result
): Promise<HtmlResult> {
const { url } = base
const makeRequest = options.makeRequest || defaultMakeRequest

const parsed = await parseHtml(stream, contentUrl)
const parsed = await parseHtml(stream, url)
const { twitter, html, icons, dublincore, applinks, sailthru, alternate } = parsed

const [jsonld, rdfa, microdata] = await Promise.all([
Expand All @@ -39,32 +32,34 @@ export async function handle (
Jsonld.expand(parsed.microdata || {})
])

result.type = 'html'

result.content = {
jsonld,
rdfa,
microdata,
twitter,
html,
icons,
dublincore,
applinks,
sailthru,
alternate
}
const result: HtmlResult = Object.assign(
{
type: 'html' as 'html',
jsonld,
rdfa,
microdata,
twitter,
html,
icons,
dublincore,
applinks,
sailthru,
alternate
},
base
)

// Attempt to read OEmbed metadata.
if (options.useOEmbed !== false) {
for (const alternate of result.content.alternate) {
for (const alternate of result.alternate) {
if (alternate.type === 'text/json+oembed') {
const res = await makeRequest(alternate.href)

if (res.status === 200) {
const content = await concat(res.stream)

try {
result.content.oembed = JSON.parse(content.toString('utf8'))
result.oembed = JSON.parse(content.toString('utf8'))
} catch (e) { /* Ignore parse errors. */ }
}

Expand All @@ -74,15 +69,15 @@ export async function handle (
}

// Follow the default browser behaviour to find `favicon.ico`.
if (options.fallbackOnFavicon !== false && result.content.icons.length === 0) {
const href = resolve(contentUrl, '/favicon.ico')
if (options.fallbackOnFavicon !== false && result.icons.length === 0) {
const href = resolve(url, '/favicon.ico')
const res = await makeRequest(href)

// Ignore the actual response body, it's not important.
res.abort()

if (res.status === 200) {
result.content.icons.push({
result.icons.push({
type: res.headers['content-type'] ? parse(res.headers['content-type']).type : undefined,
href
})
Expand Down
Loading

0 comments on commit 4f46001

Please sign in to comment.