Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
fa1b07e
add handlebars unit test
andykais Jan 24, 2019
30106d6
config v2, basic build structure
andykais Jan 24, 2019
ef0911a
convert to structure defs
andykais Jan 27, 2019
5866067
clean up lint
andykais Jan 27, 2019
0bbc9f3
remove unnecessary Rx.of observable
andykais Jan 27, 2019
c48e7bb
hacky fix for ts-runtime/issues/21
andykais Jan 31, 2019
c7685df
tighter parser&downloader class types
andykais Feb 1, 2019
c52a183
add FMap wrapper around Map
andykais Feb 14, 2019
50a4405
Merge branch 'master' of github.com:andykais/scrape-pages into config…
andykais Feb 14, 2019
88ce72d
refactor runOptions into options
andykais Feb 14, 2019
76392b0
adjust prettier width from 80 to 100
andykais Feb 14, 2019
ac4c90d
upgrade typescript-eslint
andykais Feb 14, 2019
8a924f9
ensure input keys & scrapers are valid, unreserved names
andykais Feb 14, 2019
e55725f
rename global testing vars
andykais Feb 14, 2019
dd71ea1
add pre-push check, fix handlebars vulnerability
andykais Feb 14, 2019
fff0cf2
serialize options & config
andykais Feb 14, 2019
359d26b
Merge branch 'config-structure-defs' of github.com:andykais/scrape-pa…
andykais Feb 14, 2019
ed5107e
simplify query output
andykais Feb 19, 2019
81b02a2
add logfile rotation
andykais Feb 19, 2019
d4cf36c
expose query.prepare to user
andykais Feb 21, 2019
0b963a6
add osx to travis, move npm deploy inside .travis.yml
andykais Feb 22, 2019
3d42562
add more fetch error catches, more map helpers
andykais Feb 22, 2019
54b060c
Merge branch 'config-structure-defs' of github.com:andykais/scrape-pa…
andykais Feb 22, 2019
b5168f5
add reusable options type
andykais Feb 22, 2019
ca16b4a
add value limit per ScrapeConfig
andykais Feb 25, 2019
f434200
push incomplete params third arg
andykais Feb 25, 2019
6a88f10
separate options & params
andykais Feb 26, 2019
8347690
add type assertion test
andykais Feb 26, 2019
bc809ab
remove lint
andykais Feb 26, 2019
ae1ee5a
rename config fields
andykais Feb 26, 2019
580a0e6
update readme
andykais Feb 26, 2019
0495b2e
add keyword to package.json
andykais Feb 26, 2019
f620897
remove `import` from config
andykais Feb 27, 2019
3cc622e
rename url -> downloadData in sql
andykais Feb 27, 2019
b912e18
give query function to user before creating folders
andykais Feb 27, 2019
168c2bc
update readme
andykais Feb 27, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .eslintignore

This file was deleted.

15 changes: 10 additions & 5 deletions .eslintrc
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
{
"parser": "eslint-plugin-typescript/parser",
"plugins": ["typescript", "no-only-tests"],
"parser": "@typescript-eslint/parser",
"parserOptions": {
"project": "./tsconfig.json",
"sourceType": "module"
},
"plugins": ["@typescript-eslint", "no-only-tests"],
"rules": {
"no-unused-expressions": "error",
"no-console": "warn",
"typescript/no-unused-vars": "error",
"typescript/explicit-member-accessibility": "error",
"typescript/member-ordering": "error",
"no-unreachable": "error",
"@typescript-eslint/no-unused-vars": "error",
"@typescript-eslint/explicit-member-accessibility": "error",
"@typescript-eslint/member-ordering": "error",
"no-only-tests/no-only-tests": "error"
}
}
3 changes: 2 additions & 1 deletion .prettierrc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"semi": false,
"singleQuote": true
"singleQuote": true,
"printWidth": 100
}
87 changes: 47 additions & 40 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,59 +1,66 @@
language: node_js

node_js:
- '11'
- '10'
- '9'
- '8'

os:
- linux
- osx
# - windows

cache: npm

stages:
- name: typecheck-test-lint
- name: typecheck
if: branch = master OR type IN (pull_request)
- name: test
if: branch = master OR type IN (pull_request)
- name: deploy-npm
- name: lint
if: branch = master OR type IN (pull_request)
- name: deploy npm
if: tag IS present
- name: deploy-website
- name: deploy website
if: branch = master AND NOT type IN (pull_request)

cache:
directories:
- $HOME/.npm

jobs:
include:
- stage: typecheck-test-lint
name: typecheck
node_js: '8'
- stage: typecheck
script: npm run typecheck
skip_cleanup: true

- stage: typecheck-test-lint
name: test node:10
node_js: '10'
script: npm run test:unit && npm run test:functional
- stage: typecheck-test-lint
name: test node:9
node_js: '9'
script: npm run test:unit && npm run test:functional
- stage: typecheck-test-lint
name: test node:8
node_js: '8'
script: npm run test:unit && npm run test:functional

- stage: typecheck-test-lint
name: lint
node_js: '8'
script: npm run lint

- stage: typecheck-test-lint
name: check formatting
node_js: '8'
script: npm run format:check
- stage: lint
script: npm run lint && npm run format:check
skip_cleanup: true

- stage: deploy-npm
node_js: '8'
- stage: deploy npm
script: skip
deploy:
provider: script
script: bash publish.sh
skip_cleanup: true
on:
tags: true
script:
- |
(
set -Eeuo pipefail
set -x
# bump version based on github release tag
npm version --no-git-tag-version "$TRAVIS_TAG"
# publish package to npmjs.org
echo '//registry.npmjs.org/:_authToken=${NPM_AUTH_TOKEN}' >> ~/.npmrc
npm run build
cd lib
npm publish
cd ..
# push updated package.json to github
git checkout -b master
git add package.json package-lock.json
git commit --message "release $TRAVIS_TAG"
git remote add deploy https://${GITHUB_TOKEN}@github.com/${TRAVIS_REPO_SLUG}.git
git push deploy master
)

- stage: deploy-website
node_js: '8'
- stage: deploy website
before_install: cd website
script: npm run build
deploy:
Expand Down
118 changes: 72 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,74 @@ npm install scrape-pages

## Usage

lets download the five most recent images from NASA's image of the day archive
Lets download the ten most recent images from NASA's image of the day archive. First, define a `config`,
`options`, and `params` to be passed into the scraper.

```javascript
const { scraper } = require('scrape-pages')
// create a config file
const config = {
scrape: {
download: 'https://apod.nasa.gov/apod/archivepix.html',
parse: {
selector: 'body > b > a:nth-child(-n+10)',
attribute: 'href'
// define some scrapers
scrapers: {
index: {
download: 'https://apod.nasa.gov/apod/archivepix.html',
parse: {
selector: 'body > b > a',
attribute: 'href'
},
limitValuesTo: 10
},
scrapeEach: {
post: {
download: 'https://apod.nasa.gov/apod/{value}',
parse: {
selector: 'a[href^="image"]',
attribute: 'href'
},
}
},
image: {
download: 'https://apod.nasa.gov/apod/{value}'
}
},
// describe how they work together
run: {
scraper: 'index',
scrapeEach: {
scraper: 'post',
scrapeEach: {
name: 'image',
download: 'https://apod.nasa.gov/apod/{value}'
scraper: 'image'
}
}
}
}

const options = {
folder: './downloads',
logLevel: 'info',
logFile: './nasa-download.log'
optionsEach: {
image: {
read: false,
write: true
}
}
}
// params are separated from config & options so params can change while reusing configs & options.
const params = {
folder: './downloads'
}
```

// load the config into a new 'scraper'
const scraper = await scrape(config, options)
const { on, emit, query } = scraper
on('image:compete', id => {
console.log('COMPLETED image', id)
})
After declaring your settings, the usage of the library is very simple. There is a way to start the scraper,
listen to events, emit events back to the scraper, and query the scraped data.

```javascript
const { scraper } = require('scrape-pages')

// create an executable scraper and a querier
const { start, query } = scrape(config, options, params)
// begin scraping here
const { on, emit } = await start()
// listen to events
on('image:compete', id => console.log('COMPLETED image', id))
on('done', () => {
console.log('finished.')
const result = query({ scrapers: ['images'] })
// result = [{
// images: [{ filename: 'img1.jpg' }, { filename: 'img2.jpg' }, ...]
// }]
// result = [[{ filename: 'img1.jpg' }, { filename: 'img2.jpg' }, ...]]
})
```

Expand All @@ -73,46 +98,47 @@ provided, each run will work independently. `scraper.run` returns **emitter**

### scrape

| param | type | required | type file | description |
| ------- | ---------------- | -------- | -------------------------------------------------------------- | ----------------------------- |
| config | `ConfigInit` | Yes | [src/settings/config/types.ts](src/settings/config/types.ts) | _what_ is being downloaded |
| options | `RunOptionsInit` | Yes | [src/settings/options/types.ts](src/settings/options/types.ts) | _how_ something is downloaded |

| argument | type | required | type file | description |
| -------- | ------------- | -------- | -------------------------------------------------------------- | ----------------------------- |
| config | `ConfigInit` | Yes | [src/settings/config/types.ts](src/settings/config/types.ts) | _what_ is being downloaded |
| options | `OptionsInit` | Yes | [src/settings/options/types.ts](src/settings/options/types.ts) | _how_ something is downloaded |
| params | `ParamsInit` | Yes | [src/settings/params/types.ts](src/settings/params/types.ts) | _who_ is being downloaded |

### scraper
The `scrape` function returns a promise which yeilds these utilities (`on`, `emit`, and `query`)

The `scrape` function returns a promise which yields these utilities (`on`, `emit`, and `query`)

#### on

Listen for events from the scraper

| event | callback arguments | description |
| ---------------------- | --------------------- | ------------------------------------------ |
| `'done'` | queryFor | when the scraper has completed |
| `'error'` | error | if the scraper encounters an error |
| `'<scraper>:progress'` | queryFor, download id | emits progress of download until completed |
| `'<scraper>:queued'` | queryFor, download id | when a download is queued |
| `'<scraper>:complete'` | queryFor, download id | when a download is completed |
| event | callback arguments | description |
| ---------------------- | ------------------ | ------------------------------------------ |
| `'done'` | | when the scraper has completed |
| `'error'` | Error | if the scraper encounters an error |
| `'<scraper>:progress'` | download id | emits progress of download until completed |
| `'<scraper>:queued'` | download id | when a download is queued |
| `'<scraper>:complete'` | download id | when a download is completed |

#### emit

While the scraper is working, you can affect its behavior by emitting these events:

| event | arguments | description |
| --- | --- | --- |
| `'useRateLimiter'` | boolean | turn on or off the rate limit defined in the run options |
| `'stop'` | | stop the crawler (note that in progress requests will still complete) |
| event | arguments | description |
| ------------------ | --------- | --------------------------------------------------------------------- |
| `'useRateLimiter'` | boolean | turn on or off the rate limit defined in the run options |
| `'stop'` | | stop the crawler (note that in progress requests will still complete) |

each event will return the **queryFor** function as its first argument

#### query

This function is an argument in the emitter callback and is used to get data back out of the scraper whenever
you need it. These are its arguments:

| name | type | required | description |
| --- | --- | --- | --- |
| `scrapers` | `string[]` | Yes | scrapers who will return their filenames and parsed values, in order |
| `groupBy` | `string` | Yes | name of a scraper which will delineate the values in `scrapers` |
| name | type | required | description |
| ---------- | ---------- | -------- | -------------------------------------------------------------------- |
| `scrapers` | `string[]` | Yes | scrapers who will return their filenames and parsed values, in order |
| `groupBy` | `string` | Yes | name of a scraper which will delineate the values in `scrapers` |

## Motivation

Expand Down
8 changes: 4 additions & 4 deletions custom.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ declare module '*.sql' {
}

declare module 'flow-runtime' {
const content: any
const content: {}
export default content
}

type ArgumentTypes<F extends Function> = F extends (...args: infer A) => any
? A
: never
type ArgumentTypes<F extends Function> = F extends (...args: infer A) => any ? A : never

type Omit<T, K> = Pick<T, Exclude<keyof T, K>>

type Nullable<T> = T | null
type Voidable<T> = T | void
2 changes: 1 addition & 1 deletion examples/__tests__/simple-config.unit.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { assertConfigType } from '../../src/settings/config'
import * as testingConfigs from '../../testing/resources/testing-configs'

describe('example simple config', () => {
const simpleConfig = testingConfigs.__SIMPLE_CONFIG__
const simpleConfig = testingConfigs.SIMPLE_CONFIG

it('is properly typed', () => {
assertConfigType(simpleConfig)
Expand Down
33 changes: 22 additions & 11 deletions examples/deviantart.config.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,33 @@
{
"input": "artist",
"scrape": {
"download": {
"increment": 24,
"urlTemplate": "https://{artist}.deviantart.com/gallery/?catpath=/&offset={_index}"
},
"parse": {
"selector": ".torpedo-thumb-link",
"attribute": "href"
"defs": {
"gallery": {
"download": {
"urlTemplate": "https://{{ artist }}.deviantart.com/gallery/?catpath=/&offset={{'*' index 24}}"
},
"parse": {
"selector": ".torpedo-thumb-link",
"attribute": "href"
},
"incrementUntil": "failed-download"
},
"scrapeEach": {
"post": {
"download": "{value}",
"parse": {
"selector": ".dev-view-deviation",
"attribute": "src"
},
}
},
"media": {
"download": "{value}"
}
},
"structure": {
"scraper": "gallery",
"scrapeEach": {
"scraper": "post",
"scrapeEach": {
"download": "{value}"
"scraper": "media"
}
}
}
Expand Down
Loading