/
router.ts
206 lines (184 loc) · 6.76 KB
/
router.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import type { Dictionary } from '@crawlee/types';
import type { CrawlingContext, RestrictedCrawlingContext } from './crawlers/crawler_commons';
import { MissingRouteError } from './errors';
import type { Request } from './request';
import type { Awaitable } from './typedefs';
const defaultRoute = Symbol('default-route');
export interface RouterHandler<Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'> = CrawlingContext> extends Router<Context> {
(ctx: Context): Awaitable<void>;
}
export type GetUserDataFromRequest<T> = T extends Request<infer Y> ? Y : never;
export type RouterRoutes<Context, UserData extends Dictionary> = {
[label in string | symbol]: (ctx: Omit<Context, 'request'> & { request: Request<UserData> }) => Awaitable<void>;
}
/**
* Simple router that works based on request labels. This instance can then serve as a `requestHandler` of your crawler.
*
* ```ts
* import { Router, CheerioCrawler, CheerioCrawlingContext } from 'crawlee';
*
* const router = Router.create<CheerioCrawlingContext>();
*
* // we can also use factory methods for specific crawling contexts, the above equals to:
* // import { createCheerioRouter } from 'crawlee';
* // const router = createCheerioRouter();
*
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new CheerioCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*
* Alternatively we can use the default router instance from crawler object:
*
* ```ts
* import { CheerioCrawler } from 'crawlee';
*
* const crawler = new CheerioCrawler();
*
* crawler.router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* crawler.router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* await crawler.run();
* ```
*
* For convenience, we can also define the routes right when creating the router:
*
* ```ts
* import { CheerioCrawler, createCheerioRouter } from 'crawlee';
* const crawler = new CheerioCrawler({
* requestHandler: createCheerioRouter({
* 'label-a': async (ctx) => { ... },
* 'label-b': async (ctx) => { ... },
* })},
* });
* await crawler.run();
* ```
*
* Middlewares are also supported via the `router.use` method. There can be multiple
* middlewares for a single router, they will be executed sequentially in the same
* order as they were registered.
*
* ```ts
* crawler.router.use(async (ctx) => {
* ctx.log.info('...');
* });
* ```
*/
export class Router<Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'>> {
private readonly routes: Map<string | symbol, (ctx: Context) => Awaitable<void>> = new Map();
private readonly middlewares: ((ctx: Context) => Awaitable<void>)[] = [];
/**
* use Router.create() instead!
* @ignore
*/
protected constructor() {}
/**
* Registers new route handler for given label.
*/
addHandler<UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(
label: string | symbol,
handler: (ctx: Omit<Context, 'request'> & { request: Request<UserData> }) => Awaitable<void>,
) {
this.validate(label);
this.routes.set(label, handler);
}
/**
* Registers default route handler.
*/
addDefaultHandler<UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(
handler: (ctx: Omit<Context, 'request'> & { request: Request<UserData> }) => Awaitable<void>,
) {
this.validate(defaultRoute);
this.routes.set(defaultRoute, handler);
}
/**
* Registers a middleware that will be fired before the matching route handler.
* Multiple middlewares can be registered, they will be fired in the same order.
*/
use(middleware: (ctx: Context) => Awaitable<void>) {
this.middlewares.push(middleware);
}
/**
* Returns route handler for given label. If no label is provided, the default request handler will be returned.
*/
getHandler(label?: string | symbol): (ctx: Context) => Awaitable<void> {
if (label && this.routes.has(label)) {
return this.routes.get(label)!;
}
if (this.routes.has(defaultRoute)) {
return this.routes.get(defaultRoute)!;
}
throw new MissingRouteError(
`Route not found for label '${String(label)}'.`
+ ' You must set up a route for this label or a default route.'
+ ' Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`.',
);
}
/**
* Throws when the label already exists in our registry.
*/
private validate(label: string | symbol) {
if (this.routes.has(label)) {
const message = label === defaultRoute
? `Default route is already defined!`
: `Route for label '${String(label)}' is already defined!`;
throw new Error(message);
}
}
/**
* Creates new router instance. This instance can then serve as a `requestHandler` of your crawler.
*
* ```ts
* import { Router, CheerioCrawler, CheerioCrawlingContext } from 'crawlee';
*
* const router = Router.create<CheerioCrawlingContext>();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new CheerioCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
static create<
Context extends Omit<RestrictedCrawlingContext, 'enqueueLinks'> = CrawlingContext,
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
>(routes?: RouterRoutes<Context, UserData>): RouterHandler<Context> {
const router = new Router<Context>();
const obj = Object.create(Function.prototype);
obj.addHandler = router.addHandler.bind(router);
obj.addDefaultHandler = router.addDefaultHandler.bind(router);
obj.getHandler = router.getHandler.bind(router);
obj.use = router.use.bind(router);
for (const [label, handler] of Object.entries(routes ?? {})) {
router.addHandler(label, handler);
}
const func = async function (context: Context) {
const { url, loadedUrl, label } = context.request;
context.log.debug('Page opened.', { label, url: loadedUrl ?? url });
for (const middleware of router.middlewares) {
await middleware(context);
}
return router.getHandler(label)(context);
};
Object.setPrototypeOf(func, obj);
return func as unknown as RouterHandler<Context>;
}
}