Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from diygod:master #1

Merged
merged 2 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 120 additions & 39 deletions lib/routes/gov/moa/moa.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,34 @@ import { load } from 'cheerio';
import { parseRelativeDate } from '@/utils/parse-date';

const hostUrl = 'http://www.moa.gov.cn/';
const hostUrlObj = new URL(hostUrl); // 用于在下面判断host
const hostUrlObj = new URL(hostUrl); // 用于在下面判断 host

export const route: Route = {
path: '/moa/:suburl{.+}',
categories: ['government'],
example: '/gov/moa/gk/zcjd/',
radar: [
{
source: ['moa.gov.cn/'],
target: '/moa/:suburl',
},
],
name: 'Unknown',
maintainers: [],
parameters: { suburl: '下级目录,请使用最下级的目录' },
name: '中华人民共和国农业农村部 - 新闻',
maintainers: ['Origami404', 'lyqluis'],
handler,
url: 'moa.gov.cn/',
description: `更多例子:
- \`农业农村部动态\`的网页链接是\`http://www.moa.gov.cn/xw/zwdt/\`, 对应的\`suburl\`是\`xw/zwdt\`
- \`财务公开\`的网页链接是\`http://www.moa.gov.cn/gk/cwgk_1/\`, 对应的\`suburl\`是\`gk/cwgk_1\`
- 像[政策法规](http://www.moa.gov.cn/gk/zcfg/)这种页面(\`http://www.moa.gov.cn/gk/zcfg/\`), 它**不是**一个合法的分类目录,它是\`法律\`, \`行政法规\`, \`部门规章\`等一堆栏目的集合,这时候请点开对应栏目的\`更多 >>\`进入栏目的最下级目录,再根据上面的规则提取\`suburl\`
- 特别地,\`图片新闻\`对应的\`suburl\`为\`xw/tpxw/\`, \`最新公开\`对应的\`suburl\`为\`govpublic\`, \`数据>最新发布\`对应的\`suburl\`为\`sj/zxfb\``,
};

async function handler(ctx) {
const rawSuburl = ctx.req.param('suburl');
const suburl = rawSuburl.slice(-1) === '/' ? rawSuburl : rawSuburl + '/';

// 特殊处理两个, 其他的栏目都可以找到那种一个列表下去的目录
// 特殊处理两个,其他的栏目都可以找到那种一个列表下去的目录
if (suburl === 'xw/tpxw/') {
// 图片新闻
return await dealChannel(suburl, {
Expand All @@ -34,11 +41,22 @@ async function handler(ctx) {
titleSelector: 'a[class="block w_fill ellipsis adc ahc"]',
dateSelector: 'span',
});
} else if (suburl === 'govpublic/') {
// 公开公告
return await dealChannel('govpublic/1/index.htm', {
} else if (suburl.startsWith('sj/zxfb')) {
// 数据 - 最新发布
return await dealLatestDataChannel();
} else if (suburl.startsWith('gk')) {
// 公开
return await dealChannel(suburl, {
channelTitleSelector: 'title',
listSelector: '.gongkai_centerRList li',
listSelector: '.commonlist li',
titleSelector: 'a',
dateSelector: 'span',
});
} else if (suburl.startsWith('govpublic')) {
// 最新公开
return await dealChannel('govpublic/1/index.htm', {
channelTitleText: '最新公开',
listSelector: '.commonlist li',
titleSelector: 'a',
dateSelector: 'span',
});
Expand All @@ -52,16 +70,16 @@ async function handler(ctx) {
}
}

// 处理文章列表, 从那里获得一堆要爬取的页面, 然后爬取
// 处理文章列表从那里获得一堆要爬取的页面然后爬取
async function dealChannel(suburl, selectors) {
const { channelTitleSelector, listSelector, titleSelector, dateSelector } = selectors;
const { channelTitleSelector, listSelector, titleSelector, dateSelector, channelTitleText } = selectors;

// 为了与下面解析相对链接的dealLink配合, 这里末尾必须保证有一条斜杠
const url = hostUrl + suburl;
const respone = await got.get(url);
const $ = load(respone.data);
// 为了与下面解析相对链接的 dealLink 配合,这里末尾必须保证有一条斜杠
const url = suburl.startsWith('http') ? suburl : hostUrl + suburl;
const response = await got.get(url);
const $ = load(response.data);

const channelTitle = $(channelTitleSelector).text();
const channelTitle = channelTitleText ?? $(channelTitleSelector).text();

const pageInfos = $(listSelector)
.map((i, e) => {
Expand Down Expand Up @@ -101,8 +119,7 @@ async function dealChannel(suburl, selectors) {
item = await dealGovpublicPage(link, item);
} else {
// 外部文章
item.description = `外部链接: ${item.link}`;
item.author = 'unknown';
item.description = `外部链接:${item.link}`;
}

cache.set(link, JSON.stringify(item));
Expand All @@ -117,18 +134,30 @@ async function dealChannel(suburl, selectors) {
};
}

// 处理正常文章, 例子: http://www.moa.gov.cn/gk/rsxx_1/202004/t20200421_6342037.htm
// 处理正常文章,例子:http://www.moa.gov.cn/xw/zwdt/202309/t20230915_6436615.htm
async function dealNormalPage(link, item) {
const reponse = await got.get(link);
const $ = load(reponse.data);
const metaElements = $('.bjjMAuthorBox span.dc_3').toArray();
const response = await got.get(link);
const $ = load(response.data);

// 政府网站变动不频繁, 写死第几个应该没有多大关系
// 互动-直播访谈
if (link.includes('zbft')) {
const pageHeader = $('.nybzb').html() ?? '';
const pics = $('.tpsl').html() ?? '';
const content = $('.wzsl').html() ?? '';

item.description = pageHeader + pics + content;
return item;
}

// normal news
const metaElements = $('.bjjMAuthorBox span.dc_2').toArray();

// 政府网站变动不频繁,写死第几个应该没有多大关系
const author = $(metaElements[1]).text();
const source = $(metaElements[2]).text();
item.author = `${author} ${source}`;

// 对于这个网站内的链接, 能提供更精确的时间
// 对于这个网站内的链接能提供更精确的时间
// 这个的日期跟时间之间的空格数量好像会乱变的
const exactTime = $(metaElements[0]).text();
const dateMatch = /\d{4}-\d{2}-\d{2}/.exec(exactTime);
Expand All @@ -140,34 +169,86 @@ async function dealNormalPage(link, item) {
return item;
}

// 处理那种带索引号的公示文章, 例子: http://www.moa.gov.cn/govpublic/XZQYJ/202004/t20200420_6341913.htm
// 处理那种带索引号的公示文章,例子:http://www.moa.gov.cn/gk/zcjd/202402/t20240219_6448654.htm
async function dealGovpublicPage(link, item) {
const respone = await got.get(link);
const $ = load(respone.data);
if (item.link.endsWith('.pdf')) {
return item;
}
const response = await got.get(link);
const $ = load(response.data);

const body = $('.gsj_htmlcon_bot');
const [, year, month, date] = $('.pubtime')
.text()
.match(/:(\d{4})[|年-](\d{1,2})[|月-](\d{1,2})日?/);
const [, author] = $('.pubtime.source')
?.text()
?.match(/:(.+)/) ?? [null, ''];

if (year && month && date) {
item.pubDate = `${year}-${month}-${date}`;
}
item.author = author;
item.description = body.html();
return item;
}

const head = $('ul.head');
const body = $('.arc_body');
async function dealLatestDataChannel() {
const res = await got({
url: 'http://zdscxx.moa.gov.cn:8080/nyb/getMessages',
method: 'post',
json: {
page: 1,
rows: 20,
type: '最新发布',
isLatestMessage: true,
},
});
const items = await Promise.all(
res.data.result.table.map((item) => {
const { date, id } = item;
item.pubDate = date;
const link = (item.link = `http://zdscxx.moa.gov.cn:8080/nyb/pc/messageView.jsp?id=${id}`);

// 日期时间作者等详细信息被包含在了head里面
// 况且都是政府部门, 提取作者信息无多大意义(还没有特别在页面标注出来), 干脆写在正文
// 而且我也搞不懂到底是发布部门算作者还是写出来公告的部门算还是那个人算...
return cache.tryGet(link, async () => {
const { content, source } = await getLatestDataArticleDetail(id);

item.description = head.html() + body.html();
item.description = content;
item.author = source;

return item;
return item;
});
})
);
return {
title: `中华人民共和国农业农村部 - 数据 - 最新发布`,
link: 'http://zdscxx.moa.gov.cn:8080/nyb/pc/messageList.jsp',
item: items,
};
}

async function getLatestDataArticleDetail(id) {
const res = await got({
url: 'http://zdscxx.moa.gov.cn:8080/nyb/getMessagesById',
method: 'post',
form: {
id,
},
});
return res.data.result;
}

// 处理相对url 和 按链接对文章类型进行分类
// 处理相对 url 和 按链接对文章类型进行分类
function dealLink(element, url) {
const rawLink = element.attr('href');
const { host, href } = new URL(rawLink, url);

// host不同的是外部文章, outside
// url里带govpublic的都是公示文章, govpublic
// 其他的都算普通文章, normal
// host 不同的是外部文章,outside
// url 里带 govpublic 的都是公示文章,govpublic
// 其他的都算普通文章normal
let pageType = null;
if (host === hostUrlObj.host) {
pageType = href.includes('govpublic') ? 'govpublic' : 'normal';
pageType = href.includes('gk') || href.includes('govpublic') ? 'govpublic' : 'normal';
} else {
pageType = 'outside';
}
Expand Down
135 changes: 135 additions & 0 deletions lib/routes/zhonglun/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import { Route } from '@/types';
import { getCurrentPath } from '@/utils/helpers';
const __dirname = getCurrentPath(import.meta.url);

import cache from '@/utils/cache';
import got from '@/utils/got';
import { load } from 'cheerio';
import { parseDate } from '@/utils/parse-date';
import { art } from '@/utils/render';
import path from 'node:path';

export const handler = async (ctx) => {
const { language = 'zh' } = ctx.req.param();
const limit = ctx.req.query('limit') ? Number.parseInt(ctx.req.query('limit'), 10) : 30;

const rootUrl = `https://${language === 'zh' ? 'www' : language.replaceAll(/[^\dA-Za-z-]/g, '')}.zhonglun.com`;
const currentUrl = new URL('research/articles', rootUrl).href;

const { data: response } = await got(currentUrl);

const $ = load(response);

let items = $('div#dataList h3')
.slice(0, limit)
.toArray()
.map((item) => {
item = $(item);

const title = item.text();
const description = art(path.join(__dirname, 'templates/description.art'), {
intro: item.next().text(),
});

return {
title,
description,
pubDate: parseDate(item.find('span').first().text()),
link: item.find('a').prop('href'),
language,
};
});

items = await Promise.all(
items.map((item) =>
cache.tryGet(item.link, async () => {
const { data: detailResponse } = await got(item.link);

const $$ = load(detailResponse);

const title = $$('div.news_dtitle h2').text();
const description =
item.description +
art(path.join(__dirname, 'templates/description.art'), {
description: $$('div.edit_con_original').html(),
});
const image = $$('img.raw-image').first().prop('src');

item.title = title;
item.description = description;
item.pubDate = parseDate($$('span.posttime').text());
item.author = $$('span.author').text().split(/:/).pop();
item.content = {
html: description,
text: $$('div.edit_con_original').text(),
};
item.image = image;
item.banner = image;
item.language = language;

return item;
})
)
);

const image = new URL($('header.header h1 a img').prop('src'), rootUrl).href;

return {
title: `${$('title').text()} - ${$('div.siteban_text').text()}`,
description: $('meta[name="description"]').prop('content'),
link: currentUrl,
item: items,
allowEmpty: true,
image,
author: $('meta[name="author"]').prop('content'),
language,
};
};

export const route: Route = {
path: '/research/article/:language{[a-zA-Z0-9-]+}?',
name: '中伦研究专业文章',
url: 'zhonglun.com',
maintainers: ['nczitzk'],
handler,
example: '/zhonglun/research/article/zh',
parameters: { category: '语言,默认为 zh,即简体中文,可在对应分类页 URL 中找到' },
description: `
| ENG | 简体中文 | 日本語 | 한국어 |
| --- | -------- | ------ | ------ |
| en | zh | ja | kr |
`,
categories: ['new-media'],

features: {
requireConfig: false,
requirePuppeteer: false,
antiCrawler: false,
supportRadar: true,
supportBT: false,
supportPodcast: false,
supportScihub: false,
},
radar: [
{
title: '专业文章',
source: ['zhonglun.com/research/articles'],
target: '/research/article/zh',
},
{
title: ' Articles',
source: ['en.zhonglun.com/research/articles'],
target: '/research/article/en',
},
{
title: '論評',
source: ['ja.zhonglun.com/research/articles'],
target: '/research/article/ja',
},
{
title: '전문기사',
source: ['kr.zhonglun.com/research/articles'],
target: '/research/article/kr',
},
],
};
8 changes: 8 additions & 0 deletions lib/routes/zhonglun/namespace.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import type { Namespace } from '@/types';

export const namespace: Namespace = {
name: '中伦律师事务所',
url: 'zhonglun.com',
categories: ['new-media'],
description: '',
};
7 changes: 7 additions & 0 deletions lib/routes/zhonglun/templates/description.art
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{{ if intro }}
<blockquote>{{ intro }}</blockquote>
{{ /if }}

{{ if description }}
{{@ description }}
{{ /if }}