From 910d7a22512416978b82c4a97e37048afab64075 Mon Sep 17 00:00:00 2001 From: Jacob Logan Date: Fri, 1 Mar 2024 13:19:18 -0700 Subject: [PATCH 1/3] update link checker to wait for each call to complete before making the next and remove # from urls being checked --- tasks/link-checker.js | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tasks/link-checker.js b/tasks/link-checker.js index ff1cf1a6ab9..748e8c1ed46 100644 --- a/tasks/link-checker.js +++ b/tasks/link-checker.js @@ -1,5 +1,5 @@ -const puppeteer = require('puppeteer'); -const axios = require('axios'); +const puppeteer = require('puppeteer'); // eslint-disable-line +const axios = require('axios'); // eslint-disable-line const SITEMAP_URL = 'https://docs.amplify.aws/sitemap.xml'; const DOMAIN = 'https://docs.amplify.aws'; @@ -62,7 +62,9 @@ const retrieveLinks = async (siteMapUrls, visitedLinks, localDomain) => { let response = await page.goto(url, { waitUntil: 'domcontentloaded' }); await new Promise((r) => setTimeout(r, 100)); // localhost hangs on wait for idle so use a short timeout instead if (response && response.status() && response.status() === 200) { - console.log(`successfully visited ${url} to retrieve links`); + console.log( + `successfully visited ${url} to retrieve links ${urlList.length} links found` + ); visitedLinks[url] = true; const urlList = await page.evaluate(async (url) => { @@ -128,11 +130,9 @@ const linkChecker = async (localDomain) => { localDomain ); - let allPromises = []; - for (let i = 0; i < urlsToVisit.length; i++) { const link = urlsToVisit[i]; - let href = link.url; + let href = link.url.split('#')[0]; if (href.startsWith(GITHUB_CREATE_ISSUE_LINK)) { // remove query parameters from github new issue links href = href.split('?')[0]; @@ -163,11 +163,9 @@ const linkChecker = async (localDomain) => { } }); - allPromises.push(request); + await request; } - await Promise.all(allPromises); - console.log(statusCodes); console.log(brokenLinks); From 8d435b7234170faa612810da0f40b6367af45976 Mon Sep 17 00:00:00 2001 From: Jacob Logan Date: Fri, 1 Mar 2024 13:33:07 -0700 Subject: [PATCH 2/3] update logging to show how many links found on each page --- tasks/link-checker.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tasks/link-checker.js b/tasks/link-checker.js index 748e8c1ed46..c0714226257 100644 --- a/tasks/link-checker.js +++ b/tasks/link-checker.js @@ -62,9 +62,6 @@ const retrieveLinks = async (siteMapUrls, visitedLinks, localDomain) => { let response = await page.goto(url, { waitUntil: 'domcontentloaded' }); await new Promise((r) => setTimeout(r, 100)); // localhost hangs on wait for idle so use a short timeout instead if (response && response.status() && response.status() === 200) { - console.log( - `successfully visited ${url} to retrieve links ${urlList.length} links found` - ); visitedLinks[url] = true; const urlList = await page.evaluate(async (url) => { @@ -84,6 +81,10 @@ const retrieveLinks = async (siteMapUrls, visitedLinks, localDomain) => { return urls; }, url); + console.log( + `successfully visited ${url} to retrieve links ${urlList.length} links found` + ); + urlList.forEach((link) => { if ( !CRAWLER_EXCEPTIONS.includes(link.url) && From b774fb411eac7890d87b35fe0095ab8a2e4c5b2b Mon Sep 17 00:00:00 2001 From: Scott Rees <6165315+reesscot@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:55:59 -0700 Subject: [PATCH 3/3] Update tasks/link-checker.js --- tasks/link-checker.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/link-checker.js b/tasks/link-checker.js index c0714226257..23a935105c0 100644 --- a/tasks/link-checker.js +++ b/tasks/link-checker.js @@ -82,7 +82,7 @@ const retrieveLinks = async (siteMapUrls, visitedLinks, localDomain) => { }, url); console.log( - `successfully visited ${url} to retrieve links ${urlList.length} links found` + `successfully visited ${url} to retrieve links. ${urlList.length} links found` ); urlList.forEach((link) => {