From 94d692a4607088a69402927062b4f6672d2de3f0 Mon Sep 17 00:00:00 2001 From: Damian Zehnder <16799758+dzehnder@users.noreply.github.com> Date: Mon, 25 Mar 2024 16:46:59 +0100 Subject: [PATCH] feat: introduce top-pages command using ahrefs (#3) --- README.md | 7 +++ all-assessments.js | 15 +++++- assessment/ahrefs-lib.js | 85 ++++++++++++++++++++++++++++++++++ assessment/assessment-lib.js | 8 +--- assessment/canonical.js | 90 ++++++++++++++++++++++++++++-------- assessment/file-lib.js | 19 ++++++++ assessment/sitemap.js | 15 ++++-- 7 files changed, 206 insertions(+), 33 deletions(-) create mode 100644 assessment/ahrefs-lib.js create mode 100644 assessment/file-lib.js diff --git a/README.md b/README.md index cf509f8..51f3d89 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,10 @@ Misc SEO research `npm run all ` +## How to trigger canonical assessment + +`node ./assessment/canonical.js [options]` + +Options: +- `--top-pages=` - Run audit for top pages (default 200), based on estimated organic traffic +- `--sitemap=` - Specify a specific sitemap location (default fetched from robots.txt or /sitemap.xml), especially useful for page in development as they are not listed yet in the robots.txt or sitemap_index.xml diff --git a/all-assessments.js b/all-assessments.js index f83d9f8..50d5364 100644 --- a/all-assessments.js +++ b/all-assessments.js @@ -1,5 +1,16 @@ -import {canonical} from "./assessment/canonical.js"; -import {sitemap} from "./assessment/sitemap.js"; +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +import { canonical } from './assessment/canonical.js'; +import { sitemap } from './assessment/sitemap.js'; (async () => { await sitemap; diff --git a/assessment/ahrefs-lib.js b/assessment/ahrefs-lib.js new file mode 100644 index 0000000..0253493 --- /dev/null +++ b/assessment/ahrefs-lib.js @@ -0,0 +1,85 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import path from 'path'; +import fs from 'fs'; +import { fileURLToPath } from 'url'; +import { csv2json, json2csv } from 'json-2-csv'; +import { generateFileName } from './file-lib.js'; + +const AHREFS_API_BASE_URL = 'https://api.ahrefs.com/v3'; +const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output'); + +const sendRequest = async (endpoint, queryParams = {}) => { + const queryParamsKeys = Object.keys(queryParams); + const queryString = queryParamsKeys.length > 0 + ? `?${queryParamsKeys + .map((key) => `${encodeURIComponent(key)}=${encodeURIComponent(queryParams[key])}`) + .join('&')}` : ''; + + const fullAuditRef = `${AHREFS_API_BASE_URL}${endpoint}${queryString}`; + const response = await fetch(fullAuditRef, { + method: 'GET', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${process.env.AHREFS_API_KEY}`, + }, + }); + + if (!response.ok) { + throw new Error(`Ahrefs API request failed with status: ${response.status}`); + } + + try { + const result = await response.json(); + return { + result, + fullAuditRef, + }; + } catch (e) { + throw new Error(`Error parsing Ahrefs API response: ${e.message}`); + } +}; + +export const getTopPages = async (target, limit) => { + // check if file exists that starts with and return immediately if it does + const files = fs.readdirSync(OUTPUT_DIR); + const existingFile = files.find((file) => file.startsWith(`${generateFileName(target, `top-pages-${limit}`)}`)); + if (existingFile) { + console.log(`Using cached file to avoid Ahrefs API call: ${existingFile}`); + const cachedContent = fs.readFileSync(`${OUTPUT_DIR}/${existingFile}`); + return csv2json(cachedContent.toString()); + } + + const queryParams = { + select: [ + 'url', + 'sum_traffic', + ].join(','), + limit, + order_by: 'sum_traffic_merged', + target, + date: new Date().toISOString().split('T')[0], + date_compared: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString().split('T')[0], + output: 'json', + }; + // safe result as csv to cache + const { result } = await sendRequest('/site-explorer/top-pages', queryParams); + if (result.pages) { + const csvResult = json2csv(result.pages); + const FILE_PATH = path.join(OUTPUT_DIR, `${generateFileName(target, `top-pages-${limit}`)}-${Date.now()}.csv`); + fs.writeFileSync(FILE_PATH, csvResult); + return result.pages; + } else { + throw new Error('No pages found in Ahrefs API response.'); + } +}; diff --git a/assessment/assessment-lib.js b/assessment/assessment-lib.js index 943c2cb..a8a4131 100644 --- a/assessment/assessment-lib.js +++ b/assessment/assessment-lib.js @@ -10,18 +10,14 @@ * governing permissions and limitations under the License. */ -import { fileURLToPath } from 'url'; import fs from 'fs'; import path from 'path'; import { json2csv } from 'json-2-csv'; +import { generateFileName, OUTPUT_DIR } from './file-lib.js'; import { getSiteByBaseUrl } from '../spacecat-lib.js'; export const USER_AGENT = 'basecode/seo-research-crawler/1.0'; -const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output'); - -const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase(); - const hrtimeToSeconds = (hrtime) => { const [seconds, nanoseconds] = hrtime; // Destructuring for clarity return (seconds * 1e9 + nanoseconds) / 1e9; // Simplified calculation @@ -39,7 +35,7 @@ export const createAssessment = async (userSite, userTitle) => { console.log('Check if URL is qualified to be assessed. Needs to be part of spacecat catalogue'); const SITE = await getSiteByBaseUrl(userSite); const SITE_URL = SITE.baseURL; - const FILE_PATH = path.join(OUTPUT_DIR, `${sanitizeFilename(userTitle)}-${sanitizeFilename(SITE_URL)}-${Date.now()}.csv`); + const FILE_PATH = path.join(OUTPUT_DIR, `${generateFileName(SITE_URL, userTitle)}-${Date.now()}.csv`); console.log(`${userTitle}: Assessment for ${SITE_URL}`); diff --git a/assessment/canonical.js b/assessment/canonical.js index 6bdd5a1..3d4f600 100644 --- a/assessment/canonical.js +++ b/assessment/canonical.js @@ -12,11 +12,18 @@ import { JSDOM } from 'jsdom'; import { createAssessment } from './assessment-lib.js'; import { fetchSitemapsFromBaseUrl } from './sitemap.js'; +import { getTopPages } from './ahrefs-lib.js'; const TRACKING_PARAM = '?utm'; const userSiteUrl = process.argv[2]; -const checkForCanonical = async (url, assessment) => { +const options = { + topPages: undefined, + sitemapSrc: undefined, +}; + +// eslint-disable-next-line consistent-return +const checkForCanonical = async (url, assessment, source = 'ahrefs', retries = 3, backoff = 300) => { try { const response = await fetch(url); const contentType = response.headers.get('content-type'); @@ -32,9 +39,10 @@ const checkForCanonical = async (url, assessment) => { if (canonicalLink) { assessment.addColumn({ url, + source, canonicalExists: true, response: response.status, - presentInSiteMap: url === canonicalLink, + presentInSiteMap: source === 'sitemap' ? url === canonicalLink : '', www: url.startsWith('https://www.'), hasTrailingSlash: url.endsWith('/'), hasHtmlExtension: url.endsWith('.html'), @@ -54,37 +62,79 @@ const checkForCanonical = async (url, assessment) => { }); } } catch (error) { - assessment.addColumn({ - url, - error: `Error fetching URL ${url}: ${error.message}`, - }); + if (retries > 0) { + console.log(`Error fetching URL ${url}: ${error.message}. Retrying in ${backoff}ms`); + await new Promise((resolve) => { + setTimeout(resolve, backoff); + }); + return checkForCanonical(url, assessment, source, retries - 1, backoff * 2); + } else { + assessment.addColumn({ + url, + error: `Error fetching URL ${url}: ${error.message} after ${retries} retries`, + }); + } } }; const canonicalAudit = async (siteUrl, assessment) => { - // TODO: fetch sitemap url from file if already exists - const sitemaps = await fetchSitemapsFromBaseUrl(siteUrl); - return Promise.all(sitemaps.map((sitemap) => { - if (sitemap.page) { - return checkForCanonical(sitemap.page, assessment); - } - })); + if (options.topPages) { + // if top pages are specified, get pages from ahrefs + // default, get pages from sitemap + console.log(`Fetching top ${options.topPages} pages from Ahrefs`); + const pages = await getTopPages(siteUrl, options.topPages); + // eslint-disable-next-line consistent-return,array-callback-return + return Promise.all(pages.map((page) => { + if (page.url && page.sum_traffic > 0) { + return checkForCanonical(page.url, assessment); + } + })); + } else { + console.log(`Fetching pages from sitemap ${options.sitemapSrc ? `provided at ${options.sitemapSrc}` : ''}`); + const pages = await fetchSitemapsFromBaseUrl(siteUrl, options.sitemapSrc); + // eslint-disable-next-line array-callback-return,consistent-return + return Promise.all(pages.map((page) => { + if (page.page) { + return checkForCanonical(page.page, assessment, 'sitemap'); + } + })); + } }; export const canonical = (async () => { + process.argv.slice(3).forEach((arg) => { + if (arg.startsWith('--top-pages')) { + const [, value] = arg.split('='); + const number = parseInt(value, 10); + if (Number.isNaN(number) || number <= 0) { + console.log('Defaulting to top 200 pages'); + options.topPages = 200; + } else { + options.topPages = number; + } + } else if (arg.startsWith('--sitemap')) { + const [, value] = arg.split('='); + options.sitemapSrc = value; + } else { + console.error(`Error: Unknown option '${arg}'`); + process.exit(1); + } + }); const assessment = await createAssessment(userSiteUrl, 'Canonical'); assessment.setRowHeadersAndDefaults({ url: '', - canonicalExists: false, + source: '', + canonicalExists: '', response: '', - presentInSiteMap: false, - www: undefined, - hasTrailingSlash: undefined, - hasHtmlExtension: undefined, - hasTrackingParams: undefined, + presentInSiteMap: '', + www: '', + hasTrailingSlash: '', + hasHtmlExtension: '', + hasTrackingParams: '', error: '', warning: '', }); - await canonicalAudit(userSiteUrl, assessment); + await canonicalAudit(userSiteUrl, assessment, options); assessment.end(); + process.exit(0); })(); diff --git a/assessment/file-lib.js b/assessment/file-lib.js new file mode 100644 index 0000000..f380cfb --- /dev/null +++ b/assessment/file-lib.js @@ -0,0 +1,19 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import path from 'path'; +import { fileURLToPath } from 'url'; + +export const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output'); +export const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase(); + +export const generateFileName = (siteUrl, title) => `${sanitizeFilename(title)}-${sanitizeFilename(siteUrl)}`; diff --git a/assessment/sitemap.js b/assessment/sitemap.js index 3cb9f4b..52d451d 100644 --- a/assessment/sitemap.js +++ b/assessment/sitemap.js @@ -110,7 +110,12 @@ async function fetchSitemapsFromRobots(siteUrl) { return fetchSitemapsFromSource(sitemapSources); } -export async function fetchSitemapsFromBaseUrl(url) { +export async function fetchSitemapsFromBaseUrl(url, sitemapSrc) { + if (sitemapSrc) { + return fetchSitemapsFromSource([ + { url: new URL(sitemapSrc, url).toString(), source: 'user provided' }, + ]); + } let sitemaps = await fetchSitemapsFromRobots(userSiteUrl); if (!sitemaps.length) { sitemaps = await fetchSitemapsFromSource([ @@ -123,7 +128,7 @@ export async function fetchSitemapsFromBaseUrl(url) { } } return sitemaps; -}; +} export const sitemap = (async () => { const assessment = await createAssessment(userSiteUrl, 'Sitemap'); @@ -138,10 +143,10 @@ export const sitemap = (async () => { const sitemaps = await fetchSitemapsFromBaseUrl(userSiteUrl); // Assessment for sitemaps - sitemaps.forEach(async (sitemap) => { - if (sitemap.url) { + sitemaps.forEach(async (sm) => { + if (sm.url) { assessment.addColumn({ - sitemapOrPage: sitemap.url, source: sitemap.source, locs: sitemap.locs, error: sitemap.error || '', warning: sitemap.warning || '', + sitemapOrPage: sm.url, source: sm.source, locs: sm.locs, error: sm.error || '', warning: sm.warning || '', }); } });