diff --git a/src/discover/matcher/github.js b/src/discover/matcher/github.js index 2dc19cf..6e96cb8 100644 --- a/src/discover/matcher/github.js +++ b/src/discover/matcher/github.js @@ -17,11 +17,12 @@ export default class GithubMatcher { /** * Find the inventory entries that have the given github URL. * + * @param {import('../support/AdminContext').AdminContext} context context * @param {URL} url google document or spreadsheet * @param {Inventory} inventory inventory of entries */ // eslint-disable-next-line class-methods-use-this - filter(url, inventory) { + filter(context, url, inventory) { const segs = url.pathname.split('/'); const [, owner, repo] = segs; const codeBusId = `${owner}/${repo}`; diff --git a/src/discover/matcher/google.js b/src/discover/matcher/google.js index 6dc11ea..4d7378f 100644 --- a/src/discover/matcher/google.js +++ b/src/discover/matcher/google.js @@ -39,8 +39,16 @@ function buildRoots(inventory) { * Matcher that filters inventory entries against known google drives. */ export default class GoogleMatcher { - constructor(context) { - this.context = context; + constructor(env) { + this.customUserProjects = (env.HLX_CUSTOM_GOOGLE_USERS ?? '').split(',') + .map((project) => { + const [org, site] = project.trim().split('/'); + return { + org, + site, + match: (entry) => org === entry.org && (site === '*' || site === entry.site), + }; + }); } /** @@ -50,36 +58,30 @@ export default class GoogleMatcher { * @returns {CustomUser[]} */ #getCustomUsers(entries) { - const { env } = this.context; - - return (env.HLX_CUSTOM_GOOGLE_USERS ?? '').split(',') - .map((project) => { - const [org, site] = project.trim().split('/'); - return { org, site }; - }) - .reduce((users, { org, site }) => { - // for orgs (i.e. site = '*'), return just the first custom user - // adorned project in that org. this avoids doing a lookup with - // the same registered user multiple times - const entry = entries.find((e) => !!e.customUser - && e.org === org && (site === '*' || e.site === site)); - if (entry) { - const { contentBusId } = entry; - users.push({ project: `${org}/${entry.site}`, contentBusId }); - } - return users; - }, []); + return this.customUserProjects.reduce((users, { org, site }) => { + // for orgs (i.e. site = '*'), return just the first custom user + // adorned project in that org. this avoids doing a lookup with + // the same registered user multiple times + const entry = entries.find((e) => !!e.customUser + && e.org === org && (site === '*' || e.site === site)); + if (entry) { + const { contentBusId } = entry; + users.push({ project: `${org}/${entry.site}`, contentBusId }); + } + return users; + }, []); } /** * Find the inventory entries that have the given google document, spreadsheet * or folder in their tree. * + * @param {import('../support/AdminContext').AdminContext} context context * @param {URL} url google document or spreadsheet * @param {Inventory} inventory inventory of entries */ - async filter(url, inventory) { - const { log } = this.context; + async filter(context, url, inventory) { + const { log } = context; const segs = url.pathname.split('/'); let id = segs.pop(); @@ -117,7 +119,7 @@ export default class GoogleMatcher { // resolve using the default user const roots = buildRoots(entries); - let client = await this.context.getGoogleClient(); + let client = await context.getGoogleClient(); let hierarchy = await client.getItemsFromId(id, roots); if (hierarchy.length) { const { id: rootId } = hierarchy[hierarchy.length - 1]; @@ -139,7 +141,7 @@ export default class GoogleMatcher { if (!ret) { try { // eslint-disable-next-line no-await-in-loop - client = await this.context.getGoogleClient(contentBusId); + client = await context.getGoogleClient(contentBusId); // eslint-disable-next-line no-await-in-loop hierarchy = await client.getItemsFromId(id, roots); if (hierarchy.length) { @@ -166,44 +168,54 @@ export default class GoogleMatcher { } } - /** - * Test whether this class can handle an URL - * - * @param {URL} url url to match - * @param {Inventory} inventory - * @returns true if this class can handle the URL - */ - static match(url, inventory) { - return inventory.getHostType(url.hostname) === 'google' || url.hostname.match(/^.*\.google\.com$/); - } - /** * Extract some data from a URL to store in the inventory. * - * @param {import('../../index.js').AdminContext} context context + * @param {import('../support/AdminContext').AdminContext} context context * @param {URL} url url to extract data from + * @param {import('../inventory.js').InventoryEntry} entry entry * @returns object that contains additional entries to store in inventory */ - static async extract(context, url, entry) { + async extract(context, url, entry) { const match = url.pathname.match(/\/.*\/folders\/([^?/]+)$/); - if (match) { + if (!match) { + return; + } + + // eslint-disable-next-line no-param-reassign + [, entry.gdriveId] = match; + if (!entry.contentBusId) { + return; + } + + // do not search for custom users in org/sites that + // are not listed in env.HLX_CUSTOM_GOOGLE_USERS + if (!this.customUserProjects.some((project) => project.match(entry))) { + return; + } + + const { code: codeBucket, content: contentBucket } = context.attributes.bucketMap; + const plugin = await getCachePlugin({ + log: context.log, + contentBusId: entry.contentBusId, + readOnly: true, + codeBucket, + contentBucket, + }, 'google'); + if (!plugin.key.startsWith('default/.helix-auth/')) { // eslint-disable-next-line no-param-reassign - [, entry.gdriveId] = match; - // check for custom user - if (entry.contentBusId) { - const { code: codeBucket, content: contentBucket } = context.attributes.bucketMap; - const plugin = await getCachePlugin({ - log: context.log, - contentBusId: entry.contentBusId, - readOnly: true, - codeBucket, - contentBucket, - }, 'google'); - if (!plugin.key.startsWith('default/.helix-auth/')) { - // eslint-disable-next-line no-param-reassign - entry.customUser = true; - } - } + entry.customUser = true; } } + + /** + * Test whether this class can handle an URL + * + * @param {URL} url url to match + * @param {Inventory} inventory + * @returns true if this class can handle the URL + */ + static match(url, inventory) { + return inventory.getHostType(url.hostname) === 'google' || url.hostname.match(/^.*\.google\.com$/); + } } diff --git a/src/discover/matcher/sharepoint.js b/src/discover/matcher/sharepoint.js index 4919510..31e45e7 100644 --- a/src/discover/matcher/sharepoint.js +++ b/src/discover/matcher/sharepoint.js @@ -67,21 +67,18 @@ function stripAccessSpecifiers(pathname) { * Matcher that filters inventory entries against known sharepoint sites. */ export default class SharepointMatcher { - constructor(context) { - this.context = context; - } - /** * Returns a matcher for document URLs given as `/_layouts/15/Doc.aspx` * + * @param {import('../support/AdminContext').AdminContext} context context * @param {String[]} segs segments to use for site lookup * @param {URL} url original URL * @param {import('../inventory.js').InventoryEntry} candidate candidate entry * that can be used to determine content bus ID and owner * @returns matcher */ - async documentMatcher(segs, url, candidate) { - const { attributes, env, log } = this.context; + async documentMatcher(context, segs, url, candidate) { + const { attributes, env, log } = context; try { const client = await getOneDriveClient({ @@ -114,12 +111,13 @@ export default class SharepointMatcher { /** * Returns a matcher for the given URL. * + * @param {import('../support/AdminContext').AdminContext} context context * @param {URL} url url to resolve * @param {import('../inventory.js').Inventory} inventory inventory * @returns resolved URL */ - async getMatcher(url, inventory) { - const { log } = this.context; + async getMatcher(context, url, inventory) { + const { log } = context; let { pathname } = url; pathname = stripAccessSpecifiers(pathname); @@ -146,7 +144,7 @@ export default class SharepointMatcher { return () => false; } } - return this.documentMatcher(segs, url, candidate); + return this.documentMatcher(context, segs, url, candidate); } if (ALLITEMS_REGEX.test(pathname)) { @@ -187,41 +185,30 @@ export default class SharepointMatcher { * Find the inventory entries that have the given sharepoint document, spreadsheet * or folder in their tree. * + * @param {import('../support/AdminContext').AdminContext} context context * @param {URL} url google document or spreadsheet * @param {Inventory} inventory inventory of entries */ - async filter(url, inventory) { + async filter(context, url, inventory) { const suffix = DEFENDER_DNS_SUFFIXES.find((s) => url.hostname.endsWith(s)); if (suffix) { // eslint-disable-next-line no-param-reassign url.hostname = url.hostname.substring(0, url.hostname.length - suffix.length); } - const matcher = await this.getMatcher(url, inventory); + const matcher = await this.getMatcher(context, url, inventory); return inventory.entries() .filter(({ sharepointSite }) => sharepointSite && matcher(sharepointSite)) .sort(({ sharepointSite: site1, sharepointSite: site2 }) => site1.length - site2.length); } - /** - * Test whether this class can handle an URL - * - * @param {URL} url url to match - * @param {Inventory} inventory - * @returns true if this class can handle the URL - */ - static match(url, inventory) { - return inventory.getHostType(url.hostname) === 'sharepoint' - || DEFENDER_DNS_SUFFIXES.some((suffix) => url.hostname.endsWith(suffix)); - } - /** * Extract some data from a URL to store in the inventory. * * @param {URL} url url to extract data from - * @param entry entry to extract into + * @param {import('../inventory.js').InventoryEntry} entry entry to extract into * @returns object that contains additional entries to store in inventory */ - static async extract(context, url, entry) { + async extract(context, url, entry) { let pathname = stripAccessSpecifiers(url.pathname); if (ALLITEMS_REGEX.test(pathname)) { pathname = url.searchParams.get('id'); @@ -229,4 +216,16 @@ export default class SharepointMatcher { // eslint-disable-next-line no-param-reassign entry.sharepointSite = new URL(pathname, url).href; } + + /** + * Test whether this class can handle an URL + * + * @param {URL} url url to match + * @param {Inventory} inventory + * @returns true if this class can handle the URL + */ + static match(url, inventory) { + return inventory.getHostType(url.hostname) === 'sharepoint' + || DEFENDER_DNS_SUFFIXES.some((suffix) => url.hostname.endsWith(suffix)); + } } diff --git a/src/discover/query.js b/src/discover/query.js index 1f924cb..30a9573 100644 --- a/src/discover/query.js +++ b/src/discover/query.js @@ -139,8 +139,8 @@ export default async function query(context) { if (!Matcher) { return errorResponse(log, 404, `no matcher found for ${url}`); } - const matcher = new Matcher(context); - entries = await matcher.filter(url, inventory); + const matcher = new Matcher(context.env); + entries = await matcher.filter(context, url, inventory); } const { originalSites, gdriveIds } = lookupHiddenForks(entries); diff --git a/src/discover/reindex.js b/src/discover/reindex.js index a3bae59..495afbd 100644 --- a/src/discover/reindex.js +++ b/src/discover/reindex.js @@ -33,9 +33,10 @@ const MATCHERS = { * @param {import('@adobe/helix-shared-storage').Bucket} contentBus content bus ticket * @param {string} org org * @param {string} site site + * @param {object} matchers matchers * @returns {Promise} */ -async function createEntry(context, contentBus, org, site) { +async function createEntry(context, contentBus, org, site, matchers) { const config = await loadSiteConfig(context, org, site); if (!config) { return null; @@ -62,7 +63,7 @@ async function createEntry(context, contentBus, org, site) { entry.originalRepository = hlx['original-repository']; } - const matcher = MATCHERS[content.source.type]; + const matcher = matchers[content.source.type]; if (matcher) { await matcher.extract(context, new URL(entry.contentSourceUrl), entry); } @@ -70,26 +71,26 @@ async function createEntry(context, contentBus, org, site) { } /** - * Create the complete repository inventory for all owners found in the code bus. + * Create the complete repository inventory for all sites found in the config bus. * * @param {import('../support/AdminContext').AdminContext} context context * @param {import('@adobe/helix-shared-storage').Bucket} contentBus content bus bucket + * @param {object} matchers matchers * @returns {Promise} */ -async function createInventory(context, contentBus) { +async function createInventory(context, contentBus, matchers) { const { log } = context; const inventory = new Inventory(contentBus, log); const configBus = HelixStorage.fromContext(context).configBus(); - const orgs = await configBus.listFolders('orgs/'); - log.info(`Found ${orgs.length} orgs.`); + const folders = await configBus.listFolders('orgs/'); + log.info(`found ${folders.length} folders in /orgs/`); const sites = []; - - await processQueue(orgs, async (folder) => { + await processQueue(folders, async (folder) => { const org = folder.split('/')[1]; - const siteObjects = await configBus.list(`${folder}sites`, true); + const siteObjects = await configBus.list(`${folder}sites/`, true); for (const { path } of siteObjects) { if (path.endsWith('.json')) { const site = basename(path, '.json'); @@ -100,10 +101,10 @@ async function createInventory(context, contentBus) { } } }, 64); - log.info(`Found ${sites.length} sites`); + log.info(`found ${sites.length} sites`); await processQueue(sites, async ({ org, site }) => { - const entry = await createEntry(context, contentBus, org, site); + const entry = await createEntry(context, contentBus, org, site, matchers); if (entry) { inventory.appendEntry(entry); } @@ -120,7 +121,12 @@ async function createInventory(context, contentBus) { export async function reindexAll(context) { const contentBus = HelixStorage.fromContext(context).contentBus(); - const inventory = await createInventory(context, contentBus); + const matchers = Object.fromEntries( + Object.entries(MATCHERS) + .map(([name, Matcher]) => [name, new Matcher(context.env)]), + ); + + const inventory = await createInventory(context, contentBus, matchers); await inventory.save(); return new Response(); @@ -138,8 +144,12 @@ export async function reindexProject(context, org, site) { const { log } = context; const contentBus = HelixStorage.fromContext(context).contentBus(); + const matchers = Object.fromEntries( + Object.entries(MATCHERS) + .map(([name, Matcher]) => [name, new Matcher(context.env)]), + ); - const entry = await createEntry(context, contentBus, org, site); + const entry = await createEntry(context, contentBus, org, site, matchers); if (!entry) { return new Response('', { status: 404, diff --git a/src/discover/remove.js b/src/discover/remove.js index 029673b..e33c759 100644 --- a/src/discover/remove.js +++ b/src/discover/remove.js @@ -21,9 +21,9 @@ import { Inventory } from './inventory.js'; * @param {string} site repo * @returns {Promise} response */ -export async function removeProject(ctx, org, site) { - const { log } = ctx; - const contentBus = HelixStorage.fromContext(ctx).contentBus(); +export async function removeProject(context, org, site) { + const { log } = context; + const contentBus = HelixStorage.fromContext(context).contentBus(); const inventory = new Inventory(contentBus, log); await inventory.load(); diff --git a/src/live/status.js b/src/live/status.js index 7226b5a..5a3cab7 100644 --- a/src/live/status.js +++ b/src/live/status.js @@ -19,8 +19,8 @@ import getLiveInfo from './info.js'; * @param {import('../support/RequestInfo').RequestInfo} info request info * @returns {Promise} response */ -export default async function liveStatus(ctx, info) { - const live = await getLiveInfo(ctx, info); +export default async function liveStatus(context, info) { + const live = await getLiveInfo(context, info); // return error if not 404 if (live.status !== 200 && live.status !== 404) { diff --git a/test/discover/query.test.js b/test/discover/query.test.js index 2702fe5..cf50be9 100644 --- a/test/discover/query.test.js +++ b/test/discover/query.test.js @@ -154,6 +154,7 @@ describe('Discover query tests', () => { .reply(200, { entries: [{ sharepointSite: 'https://other.sharepoint.com/', + codeBusId: 'owner/repo', }], hostTypes: { 'other.sharepoint.com': 'sharepoint', @@ -170,7 +171,8 @@ describe('Discover query tests', () => { assert.strictEqual(response.status, 200); assert.deepStrictEqual(await response.json(), [{ - githubUrl: 'https://github.com/undefined', + codeBusId: 'owner/repo', + githubUrl: 'https://github.com/owner/repo', originalRepository: false, originalSite: false, }]); diff --git a/test/discover/reindex.test.js b/test/discover/reindex.test.js index 6541a1d..767df0d 100644 --- a/test/discover/reindex.test.js +++ b/test/discover/reindex.test.js @@ -51,6 +51,7 @@ describe('Discover reindex tests', () => { function setupTest(org, site, { authInfo = new AuthInfo().withRole('index'), + env, } = {}) { const suffix = '/discover'; const query = new URLSearchParams(Object.entries({ org, site }).filter(([, v]) => !!v)); @@ -72,6 +73,7 @@ describe('Discover reindex tests', () => { CLOUDFLARE_R2_SECRET_ACCESS_KEY: 'cloudflare-secret', AZURE_HELIX_SERVICE_CLIENT_ID: 'client-id', AZURE_HELIX_SERVICE_CLIENT_SECRET: 'client-secret', + ...env, }, }; return { request, context }; @@ -109,28 +111,38 @@ describe('Discover reindex tests', () => { .reply(() => [200, new xml2js.Builder().buildObject({ ListBucketResult: { CommonPrefixes: [{ - Prefix: 'orgs/org/', + Prefix: 'orgs/org1/', + }, { + Prefix: 'orgs/org2/', }], }, })]) - .get('/?delimiter=%2F&list-type=2&prefix=orgs/org/sites') + .get('/?delimiter=%2F&list-type=2&prefix=orgs/org1/sites/') .reply(() => [200, new xml2js.Builder().buildObject({ ListBucketResult: { KeyCount: 1, Contents: [ { - Key: 'orgs/org/sites/site1.json', + Key: 'orgs/org1/sites/site1.json', LastModified: '2023-10-06T08:05:00.000Z', }, + ], + }, + })]) + .get('/?delimiter=%2F&list-type=2&prefix=orgs/org2/sites/') + .reply(() => [200, new xml2js.Builder().buildObject({ + ListBucketResult: { + KeyCount: 1, + Contents: [ { - Key: 'orgs/org/sites/site2.json', + Key: 'orgs/org2/sites/site2.json', LastModified: '2023-10-06T08:05:00.000Z', }, ], }, })]); nock('https://config.aem.page') - .get('/main--site1--org/config.json?scope=admin') + .get('/main--site1--org1/config.json?scope=admin') .reply(200, { content: { contentBusId: 1234, @@ -144,7 +156,7 @@ describe('Discover reindex tests', () => { repo: 'repo', }, }) - .get('/main--site2--org/config.json?scope=admin') + .get('/main--site2--org2/config.json?scope=admin') .reply(200, { content: { contentBusId: 5678, @@ -161,43 +173,45 @@ describe('Discover reindex tests', () => { nock('https://helix-content-bus.s3.us-east-1.amazonaws.com') .get('/1234/.hlx.json?x-id=GetObject') .reply(200, { - 'original-site': 'org/site1', + 'original-site': 'org1/site1', 'original-repq': 'owner/repo', }) .head('/1234/.helix-auth/auth-google-content.json') .reply(404) .get('/5678/.hlx.json?x-id=GetObject') .reply(200, { - 'original-site': 'org/site2', + 'original-site': 'org2/site2', 'original-repq': 'owner/repo', - }) - .head('/5678/.helix-auth/auth-google-content.json') - .reply(404); + }); - const { request, context } = setupTest('*'); + const { request, context } = setupTest('*', undefined, { + env: { + HLX_CUSTOM_GOOGLE_USERS: 'org1/*', + }, + }); const response = await main(request, context); assert.strictEqual(response.status, 200); assert.deepStrictEqual(inventory, { entries: [ - { - codeBusId: 'owner/repo', - contentBusId: 1234, - contentSourceUrl: 'https://drive.google.com/drive/folders/1N2zij7EMeS95cIFiRuxfjY0OxllX8my1', - gdriveId: '1N2zij7EMeS95cIFiRuxfjY0OxllX8my1', - org: 'org', - originalSite: 'org/site1', - site: 'site1', - }, { codeBusId: 'owner/repo', contentBusId: 5678, contentSourceUrl: 'https://drive.google.com/drive/folders/1N2zij7EMeS95cIFiRuxfjY0OxllX8my2', gdriveId: '1N2zij7EMeS95cIFiRuxfjY0OxllX8my2', - org: 'org', - originalSite: 'org/site2', + org: 'org2', + originalSite: 'org2/site2', site: 'site2', }, + { + codeBusId: 'owner/repo', + contentBusId: 1234, + contentSourceUrl: 'https://drive.google.com/drive/folders/1N2zij7EMeS95cIFiRuxfjY0OxllX8my1', + gdriveId: '1N2zij7EMeS95cIFiRuxfjY0OxllX8my1', + org: 'org1', + originalSite: 'org1/site1', + site: 'site1', + }, ], hostTypes: { 'drive.google.com': 'google', @@ -232,7 +246,11 @@ describe('Discover reindex tests', () => { codeBusId: 'owner/repo', }]); - const { request, context } = setupTest('org', 'site'); + const { request, context } = setupTest('org', 'site', { + env: { + HLX_CUSTOM_GOOGLE_USERS: 'org/*', + }, + }); const response = await main(request, context); assert.strictEqual(response.status, 201); @@ -305,9 +323,7 @@ describe('Discover reindex tests', () => { .get(`/${SITE_CONFIG.content.contentBusId}/.hlx.json?x-id=GetObject`) .reply(200, { 'original-site': 'org/site', - }) - .head(`/${SITE_CONFIG.content.contentBusId}/.helix-auth/auth-google-content.json`) - .reply(404); + }); nock.inventory([{ codeBusId: 'owner/repo',