Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/discover/matcher/github.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ export default class GithubMatcher {
/**
* Find the inventory entries that have the given github URL.
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {URL} url google document or spreadsheet
* @param {Inventory} inventory inventory of entries
*/
// eslint-disable-next-line class-methods-use-this
filter(url, inventory) {
filter(context, url, inventory) {
const segs = url.pathname.split('/');
const [, owner, repo] = segs;
const codeBusId = `${owner}/${repo}`;
Expand Down
122 changes: 67 additions & 55 deletions src/discover/matcher/google.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,16 @@ function buildRoots(inventory) {
* Matcher that filters inventory entries against known google drives.
*/
export default class GoogleMatcher {
constructor(context) {
this.context = context;
constructor(env) {
this.customUserProjects = (env.HLX_CUSTOM_GOOGLE_USERS ?? '').split(',')
.map((project) => {
const [org, site] = project.trim().split('/');
return {
org,
site,
match: (entry) => org === entry.org && (site === '*' || site === entry.site),
};
});
}

/**
Expand All @@ -50,36 +58,30 @@ export default class GoogleMatcher {
* @returns {CustomUser[]}
*/
#getCustomUsers(entries) {
const { env } = this.context;

return (env.HLX_CUSTOM_GOOGLE_USERS ?? '').split(',')
.map((project) => {
const [org, site] = project.trim().split('/');
return { org, site };
})
.reduce((users, { org, site }) => {
// for orgs (i.e. site = '*'), return just the first custom user
// adorned project in that org. this avoids doing a lookup with
// the same registered user multiple times
const entry = entries.find((e) => !!e.customUser
&& e.org === org && (site === '*' || e.site === site));
if (entry) {
const { contentBusId } = entry;
users.push({ project: `${org}/${entry.site}`, contentBusId });
}
return users;
}, []);
return this.customUserProjects.reduce((users, { org, site }) => {
// for orgs (i.e. site = '*'), return just the first custom user
// adorned project in that org. this avoids doing a lookup with
// the same registered user multiple times
const entry = entries.find((e) => !!e.customUser
&& e.org === org && (site === '*' || e.site === site));
if (entry) {
const { contentBusId } = entry;
users.push({ project: `${org}/${entry.site}`, contentBusId });
}
return users;
}, []);
}

/**
* Find the inventory entries that have the given google document, spreadsheet
* or folder in their tree.
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {URL} url google document or spreadsheet
* @param {Inventory} inventory inventory of entries
*/
async filter(url, inventory) {
const { log } = this.context;
async filter(context, url, inventory) {
const { log } = context;

const segs = url.pathname.split('/');
let id = segs.pop();
Expand Down Expand Up @@ -117,7 +119,7 @@ export default class GoogleMatcher {

// resolve using the default user
const roots = buildRoots(entries);
let client = await this.context.getGoogleClient();
let client = await context.getGoogleClient();
let hierarchy = await client.getItemsFromId(id, roots);
if (hierarchy.length) {
const { id: rootId } = hierarchy[hierarchy.length - 1];
Expand All @@ -139,7 +141,7 @@ export default class GoogleMatcher {
if (!ret) {
try {
// eslint-disable-next-line no-await-in-loop
client = await this.context.getGoogleClient(contentBusId);
client = await context.getGoogleClient(contentBusId);
// eslint-disable-next-line no-await-in-loop
hierarchy = await client.getItemsFromId(id, roots);
if (hierarchy.length) {
Expand All @@ -166,44 +168,54 @@ export default class GoogleMatcher {
}
}

/**
* Test whether this class can handle an URL
*
* @param {URL} url url to match
* @param {Inventory} inventory
* @returns true if this class can handle the URL
*/
static match(url, inventory) {
return inventory.getHostType(url.hostname) === 'google' || url.hostname.match(/^.*\.google\.com$/);
}

/**
* Extract some data from a URL to store in the inventory.
*
* @param {import('../../index.js').AdminContext} context context
* @param {import('../support/AdminContext').AdminContext} context context
* @param {URL} url url to extract data from
* @param {import('../inventory.js').InventoryEntry} entry entry
* @returns object that contains additional entries to store in inventory
*/
static async extract(context, url, entry) {
async extract(context, url, entry) {
const match = url.pathname.match(/\/.*\/folders\/([^?/]+)$/);
if (match) {
if (!match) {
return;
}

// eslint-disable-next-line no-param-reassign
[, entry.gdriveId] = match;
if (!entry.contentBusId) {
return;
}

// do not search for custom users in org/sites that
// are not listed in env.HLX_CUSTOM_GOOGLE_USERS
if (!this.customUserProjects.some((project) => project.match(entry))) {
return;
}
Comment on lines +191 to +195
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

noticed in reindex all, that we had an enormous amount of checks for custom users in every google backed project, although we won't include them when searching for google documents


const { code: codeBucket, content: contentBucket } = context.attributes.bucketMap;
const plugin = await getCachePlugin({
log: context.log,
contentBusId: entry.contentBusId,
readOnly: true,
codeBucket,
contentBucket,
}, 'google');
if (!plugin.key.startsWith('default/.helix-auth/')) {
// eslint-disable-next-line no-param-reassign
[, entry.gdriveId] = match;
// check for custom user
if (entry.contentBusId) {
const { code: codeBucket, content: contentBucket } = context.attributes.bucketMap;
const plugin = await getCachePlugin({
log: context.log,
contentBusId: entry.contentBusId,
readOnly: true,
codeBucket,
contentBucket,
}, 'google');
if (!plugin.key.startsWith('default/.helix-auth/')) {
// eslint-disable-next-line no-param-reassign
entry.customUser = true;
}
}
entry.customUser = true;
}
}

/**
* Test whether this class can handle an URL
*
* @param {URL} url url to match
* @param {Inventory} inventory
* @returns true if this class can handle the URL
*/
static match(url, inventory) {
return inventory.getHostType(url.hostname) === 'google' || url.hostname.match(/^.*\.google\.com$/);
}
}
49 changes: 24 additions & 25 deletions src/discover/matcher/sharepoint.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,18 @@ function stripAccessSpecifiers(pathname) {
* Matcher that filters inventory entries against known sharepoint sites.
*/
export default class SharepointMatcher {
constructor(context) {
this.context = context;
}

/**
* Returns a matcher for document URLs given as `/_layouts/15/Doc.aspx`
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {String[]} segs segments to use for site lookup
* @param {URL} url original URL
* @param {import('../inventory.js').InventoryEntry} candidate candidate entry
* that can be used to determine content bus ID and owner
* @returns matcher
*/
async documentMatcher(segs, url, candidate) {
const { attributes, env, log } = this.context;
async documentMatcher(context, segs, url, candidate) {
const { attributes, env, log } = context;

try {
const client = await getOneDriveClient({
Expand Down Expand Up @@ -114,12 +111,13 @@ export default class SharepointMatcher {
/**
* Returns a matcher for the given URL.
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {URL} url url to resolve
* @param {import('../inventory.js').Inventory} inventory inventory
* @returns resolved URL
*/
async getMatcher(url, inventory) {
const { log } = this.context;
async getMatcher(context, url, inventory) {
const { log } = context;
let { pathname } = url;
pathname = stripAccessSpecifiers(pathname);

Expand All @@ -146,7 +144,7 @@ export default class SharepointMatcher {
return () => false;
}
}
return this.documentMatcher(segs, url, candidate);
return this.documentMatcher(context, segs, url, candidate);
}

if (ALLITEMS_REGEX.test(pathname)) {
Expand Down Expand Up @@ -187,46 +185,47 @@ export default class SharepointMatcher {
* Find the inventory entries that have the given sharepoint document, spreadsheet
* or folder in their tree.
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {URL} url google document or spreadsheet
* @param {Inventory} inventory inventory of entries
*/
async filter(url, inventory) {
async filter(context, url, inventory) {
const suffix = DEFENDER_DNS_SUFFIXES.find((s) => url.hostname.endsWith(s));
if (suffix) {
// eslint-disable-next-line no-param-reassign
url.hostname = url.hostname.substring(0, url.hostname.length - suffix.length);
}
const matcher = await this.getMatcher(url, inventory);
const matcher = await this.getMatcher(context, url, inventory);
return inventory.entries()
.filter(({ sharepointSite }) => sharepointSite && matcher(sharepointSite))
.sort(({ sharepointSite: site1, sharepointSite: site2 }) => site1.length - site2.length);
}

/**
* Test whether this class can handle an URL
*
* @param {URL} url url to match
* @param {Inventory} inventory
* @returns true if this class can handle the URL
*/
static match(url, inventory) {
return inventory.getHostType(url.hostname) === 'sharepoint'
|| DEFENDER_DNS_SUFFIXES.some((suffix) => url.hostname.endsWith(suffix));
}

/**
* Extract some data from a URL to store in the inventory.
*
* @param {URL} url url to extract data from
* @param entry entry to extract into
* @param {import('../inventory.js').InventoryEntry} entry entry to extract into
* @returns object that contains additional entries to store in inventory
*/
static async extract(context, url, entry) {
async extract(context, url, entry) {
let pathname = stripAccessSpecifiers(url.pathname);
if (ALLITEMS_REGEX.test(pathname)) {
pathname = url.searchParams.get('id');
}
// eslint-disable-next-line no-param-reassign
entry.sharepointSite = new URL(pathname, url).href;
}

/**
* Test whether this class can handle an URL
*
* @param {URL} url url to match
* @param {Inventory} inventory
* @returns true if this class can handle the URL
*/
static match(url, inventory) {
return inventory.getHostType(url.hostname) === 'sharepoint'
|| DEFENDER_DNS_SUFFIXES.some((suffix) => url.hostname.endsWith(suffix));
}
}
4 changes: 2 additions & 2 deletions src/discover/query.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ export default async function query(context) {
if (!Matcher) {
return errorResponse(log, 404, `no matcher found for ${url}`);
}
const matcher = new Matcher(context);
entries = await matcher.filter(url, inventory);
const matcher = new Matcher(context.env);
entries = await matcher.filter(context, url, inventory);
}

const { originalSites, gdriveIds } = lookupHiddenForks(entries);
Expand Down
Loading