Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

properly categorize each url #19

Merged
merged 1 commit into from
Sep 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions src/crawler_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ const CONSTS = require('./consts');
*/
exports.handleMaster = async (page, requestQueue, input, request) => {
const { searchBox, toggleFilterMenu, filterBtnsXp } = CONSTS.SELECTORS.SEARCH;
const { search } = request.userData;
const { search, label } = request.userData;

if (search) {
if (search && label === 'MASTER') {
// we are searching
log.debug('waiting for input box...');
const searchBxElem = await page.waitForSelector(searchBox, { visible: true });
Expand Down Expand Up @@ -86,7 +86,7 @@ exports.handleMaster = async (page, requestQueue, input, request) => {

const maxRequested = (input.maxResults && input.maxResults > 0) ? +input.maxResults : 99999;

await utils.loadVideosUrls(requestQueue, page, maxRequested, !!search);
await utils.loadVideosUrls(requestQueue, page, maxRequested, ['MASTER', 'SEARCH'].includes(label));

log.info('infinite scroll done...');
};
Expand Down Expand Up @@ -158,7 +158,14 @@ exports.handleDetail = async (page, request) => {
};

exports.hndlPptGoto = async ({ page, request }) => {
await puppeteer.blockRequests(page);
await puppeteer.blockRequests(page, {
extraUrlPatterns: [
'google-analytics',
'doubleclick.net',
'googletagmanager',
'/log_event',
],
});
return page.goto(request.url, { waitUntil: 'domcontentloaded' });
};

Expand Down
12 changes: 11 additions & 1 deletion src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ Apify.main(async () => {
}

switch (request.userData.label) {
case 'CHANNEL':
case 'SEARCH':
case 'MASTER': {
await crawler.handleMaster(page, requestQueue, input, request);
break;
Expand All @@ -65,10 +67,18 @@ Apify.main(async () => {
// eslint-disable-next-line no-cond-assign
while (req = await parseUrls.fetchNextRequest()) {
// need to parse for requestsFromUrl first then categorize by path
const label = utils.categorizeUrl(req.url);
const pUrl = new URL(req.url);

if (label === 'CHANNEL' && !pUrl.pathname.includes('/videos')) {
pUrl.pathname = `${pUrl.pathname.split('/').filter((s) => s).join('/')}/videos`;
req.url = pUrl.toString();
}

await requestQueue.addRequest({
url: req.url,
userData: {
label: req.url.includes('/watch') ? 'DETAIL' : 'MASTER',
label,
},
});
}
Expand Down
28 changes: 28 additions & 0 deletions src/utility.js
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,34 @@ exports.getDataFromSelector = async (page, slctr, attrib) => {
return page.evaluate((el, key) => el[key], slctrElem, attrib);
};

/**
* @param {string} url
*/
exports.categorizeUrl = (url) => {
try {
const pUrl = new URL(url, 'https://www.youtube.com');

if (!pUrl.hostname.includes('youtube.com')) {
throw new Error('Invalid youtube url');
}

let label = 'MASTER';

if (pUrl.searchParams.get('v')) {
label = 'DETAIL';
} else if (pUrl.searchParams.get('search_query')) {
label = 'SEARCH';
} else if (pUrl.pathname.includes('/channel/') || pUrl.pathname.includes('/user/') || pUrl.pathname.includes('/c/')) {
label = 'CHANNEL';
}

return label;
} catch (e) {
log.exception(e, 'categorizeUrl', { url });
return null;
}
};

exports.unformatNumbers = (numStr) => {
const numberMatch = numStr.replace(/[^0-9,.]/ig, '');
if (numberMatch) {
Expand Down
14 changes: 12 additions & 2 deletions test/utility_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ const moment = require('moment');

const utils = require('../src/utility');


describe('getRandBetween', () => {
const numTestCycles = 1000;
const includeList = [3, 4, 5, 6, 7];
Expand Down Expand Up @@ -54,6 +53,18 @@ describe('getRandClickPos', () => {
});
});

describe('categorizeUrl', () => {
it('should categorize different start urls', () => {
expect(utils.categorizeUrl('')).to.equal('MASTER');
expect(utils.categorizeUrl('/watch?v=394u19u')).to.equal('DETAIL');
expect(utils.categorizeUrl('https://youtube.com/watch?v=394u19u')).to.equal('DETAIL');
expect(utils.categorizeUrl('/channel/asdrtsert/videos')).to.equal('CHANNEL');
expect(utils.categorizeUrl('https://www.youtube.com/user/asdrtsert/videos')).to.equal('CHANNEL');
expect(utils.categorizeUrl('https://www.youtube.com/c/asdrtsert')).to.equal('CHANNEL');
expect(utils.categorizeUrl('https://www.youtube.com/results?search_query=hello')).to.equal('SEARCH');
});
});

describe('getCutoffDate', () => {
it('should return the correct duration for given date string', () => {
const timeNow = moment();
Expand Down Expand Up @@ -106,7 +117,6 @@ describe('isDateInputValid', () => {
assert(isValid('60') === false, '60 is invalid');
assert(isValid('36 # ago') === false, '36 # ago is invalid');
assert(isValid('120 minutes ago ##') === false, '120 minutes ago ## is invalid');

});
});

Expand Down