Skip to content

Commit

Permalink
Merge pull request #19 from bernardro/fixes
Browse files Browse the repository at this point in the history
properly categorize each url
  • Loading branch information
metalwarrior665 committed Sep 14, 2020
2 parents 90d247d + 22fcf96 commit 2c6dd79
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 7 deletions.
15 changes: 11 additions & 4 deletions src/crawler_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ const CONSTS = require('./consts');
*/
exports.handleMaster = async (page, requestQueue, input, request) => {
const { searchBox, toggleFilterMenu, filterBtnsXp } = CONSTS.SELECTORS.SEARCH;
const { search } = request.userData;
const { search, label } = request.userData;

if (search) {
if (search && label === 'MASTER') {
// we are searching
log.debug('waiting for input box...');
const searchBxElem = await page.waitForSelector(searchBox, { visible: true });
Expand Down Expand Up @@ -86,7 +86,7 @@ exports.handleMaster = async (page, requestQueue, input, request) => {

const maxRequested = (input.maxResults && input.maxResults > 0) ? +input.maxResults : 99999;

await utils.loadVideosUrls(requestQueue, page, maxRequested, !!search);
await utils.loadVideosUrls(requestQueue, page, maxRequested, ['MASTER', 'SEARCH'].includes(label));

log.info('infinite scroll done...');
};
Expand Down Expand Up @@ -158,7 +158,14 @@ exports.handleDetail = async (page, request) => {
};

exports.hndlPptGoto = async ({ page, request }) => {
await puppeteer.blockRequests(page);
await puppeteer.blockRequests(page, {
extraUrlPatterns: [
'google-analytics',
'doubleclick.net',
'googletagmanager',
'/log_event',
],
});
return page.goto(request.url, { waitUntil: 'domcontentloaded' });
};

Expand Down
12 changes: 11 additions & 1 deletion src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ Apify.main(async () => {
}

switch (request.userData.label) {
case 'CHANNEL':
case 'SEARCH':
case 'MASTER': {
await crawler.handleMaster(page, requestQueue, input, request);
break;
Expand All @@ -65,10 +67,18 @@ Apify.main(async () => {
// eslint-disable-next-line no-cond-assign
while (req = await parseUrls.fetchNextRequest()) {
// need to parse for requestsFromUrl first then categorize by path
const label = utils.categorizeUrl(req.url);
const pUrl = new URL(req.url);

if (label === 'CHANNEL' && !pUrl.pathname.includes('/videos')) {
pUrl.pathname = `${pUrl.pathname.split('/').filter((s) => s).join('/')}/videos`;
req.url = pUrl.toString();
}

await requestQueue.addRequest({
url: req.url,
userData: {
label: req.url.includes('/watch') ? 'DETAIL' : 'MASTER',
label,
},
});
}
Expand Down
28 changes: 28 additions & 0 deletions src/utility.js
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,34 @@ exports.getDataFromSelector = async (page, slctr, attrib) => {
return page.evaluate((el, key) => el[key], slctrElem, attrib);
};

/**
* @param {string} url
*/
exports.categorizeUrl = (url) => {
try {
const pUrl = new URL(url, 'https://www.youtube.com');

if (!pUrl.hostname.includes('youtube.com')) {
throw new Error('Invalid youtube url');
}

let label = 'MASTER';

if (pUrl.searchParams.get('v')) {
label = 'DETAIL';
} else if (pUrl.searchParams.get('search_query')) {
label = 'SEARCH';
} else if (pUrl.pathname.includes('/channel/') || pUrl.pathname.includes('/user/') || pUrl.pathname.includes('/c/')) {
label = 'CHANNEL';
}

return label;
} catch (e) {
log.exception(e, 'categorizeUrl', { url });
return null;
}
};

exports.unformatNumbers = (numStr) => {
const numberMatch = numStr.replace(/[^0-9,.]/ig, '');
if (numberMatch) {
Expand Down
14 changes: 12 additions & 2 deletions test/utility_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ const moment = require('moment');

const utils = require('../src/utility');


describe('getRandBetween', () => {
const numTestCycles = 1000;
const includeList = [3, 4, 5, 6, 7];
Expand Down Expand Up @@ -54,6 +53,18 @@ describe('getRandClickPos', () => {
});
});

describe('categorizeUrl', () => {
it('should categorize different start urls', () => {
expect(utils.categorizeUrl('')).to.equal('MASTER');
expect(utils.categorizeUrl('/watch?v=394u19u')).to.equal('DETAIL');
expect(utils.categorizeUrl('https://youtube.com/watch?v=394u19u')).to.equal('DETAIL');
expect(utils.categorizeUrl('/channel/asdrtsert/videos')).to.equal('CHANNEL');
expect(utils.categorizeUrl('https://www.youtube.com/user/asdrtsert/videos')).to.equal('CHANNEL');
expect(utils.categorizeUrl('https://www.youtube.com/c/asdrtsert')).to.equal('CHANNEL');
expect(utils.categorizeUrl('https://www.youtube.com/results?search_query=hello')).to.equal('SEARCH');
});
});

describe('getCutoffDate', () => {
it('should return the correct duration for given date string', () => {
const timeNow = moment();
Expand Down Expand Up @@ -106,7 +117,6 @@ describe('isDateInputValid', () => {
assert(isValid('60') === false, '60 is invalid');
assert(isValid('36 # ago') === false, '36 # ago is invalid');
assert(isValid('120 minutes ago ##') === false, '120 minutes ago ## is invalid');

});
});

Expand Down

0 comments on commit 2c6dd79

Please sign in to comment.