From 940fcc452ff4d06aad659eab6fadd6293368f6ef Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 5 May 2025 15:58:26 +0200 Subject: [PATCH 1/2] return also suggested results, add resultType to output dataset items --- .actor/actor.json | 12 +++++++++++- src/google-search/google-extractors-urls.ts | 16 +++++++++++----- src/types.ts | 3 +++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/.actor/actor.json b/.actor/actor.json index a80bfb3..1a4b180 100644 --- a/.actor/actor.json +++ b/.actor/actor.json @@ -16,10 +16,11 @@ "title": "Overview", "description": "An view showing just basic properties for simplicity.", "transformation": { - "flatten": ["metadata"], + "flatten": ["metadata", "searchResult"], "fields": [ "metadata.url", "metadata.title", + "searchResult.resultType", "markdown" ] }, @@ -34,6 +35,10 @@ "label": "Page title", "format": "text" }, + "searchResult.resultType": { + "label": "Result type", + "format": "text" + }, "text": { "label": "Extracted Markdown", "format": "text" @@ -49,6 +54,7 @@ "fields": [ "searchResult.title", "searchResult.description", + "searchResult.resultType", "searchResult.url" ] }, @@ -63,6 +69,10 @@ "label": "Title", "format": "text" }, + "searchResult.resultType": { + "label": "Result type", + "format": "text" + }, "searchResult.url": { "label": "URL", "format": "text" diff --git a/src/google-search/google-extractors-urls.ts b/src/google-search/google-extractors-urls.ts index 990f26a..8e1174b 100644 --- a/src/google-search/google-extractors-urls.ts +++ b/src/google-search/google-extractors-urls.ts @@ -64,7 +64,7 @@ const areTheResultsSuggestions = ($: CheerioAPI) => { /** * Extracts organic search results from the given Cheerio instance (source: @apify/google-search). */ -export const scrapeOrganicResults = ($: CheerioAPI) => { +export const scrapeOrganicResults = ($: CheerioAPI): OrganicResult[] => { const resultSelectors2023January = [ '.hlcw0c', // Top result with site links '.g.Ww4FFb', // General search results @@ -75,10 +75,16 @@ export const scrapeOrganicResults = ($: CheerioAPI) => { '.sATSHe', // another new selector in March 2025 ]; + const searchResults = extractResultsFromSelectors($, resultSelectors2023January); + const deduplicatedResults = deduplicateResults(searchResults); if (areTheResultsSuggestions($)) { - return []; + return deduplicatedResults.map((result) => ({ + ...result, + resultType: 'SUGGESTED', + })); } - - const searchResults = extractResultsFromSelectors($, resultSelectors2023January); - return deduplicateResults(searchResults); + return deduplicatedResults.map((result) => ({ + ...result, + resultType: 'ORGANIC', + })); }; diff --git a/src/types.ts b/src/types.ts index 152acdc..445d468 100644 --- a/src/types.ts +++ b/src/types.ts @@ -32,11 +32,14 @@ export type Input = { scrapingTool: ScrapingTool; }; +export type SearchResultType = 'ORGANIC' | 'SUGGESTED'; + export type OrganicResult = { description?: string; title?: string; rank?: number; url?: string; + resultType?: SearchResultType; }; export interface TimeMeasure { From 14ff4319db190b7a78930a48d7bd542e07d05e02 Mon Sep 17 00:00:00 2001 From: MQ Date: Tue, 6 May 2025 11:17:14 +0200 Subject: [PATCH 2/2] improve readability --- src/google-search/google-extractors-urls.ts | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/google-search/google-extractors-urls.ts b/src/google-search/google-extractors-urls.ts index 8e1174b..c7cf6b5 100644 --- a/src/google-search/google-extractors-urls.ts +++ b/src/google-search/google-extractors-urls.ts @@ -1,7 +1,7 @@ import type { CheerioAPI } from 'cheerio'; import type { Element } from 'domhandler'; -import type { OrganicResult } from '../types.js'; +import type { OrganicResult, SearchResultType } from '../types.js'; /** * Deduplicates search results based on their title and URL (source @apify/google-search). @@ -77,14 +77,12 @@ export const scrapeOrganicResults = ($: CheerioAPI): OrganicResult[] => { const searchResults = extractResultsFromSelectors($, resultSelectors2023January); const deduplicatedResults = deduplicateResults(searchResults); + let resultType: SearchResultType = 'ORGANIC'; if (areTheResultsSuggestions($)) { - return deduplicatedResults.map((result) => ({ - ...result, - resultType: 'SUGGESTED', - })); + resultType = 'SUGGESTED'; } return deduplicatedResults.map((result) => ({ ...result, - resultType: 'ORGANIC', + resultType, })); };