From fb9e3bcd9b4a5c1a00c71d28ef26c53299e1b8dc Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 18 Nov 2025 14:46:52 +0100 Subject: [PATCH 01/26] wip --- .../04_downloading_html.md | 24 +++++++++++++------ src/components/TestedExercise.jsx | 9 +++++++ 2 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 src/components/TestedExercise.jsx diff --git a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md index e3866cfcb2..e31f5526bf 100644 --- a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md @@ -5,6 +5,7 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/downloading-html --- +import TestedExercise from '@site/src/components/TestedExercise'; import Exercises from '../scraping_basics/_exercises.mdx'; **In this lesson we'll start building a Python application for watching prices. As a first step, we'll use the HTTPX library to download HTML code of a product listing page.** @@ -150,14 +151,23 @@ https://www.aliexpress.com/w/wholesale-darth-vader.html
Solution - ```py - import httpx + + ```py + import httpx - url = "https://www.aliexpress.com/w/wholesale-darth-vader.html" - response = httpx.get(url) - response.raise_for_status() - print(response.text) - ``` + url = "https://www.aliexpress.com/w/wholesale-darth-vader.html" + response = httpx.get(url) + response.raise_for_status() + print(response.text) + ``` + + ```bats + @test "outputs the HTML with darth vader products" { + run uv python exercise.py + assert_output --partial 'Need more help to find the most popular darth vader?' + } + ``` +
diff --git a/src/components/TestedExercise.jsx b/src/components/TestedExercise.jsx new file mode 100644 index 0000000000..35b053d866 --- /dev/null +++ b/src/components/TestedExercise.jsx @@ -0,0 +1,9 @@ +export default function TestedExercise({ children }) { + const [exerciseCode, testCode] = children; + if (testCode.props.className !== 'language-bats') { + throw new Error('Exercise: Expected second child to be a Bats code block with tests'); + } + return exerciseCode; +} + +// TODO write docusaurus plugin to extract the exercises and tests from the MDX files From 819462b904a7bd2a96a4b73a189750b6a4147c3b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 21 Nov 2025 17:34:54 +0100 Subject: [PATCH 02/26] feat: keep exercises as separate files, include them to Markdown --- .../04_downloading_html.md | 29 ++++--------------- .../exercises/scrape_lego.py | 6 ++++ .../exercises/test.bats | 8 +++++ src/components/TestedExercise.jsx | 9 ------ 4 files changed, 20 insertions(+), 32 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/test.bats delete mode 100644 src/components/TestedExercise.jsx diff --git a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md index e31f5526bf..1a8ed59078 100644 --- a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md @@ -5,8 +5,9 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/downloading-html --- -import TestedExercise from '@site/src/components/TestedExercise'; +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import LegoExercise from '!!raw-loader!roa-loader!./exercises/scrape_lego.py'; **In this lesson we'll start building a Python application for watching prices. As a first step, we'll use the HTTPX library to download HTML code of a product listing page.** @@ -140,35 +141,17 @@ Letting our program visibly crash on error is enough for our purposes. Now, let' -### Scrape AliExpress +### Scrape LEGO -Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with AliExpress search results: +Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with LEGO search results: ```text -https://www.aliexpress.com/w/wholesale-darth-vader.html +https://www.lego.com/themes/star-wars ```
Solution - - - ```py - import httpx - - url = "https://www.aliexpress.com/w/wholesale-darth-vader.html" - response = httpx.get(url) - response.raise_for_status() - print(response.text) - ``` - - ```bats - @test "outputs the HTML with darth vader products" { - run uv python exercise.py - assert_output --partial 'Need more help to find the most popular darth vader?' - } - ``` - - + {LegoExercise.code}
### Save downloaded HTML as a file diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py b/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py new file mode 100644 index 0000000000..57fabfc95c --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py @@ -0,0 +1,6 @@ +import httpx + +url = "https://www.lego.com/themes/star-wars" +response = httpx.get(url) +response.raise_for_status() +print(response.text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats new file mode 100644 index 0000000000..0e6c2b5383 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -0,0 +1,8 @@ +setup() { + DIR=sources/academy/webscraping/scraping_basics_python/exercises +} + +@test "outputs the HTML with Star Wars products" { + run uv run --with httpx python "$DIR/scrape_lego.py" + [[ "$output" == *"Millennium Falcon"* ]] +} diff --git a/src/components/TestedExercise.jsx b/src/components/TestedExercise.jsx deleted file mode 100644 index 35b053d866..0000000000 --- a/src/components/TestedExercise.jsx +++ /dev/null @@ -1,9 +0,0 @@ -export default function TestedExercise({ children }) { - const [exerciseCode, testCode] = children; - if (testCode.props.className !== 'language-bats') { - throw new Error('Exercise: Expected second child to be a Bats code block with tests'); - } - return exerciseCode; -} - -// TODO write docusaurus plugin to extract the exercises and tests from the MDX files From 10a07430983a39f2f245c0581cb25d4d0d485711 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:19:49 +0100 Subject: [PATCH 03/26] chore: implement testing of JavaScript exercises --- .../04_downloading_html.md | 21 ++++++------------- .../05_parsing_html.md | 19 +++-------------- .../exercises/scrape_f1academy_teams.mjs | 12 +++++++++++ .../exercises/scrape_lego.mjs | 8 +++++++ .../exercises/test.bats | 13 ++++++++++++ .../04_downloading_html.md | 2 +- .../exercises/scrape_lego.py | 2 +- 7 files changed, 44 insertions(+), 33 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_f1academy_teams.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_lego.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats diff --git a/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md index dd5ebfb5b0..f96138ae88 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md @@ -5,8 +5,10 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/downloading-html --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import LegoExercise from '!!raw-loader!roa-loader!./exercises/scrape_lego.mjs'; @@ -184,28 +186,17 @@ Letting our program visibly crash on error is enough for our purposes. Now, let' -### Scrape AliExpress +### Scrape LEGO -Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with AliExpress search results: +Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with LEGO search results: ```text -https://www.aliexpress.com/w/wholesale-darth-vader.html +https://www.lego.com/en-us/themes/star-wars ```
Solution - - ```js - const url = "https://www.aliexpress.com/w/wholesale-darth-vader.html"; - const response = await fetch(url); - - if (response.ok) { - console.log(await response.text()); - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` - + {LegoExercise.code}
### Save downloaded HTML as a file diff --git a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md index 78604a16fa..50dc5c52c4 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md @@ -5,8 +5,10 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/parsing-html --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/scrape_f1academy_teams.mjs'; @@ -183,22 +185,7 @@ https://www.f1academy.com/Racing-Series/Teams
Solution - - ```js - import * as cheerio from 'cheerio'; - - const url = "https://www.f1academy.com/Racing-Series/Teams"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - console.log($(".teams-driver-item").length); - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` - + {F1AcademyTeamsExercise.code}
### Scrape F1 Academy drivers diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_f1academy_teams.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_f1academy_teams.mjs new file mode 100644 index 0000000000..cc075d8b05 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_f1academy_teams.mjs @@ -0,0 +1,12 @@ +import * as cheerio from 'cheerio'; + +const url = "https://www.f1academy.com/Racing-Series/Teams"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + console.log($(".teams-driver-item").length); +} else { + throw new Error(`HTTP ${response.status}`); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_lego.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_lego.mjs new file mode 100644 index 0000000000..131fd9c827 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_lego.mjs @@ -0,0 +1,8 @@ +const url = "https://www.lego.com/en-us/themes/star-wars"; +const response = await fetch(url); + +if (response.ok) { + console.log(await response.text()); +} else { + throw new Error(`HTTP ${response.status}`); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats new file mode 100644 index 0000000000..18ad6acc1f --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -0,0 +1,13 @@ +setup() { + DIR=sources/academy/webscraping/scraping_basics_javascript/exercises +} + +@test "outputs the HTML with Star Wars products" { + run npx node "$DIR/scrape_lego.mjs" + [[ "$output" == *"Millennium Falcon"* ]] +} + +@test "outputs the number of F1 Academy teams" { + run npx --package=cheerio node "$DIR/scrape_f1academy_teams.mjs" + [[ "$output" == "6" ]] +} diff --git a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md index 1a8ed59078..70803de44a 100644 --- a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md @@ -146,7 +146,7 @@ Letting our program visibly crash on error is enough for our purposes. Now, let' Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with LEGO search results: ```text -https://www.lego.com/themes/star-wars +https://www.lego.com/en-us/themes/star-wars ```
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py b/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py index 57fabfc95c..09816a53d6 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py @@ -1,6 +1,6 @@ import httpx -url = "https://www.lego.com/themes/star-wars" +url = "https://www.lego.com/en-us/themes/star-wars" response = httpx.get(url) response.raise_for_status() print(response.text) From 6d50689cd5b5506d08716cde25b239991b4ad66d Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:21:33 +0100 Subject: [PATCH 04/26] refactor: use shorter names --- .../scraping_basics_javascript/04_downloading_html.md | 2 +- .../webscraping/scraping_basics_javascript/05_parsing_html.md | 2 +- .../{scrape_f1academy_teams.mjs => f1academy_teams.mjs} | 0 .../exercises/{scrape_lego.mjs => lego.mjs} | 0 .../scraping_basics_javascript/exercises/test.bats | 4 ++-- .../webscraping/scraping_basics_python/04_downloading_html.md | 2 +- .../exercises/{scrape_lego.py => lego.py} | 0 .../webscraping/scraping_basics_python/exercises/test.bats | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) rename sources/academy/webscraping/scraping_basics_javascript/exercises/{scrape_f1academy_teams.mjs => f1academy_teams.mjs} (100%) rename sources/academy/webscraping/scraping_basics_javascript/exercises/{scrape_lego.mjs => lego.mjs} (100%) rename sources/academy/webscraping/scraping_basics_python/exercises/{scrape_lego.py => lego.py} (100%) diff --git a/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md index f96138ae88..3956bfc1a6 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/downloading-html import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import LegoExercise from '!!raw-loader!roa-loader!./exercises/scrape_lego.mjs'; +import LegoExercise from '!!raw-loader!roa-loader!./exercises/lego.mjs'; diff --git a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md index 50dc5c52c4..80789641a6 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md @@ -8,7 +8,7 @@ slug: /scraping-basics-javascript/parsing-html import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; -import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/scrape_f1academy_teams.mjs'; +import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/f1academy_teams.mjs'; diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_f1academy_teams.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs similarity index 100% rename from sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_f1academy_teams.mjs rename to sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_lego.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs similarity index 100% rename from sources/academy/webscraping/scraping_basics_javascript/exercises/scrape_lego.mjs rename to sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 18ad6acc1f..b9fa9b4af6 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -3,11 +3,11 @@ setup() { } @test "outputs the HTML with Star Wars products" { - run npx node "$DIR/scrape_lego.mjs" + run npx node "$DIR/lego.mjs" [[ "$output" == *"Millennium Falcon"* ]] } @test "outputs the number of F1 Academy teams" { - run npx --package=cheerio node "$DIR/scrape_f1academy_teams.mjs" + run npx --package=cheerio node "$DIR/f1academy_teams.mjs" [[ "$output" == "6" ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md index 70803de44a..929bf9a6eb 100644 --- a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md +++ b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md @@ -7,7 +7,7 @@ slug: /scraping-basics-python/downloading-html import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; -import LegoExercise from '!!raw-loader!roa-loader!./exercises/scrape_lego.py'; +import LegoExercise from '!!raw-loader!roa-loader!./exercises/lego.py'; **In this lesson we'll start building a Python application for watching prices. As a first step, we'll use the HTTPX library to download HTML code of a product listing page.** diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py b/sources/academy/webscraping/scraping_basics_python/exercises/lego.py similarity index 100% rename from sources/academy/webscraping/scraping_basics_python/exercises/scrape_lego.py rename to sources/academy/webscraping/scraping_basics_python/exercises/lego.py diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 0e6c2b5383..830a42644f 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -3,6 +3,6 @@ setup() { } @test "outputs the HTML with Star Wars products" { - run uv run --with httpx python "$DIR/scrape_lego.py" + run uv run --with httpx python "$DIR/lego.py" [[ "$output" == *"Millennium Falcon"* ]] } From 401cc929b417679edc9fa769e93b302465797ff0 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:40:19 +0100 Subject: [PATCH 05/26] chore: add GitHub Action to run tests automatically --- .github/workflows/test-academy.yml | 31 ++++++++++++++++++++++++++++++ package-lock.json | 11 +++++++++++ package.json | 4 +++- 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-academy.yml diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml new file mode 100644 index 0000000000..cf3170ee8b --- /dev/null +++ b/.github/workflows/test-academy.yml @@ -0,0 +1,31 @@ +name: Test Academy + +on: + push: + branches: [ honzajavorek/lint-py ] + schedule: + - cron: "0 3 * * 1" # at 3am UTC on Mondays + workflow_dispatch: # allows running this workflow manually from the Actions tab + +jobs: + test-exercises: + name: Test Academy Exercises + runs-on: ubuntu-latest + steps: + - name: Checkout Source code + uses: actions/checkout@v6 + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + cache: npm + cache-dependency-path: package-lock.json + + - name: Setup Python + uses: astral-sh/setup-uv@v7 + + - name: Install Bats + run: npm install --only=dev + + - name: Test + run: npm run test:academy diff --git a/package-lock.json b/package-lock.json index 9211df3e2b..7315d8d7fa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -45,6 +45,7 @@ "@apify/tsconfig": "^0.1.0", "@types/react": "^19.0.0", "babel-plugin-styled-components": "^2.1.4", + "bats": "^1.13.0", "cross-env": "^10.0.0", "eslint": "^9.32.0", "eslint-plugin-react": "^7.37.5", @@ -9316,6 +9317,16 @@ "resolved": "https://registry.npmjs.org/batch/-/batch-0.6.1.tgz", "integrity": "sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==" }, + "node_modules/bats": { + "version": "1.13.0", + "resolved": "https://registry.npmjs.org/bats/-/bats-1.13.0.tgz", + "integrity": "sha512-giSYKGTOcPZyJDbfbTtzAedLcNWdjCLbXYU3/MwPnjyvDXzu6Dgw8d2M+8jHhZXSmsCMSQqCp+YBsJ603UO4vQ==", + "dev": true, + "license": "MIT", + "bin": { + "bats": "bin/bats" + } + }, "node_modules/bcp-47-match": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/bcp-47-match/-/bcp-47-match-2.0.3.tgz", diff --git a/package.json b/package.json index cf6193d4f3..93cf8433e6 100644 --- a/package.json +++ b/package.json @@ -40,6 +40,7 @@ "lint:md:fix": "markdownlint '**/*.md' --fix", "lint:code": "eslint .", "lint:code:fix": "eslint . --fix", + "test:academy": "bats --print-output-on-failure -r .", "postinstall": "patch-package", "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs" }, @@ -48,6 +49,7 @@ "@apify/tsconfig": "^0.1.0", "@types/react": "^19.0.0", "babel-plugin-styled-components": "^2.1.4", + "bats": "^1.13.0", "cross-env": "^10.0.0", "eslint": "^9.32.0", "eslint-plugin-react": "^7.37.5", @@ -61,8 +63,8 @@ "typescript-eslint": "^8.38.0" }, "dependencies": { - "@apify/ui-library": "^1.97.2", "@apify/ui-icons": "^1.19.0", + "@apify/ui-library": "^1.97.2", "@docusaurus/core": "^3.8.1", "@docusaurus/faster": "^3.8.1", "@docusaurus/plugin-client-redirects": "^3.8.1", From e296be1cba32e253db8d620718e5cf1f60e50ba2 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:42:32 +0100 Subject: [PATCH 06/26] chore: ouch, wrong branch --- .github/workflows/test-academy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml index cf3170ee8b..dcb076fdb6 100644 --- a/.github/workflows/test-academy.yml +++ b/.github/workflows/test-academy.yml @@ -2,7 +2,7 @@ name: Test Academy on: push: - branches: [ honzajavorek/lint-py ] + branches: [ "honzajavorek/test-exercises" ] schedule: - cron: "0 3 * * 1" # at 3am UTC on Mondays workflow_dispatch: # allows running this workflow manually from the Actions tab From e673e9f3da3a94e8dce04d84a23f8c002b90539b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:45:21 +0100 Subject: [PATCH 07/26] chore: one does not simply npm install --- .github/workflows/test-academy.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml index dcb076fdb6..a879e1085c 100644 --- a/.github/workflows/test-academy.yml +++ b/.github/workflows/test-academy.yml @@ -25,7 +25,9 @@ jobs: uses: astral-sh/setup-uv@v7 - name: Install Bats - run: npm install --only=dev + run: | + corepack enable + npm install --only=dev - name: Test run: npm run test:academy From 846602c1893daa78437dbd93cca7a9ba6763c53c Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:45:54 +0100 Subject: [PATCH 08/26] style: make linter happy --- .../scraping_basics_javascript/exercises/f1academy_teams.mjs | 4 ++-- .../webscraping/scraping_basics_javascript/exercises/lego.mjs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs index cc075d8b05..1ffb67adae 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs @@ -1,12 +1,12 @@ import * as cheerio from 'cheerio'; -const url = "https://www.f1academy.com/Racing-Series/Teams"; +const url = 'https://www.f1academy.com/Racing-Series/Teams'; const response = await fetch(url); if (response.ok) { const html = await response.text(); const $ = cheerio.load(html); - console.log($(".teams-driver-item").length); + console.log($('.teams-driver-item').length); } else { throw new Error(`HTTP ${response.status}`); } diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs index 131fd9c827..50abb8de39 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs @@ -1,4 +1,4 @@ -const url = "https://www.lego.com/en-us/themes/star-wars"; +const url = 'https://www.lego.com/en-us/themes/star-wars'; const response = await fetch(url); if (response.ok) { From a6746040205a5169abd5b646af1161b994f74dd6 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:49:03 +0100 Subject: [PATCH 09/26] chore: make sure there is no schedule until we merge this, add explanatory comment --- .github/workflows/test-academy.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml index a879e1085c..acb537f855 100644 --- a/.github/workflows/test-academy.yml +++ b/.github/workflows/test-academy.yml @@ -1,11 +1,13 @@ name: Test Academy on: + # TODO, this is just temporary: push: branches: [ "honzajavorek/test-exercises" ] - schedule: - - cron: "0 3 * * 1" # at 3am UTC on Mondays - workflow_dispatch: # allows running this workflow manually from the Actions tab + + # schedule: + # - cron: "0 3 * * 1" # at 3am UTC on Mondays + # workflow_dispatch: # allows running this workflow manually from the Actions tab jobs: test-exercises: From a72090d819002dc8f1fb1364cfb5e16217a6b975 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 14:54:04 +0100 Subject: [PATCH 10/26] refactor: simplify the tests --- .../scraping_basics_javascript/exercises/test.bats | 8 ++------ .../scraping_basics_python/exercises/test.bats | 6 +----- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index b9fa9b4af6..dc01a1b245 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -1,13 +1,9 @@ -setup() { - DIR=sources/academy/webscraping/scraping_basics_javascript/exercises -} - @test "outputs the HTML with Star Wars products" { - run npx node "$DIR/lego.mjs" + run npx node "$BATS_TEST_DIRNAME/lego.mjs" [[ "$output" == *"Millennium Falcon"* ]] } @test "outputs the number of F1 Academy teams" { - run npx --package=cheerio node "$DIR/f1academy_teams.mjs" + run npx --package=cheerio node "$BATS_TEST_DIRNAME/f1academy_teams.mjs" [[ "$output" == "6" ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 830a42644f..ac09b3032a 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -1,8 +1,4 @@ -setup() { - DIR=sources/academy/webscraping/scraping_basics_python/exercises -} - @test "outputs the HTML with Star Wars products" { - run uv run --with httpx python "$DIR/lego.py" + run uv run --with httpx python "$BATS_TEST_DIRNAME/lego.py" [[ "$output" == *"Millennium Falcon"* ]] } From db19ee598c834e9c755166da43f20b65c9db6031 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 24 Nov 2025 18:18:15 +0100 Subject: [PATCH 11/26] docs: document lychee and academy testing --- CONTRIBUTING.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3136c31132..940073d2fc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -335,6 +335,12 @@ Add languages by adding new folders at the appropriate path level. - Run `vale sync` to download styles - Configure exceptions in `accepts.txt` +### Testing + +- **Broken links**: [Periodic GitHub Action](.github/workflows/lychee.yml) checks broken links by [lychee](https://lychee.cli.rs/). If the Action fails, we manually fix the issues. + +- **Academy exercises**: At the end of each lesson in the academy courses, there are exercises that target real-world websites. Each exercise includes a solution, stored as a separate file containing executable code. These files are included in the docs using the `!!raw-loader` syntax. Each course has a [Bats](https://bats-core.readthedocs.io/) test file named `test.bats`. The tests run each solution as a standalone program and verify that it produces output matching the expected results. A [periodic GitHub Action](.github/workflows/test-academy.yml) runs all these tests using `npm run test:academy`. If the Action fails, we rework the exercises. + ## Pull request process 1. Follow [Conventional Commits](https://www.conventionalcommits.org/) From fc05b0d5196ffd9e48cd3920b5f7d67be60d4059 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 10:27:31 +0100 Subject: [PATCH 12/26] refactor: make exercises testable --- .gitignore | 1 + .../05_parsing_html.md | 16 +-- .../06_locating_elements.md | 72 ++---------- .../07_extracting_data.md | 104 ++-------------- .../08_saving_data.md | 12 +- .../09_getting_links.md | 44 +------ .../scraping_basics_javascript/10_crawling.md | 73 +----------- .../11_scraping_variants.md | 93 +-------------- .../12_framework.md | 76 +----------- .../exercises/cnn_sports_shortest_article.mjs | 40 +++++++ .../exercises/crawlee_f1_drivers.mjs | 38 ++++++ .../exercises/crawlee_netflix_ratings.mjs | 32 +++++ .../exercises/f1academy_drivers.mjs | 13 ++ .../exercises/guardian_f1_authors.mjs | 36 ++++++ .../exercises/guardian_f1_links.mjs | 20 ++++ .../exercises/guardian_f1_titles.mjs | 18 +++ .../exercises/guardian_publish_dates.mjs | 28 +++++ .../exercises/npm_llm_packages.mjs | 61 ++++++++++ .../exercises/process_products_json.mjs | 8 ++ .../exercises/products.json | 12 ++ .../exercises/test.bats | 111 +++++++++++++++++- .../exercises/warehouse_units.mjs | 36 ++++++ .../exercises/warehouse_units_regex.mjs | 31 +++++ .../exercises/wikipedia_calling_codes.mjs | 36 ++++++ .../exercises/wikipedia_countries.mjs | 29 +++++ .../wikipedia_countries_single_selector.mjs | 19 +++ .../exercises/wikipedia_country_links.mjs | 20 ++++ .../scraping_basics_python/05_parsing_html.md | 29 +---- .../06_locating_elements.md | 54 ++------- .../07_extracting_data.md | 81 ++----------- .../scraping_basics_python/08_saving_data.md | 15 +-- .../09_getting_links.md | 39 +----- .../scraping_basics_python/10_crawling.md | 60 +--------- .../11_scraping_variants.md | 59 +--------- .../scraping_basics_python/12_framework.md | 86 +------------- .../exercises/cnn_sports_shortest_article.py | 32 +++++ .../exercises/crawlee_f1_drivers.py | 54 +++++++++ .../exercises/crawlee_netflix_ratings.py | 46 ++++++++ .../exercises/f1academy_drivers.py | 9 ++ .../exercises/f1academy_teams.py | 9 ++ .../exercises/guardian_f1_authors.py | 36 ++++++ .../exercises/guardian_f1_links.py | 15 +++ .../exercises/guardian_f1_titles.py | 11 ++ .../exercises/guardian_publish_dates.py | 23 ++++ .../exercises/process_products_json.py | 9 ++ .../exercises/products.json | 12 ++ .../exercises/python_jobs_database.py | 32 +++++ .../exercises/test.bats | 110 ++++++++++++++++- .../exercises/warehouse_units.py | 23 ++++ .../exercises/warehouse_units_regex.py | 19 +++ .../exercises/wikipedia_calling_codes.py | 32 +++++ .../exercises/wikipedia_countries.py | 17 +++ .../wikipedia_countries_single_selector.py | 13 ++ .../exercises/wikipedia_country_links.py | 15 +++ 54 files changed, 1183 insertions(+), 836 deletions(-) create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/products.json create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/products.json create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py diff --git a/.gitignore b/.gitignore index 8fa90c3b3e..b16d993b57 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ codegen/*/generated/ codegen/*/go.sum .github/styles/Microsoft .github/styles/write-good +sources/academy/**/exercises/storage diff --git a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md index 80789641a6..a1262cb9b5 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md @@ -9,6 +9,7 @@ import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/f1academy_teams.mjs'; +import F1AcademyDriversExercise from '!!raw-loader!roa-loader!./exercises/f1academy_drivers.mjs'; @@ -195,19 +196,6 @@ Use the same URL as in the previous exercise, but this time print a total count
Solution - ```js - import * as cheerio from 'cheerio'; - - const url = "https://www.f1academy.com/Racing-Series/Teams"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - console.log($(".driver").length); - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {F1AcademyDriversExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index 8193597053..210b31f583 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -5,8 +5,12 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/locating-elements --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.mjs'; +import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.mjs'; +import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs'; @@ -239,35 +243,7 @@ Djibouti
Solution - ```js - import * as cheerio from 'cheerio'; - - const url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const tableElement of $(".wikitable").toArray()) { - const $table = $(tableElement); - const $rows = $table.find("tr"); - - for (const rowElement of $rows.toArray()) { - const $row = $(rowElement); - const $cells = $row.find("td"); - - if ($cells.length > 0) { - const $thirdColumn = $($cells[2]); - const $link = $thirdColumn.find("a").first(); - console.log($link.text()); - } - } - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {WikipediaCountriesExercise.code} Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. @@ -289,25 +265,7 @@ You may want to check out the following pages:
Solution - ```js - import * as cheerio from 'cheerio'; - - const url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $(".wikitable tr td:nth-child(3)").toArray()) { - const $nameCell = $(element); - const $link = $nameCell.find("a").first(); - console.log($link.text()); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {WikipediaCountriesSingleSelectorExercise.code}
@@ -331,22 +289,6 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution - ```js - import * as cheerio from 'cheerio'; - - const url = "https://www.theguardian.com/sport/formulaone"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $("#maincontent ul li h3").toArray()) { - console.log($(element).text()); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {GuardianF1TitlesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md index dae6bda605..6d09f851e4 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md @@ -5,8 +5,12 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/extracting-data --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WarehouseUnitsExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units.mjs'; +import WarehouseUnitsRegexExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units_regex.mjs'; +import GuardianPublishDatesExercise from '!!raw-loader!roa-loader!./exercises/guardian_publish_dates.mjs'; @@ -240,41 +244,7 @@ Denon AH-C720 In-Ear Headphones | 236
Solution - ```js - import * as cheerio from 'cheerio'; - - function parseUnitsText(text) { - const count = text - .replace("In stock,", "") - .replace("Only", "") - .replace(" left", "") - .replace("units", "") - .trim(); - return count === "Sold out" ? 0 : parseInt(count); - } - - const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $(".product-item").toArray()) { - const $productItem = $(element); - - const title = $productItem.find(".product-item__title"); - const title = $title.text().trim(); - - const unitsText = $productItem.find(".product-item__inventory").text(); - const unitsCount = parseUnitsText(unitsText); - - console.log(`${title} | ${unitsCount}`); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {WarehouseUnitsExercise.code} :::tip Conditional (ternary) operator @@ -291,39 +261,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://deve
Solution - ```js - import * as cheerio from 'cheerio'; - - function parseUnitsText(text) { - const match = text.match(/\d+/); - if (match) { - return parseInt(match[0]); - } - return 0; - } - - const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $(".product-item").toArray()) { - const $productItem = $(element); - - const $title = $productItem.find(".product-item__title"); - const title = $title.text().trim(); - - const unitsText = $productItem.find(".product-item__inventory").text(); - const unitsCount = parseUnitsText(unitsText); - - console.log(`${title} | ${unitsCount}`); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {WarehouseUnitsRegexExercise.code} :::tip Conditional (ternary) operator @@ -363,34 +301,6 @@ Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian
Solution - ```js - import * as cheerio from 'cheerio'; - - const url = "https://www.theguardian.com/sport/formulaone"; - const response = await fetch(url); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $("#maincontent ul li").toArray()) { - const $article = $(element); - - const title = $article - .find("h3") - .text() - .trim(); - const dateText = $article - .find("time") - .attr("datetime") - .trim(); - const date = new Date(dateText); - - console.log(`${title} | ${date.toDateString()}`); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {GuardianPublishDatesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md index bd960e9b5c..6a075b51d7 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md @@ -5,7 +5,9 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/saving-data --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; +import ProcessProductsJsonExercise from '!!raw-loader!roa-loader!./exercises/process_products_json.mjs'; @@ -210,15 +212,7 @@ Write a new Node.js program that reads the `products.json` file we created in th
Solution - ```js - import { readFile } from "fs/promises"; - - const jsonData = await readFile("products.json"); - const data = JSON.parse(jsonData); - data - .filter(row => row.minPrice > 50000) - .forEach(row => console.log(row)); - ``` + {ProcessProductsJsonExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md index e923a8875d..3d4062fa81 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md @@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/getting-links --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.mjs'; +import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs'; @@ -342,26 +345,7 @@ https://en.wikipedia.org/wiki/Botswana
Solution - ```js - import * as cheerio from 'cheerio'; - - const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; - const response = await fetch(listingURL); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $(".wikitable tr td:nth-child(3)").toArray()) { - const nameCell = $(element); - const link = nameCell.find("a").first(); - const url = new URL(link.attr("href"), listingURL).href; - console.log(url); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {WikipediaCountryLinksExercise.code}
@@ -386,25 +370,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution - ```js - import * as cheerio from 'cheerio'; - - const listingURL = "https://www.theguardian.com/sport/formulaone"; - const response = await fetch(listingURL); - - if (response.ok) { - const html = await response.text(); - const $ = cheerio.load(html); - - for (const element of $("#maincontent ul li").toArray()) { - const link = $(element).find("a").first(); - const url = new URL(link.attr("href"), listingURL).href; - console.log(url); - } - } else { - throw new Error(`HTTP ${response.status}`); - } - ``` + {GuardianF1LinksExercise.code} Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this: diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md index 926fd6d839..ee24ba86cc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md @@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/crawling --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.mjs'; +import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs'; @@ -237,43 +240,7 @@ Locating cells in tables is sometimes easier if you know how to [filter](https:/
Solution - ```js - import * as cheerio from 'cheerio'; - - async function download(url) { - const response = await fetch(url); - if (response.ok) { - const html = await response.text(); - return cheerio.load(html); - } else { - throw new Error(`HTTP ${response.status}`); - } - } - - const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"; - const $ = await download(listingURL); - - const $cells = $(".wikitable tr td:nth-child(3)"); - const promises = $cells.toArray().map(async element => { - const $nameCell = $(element); - const $link = $nameCell.find("a").first(); - const countryURL = new URL($link.attr("href"), listingURL).href; - - const $c = await download(countryURL); - const $label = $c("th.infobox-label") - .filter((i, element) => $c(element).text().trim() == "Calling code") - .first(); - const callingCode = $label - .parent() - .find("td.infobox-data") - .first() - .text() - .trim(); - - console.log(`${countryURL} ${callingCode || null}`); - }); - await Promise.all(promises); - ``` + {WikipediaCallingCodesExercise.code}
@@ -306,36 +273,6 @@ PA Media: Lewis Hamilton reveals lifelong battle with depression after school bu
Solution - ```js - import * as cheerio from 'cheerio'; - - async function download(url) { - const response = await fetch(url); - if (response.ok) { - const html = await response.text(); - return cheerio.load(html); - } else { - throw new Error(`HTTP ${response.status}`); - } - } - - const listingURL = "https://www.theguardian.com/sport/formulaone"; - const $ = await download(listingURL); - - const promises = $("#maincontent ul li").toArray().map(async element => { - const $item = $(element); - const $link = $item.find("a").first(); - const authorURL = new URL($link.attr("href"), listingURL).href; - - const $a = await download(authorURL); - const title = $a("h1").text().trim(); - - const author = $a('a[rel="author"]').text().trim(); - const address = $a('aside address').text().trim(); - - console.log(`${author || address || null}: ${title}`); - }); - await Promise.all(promises); - ``` + {GuardianF1AuthorsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index 2d4044240a..b2519dd0d9 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/scraping-variants --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import NpmLlmPackagesExercise from '!!raw-loader!roa-loader!./exercises/npm_llm_packages.mjs'; +import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs'; @@ -386,61 +389,7 @@ Your output should look something like this: After inspecting the registry, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the sorting dropdown results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape the whole registry and then filter by keyword or sort by the number of dependents. - ```js - import * as cheerio from 'cheerio'; - - async function download(url) { - const response = await fetch(url); - if (response.ok) { - const html = await response.text(); - return cheerio.load(html); - } else { - throw new Error(`HTTP ${response.status}`); - } - } - - const listingURL = "https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count"; - const $ = await download(listingURL); - - const promises = $("section").toArray().map(async element => { - const $card = $(element); - - const details = $card - .children() - .first() - .children() - .last() - .text() - .split("•"); - const updatedText = details[2].trim(); - const dependents = parseInt(details[3].replace("dependents", "").trim()); - - if (updatedText.includes("years ago")) { - const yearsAgo = parseInt(updatedText.replace("years ago", "").trim()); - if (yearsAgo > 2) { - return null; - } - } - - const $link = $card.find("a").first(); - const name = $link.text().trim(); - const url = new URL($link.attr("href"), listingURL).href; - const description = $card.find("p").text().trim(); - - const downloadsText = $card - .children() - .last() - .text() - .replace(",", "") - .trim(); - const downloads = parseInt(downloadsText); - - return { name, url, description, dependents, downloads }; - }); - - const data = await Promise.all(promises); - console.log(data.filter(item => item !== null).splice(0, 5)); - ``` + {NpmLlmPackagesExercise.code} Since the HTML doesn't contain any descriptive classes, we must rely on its structure. We're using [`.children()`](https://cheerio.js.org/docs/api/classes/Cheerio#children) to carefully navigate the HTML element tree. @@ -463,38 +412,6 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution - ```js - import * as cheerio from 'cheerio'; - - async function download(url) { - const response = await fetch(url); - if (response.ok) { - const html = await response.text(); - return cheerio.load(html); - } else { - throw new Error(`HTTP ${response.status}`); - } - } - - const listingURL = "https://edition.cnn.com/sport"; - const $ = await download(listingURL); - - const promises = $(".layout__main .card").toArray().map(async element => { - const $link = $(element).find("a").first(); - const articleURL = new URL($link.attr("href"), listingURL).href; - - const $a = await download(articleURL); - const content = $a(".article__content").text().trim(); - - return { url: articleURL, length: content.length }; - }); - - const data = await Promise.all(promises); - const nonZeroData = data.filter(({ url, length }) => length > 0); - nonZeroData.sort((a, b) => a.length - b.length); - const shortestItem = nonZeroData[0]; - - console.log(shortestItem.url); - ``` + {CnnSportsShortestArticleExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index e4d45aef47..0a8097eb27 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us slug: /scraping-basics-javascript/framework --- +import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; +import CrawleeF1DriversExercise from '!!raw-loader!roa-loader!./exercises/crawlee_f1_drivers.mjs'; +import CrawleeNetflixRatingsExercise from '!!raw-loader!roa-loader!./exercises/crawlee_netflix_ratings.mjs'; @@ -422,42 +425,7 @@ If you export the dataset as JSON, it should look something like this:
Solution - ```js - import { CheerioCrawler } from 'crawlee'; - - const crawler = new CheerioCrawler({ - async requestHandler({ $, request, enqueueLinks, pushData }) { - if (request.label === 'DRIVER') { - const info = {}; - for (const itemElement of $('.common-driver-info li').toArray()) { - const name = $(itemElement).find('span').text().trim(); - const value = $(itemElement).find('h4').text().trim(); - info[name] = value; - } - const detail = {}; - for (const linkElement of $('.driver-detail--cta-group a').toArray()) { - const name = $(linkElement).find('p').text().trim(); - const value = $(linkElement).find('h2').text().trim(); - detail[name] = value; - }); - const [dobDay, dobMonth, dobYear] = info['DOB'].split("/"); - pushData({ - url: request.url, - name: $('h1').text().trim(), - team: detail['Team'], - nationality: info['Nationality'], - dob: `${dobYear}-${dobMonth}-${dobDay}`, - instagram_url: $(".common-social-share a[href*='instagram']").attr('href'), - }); - } else { - await enqueueLinks({ selector: '.teams-driver-item a', label: 'DRIVER' }); - } - }, - }); - - await crawler.run(['https://www.f1academy.com/Racing-Series/Drivers']); - await crawler.exportData('dataset.json'); - ``` + {CrawleeF1DriversExercise.code}
@@ -516,40 +484,6 @@ When navigating to the first IMDb search result, you might find it helpful to kn
Solution - ```js - import { CheerioCrawler, Request } from 'crawlee'; - import { escape } from 'node:querystring'; - - const crawler = new CheerioCrawler({ - async requestHandler({ $, request, enqueueLinks, pushData, addRequests }) { - if (request.label === 'IMDB') { - // handle IMDB film page - pushData({ - url: request.url, - title: $('h1').text().trim(), - rating: $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim(), - }); - } else if (request.label === 'IMDB_SEARCH') { - // handle IMDB search results - await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 }); - - } else if (request.label === 'NETFLIX') { - // handle Netflix table - const $buttons = $('[data-uia="top10-table-row-title"] button'); - const requests = $buttons.toArray().map(buttonElement => { - const name = $(buttonElement).text().trim(); - const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; - return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); - }); - await addRequests($requests.get()); - } else { - throw new Error(`Unexpected request label: ${request.label}`); - } - }, - }); - - await crawler.run(['https://www.netflix.com/tudum/top10']); - await crawler.exportData('dataset.json'); - ``` + {CrawleeNetflixRatingsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs new file mode 100644 index 0000000000..800a103440 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs @@ -0,0 +1,40 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://edition.cnn.com/sport'; +const $ = await download(listingUrl); + +const results = await Promise.all( + $('.layout__main .card').toArray().map(async element => { + const $element = $(element); + const $link = $element.find('a').first(); + if (!$link.length) { + return null; + } + + const articleUrl = new URL($link.attr('href'), listingUrl).href; + const $article = await download(articleUrl); + const content = $article('.article__content').text().trim(); + + if (!content) { + return null; + } + + return { url: articleUrl, length: content.length }; + }) +); + +const nonEmpty = results.filter(item => item && item.length > 0); +nonEmpty.sort((a, b) => a.length - b.length); + +if (nonEmpty.length > 0) { + console.log(nonEmpty[0].url); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs new file mode 100644 index 0000000000..da7d2d5518 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs @@ -0,0 +1,38 @@ +import { CheerioCrawler } from 'crawlee'; + +const crawler = new CheerioCrawler({ + async requestHandler({ $, request, enqueueLinks, pushData }) { + if (request.label === 'DRIVER') { + const info = {}; + for (const itemElement of $('.common-driver-info li').toArray()) { + const name = $(itemElement).find('span').text().trim(); + const value = $(itemElement).find('h4').text().trim(); + info[name] = value; + } + + const detail = {}; + for (const linkElement of $('.driver-detail--cta-group a').toArray()) { + const name = $(linkElement).find('p').text().trim(); + const value = $(linkElement).find('h2').text().trim(); + detail[name] = value; + } + + const dob = info.DOB ?? ''; + const [dobDay = '', dobMonth = '', dobYear = ''] = dob.split('/'); + + await pushData({ + url: request.url, + name: $('h1').text().trim(), + team: detail.Team, + nationality: info.Nationality, + dob: dobYear && dobMonth && dobDay ? `${dobYear}-${dobMonth}-${dobDay}` : null, + instagram_url: $(".common-social-share a[href*='instagram']").attr('href') ?? null, + }); + } else { + await enqueueLinks({ selector: '.teams-driver-item a', label: 'DRIVER' }); + } + }, +}); + +await crawler.run(['https://www.f1academy.com/Racing-Series/Drivers']); +await crawler.exportData('dataset.json'); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs new file mode 100644 index 0000000000..0d2c9d662c --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -0,0 +1,32 @@ +import { CheerioCrawler, Request } from 'crawlee'; +import { escape } from 'node:querystring'; + +const crawler = new CheerioCrawler({ + async requestHandler({ $, request, enqueueLinks, pushData, addRequests }) { + if (request.label === 'IMDB') { + const title = $('h1').text().trim(); + const rating = $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim(); + if (title && rating) { + await pushData({ + url: request.url, + title, + rating, + }); + } + } else if (request.label === 'IMDB_SEARCH') { + await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 }); + } else if (request.label === 'NETFLIX') { + const requests = $("[data-uia='top10-table-row-title'] button").toArray().map(buttonElement => { + const name = $(buttonElement).text().trim(); + const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; + return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); + }); + await addRequests(requests); + } else { + throw new Error(`Unexpected request label: ${request.label}`); + } + }, +}); + +await crawler.run(['https://www.netflix.com/tudum/top10']); +await crawler.exportData('dataset.json'); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs new file mode 100644 index 0000000000..c3dee6eb8a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs @@ -0,0 +1,13 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.f1academy.com/Racing-Series/Teams'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +console.log($('.driver').length); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs new file mode 100644 index 0000000000..6e9be2fae1 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs @@ -0,0 +1,36 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://www.theguardian.com/sport/formulaone'; +const $ = await download(listingUrl); + +const promises = $('#maincontent ul li').toArray().map(async element => { + const $item = $(element); + const $link = $item.find('a').first(); + if (!$link.length) { + return; + } + + const articleUrl = new URL($link.attr('href'), listingUrl).href; + const $article = await download(articleUrl); + + const title = $article('h1').text().trim(); + if (!title) { + return; + } + + const author = $article('a[rel="author"]').first().text().trim(); + const attribution = author || $article('aside address').first().text().trim() || 'null'; + + console.log(`${attribution}: ${title}`); +}); + +await Promise.all(promises); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs new file mode 100644 index 0000000000..5a38d7dcee --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs @@ -0,0 +1,20 @@ +import * as cheerio from 'cheerio'; + +const listingUrl = 'https://www.theguardian.com/sport/formulaone'; +const response = await fetch(listingUrl); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('#maincontent ul li').toArray()) { + const $item = $(element); + const $link = $item.find('a').first(); + if ($link.length) { + const url = new URL($link.attr('href'), listingUrl).href; + console.log(url); + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs new file mode 100644 index 0000000000..33486b9cd0 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs @@ -0,0 +1,18 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.theguardian.com/sport/formulaone'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('#maincontent ul li h3').toArray()) { + const title = $(element).text().trim(); + if (title) { + console.log(title); + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs new file mode 100644 index 0000000000..ba6d13ba9e --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs @@ -0,0 +1,28 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://www.theguardian.com/sport/formulaone'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('#maincontent ul li').toArray()) { + const $article = $(element); + const title = $article.find('h3').text().trim(); + const dateAttr = $article.find('time').attr('datetime'); + + if (!title || !dateAttr) { + continue; + } + + const date = new Date(dateAttr.trim()); + if (Number.isNaN(date.getTime())) { + continue; + } + + console.log(`${title} | ${date.toDateString()}`); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs new file mode 100644 index 0000000000..f399ff241a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs @@ -0,0 +1,61 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +function parseNumber(text) { + return Number.parseInt(text.replace(/[^0-9]/g, ''), 10); +} + +const listingUrl = 'https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count'; +const $ = await download(listingUrl); + +const promises = $('section').toArray().map(async element => { + const $card = $(element); + const $link = $card.find('a').first(); + if (!$link.length) { + return null; + } + + const details = $card + .children() + .first() + .children() + .last() + .text() + .split('•') + .map(item => item.trim()); + + const updatedText = details[2] ?? ''; + const dependentsText = details[3] ?? ''; + const dependents = parseNumber(dependentsText); + + if (updatedText.includes('years ago')) { + const yearsAgo = parseNumber(updatedText); + if (Number.isFinite(yearsAgo) && yearsAgo > 2) { + return null; + } + } + + const name = $link.text().trim(); + const url = new URL($link.attr('href'), listingUrl).href; + const description = $card.find('p').text().trim(); + + const downloadsText = $card + .children() + .last() + .text() + .trim(); + const downloads = parseNumber(downloadsText); + + return { name, url, description, dependents, downloads }; +}); + +const data = (await Promise.all(promises)).filter(item => item); +console.log(data.slice(0, 5)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs new file mode 100644 index 0000000000..13a68efa5e --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs @@ -0,0 +1,8 @@ +import { readFile } from 'node:fs/promises'; + +const jsonData = await readFile('products.json', 'utf8'); +const data = JSON.parse(jsonData); + +data + .filter(row => row.minPrice > 50000) + .forEach(row => console.log(row)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/products.json b/sources/academy/webscraping/scraping_basics_javascript/exercises/products.json new file mode 100644 index 0000000000..8e067ca9f2 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/products.json @@ -0,0 +1,12 @@ +[ + { + "title": "Premium Speakers", + "minPrice": 75000, + "price": 75000 + }, + { + "title": "Budget Headphones", + "minPrice": 25000, + "price": 25000 + } +] diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index dc01a1b245..aae0b8ecab 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -1,9 +1,112 @@ +setup() { + cd "$BATS_TEST_DIRNAME" +} + @test "outputs the HTML with Star Wars products" { - run npx node "$BATS_TEST_DIRNAME/lego.mjs" + run npx --yes node lego.mjs [[ "$output" == *"Millennium Falcon"* ]] } -@test "outputs the number of F1 Academy teams" { - run npx --package=cheerio node "$BATS_TEST_DIRNAME/f1academy_teams.mjs" - [[ "$output" == "6" ]] +@test "counts the number of F1 Academy teams" { + run npx --yes --package=cheerio node f1academy_teams.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "counts the number of F1 Academy drivers" { + run npx --yes --package=cheerio node f1academy_drivers.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists African countries" { + run npx --yes --package=cheerio node wikipedia_countries.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists African countries with a single selector" { + run npx --yes --package=cheerio node wikipedia_countries_single_selector.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Guardian F1 article titles" { + run npx --yes --package=cheerio node guardian_f1_titles.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints warehouse stock counts" { + run npx --yes --package=cheerio node warehouse_units.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints warehouse stock counts using regex" { + run npx --yes --package=cheerio node warehouse_units_regex.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints Guardian F1 titles with publish dates" { + run npx --yes --package=cheerio node guardian_publish_dates.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "filters products from JSON" { + run npx --yes node process_products_json.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Wikipedia country links" { + run npx --yes --package=cheerio node wikipedia_country_links.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Guardian F1 article links" { + run npx --yes --package=cheerio node guardian_f1_links.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints Wikipedia calling codes" { + run npx --yes --package=cheerio node wikipedia_calling_codes.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Guardian F1 authors" { + run npx --yes --package=cheerio node guardian_f1_authors.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists npm LLM packages" { + run npx --yes --package=cheerio node npm_llm_packages.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "finds the shortest CNN sports article" { + run npx --yes --package=cheerio node cnn_sports_shortest_article.mjs + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "scrapes F1 Academy driver details with Crawlee" { + run npx --yes --package=crawlee --package=cheerio node crawlee_f1_drivers.mjs + (( status == 0 )) + [[ -n "$output" || -f dataset.json ]] + rm -f dataset.json +} + +@test "scrapes Netflix ratings with Crawlee" { + run npx --yes --package=crawlee --package=cheerio node crawlee_netflix_ratings.mjs + (( status == 0 )) + [[ -n "$output" || -f dataset.json ]] + rm -f dataset.json } diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs new file mode 100644 index 0000000000..d28745259a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs @@ -0,0 +1,36 @@ +import * as cheerio from 'cheerio'; + +function parseUnitsText(text) { + const count = text + .replace('In stock,', '') + .replace('Only', '') + .replace(' left', '') + .replace('units', '') + .trim(); + if (count === 'Sold out') { + return 0; + } + return Number.parseInt(count, 10); +} + +const url = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.product-item').toArray()) { + const $productItem = $(element); + + const $title = $productItem.find('.product-item__title'); + const title = $title.text().trim(); + + const unitsText = $productItem.find('.product-item__inventory').text(); + const unitsCount = parseUnitsText(unitsText); + + console.log(`${title} | ${unitsCount}`); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs new file mode 100644 index 0000000000..91ab01f234 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs @@ -0,0 +1,31 @@ +import * as cheerio from 'cheerio'; + +function parseUnitsText(text) { + const match = text.match(/\d+/); + if (match) { + return Number.parseInt(match[0], 10); + } + return 0; +} + +const url = 'https://warehouse-theme-metal.myshopify.com/collections/sales'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.product-item').toArray()) { + const $productItem = $(element); + + const $title = $productItem.find('.product-item__title'); + const title = $title.text().trim(); + + const unitsText = $productItem.find('.product-item__inventory').text(); + const unitsCount = parseUnitsText(unitsText); + + console.log(`${title} | ${unitsCount}`); +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs new file mode 100644 index 0000000000..ec15940234 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs @@ -0,0 +1,36 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; +const $ = await download(listingUrl); + +const cells = $('.wikitable tr td:nth-child(3)'); +const promises = cells.toArray().map(async element => { + const $nameCell = $(element); + const $link = $nameCell.find('a').first(); + if (!$link.length) { + return; + } + + const countryUrl = new URL($link.attr('href'), listingUrl).href; + const $country = await download(countryUrl); + const $label = $country('th.infobox-label') + .filter((_, el) => $country(el).text().trim() === 'Calling code') + .first(); + + const callingCode = $label.length + ? $label.parent().find('td.infobox-data').first().text().trim() + : ''; + + console.log(`${countryUrl} ${callingCode || null}`); +}); + +await Promise.all(promises); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs new file mode 100644 index 0000000000..fd9a7f2fb9 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs @@ -0,0 +1,29 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const tableElement of $('.wikitable').toArray()) { + const $table = $(tableElement); + const rows = $table.find('tr'); + + for (const rowElement of rows.toArray()) { + const $row = $(rowElement); + const cells = $row.find('td'); + + if (cells.length > 0) { + const $thirdColumn = $(cells[2]); + const $link = $thirdColumn.find('a').first(); + if ($link.length) { + console.log($link.text()); + } + } + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs new file mode 100644 index 0000000000..06f54d0686 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs @@ -0,0 +1,19 @@ +import * as cheerio from 'cheerio'; + +const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; +const response = await fetch(url); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.wikitable tr td:nth-child(3)').toArray()) { + const $nameCell = $(element); + const $link = $nameCell.find('a').first(); + if ($link.length) { + console.log($link.text()); + } +} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs new file mode 100644 index 0000000000..53c95f8d4d --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs @@ -0,0 +1,20 @@ +import * as cheerio from 'cheerio'; + +const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa'; +const response = await fetch(listingUrl); + +if (!response.ok) { + throw new Error(`HTTP ${response.status}`); +} + +const html = await response.text(); +const $ = cheerio.load(html); + +for (const element of $('.wikitable tr td:nth-child(3)').toArray()) { + const $nameCell = $(element); + const $link = $nameCell.find('a').first(); + if ($link.length) { + const url = new URL($link.attr('href'), listingUrl).href; + console.log(url); + } +} diff --git a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md index dbfa52cb9a..117b694217 100644 --- a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md @@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/parsing-html --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/f1academy_teams.py'; +import F1AcademyDriversExercise from '!!raw-loader!roa-loader!./exercises/f1academy_drivers.py'; **In this lesson we'll look for products in the downloaded HTML. We'll use BeautifulSoup to turn the HTML into objects which we can work with in our Python program.** @@ -131,18 +134,7 @@ https://www.f1academy.com/Racing-Series/Teams
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://www.f1academy.com/Racing-Series/Teams" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - print(len(soup.select(".teams-driver-item"))) - ``` + {F1AcademyTeamsExercise.code}
@@ -153,17 +145,6 @@ Use the same URL as in the previous exercise, but this time print a total count
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://www.f1academy.com/Racing-Series/Teams" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - print(len(soup.select(".driver"))) - ``` + {F1AcademyDriversExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 0708dc071e..2fa83587e4 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -5,7 +5,11 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/locating-elements --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.py'; +import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.py'; +import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py'; **In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.** @@ -244,25 +248,7 @@ Djibouti
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for table in soup.select(".wikitable"): - for row in table.select("tr"): - cells = row.select("td") - if cells: - third_column = cells[2] - title_link = third_column.select_one("a") - print(title_link.text) - ``` + {WikipediaCountriesExercise.code} Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. @@ -284,20 +270,7 @@ You may want to check out the following pages:
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for name_cell in soup.select(".wikitable tr td:nth-child(3)"): - print(name_cell.select_one("a").text) - ``` + {WikipediaCountriesSingleSelectorExercise.code}
@@ -321,19 +294,6 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for title in soup.select("#maincontent ul li h3"): - print(title.text) - ``` + {GuardianF1TitlesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md index eb49b7ce69..a50ac3db33 100644 --- a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md @@ -5,7 +5,11 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/extracting-data --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WarehouseUnitsExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units.py'; +import WarehouseUnitsRegexExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units_regex.py'; +import GuardianPublishDatesExercise from '!!raw-loader!roa-loader!./exercises/guardian_publish_dates.py'; **In this lesson we'll finish extracting product data from the downloaded HTML. With help of basic string manipulation we'll focus on cleaning and correctly representing the product price.** @@ -241,37 +245,7 @@ Denon AH-C720 In-Ear Headphones | 236
Solution - ```py - import httpx - from bs4 import BeautifulSoup - - url = "https://warehouse-theme-metal.myshopify.com/collections/sales" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - units_text = ( - product - .select_one(".product-item__inventory") - .text - .removeprefix("In stock,") - .removeprefix("Only") - .removesuffix(" left") - .removesuffix("units") - .strip() - ) - if "Sold out" in units_text: - units = 0 - else: - units = int(units_text) - - print(title, units, sep=" | ") - ``` + {WarehouseUnitsExercise.code}
@@ -282,29 +256,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://docs
Solution - ```py - import re - import httpx - from bs4 import BeautifulSoup - - url = "https://warehouse-theme-metal.myshopify.com/collections/sales" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - units_text = product.select_one(".product-item__inventory").text - if re_match := re.search(r"\d+", units_text): - units = int(re_match.group()) - else: - units = 0 - - print(title, units, sep=" | ") - ``` + {WarehouseUnitsRegexExercise.code}
@@ -338,25 +290,6 @@ Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from datetime import datetime - - url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for article in soup.select("#maincontent ul li"): - title = article.select_one("h3").text.strip() - - date_iso = article.select_one("time")["datetime"].strip() - date = datetime.fromisoformat(date_iso) - - print(title, date.strftime('%a %b %d %Y'), sep=" | ") - ``` + {GuardianPublishDatesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index a0d6d94743..4d5f31bf5f 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -5,6 +5,9 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/saving-data --- +import CodeBlock from '@theme/CodeBlock'; +import ProcessProductsJsonExercise from '!!raw-loader!roa-loader!./exercises/process_products_json.py'; + **In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use Python's standard library to export the files.** --- @@ -191,17 +194,7 @@ Write a new Python program that reads the `products.json` file we created in thi
Solution - ```py - import json - from pprint import pp - - with open("products.json", "r") as file: - products = json.load(file) - - for product in products: - if int(product["min_price"]) > 500: - pp(product) - ``` + {ProcessProductsJsonExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index 883ba050f3..3d1bfd91d1 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/getting-links --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.py'; +import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.py'; **In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.** @@ -345,23 +348,7 @@ https://en.wikipedia.org/wiki/Botswana
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - response = httpx.get(listing_url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for name_cell in soup.select(".wikitable tr td:nth-child(3)"): - link = name_cell.select_one("a") - url = urljoin(listing_url, link["href"]) - print(url) - ``` + {WikipediaCountryLinksExercise.code}
@@ -386,23 +373,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - listing_url = "https://www.theguardian.com/sport/formulaone" - response = httpx.get(listing_url) - response.raise_for_status() - - html_code = response.text - soup = BeautifulSoup(html_code, "html.parser") - - for item in soup.select("#maincontent ul li"): - link = item.select_one("a") - url = urljoin(listing_url, link["href"]) - print(url) - ``` + {GuardianF1LinksExercise.code} Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this: diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 836dadad3a..6d0b554405 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/crawling --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.py'; +import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py'; **In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.** @@ -210,32 +213,7 @@ Locating cells in tables is sometimes easier if you know how to [navigate up](ht
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - def download(url): - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - def parse_calling_code(soup): - for label in soup.select("th.infobox-label"): - if label.text.strip() == "Calling code": - data = label.parent.select_one("td.infobox-data") - return data.text.strip() - return None - - listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" - listing_soup = download(listing_url) - for name_cell in listing_soup.select(".wikitable tr td:nth-child(3)"): - link = name_cell.select_one("a") - country_url = urljoin(listing_url, link["href"]) - country_soup = download(country_url) - calling_code = parse_calling_code(country_soup) - print(country_url, calling_code) - ``` + {WikipediaCallingCodesExercise.code}
@@ -268,34 +246,6 @@ PA Media: Lewis Hamilton reveals lifelong battle with depression after school bu
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - def download(url): - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - def parse_author(article_soup): - link = article_soup.select_one('a[rel="author"]') - if link: - return link.text.strip() - address = article_soup.select_one('aside address') - if address: - return address.text.strip() - return None - - listing_url = "https://www.theguardian.com/sport/formulaone" - listing_soup = download(listing_url) - for item in listing_soup.select("#maincontent ul li"): - link = item.select_one("a") - article_url = urljoin(listing_url, link["href"]) - article_soup = download(article_url) - title = article_soup.select_one("h1").text.strip() - author = parse_author(article_soup) - print(f"{author}: {title}") - ``` + {GuardianF1AuthorsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index e47affbaec..463a217874 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/scraping-variants --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import PythonJobsDatabaseExercise from '!!raw-loader!roa-loader!./exercises/python_jobs_database.py'; +import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.py'; **In this lesson, we'll scrape the product detail pages to represent each product variant as a separate item in our dataset.** @@ -342,33 +345,7 @@ You can find everything you need for working with dates and times in Python's [` After inspecting the job board, you'll notice that job postings tagged as "Database" have a dedicated URL. We'll use that as our starting point, which saves us from having to scrape and check the tags manually. - ```py - from pprint import pp - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - from datetime import datetime, date, timedelta - - today = date.today() - jobs_url = "https://www.python.org/jobs/type/database/" - response = httpx.get(jobs_url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - - for job in soup.select(".list-recent-jobs li"): - link = job.select_one(".listing-company-name a") - - time = job.select_one(".listing-posted time") - posted_at = datetime.fromisoformat(time["datetime"]) - posted_on = posted_at.date() - posted_ago = today - posted_on - - if posted_ago <= timedelta(days=60): - title = link.text.strip() - company = list(job.select_one(".listing-company-name").stripped_strings)[-1] - url = urljoin(jobs_url, link["href"]) - pp({"title": title, "company": company, "url": url, "posted_on": posted_on}) - ``` + {PythonJobsDatabaseExercise.code}
@@ -387,32 +364,6 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution - ```py - import httpx - from bs4 import BeautifulSoup - from urllib.parse import urljoin - - def download(url): - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - listing_url = "https://edition.cnn.com/sport" - listing_soup = download(listing_url) - - data = [] - for card in listing_soup.select(".layout__main .card"): - link = card.select_one(".container__link") - article_url = urljoin(listing_url, link["href"]) - article_soup = download(article_url) - if content := article_soup.select_one(".article__content"): - length = len(content.get_text()) - data.append((length, article_url)) - - data.sort() - shortest_item = data[0] - item_url = shortest_item[1] - print(item_url) - ``` + {CnnSportsShortestArticleExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index 6f8861785d..579d7e9ced 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi slug: /scraping-basics-python/framework --- +import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; +import CrawleeF1DriversExercise from '!!raw-loader!roa-loader!./exercises/crawlee_f1_drivers.py'; +import CrawleeNetflixRatingsExercise from '!!raw-loader!roa-loader!./exercises/crawlee_netflix_ratings.py'; **In this lesson, we'll rework our application for watching prices so that it builds on top of a scraping framework. We'll use Crawlee to make the program simpler, faster, and more robust.** @@ -463,48 +466,7 @@ If you export the dataset as JSON, it should look something like this:
Solution - ```py - import asyncio - from datetime import datetime - - from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext - - async def main(): - crawler = BeautifulSoupCrawler() - - @crawler.router.default_handler - async def handle_listing(context: BeautifulSoupCrawlingContext): - await context.enqueue_links(selector=".teams-driver-item a", label="DRIVER") - - @crawler.router.handler("DRIVER") - async def handle_driver(context: BeautifulSoupCrawlingContext): - info = {} - for row in context.soup.select(".common-driver-info li"): - name = row.select_one("span").text.strip() - value = row.select_one("h4").text.strip() - info[name] = value - - detail = {} - for row in context.soup.select(".driver-detail--cta-group a"): - name = row.select_one("p").text.strip() - value = row.select_one("h2").text.strip() - detail[name] = value - - await context.push_data({ - "url": context.request.url, - "name": context.soup.select_one("h1").text.strip(), - "team": detail["Team"], - "nationality": info["Nationality"], - "dob": datetime.strptime(info["DOB"], "%d/%m/%Y").date(), - "instagram_url": context.soup.select_one(".common-social-share a[href*='instagram']").get("href"), - }) - - await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"]) - await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2) - - if __name__ == '__main__': - asyncio.run(main()) - ``` + {CrawleeF1DriversExercise.code}
@@ -564,44 +526,6 @@ When navigating to the first IMDb search result, you might find it helpful to kn
Solution - ```py - import asyncio - from urllib.parse import quote_plus - - from crawlee import Request - from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext - - async def main(): - crawler = BeautifulSoupCrawler() - - @crawler.router.default_handler - async def handle_netflix_table(context: BeautifulSoupCrawlingContext): - requests = [] - for name_cell in context.soup.select('[data-uia="top10-table-row-title"] button'): - name = name_cell.text.strip() - imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" - requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH")) - await context.add_requests(requests) - - @crawler.router.handler("IMDB_SEARCH") - async def handle_imdb_search(context: BeautifulSoupCrawlingContext): - await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1) - - @crawler.router.handler("IMDB") - async def handle_imdb(context: BeautifulSoupCrawlingContext): - rating_selector = "[data-testid='hero-rating-bar__aggregate-rating__score']" - rating_text = context.soup.select_one(rating_selector).text.strip() - await context.push_data({ - "url": context.request.url, - "title": context.soup.select_one("h1").text.strip(), - "rating": rating_text, - }) - - await crawler.run(["https://www.netflix.com/tudum/top10"]) - await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2) - - if __name__ == '__main__': - asyncio.run(main()) - ``` + {CrawleeNetflixRatingsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py b/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py new file mode 100644 index 0000000000..bf8c03f07b --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py @@ -0,0 +1,32 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +listing_url = "https://edition.cnn.com/sport" +listing_soup = download(listing_url) + +results: list[tuple[int, str]] = [] +for card in listing_soup.select('.layout__main .card'): + link = card.select_one('.container__link') + if not link or 'href' not in link.attrs: + continue + + article_url = urljoin(listing_url, link['href']) + article_soup = download(article_url) + content = article_soup.select_one('.article__content') + + if not content: + continue + + results.append((len(content.get_text()), article_url)) + +results.sort() +if results: + print(results[0][1]) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py new file mode 100644 index 0000000000..83a854ab14 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py @@ -0,0 +1,54 @@ +import asyncio +from datetime import datetime + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +def format_dob(value: str) -> str | None: + try: + return datetime.strptime(value, "%d/%m/%Y").date().isoformat() + except ValueError: + return None + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def handle_listing(context: BeautifulSoupCrawlingContext) -> None: + await context.enqueue_links(selector=".teams-driver-item a", label="DRIVER") + + @crawler.router.handler("DRIVER") + async def handle_driver(context: BeautifulSoupCrawlingContext) -> None: + info: dict[str, str] = {} + for row in context.soup.select(".common-driver-info li"): + name = row.select_one("span").text.strip() + value = row.select_one("h4").text.strip() + info[name] = value + + detail: dict[str, str] = {} + for row in context.soup.select(".driver-detail--cta-group a"): + name = row.select_one("p").text.strip() + value = row.select_one("h2").text.strip() + detail[name] = value + + title_tag = context.soup.select_one("h1") + instagram_link = context.soup.select_one(".common-social-share a[href*='instagram']") + + await context.push_data( + { + "url": context.request.url, + "name": title_tag.text.strip() if title_tag else None, + "team": detail.get("Team"), + "nationality": info.get("Nationality"), + "dob": format_dob(info.get("DOB", "")), + "instagram_url": instagram_link.get("href") if instagram_link else None, + } + ) + + await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"]) + await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined] + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py new file mode 100644 index 0000000000..548c54435a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -0,0 +1,46 @@ +import asyncio +from urllib.parse import quote_plus + +from crawlee import Request +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: + requests: list[Request] = [] + for name_cell in context.soup.select('[data-uia="top10-table-row-title"] button'): + name = name_cell.text.strip() + imdb_search_url = ( + f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" + ) + requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH")) + await context.add_requests(requests) + + @crawler.router.handler("IMDB_SEARCH") + async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None: + await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1) + + @crawler.router.handler("IMDB") + async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: + rating_element = context.soup.select_one( + "[data-testid='hero-rating-bar__aggregate-rating__score']" + ) + title_element = context.soup.select_one("h1") + if rating_element and title_element: + await context.push_data( + { + "url": context.request.url, + "title": title_element.text.strip(), + "rating": rating_element.text.strip(), + } + ) + + await crawler.run(["https://www.netflix.com/tudum/top10"]) + await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined] + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py new file mode 100644 index 0000000000..ec11e4ab82 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py @@ -0,0 +1,9 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.f1academy.com/Racing-Series/Teams" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") +print(len(soup.select('.driver'))) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py new file mode 100644 index 0000000000..d9b53941ac --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py @@ -0,0 +1,9 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.f1academy.com/Racing-Series/Teams" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") +print(len(soup.select('.teams-driver-item'))) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py new file mode 100644 index 0000000000..4f1bc2664a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py @@ -0,0 +1,36 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +def parse_author(article_soup: BeautifulSoup) -> str | None: + link = article_soup.select_one('a[rel="author"]') + if link: + return link.text.strip() + address = article_soup.select_one('aside address') + if address: + return address.text.strip() + return None + + +listing_url = "https://www.theguardian.com/sport/formulaone" +listing_soup = download(listing_url) + +for item in listing_soup.select('#maincontent ul li'): + link = item.select_one('a') + if not link or 'href' not in link.attrs: + continue + + article_url = urljoin(listing_url, link['href']) + article_soup = download(article_url) + + title = article_soup.select_one('h1').text.strip() + author = parse_author(article_soup) + + print(f"{author}: {title}") diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py new file mode 100644 index 0000000000..71cec59bd7 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py @@ -0,0 +1,15 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +listing_url = "https://www.theguardian.com/sport/formulaone" +response = httpx.get(listing_url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for item in soup.select('#maincontent ul li'): + link = item.select_one('a') + if link and 'href' in link.attrs: + url = urljoin(listing_url, link['href']) + print(url) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py new file mode 100644 index 0000000000..b90d0f48c0 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py @@ -0,0 +1,11 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://www.theguardian.com/sport/formulaone" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for title in soup.select("#maincontent ul li h3"): + print(title.text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py new file mode 100644 index 0000000000..b60c291b52 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py @@ -0,0 +1,23 @@ +from datetime import datetime + +import httpx +from bs4 import BeautifulSoup + +url = "https://www.theguardian.com/sport/formulaone" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for article in soup.select('#maincontent ul li'): + title_tag = article.select_one('h3') + time_tag = article.select_one('time') + + if not title_tag or not time_tag or 'datetime' not in time_tag.attrs: + continue + + title = title_tag.text.strip() + date_iso = time_tag['datetime'].strip() + date = datetime.fromisoformat(date_iso) + + print(f"{title} | {date.strftime('%a %b %d %Y')}") diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py new file mode 100644 index 0000000000..934d347b8a --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py @@ -0,0 +1,9 @@ +import json +from pprint import pp + +with open('products.json', 'r', encoding='utf-8') as file: + products = json.load(file) + +for product in products: + if int(product['min_price']) > 500: + pp(product) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/products.json b/sources/academy/webscraping/scraping_basics_python/exercises/products.json new file mode 100644 index 0000000000..d4adea213c --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/products.json @@ -0,0 +1,12 @@ +[ + { + "title": "Premium Speakers", + "min_price": "750", + "price": "750" + }, + { + "title": "Budget Headphones", + "min_price": "250", + "price": "250" + } +] diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py b/sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py new file mode 100644 index 0000000000..9659ace119 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py @@ -0,0 +1,32 @@ +from datetime import datetime, date, timedelta +from pprint import pp + +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +today = date.today() +jobs_url = "https://www.python.org/jobs/type/database/" +response = httpx.get(jobs_url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for job in soup.select('.list-recent-jobs li'): + link = job.select_one('.listing-company-name a') + if not link: + continue + + time_tag = job.select_one('.listing-posted time') + if not time_tag or 'datetime' not in time_tag.attrs: + continue + + posted_at = datetime.fromisoformat(time_tag['datetime']) + posted_on = posted_at.date() + + if today - posted_on <= timedelta(days=60): + title = link.text.strip() + company = list(job.select_one('.listing-company-name').stripped_strings)[-1] + url = urljoin(jobs_url, link['href']) + pp({"title": title, "company": company, "url": url, "posted_on": posted_on}) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index ac09b3032a..d829cd798e 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -1,4 +1,112 @@ +setup() { + cd "$BATS_TEST_DIRNAME" +} + @test "outputs the HTML with Star Wars products" { - run uv run --with httpx python "$BATS_TEST_DIRNAME/lego.py" + run uv run --with httpx python lego.py [[ "$output" == *"Millennium Falcon"* ]] } + +@test "counts the number of F1 Academy teams" { + run uv run --with httpx --with beautifulsoup4 python f1academy_teams.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "counts the number of F1 Academy drivers" { + run uv run --with httpx --with beautifulsoup4 python f1academy_drivers.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists African countries" { + run uv run --with httpx --with beautifulsoup4 python wikipedia_countries.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists African countries with a single selector" { + run uv run --with httpx --with beautifulsoup4 python wikipedia_countries_single_selector.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Guardian F1 article titles" { + run uv run --with httpx --with beautifulsoup4 python guardian_f1_titles.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints warehouse stock counts" { + run uv run --with httpx --with beautifulsoup4 python warehouse_units.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints warehouse stock counts using regex" { + run uv run --with httpx --with beautifulsoup4 python warehouse_units_regex.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints Guardian F1 titles with publish dates" { + run uv run --with httpx --with beautifulsoup4 python guardian_publish_dates.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "filters products from JSON" { + run uv run python process_products_json.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Wikipedia country links" { + run uv run --with httpx --with beautifulsoup4 python wikipedia_country_links.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Guardian F1 article links" { + run uv run --with httpx --with beautifulsoup4 python guardian_f1_links.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "prints Wikipedia calling codes" { + run uv run --with httpx --with beautifulsoup4 python wikipedia_calling_codes.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Guardian F1 authors" { + run uv run --with httpx --with beautifulsoup4 python guardian_f1_authors.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "lists Python database jobs" { + run uv run --with httpx --with beautifulsoup4 python python_jobs_database.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "finds the shortest CNN sports article" { + run uv run --with httpx --with beautifulsoup4 python cnn_sports_shortest_article.py + (( status == 0 )) + [[ -n "$output" ]] +} + +@test "scrapes F1 Academy driver details with Crawlee" { + run uv run --with httpx --with beautifulsoup4 --with crawlee python crawlee_f1_drivers.py + (( status == 0 )) + [[ -n "$output" || -f dataset.json ]] + rm -f dataset.json +} + +@test "scrapes Netflix ratings with Crawlee" { + run uv run --with httpx --with beautifulsoup4 --with crawlee python crawlee_netflix_ratings.py + (( status == 0 )) + [[ -n "$output" || -f dataset.json ]] + rm -f dataset.json +} diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py new file mode 100644 index 0000000000..23ce38b17e --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py @@ -0,0 +1,23 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://warehouse-theme-metal.myshopify.com/collections/sales" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for product in soup.select('.product-item'): + title = product.select_one('.product-item__title').text.strip() + + units_text = ( + product.select_one('.product-item__inventory').text + .removeprefix('In stock,') + .removeprefix('Only') + .removesuffix(' left') + .removesuffix('units') + .strip() + ) + units = 0 if 'Sold out' in units_text else int(units_text) + + print(f"{title} | {units}") diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py new file mode 100644 index 0000000000..7aba32a604 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py @@ -0,0 +1,19 @@ +import re + +import httpx +from bs4 import BeautifulSoup + +url = "https://warehouse-theme-metal.myshopify.com/collections/sales" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for product in soup.select('.product-item'): + title = product.select_one('.product-item__title').text.strip() + + units_text = product.select_one('.product-item__inventory').text + match = re.search(r"\d+", units_text) + units = int(match.group()) if match else 0 + + print(f"{title} | {units}") diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py new file mode 100644 index 0000000000..4d424d6dc1 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py @@ -0,0 +1,32 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +def parse_calling_code(soup: BeautifulSoup) -> str | None: + for label in soup.select('th.infobox-label'): + if label.text.strip() == 'Calling code': + cell = label.parent.select_one('td.infobox-data') + return cell.text.strip() if cell else None + return None + + +listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" +listing_soup = download(listing_url) + +for name_cell in listing_soup.select('.wikitable tr td:nth-child(3)'): + link = name_cell.select_one('a') + if not link or 'href' not in link.attrs: + continue + + country_url = urljoin(listing_url, link['href']) + country_soup = download(country_url) + calling_code = parse_calling_code(country_soup) + + print(country_url, calling_code) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py new file mode 100644 index 0000000000..0d4769ccbb --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py @@ -0,0 +1,17 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for table in soup.select(".wikitable"): + for row in table.select("tr"): + cells = row.select("td") + if cells: + third_column = cells[2] + link = third_column.select_one("a") + if link: + print(link.text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py new file mode 100644 index 0000000000..1fc4a6b268 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py @@ -0,0 +1,13 @@ +import httpx +from bs4 import BeautifulSoup + +url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" +response = httpx.get(url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for name_cell in soup.select(".wikitable tr td:nth-child(3)"): + link = name_cell.select_one("a") + if link: + print(link.text) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py new file mode 100644 index 0000000000..f435016e45 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py @@ -0,0 +1,15 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa" +response = httpx.get(listing_url) +response.raise_for_status() + +soup = BeautifulSoup(response.text, "html.parser") + +for name_cell in soup.select('.wikitable tr td:nth-child(3)'): + link = name_cell.select_one('a') + if link and 'href' in link.attrs: + url = urljoin(listing_url, link['href']) + print(url) From 6936de91dd1b1bdff367557ced803dd467fe971f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 11:22:02 +0100 Subject: [PATCH 13/26] fix: avoid the yes option, fix crawlee installation, improve readability of uv options --- .../exercises/test.bats | 39 +++++++++---------- .../exercises/test.bats | 36 ++++++++--------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index aae0b8ecab..c4d7598e19 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -1,112 +1,111 @@ setup() { cd "$BATS_TEST_DIRNAME" + export npm_config_yes=true } @test "outputs the HTML with Star Wars products" { - run npx --yes node lego.mjs + run npx node lego.mjs [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { - run npx --yes --package=cheerio node f1academy_teams.mjs + run npx --package=cheerio node f1academy_teams.mjs (( status == 0 )) [[ -n "$output" ]] } @test "counts the number of F1 Academy drivers" { - run npx --yes --package=cheerio node f1academy_drivers.mjs + run npx --package=cheerio node f1academy_drivers.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists African countries" { - run npx --yes --package=cheerio node wikipedia_countries.mjs + run npx --package=cheerio node wikipedia_countries.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists African countries with a single selector" { - run npx --yes --package=cheerio node wikipedia_countries_single_selector.mjs + run npx --package=cheerio node wikipedia_countries_single_selector.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists Guardian F1 article titles" { - run npx --yes --package=cheerio node guardian_f1_titles.mjs + run npx --package=cheerio node guardian_f1_titles.mjs (( status == 0 )) [[ -n "$output" ]] } @test "prints warehouse stock counts" { - run npx --yes --package=cheerio node warehouse_units.mjs + run npx --package=cheerio node warehouse_units.mjs (( status == 0 )) [[ -n "$output" ]] } @test "prints warehouse stock counts using regex" { - run npx --yes --package=cheerio node warehouse_units_regex.mjs + run npx --package=cheerio node warehouse_units_regex.mjs (( status == 0 )) [[ -n "$output" ]] } @test "prints Guardian F1 titles with publish dates" { - run npx --yes --package=cheerio node guardian_publish_dates.mjs + run npx --package=cheerio node guardian_publish_dates.mjs (( status == 0 )) [[ -n "$output" ]] } @test "filters products from JSON" { - run npx --yes node process_products_json.mjs + run npx node process_products_json.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists Wikipedia country links" { - run npx --yes --package=cheerio node wikipedia_country_links.mjs + run npx --package=cheerio node wikipedia_country_links.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists Guardian F1 article links" { - run npx --yes --package=cheerio node guardian_f1_links.mjs + run npx --package=cheerio node guardian_f1_links.mjs (( status == 0 )) [[ -n "$output" ]] } @test "prints Wikipedia calling codes" { - run npx --yes --package=cheerio node wikipedia_calling_codes.mjs + run npx --package=cheerio node wikipedia_calling_codes.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists Guardian F1 authors" { - run npx --yes --package=cheerio node guardian_f1_authors.mjs + run npx --package=cheerio node guardian_f1_authors.mjs (( status == 0 )) [[ -n "$output" ]] } @test "lists npm LLM packages" { - run npx --yes --package=cheerio node npm_llm_packages.mjs + run npx --package=cheerio node npm_llm_packages.mjs (( status == 0 )) [[ -n "$output" ]] } @test "finds the shortest CNN sports article" { - run npx --yes --package=cheerio node cnn_sports_shortest_article.mjs + run npx --package=cheerio node cnn_sports_shortest_article.mjs (( status == 0 )) [[ -n "$output" ]] } @test "scrapes F1 Academy driver details with Crawlee" { - run npx --yes --package=crawlee --package=cheerio node crawlee_f1_drivers.mjs - (( status == 0 )) + run npx --package=crawlee node crawlee_f1_drivers.mjs [[ -n "$output" || -f dataset.json ]] rm -f dataset.json } @test "scrapes Netflix ratings with Crawlee" { - run npx --yes --package=crawlee --package=cheerio node crawlee_netflix_ratings.mjs - (( status == 0 )) + run npx --package=crawlee node crawlee_netflix_ratings.mjs [[ -n "$output" || -f dataset.json ]] rm -f dataset.json } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index d829cd798e..dfefde9dc1 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -3,54 +3,54 @@ setup() { } @test "outputs the HTML with Star Wars products" { - run uv run --with httpx python lego.py + run uv run --with=httpx python lego.py [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { - run uv run --with httpx --with beautifulsoup4 python f1academy_teams.py + run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py (( status == 0 )) [[ -n "$output" ]] } @test "counts the number of F1 Academy drivers" { - run uv run --with httpx --with beautifulsoup4 python f1academy_drivers.py + run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py (( status == 0 )) [[ -n "$output" ]] } @test "lists African countries" { - run uv run --with httpx --with beautifulsoup4 python wikipedia_countries.py + run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py (( status == 0 )) [[ -n "$output" ]] } @test "lists African countries with a single selector" { - run uv run --with httpx --with beautifulsoup4 python wikipedia_countries_single_selector.py + run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py (( status == 0 )) [[ -n "$output" ]] } @test "lists Guardian F1 article titles" { - run uv run --with httpx --with beautifulsoup4 python guardian_f1_titles.py + run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py (( status == 0 )) [[ -n "$output" ]] } @test "prints warehouse stock counts" { - run uv run --with httpx --with beautifulsoup4 python warehouse_units.py + run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py (( status == 0 )) [[ -n "$output" ]] } @test "prints warehouse stock counts using regex" { - run uv run --with httpx --with beautifulsoup4 python warehouse_units_regex.py + run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py (( status == 0 )) [[ -n "$output" ]] } @test "prints Guardian F1 titles with publish dates" { - run uv run --with httpx --with beautifulsoup4 python guardian_publish_dates.py + run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py (( status == 0 )) [[ -n "$output" ]] } @@ -62,51 +62,49 @@ setup() { } @test "lists Wikipedia country links" { - run uv run --with httpx --with beautifulsoup4 python wikipedia_country_links.py + run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py (( status == 0 )) [[ -n "$output" ]] } @test "lists Guardian F1 article links" { - run uv run --with httpx --with beautifulsoup4 python guardian_f1_links.py + run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py (( status == 0 )) [[ -n "$output" ]] } @test "prints Wikipedia calling codes" { - run uv run --with httpx --with beautifulsoup4 python wikipedia_calling_codes.py + run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py (( status == 0 )) [[ -n "$output" ]] } @test "lists Guardian F1 authors" { - run uv run --with httpx --with beautifulsoup4 python guardian_f1_authors.py + run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py (( status == 0 )) [[ -n "$output" ]] } @test "lists Python database jobs" { - run uv run --with httpx --with beautifulsoup4 python python_jobs_database.py + run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py (( status == 0 )) [[ -n "$output" ]] } @test "finds the shortest CNN sports article" { - run uv run --with httpx --with beautifulsoup4 python cnn_sports_shortest_article.py + run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py (( status == 0 )) [[ -n "$output" ]] } @test "scrapes F1 Academy driver details with Crawlee" { - run uv run --with httpx --with beautifulsoup4 --with crawlee python crawlee_f1_drivers.py - (( status == 0 )) + run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py [[ -n "$output" || -f dataset.json ]] rm -f dataset.json } @test "scrapes Netflix ratings with Crawlee" { - run uv run --with httpx --with beautifulsoup4 --with crawlee python crawlee_netflix_ratings.py - (( status == 0 )) + run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py [[ -n "$output" || -f dataset.json ]] rm -f dataset.json } From cd5deaa180ec1a774e5a38d44b1932f4ba3d3eb7 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 12:13:45 +0100 Subject: [PATCH 14/26] chore: make the tests more meaningful --- .gitignore | 1 + .../exercises/test.bats | 70 +++++++++++-------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index b16d993b57..d76fad186b 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ codegen/*/go.sum .github/styles/Microsoft .github/styles/write-good sources/academy/**/exercises/storage +sources/academy/**/exercises/dataset.json diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index c4d7598e19..1a0ae8fa89 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -3,6 +3,15 @@ setup() { export npm_config_yes=true } +retry_run() { + for attempt in 1 2 3; do + run "$@" + (( status == 0 )) && return 0 + sleep 1 + done + return "$status" +} + @test "outputs the HTML with Star Wars products" { run npx node lego.mjs [[ "$output" == *"Millennium Falcon"* ]] @@ -10,80 +19,84 @@ setup() { @test "counts the number of F1 Academy teams" { run npx --package=cheerio node f1academy_teams.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == "6" ]] } @test "counts the number of F1 Academy drivers" { run npx --package=cheerio node f1academy_drivers.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == "18" ]] } @test "lists African countries" { run npx --package=cheerio node wikipedia_countries.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists African countries with a single selector" { run npx --package=cheerio node wikipedia_countries_single_selector.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article titles" { run npx --package=cheerio node guardian_f1_titles.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *' F1 '* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts" { run npx --package=cheerio node warehouse_units.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts using regex" { run npx --package=cheerio node warehouse_units_regex.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Guardian F1 titles with publish dates" { run npx --package=cheerio node guardian_publish_dates.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *' F1 '* ]] + [[ "$output" == *' | Sun '* ]] # has info about date (articles published on Sunday are very likely) + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "filters products from JSON" { run npx node process_products_json.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } @test "lists Wikipedia country links" { run npx --package=cheerio node wikipedia_country_links.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] + [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article links" { run npx --package=cheerio node guardian_f1_links.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *'https://www.theguardian.com/sport/'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Wikipedia calling codes" { run npx --package=cheerio node wikipedia_calling_codes.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] + [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 authors" { run npx --package=cheerio node guardian_f1_authors.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == *' F1 '* ]] + [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him) + [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists npm LLM packages" { @@ -94,18 +107,19 @@ setup() { @test "finds the shortest CNN sports article" { run npx --package=cheerio node cnn_sports_shortest_article.mjs - (( status == 0 )) - [[ -n "$output" ]] + [[ "$output" == 'https://edition.cnn.com/'* ]] } @test "scrapes F1 Academy driver details with Crawlee" { run npx --package=crawlee node crawlee_f1_drivers.mjs + (( status == 0 )) [[ -n "$output" || -f dataset.json ]] rm -f dataset.json } @test "scrapes Netflix ratings with Crawlee" { run npx --package=crawlee node crawlee_netflix_ratings.mjs + (( status == 0 )) [[ -n "$output" || -f dataset.json ]] rm -f dataset.json } From 857d3a330749e2ec7fe6f7f9ff6f0d398f6a7e7d Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 12:57:42 +0100 Subject: [PATCH 15/26] chore: improve the JS test suite --- .gitignore | 2 + .../exercises/test.bats | 56 ++++++++++++++----- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index d76fad186b..995f278d94 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,6 @@ codegen/*/go.sum .github/styles/Microsoft .github/styles/write-good sources/academy/**/exercises/storage +sources/academy/**/exercises/node_modules +sources/academy/**/exercises/package*.json sources/academy/**/exercises/dataset.json diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 1a0ae8fa89..abb47539c3 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -1,52 +1,63 @@ -setup() { +setup_file() { cd "$BATS_TEST_DIRNAME" export npm_config_yes=true } -retry_run() { - for attempt in 1 2 3; do - run "$@" - (( status == 0 )) && return 0 - sleep 1 - done - return "$status" +teardown_file() { + rm -rf storage node_modules package.json package-lock.json dataset.json } +# retry_run() { +# for attempt in 1 2 3; do +# run "$@" +# (( status == 0 )) && return 0 +# sleep 1 +# done +# return "$status" +# } + @test "outputs the HTML with Star Wars products" { run npx node lego.mjs + [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { run npx --package=cheerio node f1academy_teams.mjs + [[ "$output" == "6" ]] } @test "counts the number of F1 Academy drivers" { run npx --package=cheerio node f1academy_drivers.mjs + [[ "$output" == "18" ]] } @test "lists African countries" { run npx --package=cheerio node wikipedia_countries.mjs + [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists African countries with a single selector" { run npx --package=cheerio node wikipedia_countries_single_selector.mjs + [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article titles" { run npx --package=cheerio node guardian_f1_titles.mjs + [[ "$output" == *' F1 '* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts" { run npx --package=cheerio node warehouse_units.mjs + [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] @@ -54,6 +65,7 @@ retry_run() { @test "prints warehouse stock counts using regex" { run npx --package=cheerio node warehouse_units_regex.mjs + [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] @@ -61,18 +73,21 @@ retry_run() { @test "prints Guardian F1 titles with publish dates" { run npx --package=cheerio node guardian_publish_dates.mjs + [[ "$output" == *' F1 '* ]] - [[ "$output" == *' | Sun '* ]] # has info about date (articles published on Sunday are very likely) + [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "filters products from JSON" { run npx node process_products_json.mjs + [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } @test "lists Wikipedia country links" { run npx --package=cheerio node wikipedia_country_links.mjs + [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] @@ -80,12 +95,14 @@ retry_run() { @test "lists Guardian F1 article links" { run npx --package=cheerio node guardian_f1_links.mjs + [[ "$output" == *'https://www.theguardian.com/sport/'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Wikipedia calling codes" { run npx --package=cheerio node wikipedia_calling_codes.mjs + [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] @@ -93,6 +110,7 @@ retry_run() { @test "lists Guardian F1 authors" { run npx --package=cheerio node guardian_f1_authors.mjs + [[ "$output" == *' F1 '* ]] [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him) [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] @@ -101,25 +119,37 @@ retry_run() { @test "lists npm LLM packages" { run npx --package=cheerio node npm_llm_packages.mjs + (( status == 0 )) [[ -n "$output" ]] } @test "finds the shortest CNN sports article" { run npx --package=cheerio node cnn_sports_shortest_article.mjs + [[ "$output" == 'https://edition.cnn.com/'* ]] } @test "scrapes F1 Academy driver details with Crawlee" { - run npx --package=crawlee node crawlee_f1_drivers.mjs + npm init --yes + npm install crawlee + run node crawlee_f1_drivers.mjs + (( status == 0 )) [[ -n "$output" || -f dataset.json ]] - rm -f dataset.json + [[ $(cat dataset.json | jq '. | length') == "18" ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } @test "scrapes Netflix ratings with Crawlee" { - run npx --package=crawlee node crawlee_netflix_ratings.mjs + npm init --yes + npm install crawlee + run node crawlee_netflix_ratings.mjs + (( status == 0 )) [[ -n "$output" || -f dataset.json ]] - rm -f dataset.json + [[ $(cat dataset.json | jq '. | length') == "10" ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } From 9901553dd5cbcd5216407595265b370b2f6e4166 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 12:59:04 +0100 Subject: [PATCH 16/26] style: make the code linter happy --- .../exercises/cnn_sports_shortest_article.mjs | 6 +++--- .../exercises/crawlee_netflix_ratings.mjs | 5 +++-- .../exercises/guardian_f1_authors.mjs | 2 +- .../exercises/npm_llm_packages.mjs | 6 +++--- .../exercises/process_products_json.mjs | 4 ++-- .../exercises/wikipedia_calling_codes.mjs | 5 +++-- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs index 800a103440..c9e0bad89a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs @@ -13,7 +13,7 @@ const listingUrl = 'https://edition.cnn.com/sport'; const $ = await download(listingUrl); const results = await Promise.all( - $('.layout__main .card').toArray().map(async element => { + $('.layout__main .card').toArray().map(async (element) => { const $element = $(element); const $link = $element.find('a').first(); if (!$link.length) { @@ -29,10 +29,10 @@ const results = await Promise.all( } return { url: articleUrl, length: content.length }; - }) + }), ); -const nonEmpty = results.filter(item => item && item.length > 0); +const nonEmpty = results.filter((item) => item && item.length > 0); nonEmpty.sort((a, b) => a.length - b.length); if (nonEmpty.length > 0) { diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index 0d2c9d662c..6a1a756328 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -1,6 +1,7 @@ -import { CheerioCrawler, Request } from 'crawlee'; import { escape } from 'node:querystring'; +import { CheerioCrawler, Request } from 'crawlee'; + const crawler = new CheerioCrawler({ async requestHandler({ $, request, enqueueLinks, pushData, addRequests }) { if (request.label === 'IMDB') { @@ -16,7 +17,7 @@ const crawler = new CheerioCrawler({ } else if (request.label === 'IMDB_SEARCH') { await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 }); } else if (request.label === 'NETFLIX') { - const requests = $("[data-uia='top10-table-row-title'] button").toArray().map(buttonElement => { + const requests = $("[data-uia='top10-table-row-title'] button").toArray().map((buttonElement) => { const name = $(buttonElement).text().trim(); const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs index 6e9be2fae1..ebdc0ecff5 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs @@ -12,7 +12,7 @@ async function download(url) { const listingUrl = 'https://www.theguardian.com/sport/formulaone'; const $ = await download(listingUrl); -const promises = $('#maincontent ul li').toArray().map(async element => { +const promises = $('#maincontent ul li').toArray().map(async (element) => { const $item = $(element); const $link = $item.find('a').first(); if (!$link.length) { diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs index f399ff241a..f52a885057 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs @@ -16,7 +16,7 @@ function parseNumber(text) { const listingUrl = 'https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count'; const $ = await download(listingUrl); -const promises = $('section').toArray().map(async element => { +const promises = $('section').toArray().map(async (element) => { const $card = $(element); const $link = $card.find('a').first(); if (!$link.length) { @@ -30,7 +30,7 @@ const promises = $('section').toArray().map(async element => { .last() .text() .split('•') - .map(item => item.trim()); + .map((item) => item.trim()); const updatedText = details[2] ?? ''; const dependentsText = details[3] ?? ''; @@ -57,5 +57,5 @@ const promises = $('section').toArray().map(async element => { return { name, url, description, dependents, downloads }; }); -const data = (await Promise.all(promises)).filter(item => item); +const data = (await Promise.all(promises)).filter((item) => item); console.log(data.slice(0, 5)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs index 13a68efa5e..a7c951090a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs @@ -4,5 +4,5 @@ const jsonData = await readFile('products.json', 'utf8'); const data = JSON.parse(jsonData); data - .filter(row => row.minPrice > 50000) - .forEach(row => console.log(row)); + .filter((row) => row.minPrice > 50000) + .forEach((row) => console.log(row)); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs index ec15940234..02443b1ba8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs @@ -13,7 +13,7 @@ const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_d const $ = await download(listingUrl); const cells = $('.wikitable tr td:nth-child(3)'); -const promises = cells.toArray().map(async element => { +const promises = cells.toArray().map(async (element) => { const $nameCell = $(element); const $link = $nameCell.find('a').first(); if (!$link.length) { @@ -27,7 +27,8 @@ const promises = cells.toArray().map(async element => { .first(); const callingCode = $label.length - ? $label.parent().find('td.infobox-data').first().text().trim() + ? $label.parent().find('td.infobox-data').first().text() +.trim() : ''; console.log(`${countryUrl} ${callingCode || null}`); From c0569063f6d6012cca5cf07d152e8c012e760f90 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 13:03:41 +0100 Subject: [PATCH 17/26] style: condense and fix the solutions markup --- .../scraping_basics_javascript/05_parsing_html.md | 2 -- .../scraping_basics_javascript/06_locating_elements.md | 5 ----- .../scraping_basics_javascript/07_extracting_data.md | 4 ---- .../scraping_basics_javascript/08_saving_data.md | 2 -- .../scraping_basics_javascript/09_getting_links.md | 3 --- .../scraping_basics_javascript/10_crawling.md | 4 ---- .../scraping_basics_javascript/11_scraping_variants.md | 2 -- .../scraping_basics_javascript/12_framework.md | 4 ---- .../scraping_basics_python/05_parsing_html.md | 4 ---- .../scraping_basics_python/06_locating_elements.md | 5 ----- .../scraping_basics_python/07_extracting_data.md | 10 ++-------- .../webscraping/scraping_basics_python/10_crawling.md | 4 ---- .../scraping_basics_python/11_scraping_variants.md | 2 -- .../webscraping/scraping_basics_python/12_framework.md | 4 ---- 14 files changed, 2 insertions(+), 53 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md index a1262cb9b5..2b88caa15d 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md @@ -195,7 +195,5 @@ Use the same URL as in the previous exercise, but this time print a total count
Solution - {F1AcademyDriversExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md index 210b31f583..d6666fcbd1 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md @@ -242,7 +242,6 @@ Djibouti
Solution - {WikipediaCountriesExercise.code} Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. @@ -264,9 +263,7 @@ You may want to check out the following pages:
Solution - {WikipediaCountriesSingleSelectorExercise.code} -
### Scrape F1 news @@ -288,7 +285,5 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution - {GuardianF1TitlesExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md index 6d09f851e4..74b440b4bc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md @@ -243,7 +243,6 @@ Denon AH-C720 In-Ear Headphones | 236
Solution - {WarehouseUnitsExercise.code} :::tip Conditional (ternary) operator @@ -260,7 +259,6 @@ Simplify the code from previous exercise. Use [regular expressions](https://deve
Solution - {WarehouseUnitsRegexExercise.code} :::tip Conditional (ternary) operator @@ -300,7 +298,5 @@ Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian
Solution - {GuardianPublishDatesExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md index 6a075b51d7..6a312d9570 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md @@ -211,9 +211,7 @@ Write a new Node.js program that reads the `products.json` file we created in th
Solution - {ProcessProductsJsonExercise.code} -
### Process your CSV diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md index 3d4062fa81..8670a0536e 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md @@ -344,9 +344,7 @@ https://en.wikipedia.org/wiki/Botswana
Solution - {WikipediaCountryLinksExercise.code} -
### Scrape links to F1 news @@ -369,7 +367,6 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution - {GuardianF1LinksExercise.code} Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this: diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md index ee24ba86cc..7fb737c293 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md @@ -239,9 +239,7 @@ Locating cells in tables is sometimes easier if you know how to [filter](https:/
Solution - {WikipediaCallingCodesExercise.code} -
### Scrape authors of F1 news articles @@ -272,7 +270,5 @@ PA Media: Lewis Hamilton reveals lifelong battle with depression after school bu
Solution - {GuardianF1AuthorsExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index b2519dd0d9..3a85eec446 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -411,7 +411,5 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution - {CnnSportsShortestArticleExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index 0a8097eb27..b2e86624b1 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -424,9 +424,7 @@ If you export the dataset as JSON, it should look something like this:
Solution - {CrawleeF1DriversExercise.code} -
### Use Crawlee to find the ratings of the most popular Netflix films @@ -483,7 +481,5 @@ When navigating to the first IMDb search result, you might find it helpful to kn
Solution - {CrawleeNetflixRatingsExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md index 117b694217..dfa99ebe23 100644 --- a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md +++ b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md @@ -133,9 +133,7 @@ https://www.f1academy.com/Racing-Series/Teams
Solution - {F1AcademyTeamsExercise.code} -
### Scrape F1 Academy drivers @@ -144,7 +142,5 @@ Use the same URL as in the previous exercise, but this time print a total count
Solution - {F1AcademyDriversExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md index 2fa83587e4..5dce9bc0e6 100644 --- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md +++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md @@ -247,7 +247,6 @@ Djibouti
Solution - {WikipediaCountriesExercise.code} Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells. @@ -269,9 +268,7 @@ You may want to check out the following pages:
Solution - {WikipediaCountriesSingleSelectorExercise.code} -
### Scrape F1 news @@ -293,7 +290,5 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution - {GuardianF1TitlesExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md index a50ac3db33..ab0c86e589 100644 --- a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md +++ b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md @@ -244,9 +244,7 @@ Denon AH-C720 In-Ear Headphones | 236
Solution - {WarehouseUnitsExercise.code} -
### Use regular expressions @@ -255,9 +253,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://docs
Solution - - {WarehouseUnitsRegexExercise.code} - + {WarehouseUnitsRegexExercise.code}
### Scrape publish dates of F1 news @@ -289,7 +285,5 @@ Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian
Solution - - {GuardianPublishDatesExercise.code} - + {GuardianPublishDatesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md index 6d0b554405..893683792b 100644 --- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md +++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md @@ -212,9 +212,7 @@ Locating cells in tables is sometimes easier if you know how to [navigate up](ht
Solution - {WikipediaCallingCodesExercise.code} -
### Scrape authors of F1 news articles @@ -245,7 +243,5 @@ PA Media: Lewis Hamilton reveals lifelong battle with depression after school bu
Solution - {GuardianF1AuthorsExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index 463a217874..33d4720a1f 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -363,7 +363,5 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution - {CnnSportsShortestArticleExercise.code} -
diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index 579d7e9ced..174c4f7b93 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -465,9 +465,7 @@ If you export the dataset as JSON, it should look something like this:
Solution - {CrawleeF1DriversExercise.code} -
### Use Crawlee to find the ratings of the most popular Netflix films @@ -525,7 +523,5 @@ When navigating to the first IMDb search result, you might find it helpful to kn
Solution - {CrawleeNetflixRatingsExercise.code} -
From ddb9a1bd9eb144bb831d7253165721e9606262e1 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 13:07:18 +0100 Subject: [PATCH 18/26] chore: setup and teardown for Python --- .../scraping_basics_python/exercises/test.bats | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index dfefde9dc1..5a509b5301 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -1,7 +1,20 @@ -setup() { +setup_file() { cd "$BATS_TEST_DIRNAME" } +teardown_file() { + rm -rf storage dataset.json +} + +# retry_run() { +# for attempt in 1 2 3; do +# run "$@" +# (( status == 0 )) && return 0 +# sleep 1 +# done +# return "$status" +# } + @test "outputs the HTML with Star Wars products" { run uv run --with=httpx python lego.py [[ "$output" == *"Millennium Falcon"* ]] From 87b813b2a6ac50c13fc98b7183c09656e311d5e2 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 14:32:58 +0100 Subject: [PATCH 19/26] chore: fix the JS test suite not to rely on npx --package --- .../exercises/products.json | 12 ---- .../exercises/test.bats | 59 +++++++++---------- 2 files changed, 27 insertions(+), 44 deletions(-) delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/products.json diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/products.json b/sources/academy/webscraping/scraping_basics_javascript/exercises/products.json deleted file mode 100644 index 8e067ca9f2..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/products.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "title": "Premium Speakers", - "minPrice": 75000, - "price": 75000 - }, - { - "title": "Budget Headphones", - "minPrice": 25000, - "price": 25000 - } -] diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index abb47539c3..a6176bd530 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -1,62 +1,59 @@ setup_file() { cd "$BATS_TEST_DIRNAME" - export npm_config_yes=true + npm init --yes + npm install cheerio crawlee +} + +teardown() { + rm -rf products.json storage dataset.json } teardown_file() { - rm -rf storage node_modules package.json package-lock.json dataset.json + rm -rf node_modules package.json package-lock.json } -# retry_run() { -# for attempt in 1 2 3; do -# run "$@" -# (( status == 0 )) && return 0 -# sleep 1 -# done -# return "$status" -# } @test "outputs the HTML with Star Wars products" { - run npx node lego.mjs + run node lego.mjs [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { - run npx --package=cheerio node f1academy_teams.mjs + run node f1academy_teams.mjs [[ "$output" == "6" ]] } @test "counts the number of F1 Academy drivers" { - run npx --package=cheerio node f1academy_drivers.mjs + run node f1academy_drivers.mjs [[ "$output" == "18" ]] } @test "lists African countries" { - run npx --package=cheerio node wikipedia_countries.mjs + run node wikipedia_countries.mjs [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists African countries with a single selector" { - run npx --package=cheerio node wikipedia_countries_single_selector.mjs + run node wikipedia_countries_single_selector.mjs [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article titles" { - run npx --package=cheerio node guardian_f1_titles.mjs + run node guardian_f1_titles.mjs [[ "$output" == *' F1 '* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts" { - run npx --package=cheerio node warehouse_units.mjs + run node warehouse_units.mjs [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] @@ -64,7 +61,7 @@ teardown_file() { } @test "prints warehouse stock counts using regex" { - run npx --package=cheerio node warehouse_units_regex.mjs + run node warehouse_units_regex.mjs [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] @@ -72,7 +69,7 @@ teardown_file() { } @test "prints Guardian F1 titles with publish dates" { - run npx --package=cheerio node guardian_publish_dates.mjs + run node guardian_publish_dates.mjs [[ "$output" == *' F1 '* ]] [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely @@ -80,13 +77,15 @@ teardown_file() { } @test "filters products from JSON" { - run npx node process_products_json.mjs + echo '[{"title":"Premium Speakers","minPrice":75000,"price":75000},{"title":"Budget Headphones","minPrice":25000,"price":25000}]' > products.json + + run node process_products_json.mjs [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } @test "lists Wikipedia country links" { - run npx --package=cheerio node wikipedia_country_links.mjs + run node wikipedia_country_links.mjs [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] @@ -94,14 +93,14 @@ teardown_file() { } @test "lists Guardian F1 article links" { - run npx --package=cheerio node guardian_f1_links.mjs + run node guardian_f1_links.mjs [[ "$output" == *'https://www.theguardian.com/sport/'* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Wikipedia calling codes" { - run npx --package=cheerio node wikipedia_calling_codes.mjs + run node wikipedia_calling_codes.mjs [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] @@ -109,7 +108,7 @@ teardown_file() { } @test "lists Guardian F1 authors" { - run npx --package=cheerio node guardian_f1_authors.mjs + run node guardian_f1_authors.mjs [[ "$output" == *' F1 '* ]] [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him) @@ -118,37 +117,33 @@ teardown_file() { } @test "lists npm LLM packages" { - run npx --package=cheerio node npm_llm_packages.mjs + run node npm_llm_packages.mjs (( status == 0 )) [[ -n "$output" ]] } @test "finds the shortest CNN sports article" { - run npx --package=cheerio node cnn_sports_shortest_article.mjs + run node cnn_sports_shortest_article.mjs [[ "$output" == 'https://edition.cnn.com/'* ]] } @test "scrapes F1 Academy driver details with Crawlee" { - npm init --yes - npm install crawlee run node crawlee_f1_drivers.mjs (( status == 0 )) - [[ -n "$output" || -f dataset.json ]] + [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') == "18" ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } @test "scrapes Netflix ratings with Crawlee" { - npm init --yes - npm install crawlee run node crawlee_netflix_ratings.mjs (( status == 0 )) - [[ -n "$output" || -f dataset.json ]] + [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') == "10" ]] [[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] From cdc8023c775e02b8dd2c4826e5981d538e0fd052 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 14:54:07 +0100 Subject: [PATCH 20/26] fix: improve the Python test suite and fix solutions using Crawlee (see https://github.com/apify/apify-docs/issues/2112 ) --- .../exercises/crawlee_f1_drivers.py | 2 +- .../exercises/crawlee_netflix_ratings.py | 2 +- .../exercises/products.json | 12 -- .../exercises/test.bats | 112 +++++++++++------- 4 files changed, 69 insertions(+), 59 deletions(-) delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/products.json diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py index 83a854ab14..2f7cef895c 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py @@ -47,7 +47,7 @@ async def handle_driver(context: BeautifulSoupCrawlingContext) -> None: ) await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"]) - await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined] + await crawler.export_data("dataset.json") if __name__ == "__main__": diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index 548c54435a..b7f2000b37 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -39,7 +39,7 @@ async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: ) await crawler.run(["https://www.netflix.com/tudum/top10"]) - await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined] + await crawler.export_data("dataset.json") if __name__ == "__main__": diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/products.json b/sources/academy/webscraping/scraping_basics_python/exercises/products.json deleted file mode 100644 index d4adea213c..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/products.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "title": "Premium Speakers", - "min_price": "750", - "price": "750" - }, - { - "title": "Budget Headphones", - "min_price": "250", - "price": "250" - } -] diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 5a509b5301..f44719b4f3 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -2,122 +2,144 @@ setup_file() { cd "$BATS_TEST_DIRNAME" } -teardown_file() { - rm -rf storage dataset.json +teardown() { + rm -rf products.json storage dataset.json } -# retry_run() { -# for attempt in 1 2 3; do -# run "$@" -# (( status == 0 )) && return 0 -# sleep 1 -# done -# return "$status" -# } - @test "outputs the HTML with Star Wars products" { run uv run --with=httpx python lego.py + [[ "$output" == *"Millennium Falcon"* ]] } @test "counts the number of F1 Academy teams" { run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == "6" ]] } @test "counts the number of F1 Academy drivers" { run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == "18" ]] } @test "lists African countries" { run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists African countries with a single selector" { run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article titles" { run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *' F1 '* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts" { run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints warehouse stock counts using regex" { run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]] + [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Guardian F1 titles with publish dates" { run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *' F1 '* ]] + [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "filters products from JSON" { + echo '[{"title":"Premium Speakers","minPrice":75000,"price":75000},{"title":"Budget Headphones","minPrice":25000,"price":25000}]' > products.json + run uv run python process_products_json.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] } @test "lists Wikipedia country links" { run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]] + [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 article links" { run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *'https://www.theguardian.com/sport/'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "prints Wikipedia calling codes" { run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]] + [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Guardian F1 authors" { run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *' F1 '* ]] + [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him) + [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] + [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Python database jobs" { run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == *"'title': '"* ]] + [[ "$output" == *"'company': '"* ]] + [[ "$output" == *"'url': 'https://www.python.org/jobs/"* ]] + [[ "$output" == *"'posted_on': datetime.date("* ]] } @test "finds the shortest CNN sports article" { run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py - (( status == 0 )) - [[ -n "$output" ]] + + [[ "$output" == 'https://edition.cnn.com/'* ]] } @test "scrapes F1 Academy driver details with Crawlee" { run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py - [[ -n "$output" || -f dataset.json ]] - rm -f dataset.json + + (( status == 0 )) + [[ -f dataset.json ]] + [[ $(cat dataset.json | jq '. | length') == "18" ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } @test "scrapes Netflix ratings with Crawlee" { run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py - [[ -n "$output" || -f dataset.json ]] - rm -f dataset.json + + (( status == 0 )) + [[ -f dataset.json ]] + [[ $(cat dataset.json | jq '. | length') == "10" ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } From d006397066773f5e57c3ab2e5e357736a63baf95 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 15:35:47 +0100 Subject: [PATCH 21/26] style: fix markup --- .../webscraping/scraping_basics_python/08_saving_data.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 4d5f31bf5f..ef8854e633 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -193,9 +193,7 @@ Write a new Python program that reads the `products.json` file we created in thi
Solution - - {ProcessProductsJsonExercise.code} - + {ProcessProductsJsonExercise.code}
### Process your CSV From dfe9b2817b9b79807d41f6aa10b5ace391ab53bc Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 15:36:36 +0100 Subject: [PATCH 22/26] chore: fix typo --- .../webscraping/scraping_basics_javascript/exercises/test.bats | 2 +- .../webscraping/scraping_basics_python/exercises/test.bats | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index a6176bd530..8894b755bd 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -111,7 +111,7 @@ teardown_file() { run node guardian_f1_authors.mjs [[ "$output" == *' F1 '* ]] - [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him) + [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index f44719b4f3..66113ad61f 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -104,7 +104,7 @@ teardown() { run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py [[ "$output" == *' F1 '* ]] - [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him) + [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] [[ $(echo "$output" | wc -l) -gt 5 ]] } From b324b206ccd8de398f25d8cb4e9959e70856d35a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 15:37:07 +0100 Subject: [PATCH 23/26] chore: enable only as a cron --- .github/workflows/test-academy.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml index acb537f855..b801d4183f 100644 --- a/.github/workflows/test-academy.yml +++ b/.github/workflows/test-academy.yml @@ -1,13 +1,9 @@ name: Test Academy on: - # TODO, this is just temporary: - push: - branches: [ "honzajavorek/test-exercises" ] - - # schedule: - # - cron: "0 3 * * 1" # at 3am UTC on Mondays - # workflow_dispatch: # allows running this workflow manually from the Actions tab + schedule: + - cron: "0 3 * * 1" # at 3am UTC on Mondays + workflow_dispatch: # allows running this workflow manually from the Actions tab jobs: test-exercises: From d4e06432d015ad99a5f195f8d80cadd2bf9f368e Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 15:38:57 +0100 Subject: [PATCH 24/26] chore: run monthly --- .github/workflows/test-academy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml index b801d4183f..a17e78f114 100644 --- a/.github/workflows/test-academy.yml +++ b/.github/workflows/test-academy.yml @@ -2,7 +2,7 @@ name: Test Academy on: schedule: - - cron: "0 3 * * 1" # at 3am UTC on Mondays + - cron: "0 3 1 * *" # at 3am UTC on 1st day of month workflow_dispatch: # allows running this workflow manually from the Actions tab jobs: From 118a55065eb6993fcad36815d31aa72ef73bd767 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 15:45:52 +0100 Subject: [PATCH 25/26] style: fix markup --- .../webscraping/scraping_basics_python/09_getting_links.md | 7 ++----- .../scraping_basics_python/11_scraping_variants.md | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md index 3d1bfd91d1..ea5a79a915 100644 --- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md @@ -347,9 +347,7 @@ https://en.wikipedia.org/wiki/Botswana
Solution - - {WikipediaCountryLinksExercise.code} - + {WikipediaCountryLinksExercise.code}
### Scrape links to F1 news @@ -372,8 +370,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution - - {GuardianF1LinksExercise.code} + {GuardianF1LinksExercise.code} Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this: diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index 33d4720a1f..e654ee34eb 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -346,7 +346,6 @@ You can find everything you need for working with dates and times in Python's [` After inspecting the job board, you'll notice that job postings tagged as "Database" have a dedicated URL. We'll use that as our starting point, which saves us from having to scrape and check the tags manually. {PythonJobsDatabaseExercise.code} -
### Find the shortest CNN article which made it to the Sports homepage From 8c875d153b4c3ae5b7699d8141b2b811b92b3555 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 25 Nov 2025 16:00:49 +0100 Subject: [PATCH 26/26] fix: address bugbot comments --- .../exercises/crawlee_netflix_ratings.mjs | 4 +--- .../scraping_basics_javascript/exercises/test.bats | 2 +- .../webscraping/scraping_basics_python/exercises/test.bats | 6 +++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index 6a1a756328..19da811bc3 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -16,15 +16,13 @@ const crawler = new CheerioCrawler({ } } else if (request.label === 'IMDB_SEARCH') { await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 }); - } else if (request.label === 'NETFLIX') { + } else { const requests = $("[data-uia='top10-table-row-title'] button").toArray().map((buttonElement) => { const name = $(buttonElement).text().trim(); const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); }); await addRequests(requests); - } else { - throw new Error(`Unexpected request label: ${request.label}`); } }, }); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 8894b755bd..618b64cd14 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -145,6 +145,6 @@ teardown_file() { (( status == 0 )) [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') == "10" ]] - [[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 66113ad61f..2de3db35f3 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -70,11 +70,11 @@ teardown() { } @test "filters products from JSON" { - echo '[{"title":"Premium Speakers","minPrice":75000,"price":75000},{"title":"Budget Headphones","minPrice":25000,"price":25000}]' > products.json + echo '[{"title":"Premium Speakers","min_price":75000,"price":75000},{"title":"Budget Headphones","min_price":25000,"price":25000}]' > products.json run uv run python process_products_json.py - [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]] + [[ "$output" == "{'title': 'Premium Speakers', 'min_price': 75000, 'price': 75000}" ]] } @test "lists Wikipedia country links" { @@ -140,6 +140,6 @@ teardown() { (( status == 0 )) [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') == "10" ]] - [[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] }