diff --git a/.github/workflows/test-academy.yml b/.github/workflows/test-academy.yml
new file mode 100644
index 0000000000..a17e78f114
--- /dev/null
+++ b/.github/workflows/test-academy.yml
@@ -0,0 +1,31 @@
+name: Test Academy
+
+on:
+ schedule:
+ - cron: "0 3 1 * *" # at 3am UTC on 1st day of month
+ workflow_dispatch: # allows running this workflow manually from the Actions tab
+
+jobs:
+ test-exercises:
+ name: Test Academy Exercises
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout Source code
+ uses: actions/checkout@v6
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v6
+ with:
+ cache: npm
+ cache-dependency-path: package-lock.json
+
+ - name: Setup Python
+ uses: astral-sh/setup-uv@v7
+
+ - name: Install Bats
+ run: |
+ corepack enable
+ npm install --only=dev
+
+ - name: Test
+ run: npm run test:academy
diff --git a/.gitignore b/.gitignore
index 8fa90c3b3e..995f278d94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,7 @@ codegen/*/generated/
codegen/*/go.sum
.github/styles/Microsoft
.github/styles/write-good
+sources/academy/**/exercises/storage
+sources/academy/**/exercises/node_modules
+sources/academy/**/exercises/package*.json
+sources/academy/**/exercises/dataset.json
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3136c31132..940073d2fc 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -335,6 +335,12 @@ Add languages by adding new folders at the appropriate path level.
- Run `vale sync` to download styles
- Configure exceptions in `accepts.txt`
+### Testing
+
+- **Broken links**: [Periodic GitHub Action](.github/workflows/lychee.yml) checks broken links by [lychee](https://lychee.cli.rs/). If the Action fails, we manually fix the issues.
+
+- **Academy exercises**: At the end of each lesson in the academy courses, there are exercises that target real-world websites. Each exercise includes a solution, stored as a separate file containing executable code. These files are included in the docs using the `!!raw-loader` syntax. Each course has a [Bats](https://bats-core.readthedocs.io/) test file named `test.bats`. The tests run each solution as a standalone program and verify that it produces output matching the expected results. A [periodic GitHub Action](.github/workflows/test-academy.yml) runs all these tests using `npm run test:academy`. If the Action fails, we rework the exercises.
+
## Pull request process
1. Follow [Conventional Commits](https://www.conventionalcommits.org/)
diff --git a/package-lock.json b/package-lock.json
index 9211df3e2b..7315d8d7fa 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -45,6 +45,7 @@
"@apify/tsconfig": "^0.1.0",
"@types/react": "^19.0.0",
"babel-plugin-styled-components": "^2.1.4",
+ "bats": "^1.13.0",
"cross-env": "^10.0.0",
"eslint": "^9.32.0",
"eslint-plugin-react": "^7.37.5",
@@ -9316,6 +9317,16 @@
"resolved": "https://registry.npmjs.org/batch/-/batch-0.6.1.tgz",
"integrity": "sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw=="
},
+ "node_modules/bats": {
+ "version": "1.13.0",
+ "resolved": "https://registry.npmjs.org/bats/-/bats-1.13.0.tgz",
+ "integrity": "sha512-giSYKGTOcPZyJDbfbTtzAedLcNWdjCLbXYU3/MwPnjyvDXzu6Dgw8d2M+8jHhZXSmsCMSQqCp+YBsJ603UO4vQ==",
+ "dev": true,
+ "license": "MIT",
+ "bin": {
+ "bats": "bin/bats"
+ }
+ },
"node_modules/bcp-47-match": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/bcp-47-match/-/bcp-47-match-2.0.3.tgz",
diff --git a/package.json b/package.json
index cf6193d4f3..93cf8433e6 100644
--- a/package.json
+++ b/package.json
@@ -40,6 +40,7 @@
"lint:md:fix": "markdownlint '**/*.md' --fix",
"lint:code": "eslint .",
"lint:code:fix": "eslint . --fix",
+ "test:academy": "bats --print-output-on-failure -r .",
"postinstall": "patch-package",
"postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs"
},
@@ -48,6 +49,7 @@
"@apify/tsconfig": "^0.1.0",
"@types/react": "^19.0.0",
"babel-plugin-styled-components": "^2.1.4",
+ "bats": "^1.13.0",
"cross-env": "^10.0.0",
"eslint": "^9.32.0",
"eslint-plugin-react": "^7.37.5",
@@ -61,8 +63,8 @@
"typescript-eslint": "^8.38.0"
},
"dependencies": {
- "@apify/ui-library": "^1.97.2",
"@apify/ui-icons": "^1.19.0",
+ "@apify/ui-library": "^1.97.2",
"@docusaurus/core": "^3.8.1",
"@docusaurus/faster": "^3.8.1",
"@docusaurus/plugin-client-redirects": "^3.8.1",
diff --git a/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md
index dd5ebfb5b0..3956bfc1a6 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/04_downloading_html.md
@@ -5,8 +5,10 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/downloading-html
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import LegoExercise from '!!raw-loader!roa-loader!./exercises/lego.mjs';
@@ -184,28 +186,17 @@ Letting our program visibly crash on error is enough for our purposes. Now, let'
-### Scrape AliExpress
+### Scrape LEGO
-Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with AliExpress search results:
+Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with LEGO search results:
```text
-https://www.aliexpress.com/w/wholesale-darth-vader.html
+https://www.lego.com/en-us/themes/star-wars
```
Solution
-
- ```js
- const url = "https://www.aliexpress.com/w/wholesale-darth-vader.html";
- const response = await fetch(url);
-
- if (response.ok) {
- console.log(await response.text());
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {LegoExercise.code}
### Save downloaded HTML as a file
diff --git a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md
index 78604a16fa..2b88caa15d 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/05_parsing_html.md
@@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/parsing-html
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/f1academy_teams.mjs';
+import F1AcademyDriversExercise from '!!raw-loader!roa-loader!./exercises/f1academy_drivers.mjs';
@@ -183,22 +186,7 @@ https://www.f1academy.com/Racing-Series/Teams
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const url = "https://www.f1academy.com/Racing-Series/Teams";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
- console.log($(".teams-driver-item").length);
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {F1AcademyTeamsExercise.code}
### Scrape F1 Academy drivers
@@ -207,20 +195,5 @@ Use the same URL as in the previous exercise, but this time print a total count
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const url = "https://www.f1academy.com/Racing-Series/Teams";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
- console.log($(".driver").length);
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {F1AcademyDriversExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md
index 8193597053..d6666fcbd1 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/06_locating_elements.md
@@ -5,8 +5,12 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/locating-elements
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.mjs';
+import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.mjs';
+import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.mjs';
@@ -238,36 +242,7 @@ Djibouti
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const tableElement of $(".wikitable").toArray()) {
- const $table = $(tableElement);
- const $rows = $table.find("tr");
-
- for (const rowElement of $rows.toArray()) {
- const $row = $(rowElement);
- const $cells = $row.find("td");
-
- if ($cells.length > 0) {
- const $thirdColumn = $($cells[2]);
- const $link = $thirdColumn.find("a").first();
- console.log($link.text());
- }
- }
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
+ {WikipediaCountriesExercise.code}
Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells.
@@ -288,27 +263,7 @@ You may want to check out the following pages:
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $(".wikitable tr td:nth-child(3)").toArray()) {
- const $nameCell = $(element);
- const $link = $nameCell.find("a").first();
- console.log($link.text());
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {WikipediaCountriesSingleSelectorExercise.code}
### Scrape F1 news
@@ -330,23 +285,5 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const url = "https://www.theguardian.com/sport/formulaone";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $("#maincontent ul li h3").toArray()) {
- console.log($(element).text());
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {GuardianF1TitlesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md
index dae6bda605..74b440b4bc 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/07_extracting_data.md
@@ -5,8 +5,12 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/extracting-data
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WarehouseUnitsExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units.mjs';
+import WarehouseUnitsRegexExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units_regex.mjs';
+import GuardianPublishDatesExercise from '!!raw-loader!roa-loader!./exercises/guardian_publish_dates.mjs';
@@ -239,42 +243,7 @@ Denon AH-C720 In-Ear Headphones | 236
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- function parseUnitsText(text) {
- const count = text
- .replace("In stock,", "")
- .replace("Only", "")
- .replace(" left", "")
- .replace("units", "")
- .trim();
- return count === "Sold out" ? 0 : parseInt(count);
- }
-
- const url = "https://warehouse-theme-metal.myshopify.com/collections/sales";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $(".product-item").toArray()) {
- const $productItem = $(element);
-
- const title = $productItem.find(".product-item__title");
- const title = $title.text().trim();
-
- const unitsText = $productItem.find(".product-item__inventory").text();
- const unitsCount = parseUnitsText(unitsText);
-
- console.log(`${title} | ${unitsCount}`);
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
+ {WarehouseUnitsExercise.code}
:::tip Conditional (ternary) operator
@@ -290,40 +259,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://deve
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- function parseUnitsText(text) {
- const match = text.match(/\d+/);
- if (match) {
- return parseInt(match[0]);
- }
- return 0;
- }
-
- const url = "https://warehouse-theme-metal.myshopify.com/collections/sales";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $(".product-item").toArray()) {
- const $productItem = $(element);
-
- const $title = $productItem.find(".product-item__title");
- const title = $title.text().trim();
-
- const unitsText = $productItem.find(".product-item__inventory").text();
- const unitsCount = parseUnitsText(unitsText);
-
- console.log(`${title} | ${unitsCount}`);
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
+ {WarehouseUnitsRegexExercise.code}
:::tip Conditional (ternary) operator
@@ -362,35 +298,5 @@ Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const url = "https://www.theguardian.com/sport/formulaone";
- const response = await fetch(url);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $("#maincontent ul li").toArray()) {
- const $article = $(element);
-
- const title = $article
- .find("h3")
- .text()
- .trim();
- const dateText = $article
- .find("time")
- .attr("datetime")
- .trim();
- const date = new Date(dateText);
-
- console.log(`${title} | ${date.toDateString()}`);
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {GuardianPublishDatesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md
index bd960e9b5c..6a312d9570 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/08_saving_data.md
@@ -5,7 +5,9 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/saving-data
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
+import ProcessProductsJsonExercise from '!!raw-loader!roa-loader!./exercises/process_products_json.mjs';
@@ -209,17 +211,7 @@ Write a new Node.js program that reads the `products.json` file we created in th
Solution
-
- ```js
- import { readFile } from "fs/promises";
-
- const jsonData = await readFile("products.json");
- const data = JSON.parse(jsonData);
- data
- .filter(row => row.minPrice > 50000)
- .forEach(row => console.log(row));
- ```
-
+ {ProcessProductsJsonExercise.code}
### Process your CSV
diff --git a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md
index e923a8875d..8670a0536e 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/09_getting_links.md
@@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/getting-links
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.mjs';
+import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.mjs';
@@ -341,28 +344,7 @@ https://en.wikipedia.org/wiki/Botswana
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
- const response = await fetch(listingURL);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $(".wikitable tr td:nth-child(3)").toArray()) {
- const nameCell = $(element);
- const link = nameCell.find("a").first();
- const url = new URL(link.attr("href"), listingURL).href;
- console.log(url);
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
-
+ {WikipediaCountryLinksExercise.code}
### Scrape links to F1 news
@@ -385,26 +367,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- const listingURL = "https://www.theguardian.com/sport/formulaone";
- const response = await fetch(listingURL);
-
- if (response.ok) {
- const html = await response.text();
- const $ = cheerio.load(html);
-
- for (const element of $("#maincontent ul li").toArray()) {
- const link = $(element).find("a").first();
- const url = new URL(link.attr("href"), listingURL).href;
- console.log(url);
- }
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- ```
+ {GuardianF1LinksExercise.code}
Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this:
diff --git a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md
index 926fd6d839..7fb737c293 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/10_crawling.md
@@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/crawling
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.mjs';
+import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.mjs';
@@ -236,45 +239,7 @@ Locating cells in tables is sometimes easier if you know how to [filter](https:/
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- async function download(url) {
- const response = await fetch(url);
- if (response.ok) {
- const html = await response.text();
- return cheerio.load(html);
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- }
-
- const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
- const $ = await download(listingURL);
-
- const $cells = $(".wikitable tr td:nth-child(3)");
- const promises = $cells.toArray().map(async element => {
- const $nameCell = $(element);
- const $link = $nameCell.find("a").first();
- const countryURL = new URL($link.attr("href"), listingURL).href;
-
- const $c = await download(countryURL);
- const $label = $c("th.infobox-label")
- .filter((i, element) => $c(element).text().trim() == "Calling code")
- .first();
- const callingCode = $label
- .parent()
- .find("td.infobox-data")
- .first()
- .text()
- .trim();
-
- console.log(`${countryURL} ${callingCode || null}`);
- });
- await Promise.all(promises);
- ```
-
+ {WikipediaCallingCodesExercise.code}
### Scrape authors of F1 news articles
@@ -305,37 +270,5 @@ PA Media: Lewis Hamilton reveals lifelong battle with depression after school bu
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- async function download(url) {
- const response = await fetch(url);
- if (response.ok) {
- const html = await response.text();
- return cheerio.load(html);
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- }
-
- const listingURL = "https://www.theguardian.com/sport/formulaone";
- const $ = await download(listingURL);
-
- const promises = $("#maincontent ul li").toArray().map(async element => {
- const $item = $(element);
- const $link = $item.find("a").first();
- const authorURL = new URL($link.attr("href"), listingURL).href;
-
- const $a = await download(authorURL);
- const title = $a("h1").text().trim();
-
- const author = $a('a[rel="author"]').text().trim();
- const address = $a('aside address').text().trim();
-
- console.log(`${author || address || null}: ${title}`);
- });
- await Promise.all(promises);
- ```
-
+ {GuardianF1AuthorsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md
index 2d4044240a..3a85eec446 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md
@@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/scraping-variants
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import NpmLlmPackagesExercise from '!!raw-loader!roa-loader!./exercises/npm_llm_packages.mjs';
+import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs';
@@ -386,61 +389,7 @@ Your output should look something like this:
After inspecting the registry, you'll notice that packages with the keyword "LLM" have a dedicated URL. Also, changing the sorting dropdown results in a page with its own URL. We'll use that as our starting point, which saves us from having to scrape the whole registry and then filter by keyword or sort by the number of dependents.
- ```js
- import * as cheerio from 'cheerio';
-
- async function download(url) {
- const response = await fetch(url);
- if (response.ok) {
- const html = await response.text();
- return cheerio.load(html);
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- }
-
- const listingURL = "https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count";
- const $ = await download(listingURL);
-
- const promises = $("section").toArray().map(async element => {
- const $card = $(element);
-
- const details = $card
- .children()
- .first()
- .children()
- .last()
- .text()
- .split("•");
- const updatedText = details[2].trim();
- const dependents = parseInt(details[3].replace("dependents", "").trim());
-
- if (updatedText.includes("years ago")) {
- const yearsAgo = parseInt(updatedText.replace("years ago", "").trim());
- if (yearsAgo > 2) {
- return null;
- }
- }
-
- const $link = $card.find("a").first();
- const name = $link.text().trim();
- const url = new URL($link.attr("href"), listingURL).href;
- const description = $card.find("p").text().trim();
-
- const downloadsText = $card
- .children()
- .last()
- .text()
- .replace(",", "")
- .trim();
- const downloads = parseInt(downloadsText);
-
- return { name, url, description, dependents, downloads };
- });
-
- const data = await Promise.all(promises);
- console.log(data.filter(item => item !== null).splice(0, 5));
- ```
+ {NpmLlmPackagesExercise.code}
Since the HTML doesn't contain any descriptive classes, we must rely on its structure. We're using [`.children()`](https://cheerio.js.org/docs/api/classes/Cheerio#children) to carefully navigate the HTML element tree.
@@ -462,39 +411,5 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution
-
- ```js
- import * as cheerio from 'cheerio';
-
- async function download(url) {
- const response = await fetch(url);
- if (response.ok) {
- const html = await response.text();
- return cheerio.load(html);
- } else {
- throw new Error(`HTTP ${response.status}`);
- }
- }
-
- const listingURL = "https://edition.cnn.com/sport";
- const $ = await download(listingURL);
-
- const promises = $(".layout__main .card").toArray().map(async element => {
- const $link = $(element).find("a").first();
- const articleURL = new URL($link.attr("href"), listingURL).href;
-
- const $a = await download(articleURL);
- const content = $a(".article__content").text().trim();
-
- return { url: articleURL, length: content.length };
- });
-
- const data = await Promise.all(promises);
- const nonZeroData = data.filter(({ url, length }) => length > 0);
- nonZeroData.sort((a, b) => a.length - b.length);
- const shortestItem = nonZeroData[0];
-
- console.log(shortestItem.url);
- ```
-
+ {CnnSportsShortestArticleExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md
index e4d45aef47..b2e86624b1 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md
@@ -5,8 +5,11 @@ description: Lesson about building a Node.js application for watching prices. Us
slug: /scraping-basics-javascript/framework
---
+import CodeBlock from '@theme/CodeBlock';
import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition';
import Exercises from '../scraping_basics/_exercises.mdx';
+import CrawleeF1DriversExercise from '!!raw-loader!roa-loader!./exercises/crawlee_f1_drivers.mjs';
+import CrawleeNetflixRatingsExercise from '!!raw-loader!roa-loader!./exercises/crawlee_netflix_ratings.mjs';
@@ -421,44 +424,7 @@ If you export the dataset as JSON, it should look something like this:
Solution
-
- ```js
- import { CheerioCrawler } from 'crawlee';
-
- const crawler = new CheerioCrawler({
- async requestHandler({ $, request, enqueueLinks, pushData }) {
- if (request.label === 'DRIVER') {
- const info = {};
- for (const itemElement of $('.common-driver-info li').toArray()) {
- const name = $(itemElement).find('span').text().trim();
- const value = $(itemElement).find('h4').text().trim();
- info[name] = value;
- }
- const detail = {};
- for (const linkElement of $('.driver-detail--cta-group a').toArray()) {
- const name = $(linkElement).find('p').text().trim();
- const value = $(linkElement).find('h2').text().trim();
- detail[name] = value;
- });
- const [dobDay, dobMonth, dobYear] = info['DOB'].split("/");
- pushData({
- url: request.url,
- name: $('h1').text().trim(),
- team: detail['Team'],
- nationality: info['Nationality'],
- dob: `${dobYear}-${dobMonth}-${dobDay}`,
- instagram_url: $(".common-social-share a[href*='instagram']").attr('href'),
- });
- } else {
- await enqueueLinks({ selector: '.teams-driver-item a', label: 'DRIVER' });
- }
- },
- });
-
- await crawler.run(['https://www.f1academy.com/Racing-Series/Drivers']);
- await crawler.exportData('dataset.json');
- ```
-
+ {CrawleeF1DriversExercise.code}
### Use Crawlee to find the ratings of the most popular Netflix films
@@ -515,41 +481,5 @@ When navigating to the first IMDb search result, you might find it helpful to kn
Solution
-
- ```js
- import { CheerioCrawler, Request } from 'crawlee';
- import { escape } from 'node:querystring';
-
- const crawler = new CheerioCrawler({
- async requestHandler({ $, request, enqueueLinks, pushData, addRequests }) {
- if (request.label === 'IMDB') {
- // handle IMDB film page
- pushData({
- url: request.url,
- title: $('h1').text().trim(),
- rating: $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim(),
- });
- } else if (request.label === 'IMDB_SEARCH') {
- // handle IMDB search results
- await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 });
-
- } else if (request.label === 'NETFLIX') {
- // handle Netflix table
- const $buttons = $('[data-uia="top10-table-row-title"] button');
- const requests = $buttons.toArray().map(buttonElement => {
- const name = $(buttonElement).text().trim();
- const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
- return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });
- });
- await addRequests($requests.get());
- } else {
- throw new Error(`Unexpected request label: ${request.label}`);
- }
- },
- });
-
- await crawler.run(['https://www.netflix.com/tudum/top10']);
- await crawler.exportData('dataset.json');
- ```
-
+ {CrawleeNetflixRatingsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs
new file mode 100644
index 0000000000..c9e0bad89a
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs
@@ -0,0 +1,40 @@
+import * as cheerio from 'cheerio';
+
+async function download(url) {
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+ }
+ const html = await response.text();
+ return cheerio.load(html);
+}
+
+const listingUrl = 'https://edition.cnn.com/sport';
+const $ = await download(listingUrl);
+
+const results = await Promise.all(
+ $('.layout__main .card').toArray().map(async (element) => {
+ const $element = $(element);
+ const $link = $element.find('a').first();
+ if (!$link.length) {
+ return null;
+ }
+
+ const articleUrl = new URL($link.attr('href'), listingUrl).href;
+ const $article = await download(articleUrl);
+ const content = $article('.article__content').text().trim();
+
+ if (!content) {
+ return null;
+ }
+
+ return { url: articleUrl, length: content.length };
+ }),
+);
+
+const nonEmpty = results.filter((item) => item && item.length > 0);
+nonEmpty.sort((a, b) => a.length - b.length);
+
+if (nonEmpty.length > 0) {
+ console.log(nonEmpty[0].url);
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs
new file mode 100644
index 0000000000..da7d2d5518
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_f1_drivers.mjs
@@ -0,0 +1,38 @@
+import { CheerioCrawler } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+ async requestHandler({ $, request, enqueueLinks, pushData }) {
+ if (request.label === 'DRIVER') {
+ const info = {};
+ for (const itemElement of $('.common-driver-info li').toArray()) {
+ const name = $(itemElement).find('span').text().trim();
+ const value = $(itemElement).find('h4').text().trim();
+ info[name] = value;
+ }
+
+ const detail = {};
+ for (const linkElement of $('.driver-detail--cta-group a').toArray()) {
+ const name = $(linkElement).find('p').text().trim();
+ const value = $(linkElement).find('h2').text().trim();
+ detail[name] = value;
+ }
+
+ const dob = info.DOB ?? '';
+ const [dobDay = '', dobMonth = '', dobYear = ''] = dob.split('/');
+
+ await pushData({
+ url: request.url,
+ name: $('h1').text().trim(),
+ team: detail.Team,
+ nationality: info.Nationality,
+ dob: dobYear && dobMonth && dobDay ? `${dobYear}-${dobMonth}-${dobDay}` : null,
+ instagram_url: $(".common-social-share a[href*='instagram']").attr('href') ?? null,
+ });
+ } else {
+ await enqueueLinks({ selector: '.teams-driver-item a', label: 'DRIVER' });
+ }
+ },
+});
+
+await crawler.run(['https://www.f1academy.com/Racing-Series/Drivers']);
+await crawler.exportData('dataset.json');
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs
new file mode 100644
index 0000000000..19da811bc3
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs
@@ -0,0 +1,31 @@
+import { escape } from 'node:querystring';
+
+import { CheerioCrawler, Request } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+ async requestHandler({ $, request, enqueueLinks, pushData, addRequests }) {
+ if (request.label === 'IMDB') {
+ const title = $('h1').text().trim();
+ const rating = $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim();
+ if (title && rating) {
+ await pushData({
+ url: request.url,
+ title,
+ rating,
+ });
+ }
+ } else if (request.label === 'IMDB_SEARCH') {
+ await enqueueLinks({ selector: '.find-result-item a', label: 'IMDB', limit: 1 });
+ } else {
+ const requests = $("[data-uia='top10-table-row-title'] button").toArray().map((buttonElement) => {
+ const name = $(buttonElement).text().trim();
+ const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
+ return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });
+ });
+ await addRequests(requests);
+ }
+ },
+});
+
+await crawler.run(['https://www.netflix.com/tudum/top10']);
+await crawler.exportData('dataset.json');
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs
new file mode 100644
index 0000000000..c3dee6eb8a
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_drivers.mjs
@@ -0,0 +1,13 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://www.f1academy.com/Racing-Series/Teams';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+console.log($('.driver').length);
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs
new file mode 100644
index 0000000000..1ffb67adae
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/f1academy_teams.mjs
@@ -0,0 +1,12 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://www.f1academy.com/Racing-Series/Teams';
+const response = await fetch(url);
+
+if (response.ok) {
+ const html = await response.text();
+ const $ = cheerio.load(html);
+ console.log($('.teams-driver-item').length);
+} else {
+ throw new Error(`HTTP ${response.status}`);
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs
new file mode 100644
index 0000000000..ebdc0ecff5
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_authors.mjs
@@ -0,0 +1,36 @@
+import * as cheerio from 'cheerio';
+
+async function download(url) {
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+ }
+ const html = await response.text();
+ return cheerio.load(html);
+}
+
+const listingUrl = 'https://www.theguardian.com/sport/formulaone';
+const $ = await download(listingUrl);
+
+const promises = $('#maincontent ul li').toArray().map(async (element) => {
+ const $item = $(element);
+ const $link = $item.find('a').first();
+ if (!$link.length) {
+ return;
+ }
+
+ const articleUrl = new URL($link.attr('href'), listingUrl).href;
+ const $article = await download(articleUrl);
+
+ const title = $article('h1').text().trim();
+ if (!title) {
+ return;
+ }
+
+ const author = $article('a[rel="author"]').first().text().trim();
+ const attribution = author || $article('aside address').first().text().trim() || 'null';
+
+ console.log(`${attribution}: ${title}`);
+});
+
+await Promise.all(promises);
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs
new file mode 100644
index 0000000000..5a38d7dcee
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_links.mjs
@@ -0,0 +1,20 @@
+import * as cheerio from 'cheerio';
+
+const listingUrl = 'https://www.theguardian.com/sport/formulaone';
+const response = await fetch(listingUrl);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('#maincontent ul li').toArray()) {
+ const $item = $(element);
+ const $link = $item.find('a').first();
+ if ($link.length) {
+ const url = new URL($link.attr('href'), listingUrl).href;
+ console.log(url);
+ }
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs
new file mode 100644
index 0000000000..33486b9cd0
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_f1_titles.mjs
@@ -0,0 +1,18 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://www.theguardian.com/sport/formulaone';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('#maincontent ul li h3').toArray()) {
+ const title = $(element).text().trim();
+ if (title) {
+ console.log(title);
+ }
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs
new file mode 100644
index 0000000000..ba6d13ba9e
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/guardian_publish_dates.mjs
@@ -0,0 +1,28 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://www.theguardian.com/sport/formulaone';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('#maincontent ul li').toArray()) {
+ const $article = $(element);
+ const title = $article.find('h3').text().trim();
+ const dateAttr = $article.find('time').attr('datetime');
+
+ if (!title || !dateAttr) {
+ continue;
+ }
+
+ const date = new Date(dateAttr.trim());
+ if (Number.isNaN(date.getTime())) {
+ continue;
+ }
+
+ console.log(`${title} | ${date.toDateString()}`);
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs
new file mode 100644
index 0000000000..50abb8de39
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/lego.mjs
@@ -0,0 +1,8 @@
+const url = 'https://www.lego.com/en-us/themes/star-wars';
+const response = await fetch(url);
+
+if (response.ok) {
+ console.log(await response.text());
+} else {
+ throw new Error(`HTTP ${response.status}`);
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs
new file mode 100644
index 0000000000..f52a885057
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/npm_llm_packages.mjs
@@ -0,0 +1,61 @@
+import * as cheerio from 'cheerio';
+
+async function download(url) {
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+ }
+ const html = await response.text();
+ return cheerio.load(html);
+}
+
+function parseNumber(text) {
+ return Number.parseInt(text.replace(/[^0-9]/g, ''), 10);
+}
+
+const listingUrl = 'https://www.npmjs.com/search?page=0&q=keywords%3Allm&sortBy=dependent_count';
+const $ = await download(listingUrl);
+
+const promises = $('section').toArray().map(async (element) => {
+ const $card = $(element);
+ const $link = $card.find('a').first();
+ if (!$link.length) {
+ return null;
+ }
+
+ const details = $card
+ .children()
+ .first()
+ .children()
+ .last()
+ .text()
+ .split('•')
+ .map((item) => item.trim());
+
+ const updatedText = details[2] ?? '';
+ const dependentsText = details[3] ?? '';
+ const dependents = parseNumber(dependentsText);
+
+ if (updatedText.includes('years ago')) {
+ const yearsAgo = parseNumber(updatedText);
+ if (Number.isFinite(yearsAgo) && yearsAgo > 2) {
+ return null;
+ }
+ }
+
+ const name = $link.text().trim();
+ const url = new URL($link.attr('href'), listingUrl).href;
+ const description = $card.find('p').text().trim();
+
+ const downloadsText = $card
+ .children()
+ .last()
+ .text()
+ .trim();
+ const downloads = parseNumber(downloadsText);
+
+ return { name, url, description, dependents, downloads };
+});
+
+const data = (await Promise.all(promises)).filter((item) => item);
+console.log(data.slice(0, 5));
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs
new file mode 100644
index 0000000000..a7c951090a
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/process_products_json.mjs
@@ -0,0 +1,8 @@
+import { readFile } from 'node:fs/promises';
+
+const jsonData = await readFile('products.json', 'utf8');
+const data = JSON.parse(jsonData);
+
+data
+ .filter((row) => row.minPrice > 50000)
+ .forEach((row) => console.log(row));
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats
new file mode 100644
index 0000000000..618b64cd14
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats
@@ -0,0 +1,150 @@
+setup_file() {
+ cd "$BATS_TEST_DIRNAME"
+ npm init --yes
+ npm install cheerio crawlee
+}
+
+teardown() {
+ rm -rf products.json storage dataset.json
+}
+
+teardown_file() {
+ rm -rf node_modules package.json package-lock.json
+}
+
+
+@test "outputs the HTML with Star Wars products" {
+ run node lego.mjs
+
+ [[ "$output" == *"Millennium Falcon"* ]]
+}
+
+@test "counts the number of F1 Academy teams" {
+ run node f1academy_teams.mjs
+
+ [[ "$output" == "6" ]]
+}
+
+@test "counts the number of F1 Academy drivers" {
+ run node f1academy_drivers.mjs
+
+ [[ "$output" == "18" ]]
+}
+
+@test "lists African countries" {
+ run node wikipedia_countries.mjs
+
+ [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists African countries with a single selector" {
+ run node wikipedia_countries_single_selector.mjs
+
+ [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Guardian F1 article titles" {
+ run node guardian_f1_titles.mjs
+
+ [[ "$output" == *' F1 '* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints warehouse stock counts" {
+ run node warehouse_units.mjs
+
+ [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
+ [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints warehouse stock counts using regex" {
+ run node warehouse_units_regex.mjs
+
+ [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
+ [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints Guardian F1 titles with publish dates" {
+ run node guardian_publish_dates.mjs
+
+ [[ "$output" == *' F1 '* ]]
+ [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "filters products from JSON" {
+ echo '[{"title":"Premium Speakers","minPrice":75000,"price":75000},{"title":"Budget Headphones","minPrice":25000,"price":25000}]' > products.json
+
+ run node process_products_json.mjs
+
+ [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]]
+}
+
+@test "lists Wikipedia country links" {
+ run node wikipedia_country_links.mjs
+
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]]
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Guardian F1 article links" {
+ run node guardian_f1_links.mjs
+
+ [[ "$output" == *'https://www.theguardian.com/sport/'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints Wikipedia calling codes" {
+ run node wikipedia_calling_codes.mjs
+
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Guardian F1 authors" {
+ run node guardian_f1_authors.mjs
+
+ [[ "$output" == *' F1 '* ]]
+ [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him)
+ [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists npm LLM packages" {
+ run node npm_llm_packages.mjs
+
+ (( status == 0 ))
+ [[ -n "$output" ]]
+}
+
+@test "finds the shortest CNN sports article" {
+ run node cnn_sports_shortest_article.mjs
+
+ [[ "$output" == 'https://edition.cnn.com/'* ]]
+}
+
+@test "scrapes F1 Academy driver details with Crawlee" {
+ run node crawlee_f1_drivers.mjs
+
+ (( status == 0 ))
+ [[ -f dataset.json ]]
+ [[ $(cat dataset.json | jq '. | length') == "18" ]]
+ [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]]
+ [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
+}
+
+@test "scrapes Netflix ratings with Crawlee" {
+ run node crawlee_netflix_ratings.mjs
+
+ (( status == 0 ))
+ [[ -f dataset.json ]]
+ [[ $(cat dataset.json | jq '. | length') == "10" ]]
+ [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]]
+ [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs
new file mode 100644
index 0000000000..d28745259a
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units.mjs
@@ -0,0 +1,36 @@
+import * as cheerio from 'cheerio';
+
+function parseUnitsText(text) {
+ const count = text
+ .replace('In stock,', '')
+ .replace('Only', '')
+ .replace(' left', '')
+ .replace('units', '')
+ .trim();
+ if (count === 'Sold out') {
+ return 0;
+ }
+ return Number.parseInt(count, 10);
+}
+
+const url = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('.product-item').toArray()) {
+ const $productItem = $(element);
+
+ const $title = $productItem.find('.product-item__title');
+ const title = $title.text().trim();
+
+ const unitsText = $productItem.find('.product-item__inventory').text();
+ const unitsCount = parseUnitsText(unitsText);
+
+ console.log(`${title} | ${unitsCount}`);
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs
new file mode 100644
index 0000000000..91ab01f234
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/warehouse_units_regex.mjs
@@ -0,0 +1,31 @@
+import * as cheerio from 'cheerio';
+
+function parseUnitsText(text) {
+ const match = text.match(/\d+/);
+ if (match) {
+ return Number.parseInt(match[0], 10);
+ }
+ return 0;
+}
+
+const url = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('.product-item').toArray()) {
+ const $productItem = $(element);
+
+ const $title = $productItem.find('.product-item__title');
+ const title = $title.text().trim();
+
+ const unitsText = $productItem.find('.product-item__inventory').text();
+ const unitsCount = parseUnitsText(unitsText);
+
+ console.log(`${title} | ${unitsCount}`);
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs
new file mode 100644
index 0000000000..02443b1ba8
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_calling_codes.mjs
@@ -0,0 +1,37 @@
+import * as cheerio from 'cheerio';
+
+async function download(url) {
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+ }
+ const html = await response.text();
+ return cheerio.load(html);
+}
+
+const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa';
+const $ = await download(listingUrl);
+
+const cells = $('.wikitable tr td:nth-child(3)');
+const promises = cells.toArray().map(async (element) => {
+ const $nameCell = $(element);
+ const $link = $nameCell.find('a').first();
+ if (!$link.length) {
+ return;
+ }
+
+ const countryUrl = new URL($link.attr('href'), listingUrl).href;
+ const $country = await download(countryUrl);
+ const $label = $country('th.infobox-label')
+ .filter((_, el) => $country(el).text().trim() === 'Calling code')
+ .first();
+
+ const callingCode = $label.length
+ ? $label.parent().find('td.infobox-data').first().text()
+.trim()
+ : '';
+
+ console.log(`${countryUrl} ${callingCode || null}`);
+});
+
+await Promise.all(promises);
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs
new file mode 100644
index 0000000000..fd9a7f2fb9
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries.mjs
@@ -0,0 +1,29 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const tableElement of $('.wikitable').toArray()) {
+ const $table = $(tableElement);
+ const rows = $table.find('tr');
+
+ for (const rowElement of rows.toArray()) {
+ const $row = $(rowElement);
+ const cells = $row.find('td');
+
+ if (cells.length > 0) {
+ const $thirdColumn = $(cells[2]);
+ const $link = $thirdColumn.find('a').first();
+ if ($link.length) {
+ console.log($link.text());
+ }
+ }
+ }
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs
new file mode 100644
index 0000000000..06f54d0686
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_countries_single_selector.mjs
@@ -0,0 +1,19 @@
+import * as cheerio from 'cheerio';
+
+const url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa';
+const response = await fetch(url);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('.wikitable tr td:nth-child(3)').toArray()) {
+ const $nameCell = $(element);
+ const $link = $nameCell.find('a').first();
+ if ($link.length) {
+ console.log($link.text());
+ }
+}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs
new file mode 100644
index 0000000000..53c95f8d4d
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/wikipedia_country_links.mjs
@@ -0,0 +1,20 @@
+import * as cheerio from 'cheerio';
+
+const listingUrl = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa';
+const response = await fetch(listingUrl);
+
+if (!response.ok) {
+ throw new Error(`HTTP ${response.status}`);
+}
+
+const html = await response.text();
+const $ = cheerio.load(html);
+
+for (const element of $('.wikitable tr td:nth-child(3)').toArray()) {
+ const $nameCell = $(element);
+ const $link = $nameCell.find('a').first();
+ if ($link.length) {
+ const url = new URL($link.attr('href'), listingUrl).href;
+ console.log(url);
+ }
+}
diff --git a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md
index e3866cfcb2..929bf9a6eb 100644
--- a/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md
+++ b/sources/academy/webscraping/scraping_basics_python/04_downloading_html.md
@@ -5,7 +5,9 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/downloading-html
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import LegoExercise from '!!raw-loader!roa-loader!./exercises/lego.py';
**In this lesson we'll start building a Python application for watching prices. As a first step, we'll use the HTTPX library to download HTML code of a product listing page.**
@@ -139,26 +141,17 @@ Letting our program visibly crash on error is enough for our purposes. Now, let'
-### Scrape AliExpress
+### Scrape LEGO
-Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with AliExpress search results:
+Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with LEGO search results:
```text
-https://www.aliexpress.com/w/wholesale-darth-vader.html
+https://www.lego.com/en-us/themes/star-wars
```
Solution
-
- ```py
- import httpx
-
- url = "https://www.aliexpress.com/w/wholesale-darth-vader.html"
- response = httpx.get(url)
- response.raise_for_status()
- print(response.text)
- ```
-
+ {LegoExercise.code}
### Save downloaded HTML as a file
diff --git a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md
index dbfa52cb9a..dfa99ebe23 100644
--- a/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md
+++ b/sources/academy/webscraping/scraping_basics_python/05_parsing_html.md
@@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/parsing-html
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import F1AcademyTeamsExercise from '!!raw-loader!roa-loader!./exercises/f1academy_teams.py';
+import F1AcademyDriversExercise from '!!raw-loader!roa-loader!./exercises/f1academy_drivers.py';
**In this lesson we'll look for products in the downloaded HTML. We'll use BeautifulSoup to turn the HTML into objects which we can work with in our Python program.**
@@ -130,20 +133,7 @@ https://www.f1academy.com/Racing-Series/Teams
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://www.f1academy.com/Racing-Series/Teams"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
- print(len(soup.select(".teams-driver-item")))
- ```
-
+ {F1AcademyTeamsExercise.code}
### Scrape F1 Academy drivers
@@ -152,18 +142,5 @@ Use the same URL as in the previous exercise, but this time print a total count
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://www.f1academy.com/Racing-Series/Teams"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
- print(len(soup.select(".driver")))
- ```
-
+ {F1AcademyDriversExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md
index 0708dc071e..5dce9bc0e6 100644
--- a/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md
+++ b/sources/academy/webscraping/scraping_basics_python/06_locating_elements.md
@@ -5,7 +5,11 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/locating-elements
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WikipediaCountriesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries.py';
+import WikipediaCountriesSingleSelectorExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_countries_single_selector.py';
+import GuardianF1TitlesExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_titles.py';
**In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.**
@@ -243,26 +247,7 @@ Djibouti
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for table in soup.select(".wikitable"):
- for row in table.select("tr"):
- cells = row.select("td")
- if cells:
- third_column = cells[2]
- title_link = third_column.select_one("a")
- print(title_link.text)
- ```
+ {WikipediaCountriesExercise.code}
Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells.
@@ -283,22 +268,7 @@ You may want to check out the following pages:
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for name_cell in soup.select(".wikitable tr td:nth-child(3)"):
- print(name_cell.select_one("a").text)
- ```
-
+ {WikipediaCountriesSingleSelectorExercise.code}
### Scrape F1 news
@@ -320,20 +290,5 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://www.theguardian.com/sport/formulaone"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for title in soup.select("#maincontent ul li h3"):
- print(title.text)
- ```
-
+ {GuardianF1TitlesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md
index eb49b7ce69..ab0c86e589 100644
--- a/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md
+++ b/sources/academy/webscraping/scraping_basics_python/07_extracting_data.md
@@ -5,7 +5,11 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/extracting-data
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WarehouseUnitsExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units.py';
+import WarehouseUnitsRegexExercise from '!!raw-loader!roa-loader!./exercises/warehouse_units_regex.py';
+import GuardianPublishDatesExercise from '!!raw-loader!roa-loader!./exercises/guardian_publish_dates.py';
**In this lesson we'll finish extracting product data from the downloaded HTML. With help of basic string manipulation we'll focus on cleaning and correctly representing the product price.**
@@ -240,39 +244,7 @@ Denon AH-C720 In-Ear Headphones | 236
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for product in soup.select(".product-item"):
- title = product.select_one(".product-item__title").text.strip()
-
- units_text = (
- product
- .select_one(".product-item__inventory")
- .text
- .removeprefix("In stock,")
- .removeprefix("Only")
- .removesuffix(" left")
- .removesuffix("units")
- .strip()
- )
- if "Sold out" in units_text:
- units = 0
- else:
- units = int(units_text)
-
- print(title, units, sep=" | ")
- ```
-
+ {WarehouseUnitsExercise.code}
### Use regular expressions
@@ -281,31 +253,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://docs
Solution
-
- ```py
- import re
- import httpx
- from bs4 import BeautifulSoup
-
- url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for product in soup.select(".product-item"):
- title = product.select_one(".product-item__title").text.strip()
-
- units_text = product.select_one(".product-item__inventory").text
- if re_match := re.search(r"\d+", units_text):
- units = int(re_match.group())
- else:
- units = 0
-
- print(title, units, sep=" | ")
- ```
-
+ {WarehouseUnitsRegexExercise.code}
### Scrape publish dates of F1 news
@@ -337,26 +285,5 @@ Hamilton reveals distress over ‘devastating’ groundhog accident at Canadian
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
- from datetime import datetime
-
- url = "https://www.theguardian.com/sport/formulaone"
- response = httpx.get(url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for article in soup.select("#maincontent ul li"):
- title = article.select_one("h3").text.strip()
-
- date_iso = article.select_one("time")["datetime"].strip()
- date = datetime.fromisoformat(date_iso)
-
- print(title, date.strftime('%a %b %d %Y'), sep=" | ")
- ```
-
+ {GuardianPublishDatesExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md
index a0d6d94743..ef8854e633 100644
--- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md
+++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md
@@ -5,6 +5,9 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/saving-data
---
+import CodeBlock from '@theme/CodeBlock';
+import ProcessProductsJsonExercise from '!!raw-loader!roa-loader!./exercises/process_products_json.py';
+
**In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use Python's standard library to export the files.**
---
@@ -190,19 +193,7 @@ Write a new Python program that reads the `products.json` file we created in thi
Solution
-
- ```py
- import json
- from pprint import pp
-
- with open("products.json", "r") as file:
- products = json.load(file)
-
- for product in products:
- if int(product["min_price"]) > 500:
- pp(product)
- ```
-
+ {ProcessProductsJsonExercise.code}
### Process your CSV
diff --git a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md
index 883ba050f3..ea5a79a915 100644
--- a/sources/academy/webscraping/scraping_basics_python/09_getting_links.md
+++ b/sources/academy/webscraping/scraping_basics_python/09_getting_links.md
@@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/getting-links
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WikipediaCountryLinksExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_country_links.py';
+import GuardianF1LinksExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_links.py';
**In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.**
@@ -344,25 +347,7 @@ https://en.wikipedia.org/wiki/Botswana
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
-
- listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
- response = httpx.get(listing_url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for name_cell in soup.select(".wikitable tr td:nth-child(3)"):
- link = name_cell.select_one("a")
- url = urljoin(listing_url, link["href"])
- print(url)
- ```
-
+ {WikipediaCountryLinksExercise.code}
### Scrape links to F1 news
@@ -385,24 +370,7 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
-
- listing_url = "https://www.theguardian.com/sport/formulaone"
- response = httpx.get(listing_url)
- response.raise_for_status()
-
- html_code = response.text
- soup = BeautifulSoup(html_code, "html.parser")
-
- for item in soup.select("#maincontent ul li"):
- link = item.select_one("a")
- url = urljoin(listing_url, link["href"])
- print(url)
- ```
+ {GuardianF1LinksExercise.code}
Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this:
diff --git a/sources/academy/webscraping/scraping_basics_python/10_crawling.md b/sources/academy/webscraping/scraping_basics_python/10_crawling.md
index 836dadad3a..893683792b 100644
--- a/sources/academy/webscraping/scraping_basics_python/10_crawling.md
+++ b/sources/academy/webscraping/scraping_basics_python/10_crawling.md
@@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/crawling
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import WikipediaCallingCodesExercise from '!!raw-loader!roa-loader!./exercises/wikipedia_calling_codes.py';
+import GuardianF1AuthorsExercise from '!!raw-loader!roa-loader!./exercises/guardian_f1_authors.py';
**In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.**
@@ -209,34 +212,7 @@ Locating cells in tables is sometimes easier if you know how to [navigate up](ht
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
-
- def download(url):
- response = httpx.get(url)
- response.raise_for_status()
- return BeautifulSoup(response.text, "html.parser")
-
- def parse_calling_code(soup):
- for label in soup.select("th.infobox-label"):
- if label.text.strip() == "Calling code":
- data = label.parent.select_one("td.infobox-data")
- return data.text.strip()
- return None
-
- listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
- listing_soup = download(listing_url)
- for name_cell in listing_soup.select(".wikitable tr td:nth-child(3)"):
- link = name_cell.select_one("a")
- country_url = urljoin(listing_url, link["href"])
- country_soup = download(country_url)
- calling_code = parse_calling_code(country_soup)
- print(country_url, calling_code)
- ```
-
+ {WikipediaCallingCodesExercise.code}
### Scrape authors of F1 news articles
@@ -267,35 +243,5 @@ PA Media: Lewis Hamilton reveals lifelong battle with depression after school bu
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
-
- def download(url):
- response = httpx.get(url)
- response.raise_for_status()
- return BeautifulSoup(response.text, "html.parser")
-
- def parse_author(article_soup):
- link = article_soup.select_one('a[rel="author"]')
- if link:
- return link.text.strip()
- address = article_soup.select_one('aside address')
- if address:
- return address.text.strip()
- return None
-
- listing_url = "https://www.theguardian.com/sport/formulaone"
- listing_soup = download(listing_url)
- for item in listing_soup.select("#maincontent ul li"):
- link = item.select_one("a")
- article_url = urljoin(listing_url, link["href"])
- article_soup = download(article_url)
- title = article_soup.select_one("h1").text.strip()
- author = parse_author(article_soup)
- print(f"{author}: {title}")
- ```
-
+ {GuardianF1AuthorsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md
index e47affbaec..e654ee34eb 100644
--- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md
+++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md
@@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/scraping-variants
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import PythonJobsDatabaseExercise from '!!raw-loader!roa-loader!./exercises/python_jobs_database.py';
+import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.py';
**In this lesson, we'll scrape the product detail pages to represent each product variant as a separate item in our dataset.**
@@ -342,34 +345,7 @@ You can find everything you need for working with dates and times in Python's [`
After inspecting the job board, you'll notice that job postings tagged as "Database" have a dedicated URL. We'll use that as our starting point, which saves us from having to scrape and check the tags manually.
- ```py
- from pprint import pp
- import httpx
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
- from datetime import datetime, date, timedelta
-
- today = date.today()
- jobs_url = "https://www.python.org/jobs/type/database/"
- response = httpx.get(jobs_url)
- response.raise_for_status()
- soup = BeautifulSoup(response.text, "html.parser")
-
- for job in soup.select(".list-recent-jobs li"):
- link = job.select_one(".listing-company-name a")
-
- time = job.select_one(".listing-posted time")
- posted_at = datetime.fromisoformat(time["datetime"])
- posted_on = posted_at.date()
- posted_ago = today - posted_on
-
- if posted_ago <= timedelta(days=60):
- title = link.text.strip()
- company = list(job.select_one(".listing-company-name").stripped_strings)[-1]
- url = urljoin(jobs_url, link["href"])
- pp({"title": title, "company": company, "url": url, "posted_on": posted_on})
- ```
-
+ {PythonJobsDatabaseExercise.code}
### Find the shortest CNN article which made it to the Sports homepage
@@ -386,33 +362,5 @@ At the time of writing, the shortest article on the CNN Sports homepage is [abou
Solution
-
- ```py
- import httpx
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
-
- def download(url):
- response = httpx.get(url)
- response.raise_for_status()
- return BeautifulSoup(response.text, "html.parser")
-
- listing_url = "https://edition.cnn.com/sport"
- listing_soup = download(listing_url)
-
- data = []
- for card in listing_soup.select(".layout__main .card"):
- link = card.select_one(".container__link")
- article_url = urljoin(listing_url, link["href"])
- article_soup = download(article_url)
- if content := article_soup.select_one(".article__content"):
- length = len(content.get_text())
- data.append((length, article_url))
-
- data.sort()
- shortest_item = data[0]
- item_url = shortest_item[1]
- print(item_url)
- ```
-
+ {CnnSportsShortestArticleExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md
index 6f8861785d..174c4f7b93 100644
--- a/sources/academy/webscraping/scraping_basics_python/12_framework.md
+++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md
@@ -5,7 +5,10 @@ description: Lesson about building a Python application for watching prices. Usi
slug: /scraping-basics-python/framework
---
+import CodeBlock from '@theme/CodeBlock';
import Exercises from '../scraping_basics/_exercises.mdx';
+import CrawleeF1DriversExercise from '!!raw-loader!roa-loader!./exercises/crawlee_f1_drivers.py';
+import CrawleeNetflixRatingsExercise from '!!raw-loader!roa-loader!./exercises/crawlee_netflix_ratings.py';
**In this lesson, we'll rework our application for watching prices so that it builds on top of a scraping framework. We'll use Crawlee to make the program simpler, faster, and more robust.**
@@ -462,50 +465,7 @@ If you export the dataset as JSON, it should look something like this:
Solution
-
- ```py
- import asyncio
- from datetime import datetime
-
- from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-
- async def main():
- crawler = BeautifulSoupCrawler()
-
- @crawler.router.default_handler
- async def handle_listing(context: BeautifulSoupCrawlingContext):
- await context.enqueue_links(selector=".teams-driver-item a", label="DRIVER")
-
- @crawler.router.handler("DRIVER")
- async def handle_driver(context: BeautifulSoupCrawlingContext):
- info = {}
- for row in context.soup.select(".common-driver-info li"):
- name = row.select_one("span").text.strip()
- value = row.select_one("h4").text.strip()
- info[name] = value
-
- detail = {}
- for row in context.soup.select(".driver-detail--cta-group a"):
- name = row.select_one("p").text.strip()
- value = row.select_one("h2").text.strip()
- detail[name] = value
-
- await context.push_data({
- "url": context.request.url,
- "name": context.soup.select_one("h1").text.strip(),
- "team": detail["Team"],
- "nationality": info["Nationality"],
- "dob": datetime.strptime(info["DOB"], "%d/%m/%Y").date(),
- "instagram_url": context.soup.select_one(".common-social-share a[href*='instagram']").get("href"),
- })
-
- await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"])
- await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2)
-
- if __name__ == '__main__':
- asyncio.run(main())
- ```
-
+ {CrawleeF1DriversExercise.code}
### Use Crawlee to find the ratings of the most popular Netflix films
@@ -563,45 +523,5 @@ When navigating to the first IMDb search result, you might find it helpful to kn
Solution
-
- ```py
- import asyncio
- from urllib.parse import quote_plus
-
- from crawlee import Request
- from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-
- async def main():
- crawler = BeautifulSoupCrawler()
-
- @crawler.router.default_handler
- async def handle_netflix_table(context: BeautifulSoupCrawlingContext):
- requests = []
- for name_cell in context.soup.select('[data-uia="top10-table-row-title"] button'):
- name = name_cell.text.strip()
- imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
- requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH"))
- await context.add_requests(requests)
-
- @crawler.router.handler("IMDB_SEARCH")
- async def handle_imdb_search(context: BeautifulSoupCrawlingContext):
- await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1)
-
- @crawler.router.handler("IMDB")
- async def handle_imdb(context: BeautifulSoupCrawlingContext):
- rating_selector = "[data-testid='hero-rating-bar__aggregate-rating__score']"
- rating_text = context.soup.select_one(rating_selector).text.strip()
- await context.push_data({
- "url": context.request.url,
- "title": context.soup.select_one("h1").text.strip(),
- "rating": rating_text,
- })
-
- await crawler.run(["https://www.netflix.com/tudum/top10"])
- await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2)
-
- if __name__ == '__main__':
- asyncio.run(main())
- ```
-
+ {CrawleeNetflixRatingsExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py b/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py
new file mode 100644
index 0000000000..bf8c03f07b
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py
@@ -0,0 +1,32 @@
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+def download(url: str) -> BeautifulSoup:
+ response = httpx.get(url)
+ response.raise_for_status()
+ return BeautifulSoup(response.text, "html.parser")
+
+
+listing_url = "https://edition.cnn.com/sport"
+listing_soup = download(listing_url)
+
+results: list[tuple[int, str]] = []
+for card in listing_soup.select('.layout__main .card'):
+ link = card.select_one('.container__link')
+ if not link or 'href' not in link.attrs:
+ continue
+
+ article_url = urljoin(listing_url, link['href'])
+ article_soup = download(article_url)
+ content = article_soup.select_one('.article__content')
+
+ if not content:
+ continue
+
+ results.append((len(content.get_text()), article_url))
+
+results.sort()
+if results:
+ print(results[0][1])
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py
new file mode 100644
index 0000000000..2f7cef895c
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py
@@ -0,0 +1,54 @@
+import asyncio
+from datetime import datetime
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+def format_dob(value: str) -> str | None:
+ try:
+ return datetime.strptime(value, "%d/%m/%Y").date().isoformat()
+ except ValueError:
+ return None
+
+
+async def main() -> None:
+ crawler = BeautifulSoupCrawler()
+
+ @crawler.router.default_handler
+ async def handle_listing(context: BeautifulSoupCrawlingContext) -> None:
+ await context.enqueue_links(selector=".teams-driver-item a", label="DRIVER")
+
+ @crawler.router.handler("DRIVER")
+ async def handle_driver(context: BeautifulSoupCrawlingContext) -> None:
+ info: dict[str, str] = {}
+ for row in context.soup.select(".common-driver-info li"):
+ name = row.select_one("span").text.strip()
+ value = row.select_one("h4").text.strip()
+ info[name] = value
+
+ detail: dict[str, str] = {}
+ for row in context.soup.select(".driver-detail--cta-group a"):
+ name = row.select_one("p").text.strip()
+ value = row.select_one("h2").text.strip()
+ detail[name] = value
+
+ title_tag = context.soup.select_one("h1")
+ instagram_link = context.soup.select_one(".common-social-share a[href*='instagram']")
+
+ await context.push_data(
+ {
+ "url": context.request.url,
+ "name": title_tag.text.strip() if title_tag else None,
+ "team": detail.get("Team"),
+ "nationality": info.get("Nationality"),
+ "dob": format_dob(info.get("DOB", "")),
+ "instagram_url": instagram_link.get("href") if instagram_link else None,
+ }
+ )
+
+ await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"])
+ await crawler.export_data("dataset.json")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py
new file mode 100644
index 0000000000..b7f2000b37
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py
@@ -0,0 +1,46 @@
+import asyncio
+from urllib.parse import quote_plus
+
+from crawlee import Request
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+ crawler = BeautifulSoupCrawler()
+
+ @crawler.router.default_handler
+ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None:
+ requests: list[Request] = []
+ for name_cell in context.soup.select('[data-uia="top10-table-row-title"] button'):
+ name = name_cell.text.strip()
+ imdb_search_url = (
+ f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
+ )
+ requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH"))
+ await context.add_requests(requests)
+
+ @crawler.router.handler("IMDB_SEARCH")
+ async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None:
+ await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1)
+
+ @crawler.router.handler("IMDB")
+ async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None:
+ rating_element = context.soup.select_one(
+ "[data-testid='hero-rating-bar__aggregate-rating__score']"
+ )
+ title_element = context.soup.select_one("h1")
+ if rating_element and title_element:
+ await context.push_data(
+ {
+ "url": context.request.url,
+ "title": title_element.text.strip(),
+ "rating": rating_element.text.strip(),
+ }
+ )
+
+ await crawler.run(["https://www.netflix.com/tudum/top10"])
+ await crawler.export_data("dataset.json")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py
new file mode 100644
index 0000000000..ec11e4ab82
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_drivers.py
@@ -0,0 +1,9 @@
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://www.f1academy.com/Racing-Series/Teams"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+print(len(soup.select('.driver')))
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py
new file mode 100644
index 0000000000..d9b53941ac
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/f1academy_teams.py
@@ -0,0 +1,9 @@
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://www.f1academy.com/Racing-Series/Teams"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+print(len(soup.select('.teams-driver-item')))
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py
new file mode 100644
index 0000000000..4f1bc2664a
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_authors.py
@@ -0,0 +1,36 @@
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+def download(url: str) -> BeautifulSoup:
+ response = httpx.get(url)
+ response.raise_for_status()
+ return BeautifulSoup(response.text, "html.parser")
+
+
+def parse_author(article_soup: BeautifulSoup) -> str | None:
+ link = article_soup.select_one('a[rel="author"]')
+ if link:
+ return link.text.strip()
+ address = article_soup.select_one('aside address')
+ if address:
+ return address.text.strip()
+ return None
+
+
+listing_url = "https://www.theguardian.com/sport/formulaone"
+listing_soup = download(listing_url)
+
+for item in listing_soup.select('#maincontent ul li'):
+ link = item.select_one('a')
+ if not link or 'href' not in link.attrs:
+ continue
+
+ article_url = urljoin(listing_url, link['href'])
+ article_soup = download(article_url)
+
+ title = article_soup.select_one('h1').text.strip()
+ author = parse_author(article_soup)
+
+ print(f"{author}: {title}")
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py
new file mode 100644
index 0000000000..71cec59bd7
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_links.py
@@ -0,0 +1,15 @@
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+listing_url = "https://www.theguardian.com/sport/formulaone"
+response = httpx.get(listing_url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for item in soup.select('#maincontent ul li'):
+ link = item.select_one('a')
+ if link and 'href' in link.attrs:
+ url = urljoin(listing_url, link['href'])
+ print(url)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py
new file mode 100644
index 0000000000..b90d0f48c0
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_f1_titles.py
@@ -0,0 +1,11 @@
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://www.theguardian.com/sport/formulaone"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for title in soup.select("#maincontent ul li h3"):
+ print(title.text)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py
new file mode 100644
index 0000000000..b60c291b52
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/guardian_publish_dates.py
@@ -0,0 +1,23 @@
+from datetime import datetime
+
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://www.theguardian.com/sport/formulaone"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for article in soup.select('#maincontent ul li'):
+ title_tag = article.select_one('h3')
+ time_tag = article.select_one('time')
+
+ if not title_tag or not time_tag or 'datetime' not in time_tag.attrs:
+ continue
+
+ title = title_tag.text.strip()
+ date_iso = time_tag['datetime'].strip()
+ date = datetime.fromisoformat(date_iso)
+
+ print(f"{title} | {date.strftime('%a %b %d %Y')}")
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/lego.py b/sources/academy/webscraping/scraping_basics_python/exercises/lego.py
new file mode 100644
index 0000000000..09816a53d6
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/lego.py
@@ -0,0 +1,6 @@
+import httpx
+
+url = "https://www.lego.com/en-us/themes/star-wars"
+response = httpx.get(url)
+response.raise_for_status()
+print(response.text)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py
new file mode 100644
index 0000000000..934d347b8a
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/process_products_json.py
@@ -0,0 +1,9 @@
+import json
+from pprint import pp
+
+with open('products.json', 'r', encoding='utf-8') as file:
+ products = json.load(file)
+
+for product in products:
+ if int(product['min_price']) > 500:
+ pp(product)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py b/sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py
new file mode 100644
index 0000000000..9659ace119
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/python_jobs_database.py
@@ -0,0 +1,32 @@
+from datetime import datetime, date, timedelta
+from pprint import pp
+
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+today = date.today()
+jobs_url = "https://www.python.org/jobs/type/database/"
+response = httpx.get(jobs_url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for job in soup.select('.list-recent-jobs li'):
+ link = job.select_one('.listing-company-name a')
+ if not link:
+ continue
+
+ time_tag = job.select_one('.listing-posted time')
+ if not time_tag or 'datetime' not in time_tag.attrs:
+ continue
+
+ posted_at = datetime.fromisoformat(time_tag['datetime'])
+ posted_on = posted_at.date()
+
+ if today - posted_on <= timedelta(days=60):
+ title = link.text.strip()
+ company = list(job.select_one('.listing-company-name').stripped_strings)[-1]
+ url = urljoin(jobs_url, link['href'])
+ pp({"title": title, "company": company, "url": url, "posted_on": posted_on})
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
new file mode 100644
index 0000000000..2de3db35f3
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
@@ -0,0 +1,145 @@
+setup_file() {
+ cd "$BATS_TEST_DIRNAME"
+}
+
+teardown() {
+ rm -rf products.json storage dataset.json
+}
+
+@test "outputs the HTML with Star Wars products" {
+ run uv run --with=httpx python lego.py
+
+ [[ "$output" == *"Millennium Falcon"* ]]
+}
+
+@test "counts the number of F1 Academy teams" {
+ run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py
+
+ [[ "$output" == "6" ]]
+}
+
+@test "counts the number of F1 Academy drivers" {
+ run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py
+
+ [[ "$output" == "18" ]]
+}
+
+@test "lists African countries" {
+ run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py
+
+ [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists African countries with a single selector" {
+ run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py
+
+ [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Guardian F1 article titles" {
+ run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py
+
+ [[ "$output" == *' F1 '* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints warehouse stock counts" {
+ run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py
+
+ [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
+ [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints warehouse stock counts using regex" {
+ run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py
+
+ [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
+ [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints Guardian F1 titles with publish dates" {
+ run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py
+
+ [[ "$output" == *' F1 '* ]]
+ [[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "filters products from JSON" {
+ echo '[{"title":"Premium Speakers","min_price":75000,"price":75000},{"title":"Budget Headphones","min_price":25000,"price":25000}]' > products.json
+
+ run uv run python process_products_json.py
+
+ [[ "$output" == "{'title': 'Premium Speakers', 'min_price': 75000, 'price': 75000}" ]]
+}
+
+@test "lists Wikipedia country links" {
+ run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py
+
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]]
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Guardian F1 article links" {
+ run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py
+
+ [[ "$output" == *'https://www.theguardian.com/sport/'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "prints Wikipedia calling codes" {
+ run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py
+
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
+ [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Guardian F1 authors" {
+ run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py
+
+ [[ "$output" == *' F1 '* ]]
+ [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him)
+ [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
+}
+
+@test "lists Python database jobs" {
+ run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py
+
+ [[ "$output" == *"'title': '"* ]]
+ [[ "$output" == *"'company': '"* ]]
+ [[ "$output" == *"'url': 'https://www.python.org/jobs/"* ]]
+ [[ "$output" == *"'posted_on': datetime.date("* ]]
+}
+
+@test "finds the shortest CNN sports article" {
+ run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py
+
+ [[ "$output" == 'https://edition.cnn.com/'* ]]
+}
+
+@test "scrapes F1 Academy driver details with Crawlee" {
+ run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py
+
+ (( status == 0 ))
+ [[ -f dataset.json ]]
+ [[ $(cat dataset.json | jq '. | length') == "18" ]]
+ [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]]
+ [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
+}
+
+@test "scrapes Netflix ratings with Crawlee" {
+ run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py
+
+ (( status == 0 ))
+ [[ -f dataset.json ]]
+ [[ $(cat dataset.json | jq '. | length') == "10" ]]
+ [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]]
+ [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
+}
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py
new file mode 100644
index 0000000000..23ce38b17e
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units.py
@@ -0,0 +1,23 @@
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for product in soup.select('.product-item'):
+ title = product.select_one('.product-item__title').text.strip()
+
+ units_text = (
+ product.select_one('.product-item__inventory').text
+ .removeprefix('In stock,')
+ .removeprefix('Only')
+ .removesuffix(' left')
+ .removesuffix('units')
+ .strip()
+ )
+ units = 0 if 'Sold out' in units_text else int(units_text)
+
+ print(f"{title} | {units}")
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py
new file mode 100644
index 0000000000..7aba32a604
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/warehouse_units_regex.py
@@ -0,0 +1,19 @@
+import re
+
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for product in soup.select('.product-item'):
+ title = product.select_one('.product-item__title').text.strip()
+
+ units_text = product.select_one('.product-item__inventory').text
+ match = re.search(r"\d+", units_text)
+ units = int(match.group()) if match else 0
+
+ print(f"{title} | {units}")
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py
new file mode 100644
index 0000000000..4d424d6dc1
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_calling_codes.py
@@ -0,0 +1,32 @@
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+
+def download(url: str) -> BeautifulSoup:
+ response = httpx.get(url)
+ response.raise_for_status()
+ return BeautifulSoup(response.text, "html.parser")
+
+
+def parse_calling_code(soup: BeautifulSoup) -> str | None:
+ for label in soup.select('th.infobox-label'):
+ if label.text.strip() == 'Calling code':
+ cell = label.parent.select_one('td.infobox-data')
+ return cell.text.strip() if cell else None
+ return None
+
+
+listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+listing_soup = download(listing_url)
+
+for name_cell in listing_soup.select('.wikitable tr td:nth-child(3)'):
+ link = name_cell.select_one('a')
+ if not link or 'href' not in link.attrs:
+ continue
+
+ country_url = urljoin(listing_url, link['href'])
+ country_soup = download(country_url)
+ calling_code = parse_calling_code(country_soup)
+
+ print(country_url, calling_code)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py
new file mode 100644
index 0000000000..0d4769ccbb
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries.py
@@ -0,0 +1,17 @@
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for table in soup.select(".wikitable"):
+ for row in table.select("tr"):
+ cells = row.select("td")
+ if cells:
+ third_column = cells[2]
+ link = third_column.select_one("a")
+ if link:
+ print(link.text)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py
new file mode 100644
index 0000000000..1fc4a6b268
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_countries_single_selector.py
@@ -0,0 +1,13 @@
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+response = httpx.get(url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for name_cell in soup.select(".wikitable tr td:nth-child(3)"):
+ link = name_cell.select_one("a")
+ if link:
+ print(link.text)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py
new file mode 100644
index 0000000000..f435016e45
--- /dev/null
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/wikipedia_country_links.py
@@ -0,0 +1,15 @@
+import httpx
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+response = httpx.get(listing_url)
+response.raise_for_status()
+
+soup = BeautifulSoup(response.text, "html.parser")
+
+for name_cell in soup.select('.wikitable tr td:nth-child(3)'):
+ link = name_cell.select_one('a')
+ if link and 'href' in link.attrs:
+ url = urljoin(listing_url, link['href'])
+ print(url)