diff --git a/repomix-output.xml b/repomix-output.xml
new file mode 100644
index 0000000000..04cfcaa9ab
--- /dev/null
+++ b/repomix-output.xml
@@ -0,0 +1,28987 @@
+This file is a merged representation of the entire codebase, combined into a single document by Repomix.
+
+<file_summary>
+This section contains a summary of this file.
+
+<purpose>
+This file contains a packed representation of the entire repository's contents.
+It is designed to be easily consumable by AI systems for analysis, code review,
+or other automated processes.
+</purpose>
+
+<file_format>
+The content is organized as follows:
+1. This summary section
+2. Repository information
+3. Directory structure
+4. Repository files, each consisting of:
+  - File path as an attribute
+  - Full contents of the file
+</file_format>
+
+<usage_guidelines>
+- This file should be treated as read-only. Any changes should be made to the
+  original repository files, not this packed version.
+- When processing this file, use the file path to distinguish
+  between different files in the repository.
+- Be aware that this file may contain sensitive information. Handle it with
+  the same level of security as you would the original repository.
+</usage_guidelines>
+
+<notes>
+- Some files may have been excluded based on .gitignore rules and Repomix's configuration
+- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files
+- Files matching patterns in .gitignore are excluded
+- Files matching default ignore patterns are excluded
+- Files are sorted by Git change count (files with more changes are at the bottom)
+</notes>
+
+<additional_info>
+
+</additional_info>
+
+</file_summary>
+
+<directory_structure>
+glossary/
+  concepts/
+    css_selectors.md
+    dynamic_pages.md
+    html_elements.md
+    http_cookies.md
+    http_headers.md
+    index.md
+    querying_css_selectors.md
+    robot_process_automation.md
+  tools/
+    apify_cli.md
+    edit_this_cookie.md
+    index.md
+    insomnia.md
+    modheader.md
+    postman.md
+    proxyman.md
+    quick_javascript_switcher.md
+    switchyomega.md
+    user_agent_switcher.md
+  glossary.md
+platform/
+  deploying_your_code/
+    deploying.md
+    docker_file.md
+    index.md
+    input_schema.md
+    inputs_outputs.md
+    output_schema.md
+  expert_scraping_with_apify/
+    solutions/
+      handling_migrations.md
+      index.md
+      integrating_webhooks.md
+      managing_source.md
+      rotating_proxies.md
+      saving_stats.md
+      using_api_and_client.md
+      using_storage_creating_tasks.md
+    actors_webhooks.md
+    apify_api_and_client.md
+    bypassing_anti_scraping.md
+    index.md
+    managing_source_code.md
+    migrations_maintaining_state.md
+    saving_useful_stats.md
+    tasks_and_storage.md
+  get_most_of_actors/
+    actor_basics/
+      _category_.yaml
+      actor_description_seo_description.md
+      actors-and-emojis.md
+      how-to-create-actor-readme.md
+      importance-of-actor-url.md
+      name-your-actor.md
+    interact_with_users/
+      _category_.yaml
+      emails_to_actor_users.md
+      issues_tab.md
+      your_store_bio.md
+    product_optimization/
+      _category_.yaml
+      actor_bundles.md
+      how_to_create_a_great_input_schema.md
+    promote_your_actor/
+      _category_.yaml
+      blogs_and_blog_resources.md
+      parasite_seo.md
+      product_hunt.md
+      seo.md
+      social_media.md
+      video_tutorials.md
+      webinars.md
+    store_basics/
+      _category_.yaml
+      actor_success_stories.md
+      how_actor_monetization_works.md
+      how_store_works.md
+      how_to_build_actors.md
+      ideas_page_and_its_use.md
+    index.md
+    monetizing_your_actor.md
+  getting_started/
+    actors.md
+    apify_api.md
+    apify_client.md
+    creating_actors.md
+    index.md
+    inputs_outputs.md
+  apify_platform.md
+  running_a_web_server.md
+tutorials/
+  api/
+    index.md
+    retry_failed_requests.md
+    run_actor_and_retrieve_data_via_api.md
+  apify_scrapers/
+    cheerio_scraper.md
+    getting_started.md
+    index.md
+    puppeteer_scraper.md
+    web_scraper.md
+  node_js/
+    add_external_libraries_web_scraper.md
+    analyzing_pages_and_fixing_errors.md
+    apify_free_google_serp_api.md
+    avoid_eacces_error_in_actor_builds.md
+    block_requests_puppeteer.md
+    caching_responses_in_puppeteer.js
+    caching_responses_in_puppeteer.md
+    choosing_the_right_scraper.md
+    dealing_with_dynamic_pages.js
+    dealing_with_dynamic_pages.md
+    debugging_web_scraper.md
+    filter_blocked_requests_using_sessions.md
+    handle_blocked_requests_puppeteer.md
+    how_to_fix_target_closed.md
+    how_to_save_screenshots_puppeteer.md
+    index.md
+    js_in_html.md
+    multiple-runs-scrape.md
+    optimizing_scrapers.md
+    processing_multiple_pages_web_scraper.md
+    request_labels_in_apify_actors.md
+    scraping_from_sitemaps.js
+    scraping_from_sitemaps.md
+    scraping_shadow_doms.md
+    scraping_urls_list_from_google_sheets.md
+    submitting_form_with_file_attachment.md
+    submitting_forms_on_aspx_pages.md
+    using_proxy_to_intercept_requests_puppeteer.md
+    waiting_for_dynamic_content.md
+    when_to_use_puppeteer_scraper.md
+  php/
+    index.md
+    using_apify_from_php.md
+  python/
+    index.md
+    process_data_using_python.md
+    scrape_data_python.md
+  tutorials/
+    index.md
+webscraping/
+  advanced_web_scraping/
+    crawling/
+      crawling-sitemaps.md
+      crawling-with-search.md
+      sitemaps-vs-search.md
+    index.md
+    tips_and_tricks_robustness.md
+  anti_scraping/
+    mitigation/
+      cloudflare_challenge.md
+      generating_fingerprints.md
+      index.md
+      using_proxies.md
+    techniques/
+      browser_challenges.md
+      captchas.md
+      fingerprinting.md
+      firewalls.md
+      geolocation.md
+      index.md
+      rate_limiting.md
+    index.md
+  api_scraping/
+    general_api_scraping/
+      cookies_headers_tokens.md
+      handling_pagination.md
+      index.md
+      locating_and_learning.md
+    graphql_scraping/
+      custom_queries.md
+      index.md
+      introspection.md
+      modifying_variables.md
+    index.md
+  puppeteer_playwright/
+    common_use_cases/
+      downloading_files.md
+      index.md
+      logging_into_a_website.md
+      paginating_through_results.md
+      scraping_iframes.md
+      submitting_a_form_with_a_file_attachment.md
+    executing_scripts/
+      extracting_data.md
+      index.md
+      injecting_code.md
+    page/
+      index.md
+      interacting_with_a_page.md
+      page_methods.md
+      waiting.md
+    browser_contexts.md
+    browser.md
+    index.md
+    proxies.md
+    reading_intercepting_requests.md
+  scraping_basics_javascript/
+    challenge/
+      index.md
+      initializing_and_setting_up.md
+      modularity.md
+      scraping_amazon.md
+    crawling/
+      exporting_data.md
+      filtering_links.md
+      finding_links.js
+      finding_links.md
+      first_crawl.md
+      headless_browser.md
+      index.md
+      pro_scraping.md
+      recap_extraction_basics.md
+      relative_urls.md
+      scraping_the_data.md
+    data_extraction/
+      browser_devtools.md
+      computer_preparation.md
+      devtools_continued.md
+      index.md
+      node_continued.md
+      node_js_scraper.md
+      project_setup.md
+      save_to_csv.md
+      using_devtools.md
+    best_practices.md
+    index.md
+    introduction.md
+  scraping_basics_python/
+    _exercises.mdx
+    01_devtools_inspecting.md
+    02_devtools_locating_elements.md
+    03_devtools_extracting_data.md
+    04_downloading_html.md
+    05_parsing_html.md
+    06_locating_elements.md
+    07_extracting_data.md
+    08_saving_data.md
+    09_getting_links.md
+    10_crawling.md
+    11_scraping_variants.md
+    12_framework.md
+    13_platform.md
+    index.md
+  typescript/
+    enums.md
+    index.md
+    installation.md
+    interfaces.md
+    mini_project.md
+    type_aliases.md
+    unknown_and_type_assertions.md
+    using_types_continued.md
+    using_types.md
+    watch_mode_and_tsconfig.md
+homepage_content.json
+index.mdx
+sidebars.js
+</directory_structure>
+
+<files>
+This section contains the contents of the repository's files.
+
+<file path="glossary/concepts/css_selectors.md">
+---
+title: CSS selectors
+description: Learn about CSS selectors. What they are, their types, why they are important for web scraping and how to use them in browser Console with JavaScript.
+sidebar_position: 8.4
+slug: /concepts/css-selectors
+---
+
+CSS selectors are patterns used to select [HTML elements](./html_elements.md) on a web page. They are used in combination with CSS styles to change the appearance of web pages, and also in JavaScript to access and manipulate the elements on a web page.
+
+> Querying of CSS selectors with JavaScript is done using [query selector functions](./querying_css_selectors.md).
+
+## Common types of CSS selectors
+
+Some of the most common types of CSS selectors are:
+
+### Element selector
+
+This is used to select elements by their tag name. For example, to select all `<p>` elements, you would use the `p` selector.
+
+```js
+const paragraphs = document.querySelectorAll('p');
+```
+
+### Class selector
+
+This is used to select elements by their class attribute. For example, to select all elements with the class of `highlight`, you would use the `.highlight` selector.
+
+```js
+const highlightedElements = document.querySelectorAll('.highlight');
+```
+
+### ID selector
+
+This is used to select an element by its `id` attribute. For example, to select an element with the id of `header`, you would use the `#header` selector.
+
+```js
+const header = document.querySelector(`#header`);
+```
+
+### Attribute selector
+
+This is used to select elements based on the value of an attribute. For example, to select all elements with the attribute `data-custom` whose value is `yes`, you would use the `[data-custom="yes"]` selector.
+
+```js
+const customElements = document.querySelectorAll('[data-custom="yes"]');
+```
+
+### Chaining selectors
+
+You can also chain multiple selectors together to select elements more precisely. For example, to select an element with the class `highlight` that is inside a `<p>` element, you would use the `p.highlight` selector.
+
+```js
+const highlightedParagraph = document.querySelectorAll('p.highlight');
+```
+
+## CSS selectors in web scraping
+
+CSS selectors are important for web scraping because they allow you to target specific elements on a web page and extract their data. When scraping a web page, you typically want to extract specific pieces of information from the page, such as text, images, or links. CSS selectors allow you to locate these elements on the page, so you can extract the data that you need.
+
+For example, if you wanted to scrape a list of all the titles of blog posts on a website, you could use a CSS selector to select all the elements that contain the title text. Once you have selected these elements, you can extract the text from them and use it for your scraping project.
+
+Additionally, when web scraping it is important to understand the structure of the website and CSS selectors can help you to navigate it. With them, you can select specific elements and their children, siblings, or parent elements. This allows you to extract data that is nested within other elements, or to navigate through the page structure to find the data you need.
+
+## Resources
+
+- Find all the available CSS selectors and their syntax on the [MDN CSS Selectors page](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors).
+</file>
+
+<file path="glossary/concepts/dynamic_pages.md">
+---
+title: Dynamic pages
+description: Understand what makes a page dynamic, and how a page being dynamic might change your approach when writing a scraper for it.
+sidebar_position: 8.3
+slug: /concepts/dynamic-pages
+---
+
+# Dynamic pages and single-page applications (SPAs) {#dynamic-pages}
+
+**Understand what makes a page dynamic, and how a page being dynamic might change your approach when writing a scraper for it.**
+
+---
+
+Oftentimes, web pages load additional information dynamically, long after their main body is loaded in the browser. A subset of dynamic pages takes this approach further and loads all of its content dynamically. Such style of constructing websites is called Single-page applications (SPAs), and it's widespread thanks to some popular JavaScript libraries, such as [React](https://react.dev/) or [Vue](https://vuejs.org/).
+
+As you progress in your scraping journey, you'll quickly realize that different websites load their content and populate their pages with data in different ways. Some pages are rendered entirely on the server, some retrieve the data dynamically, and some use a combination of both those methods.
+
+## How page loading works {#about-page-loading}
+
+The process of loading a page involves three main events, each with a designated corresponding name:
+
+1. `DOMContentLoaded` - The initial HTML document is loaded, which contains the HTML as it was rendered on the website's server. It also includes all of the JavaScript which will be run in the next step.
+2. `load` - The page's JavaScript is executed.
+3. `networkidle` - Network [XHR/Fetch requests](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest) are sent and loaded, and data from these requests is populated onto the page. Many websites load essential data this way. These requests might be sent upon certain page events as well (not just the first load), such as scrolling or clicking.
+
+Now that we have a solid understanding of the different stages of page-loading, and the order they happen in, we can fully understand what a dynamic page is.
+
+## What is dynamic content {#what-is-dynamic-content}
+
+Dynamic content is any content that is rendered **after** the `DOMContentLoaded` event, which means any content loaded by JavaScript during the `load` event, or after any network XHR/Fetch requests have been made.
+
+Sometimes, it can be quite obvious when content is dynamically being rendered. For example, take a look at this gif:
+
+<!-- This image comes from this blog post https://blog.apify.com/what-is-a-dynamic-page/ -->
+<!-- It is pretty large, so it doesn't make sense to upload it a second time here -->
+
+![Image](https://blog.apify.com/content/images/2022/02/dynamicLoading-1--1--2.gif)
+
+Here, it's very clear that new content is being generated. As we scroll down the Twitter feed, we can see the scroll bar jumping back up, signifying that more elements have been created using JavaScript.
+
+Other times, it's less obvious though. Content can appear to be static (non-dynamic) when it is not, or even sometimes the other way around.
+</file>
+
+<file path="glossary/concepts/html_elements.md">
+---
+title: HTML elements
+description: Learn about HTML elements. What they are, their types and how to work with them in a browser environment using JavaScript.
+sidebar_position: 8.6
+slug: /concepts/html-elements
+---
+
+An HTML element is a building block of an HTML document. It is used to represent a piece of content on a web page, such as text, images, or videos. Each element is defined by a tag, which is a set of characters enclosed in angle brackets, such as `<p>`, `<img>`, or `<video>`. For example, this is a paragraph element:
+
+```html
+<p>This is a paragraph of text.</p>
+```
+
+You can also add **attributes** to an element to provide additional information or to control how the element behaves. For example, the `src` attribute is used to specify the source of an image, like this:
+
+```html
+<img src="image.jpg" alt="A description of the image">
+```
+
+In JavaScript, you can use the **DOM** (Document Object Model) to interact with elements on a web page. For example, you can use the [`querySelector()` method](./querying_css_selectors.md) to select an element by its [CSS selector](./css_selectors.md), like this:
+
+```js
+const myElement = document.querySelector('#myId');
+```
+
+You can also use `getElementById()` method to select an element by its `id`, like this:
+
+```js
+const myElement = document.getElementById('myId');
+```
+
+You can also use `getElementsByTagName()` method to select all elements of a certain type, like this:
+
+```js
+const myElements = document.getElementsByTagName('p');
+```
+
+Once you have selected an element, you can use JavaScript to change its content, style, or behavior.
+
+In summary, an HTML element is a building block of a web page. It is defined by a **tag** with **attributes**, which provide additional information or control how the element behaves. You can use the **DOM** (Document Object Model) to interact with elements on a web page.
+</file>
+
+<file path="glossary/concepts/http_cookies.md">
+---
+title: HTTP cookies
+description: Learn a bit about what cookies are, and how they are utilized in scrapers to appear logged-in, view specific data, or even avoid blocking.
+sidebar_position: 8.2
+slug: /concepts/http-cookies
+---
+
+# HTTP cookies {#cookies}
+
+**Learn a bit about what cookies are, and how they are utilized in scrapers to appear logged-in, view specific data, or even avoid blocking.**
+
+---
+
+HTTP cookies are small pieces of data sent by the server to the user's web browser, which are typically stored by the browser and used to send later requests to the same server. Cookies are usually represented as a string (if used together with a plain HTTP request) and sent with the request under the **Cookie** [header](./http_headers.md).
+
+## Most common uses of cookies in crawlers {#uses-in-crawlers}
+
+1. To make the website show data to you as if you were a logged-in user.
+2. To make the website show location-specific data (works for websites where you could set a zip code or country directly on the page, but unfortunately doesn't work for some location-based ads).
+3. To make the website less suspicious of the crawler and let the crawler's traffic blend in with regular user traffic.
+
+For local testing, we recommend using the [**EditThisCookie**](https://chrome.google.com/webstore/detail/fngmhnnpilhplaeedifhccceomclgfbg) Chrome extension.
+</file>
+
+<file path="glossary/concepts/http_headers.md">
+---
+title: HTTP headers
+description: Understand what HTTP headers are, what they're used for, and three of the biggest differences between HTTP/1.1 and HTTP/2 headers.
+sidebar_position: 8.1
+slug: /concepts/http-headers
+---
+
+# HTTP headers {#headers}
+
+**Understand what HTTP headers are, what they're used for, and three of the biggest differences between HTTP/1.1 and HTTP/2 headers.**
+
+---
+
+[HTTP headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers) let the client and the server pass additional information with an HTTP request or response. Headers are represented by an object where the keys are header names. Headers can also contain certain authentication tokens.
+
+In general, there are 4 different paths you'll find yourself on when scraping a website and dealing with headers:
+
+## No headers {#no-headers}
+
+For some websites, you won't need to worry about modifying headers at all, as there are no checks or verifications in place.
+
+## Some default headers required {#needs-default-headers}
+
+Some websites will require certain default browser headers to work properly, such as **User-Agent** (though, this header is becoming more obsolete, as there are more sophisticated ways to detect and block a suspicious user).
+
+Another example of such a "default" header is **Referer**. Some e-commerce websites might share the same platform, and data is loaded through XMLHttpRequests to that platform, which would not know which data to return without knowing which exact website is requesting it.
+
+## Custom headers required {#needs-custom-headers}
+
+A custom header is a non-standard HTTP header used for a specific website. For example, an imaginary website of **cool-stuff.com** might have a header with the name **X_Cool_Stuff_Token** which is required for every single request to a product page.
+
+Dealing with cases like these usually isn't difficult, but can sometimes be tedious.
+
+## Very specific headers required {#needs-specific-headers}
+
+The most challenging websites to scrape are the ones that require a full set of site-specific headers to be included with the request. For example, not only would they potentially require proper **User-Agent** and **Referer** headers mentioned above, but also **Accept**, **Accept-Language**, **Accept-Encoding**, etc. with specific values.
+
+Another big one to mention is the **Cookie** header. We cover this in more detail within the [cookies](./http_cookies.md) lesson.
+
+You could use Chrome DevTools to inspect request headers, and [Insomnia](../tools/insomnia.md) or [Postman](../tools/postman.md) to test how the website behaves with or without specific headers.
+
+## HTTP/1.1 vs HTTP/2 headers {#http1-vs-http2}
+
+HTTP/1.1 and HTTP/2 headers have several differences. Here are the three key differences that you should be aware of:
+
+1. HTTP/2 headers do not include status messages. They only contain status codes.
+2. Certain headers are no longer used in HTTP/2 (such as **Connection** along with a few others related to it like **Keep-Alive**). In HTTP/2, connection-specific headers are prohibited. While some browsers will ignore them, Safari and other Webkit-based browsers will outright reject any response that contains them. Easy to do by accident, and a big problem.
+3. While HTTP/1.1 headers are case-insensitive and could be sent by the browsers with capitalized letters (e.g. **Accept-Encoding**, **Cache-Control**, **User-Agent**), HTTP/2 headers must be lower-cased (e.g. **accept-encoding**, **cache-control**, **user-agent**).
+
+> To learn more about the difference between HTTP/1.1 and HTTP/2 headers, check out [this](https://httptoolkit.com/blog/translating-http-2-into-http-1/) article
+</file>
+
+<file path="glossary/concepts/index.md">
+---
+title: Concepts
+description: Learn about some common yet tricky concepts and terms that are used frequently within the academy, as well as in the world of scraper development.
+sidebar_position: 18
+category: glossary
+slug: /concepts
+---
+
+# Concepts 🤔 {#concepts}
+
+**Learn about some common yet tricky concepts and terms that are used frequently within the academy, as well as in the world of scraper development.**
+
+---
+
+You'll see some terms and concepts frequently repeated throughout various courses in the academy. Many of these concepts are common, and even fundamental in the scraping world, which makes it necessary to explain them to our course-takers; however it would be inconvenient for our readers to explain these terms each time they appear in a lesson.
+
+Because of this slight dilemma, and because there are no outside resources which compile all of these concepts into an educational and digestible form, we've decided to do just that. Welcome to the **Concepts** section of the Apify Academy's **Glossary**!
+
+> It's important to note that there is no specific order to these concepts. All of them range in their relevance and importance to your every day scraping endeavors.
+</file>
+
+<file path="glossary/concepts/querying_css_selectors.md">
+---
+title: Querying elements
+description: Learn how to query DOM elements using CSS selectors with the document.querySelector() and document.querySelectorAll() functions.
+sidebar_position: 8.5
+slug: /concepts/querying-css-selectors
+---
+
+`document.querySelector()` and `document.querySelectorAll()` are JavaScript functions that allow you to select elements on a web page using [CSS selectors](./css_selectors.md).
+
+`document.querySelector()` is used to select the first element that matches the provided [CSS selector](./css_selectors.md). It returns the first matching element or null if no matching element is found.
+
+Here's an example of how you can use it:
+
+```js
+const firstButton = document.querySelector('button');
+```
+
+This will select the first button element on the page and store it in the variable **firstButton**.
+
+`document.querySelectorAll()` is used to select all elements that match the provided CSS selector. It returns a `NodeList` (a collection of elements) that can be accessed and manipulated like an array.
+
+Here's an example of how you can use it:
+
+```js
+const buttons = document.querySelectorAll('button');
+```
+
+This will select all button elements on the page and store them in the variable "buttons".
+
+Both functions can be used to access and manipulate the elements in the web page. Here's an example on how you can use it to extract the text of all buttons.
+
+```js
+const buttons = document.querySelectorAll('button');
+const buttonTexts = buttons.forEach((button) => button.textContent);
+```
+
+It's important to note that when using `querySelectorAll()` in a browser environment, it returns a live `NodeList`, which means that if the DOM changes, the NodeList will also change.
+</file>
+
+<file path="glossary/concepts/robot_process_automation.md">
+---
+title: Robotic process automation
+description: Learn the basics of robotic process automation. Make your processes on the web and other software more efficient by automating repetitive tasks.
+sidebar_position: 8.7
+slug: /concepts/robotic-process-automation
+---
+
+# What is robotic process automation (RPA)? {#what-is-robotic-process-automation-rpa}
+
+**Learn the basics of robotic process automation. Make your processes on the web and other software more efficient by automating repetitive tasks.**
+
+---
+
+RPA allows you to create software (also known as **bots**), which can imitate your digital actions. You can program bots to perform repetitive tasks faster, more reliably and more accurately than humans. Plus, they can do these tasks all day, every day.
+
+## What can I use RPA for? {#what-can-i-use-rpa-for}
+
+You can [use](https://apify.com/use-cases/rpa) RPA to automate any repetitive task you perform using software. The tasks can range from [analyzing content](https://apify.com/jakubbalada/content-checker) to monitoring web pages for changes (such as changes in your competitors' pricing).
+
+Other use cases for RPA include filling forms or [uploading files](https://apify.com/lukaskrivka/google-sheets) while you get on with more important tasks. And it's not just simple tasks you can automate. How about [processing your invoices](https://apify.com/katerinahronik/toggl-invoice-download) or posting content across several marketing channels at once?
+
+## How does RPA work? {#how-does-rpa-work}
+
+In a traditional automation workflow, you
+
+1. Break a repetitive process down into [manageable chunks](https://kissflow.com/workflow/workflow-automation/an-8-step-checklist-to-get-your-workflow-ready-for-automation/), e.g. open website => log into website => click button "X" => download section "Y", etc.
+2. Program a bot that does each of those chunks.
+3. Execute the chunks of code in the right order (or in parallel).
+
+With the advance of [machine learning](https://en.wikipedia.org/wiki/Machine_learning), it is becoming possible to [record](https://www.nice.com/info/rpa-guide/process-recorder-function-in-rpa/) your workflows and analyze which can be automated. However, this technology is still not perfected and at times can even be less practical than the manual process.
+
+## Is RPA the same as web scraping? {#is-rpa-the-same-as-web-scraping}
+
+While [web scraping](../../webscraping/scraping_basics_javascript/index.md) is a kind of RPA, it focuses on extracting structured data. RPA focuses on the other tasks in browsers - everything except for extracting information.
+
+## Additional resources {#additional-resources}
+
+An easy-to-follow [video](https://www.youtube.com/watch?v=9URSbTOE4YI) on what RPA is.
+
+To learn about RPA in plain English, check out [this](https://enterprisersproject.com/article/2019/5/rpa-robotic-process-automation-how-explain) article.
+
+[This](https://www.cio.com/article/227908/what-is-rpa-robotic-process-automation-explained.html) article explains what RPA is and discusses both its advantages and disadvantages.
+
+You might also like to check out this article on [12 Steps to Automate Workflows](https://quandarycg.com/automating-workflows/).
+</file>
+
+<file path="glossary/tools/apify_cli.md">
+---
+title: The Apify CLI
+description: Learn about, install, and log into the Apify CLI - your best friend for interacting with the Apify platform via your terminal.
+sidebar_position: 9.1
+slug: /tools/apify-cli
+---
+
+# The Apify CLI {#the-apify-cli}
+
+**Learn about, install, and log into the Apify CLI - your best friend for interacting with the Apify platform via your terminal.**
+
+---
+
+The [Apify CLI](/cli) helps you create, develop, build and run Apify Actors, and manage the Apify cloud platform from any computer. It can be used to automatically generate the boilerplate for different types of projects, initialize projects, remotely call Actors on the platform, and run your own projects.
+
+## Installing {#installing}
+
+To install the Apfiy CLI, you'll first need npm, which comes preinstalled with Node.js. If you haven't yet installed Node, learn how to do that [here](../../webscraping/scraping_basics_javascript/data_extraction/computer_preparation.md). Additionally, make sure you've got an Apify account, as you will need to log in to the CLI to gain access to its full potential.
+
+Open up a terminal instance and run the following command:
+
+```shell
+npm i -g apify-cli
+```
+
+This will install the CLI via npm.
+
+## Logging in {#logging-in}
+
+After the CLI has finished installing, navigate to the [Apify Console](https://console.apify.com?asrc=developers_portal) and click on **Settings**. Then, within your account settings, click **Integrations**. The page should look like this:
+
+![Integrations tab on the Apify platform](./images/settings-integrations.jpg)
+
+> We've censored out the **User ID** in the image because it is private information which should not be shared with anyone who is not trusted. The same goes for your **Personal API Token**.
+
+Copy the **Personal API Token** and return to your terminal, entering this command:
+
+```shell
+apify login -t YOUR_TOKEN_HERE
+```
+
+If you see a log which looks like this,
+
+```text
+Success: You are logged in to Apify as YOUR_USERNAME!
+```
+
+If you see a log which looks like **Success: You are logged in to Apify as YOUR_USERNAME!**, you're in!
+</file>
+
+<file path="glossary/tools/edit_this_cookie.md">
+---
+title: EditThisCookie
+description: Learn how to add, delete, and modify different cookies in your browser for testing purposes using the EditThisCookie Chrome extension.
+sidebar_position: 9.7
+slug: /tools/edit-this-cookie
+---
+
+# What's EditThisCookie? {#what-is-it}
+
+**Learn how to add, delete, and modify different cookies in your browser for testing purposes using the EditThisCookie Chrome extension.**
+
+---
+
+**EditThisCookie** is a Chrome extension to manage your browser's cookies. It can be added through the [Chrome Web Store](https://chrome.google.com/webstore/category/extensions). After adding it to Chrome, you'll see a button with a delicious cookie icon next to any other Chrome extensions you might have installed. Clicking on it will open a pop-up window with a list of all saved cookies associated with the currently opened page domain.
+
+![EditThisCookie popup](./images/edit-this-cookie-popup.png)
+
+## Functionalities {#functions}
+
+At the top of the popup, there is a row of buttons. From left to right, here is an explanation for each one:
+
+### Delete all cookies
+
+Clicking this button will remove all cookies associated with the current domain. For example, if you're logged into your Apify account and delete all the cookies, the website will ask you to log in again.
+
+### Reset
+
+A refresh button.
+
+### Add a new cookie
+
+Manually add a new cookie for the current domain.
+
+### Import cookies
+
+Allows you to add cookies in bulk. For example, if you have saved some cookies inside your crawler, or someone provided you with some cookies for the purpose of testing a certain website in your browser, they can be imported and automatically applied with this button.
+
+### Export cookies
+
+Copies an array of cookies associated with the current domain to the clipboard. The cookies can then be later inspected, added to your crawler, or imported by someone else using EditThisCookie.
+
+### Search
+
+Allows you to filter through cookies by name.
+
+### Options
+
+Will open a new browser tab with a bunch of EditThisCookie options. The options page allows you to tweak a few settings such as changing the export format, but you will most likely never need to change anything there.
+
+![EditThisCookie options](./images/edit-this-cookie-options.png)
+</file>
+
+<file path="glossary/tools/index.md">
+---
+title: Tools
+description: Discover a variety of tools that can be used to enhance the scraper development process, or even unlock doors to new scraping possibilities.
+sidebar_position: 17
+category: glossary
+slug: /tools
+---
+
+# Tools 🔧 {#tools}
+
+**Discover a variety of tools that can be used to enhance the scraper development process, or even unlock doors to new scraping possibilities.**
+
+---
+
+Here at Apify, we've found many tools, some quite popular and well-known and some niche, which can aid any developer in their scraper development process. We've compiled some of our favorite developer tools into this short section. Each tool featured here serves a specific purpose, if not multiple purposes, which are directly relevant to Web Scraping and Web Automation.
+
+In any lesson in the academy where a tool which was not already discussed in the course is being used, a short lesson about the tool will be featured in the **Tools** section right here in the Apify Academy's **Glossary** and referenced with a link within the lesson.
+</file>
+
+<file path="glossary/tools/insomnia.md">
+---
+title: Insomnia
+description: Learn about Insomnia, a valuable tool for testing requests and proxies when building scalable web scrapers.
+sidebar_position: 9.2
+slug: /tools/insomnia
+---
+
+# What is Insomnia {#what-is-insomnia}
+
+**Learn about Insomnia, a valuable tool for testing requests and proxies when building scalable web scrapers.**
+
+---
+
+Despite its name, the [Insomnia](https://insomnia.rest/download) desktop application has absolutely nothing to do with having a lack of sleep. Rather, it is a tool to build and test APIs. If you've already read about [Postman](./postman.md), you already know what Insomnia can be used for, as they both practically do the same exact things.
+While Insomnia shares similarities with Postman, such as the ability to send requests with specific headers, cookies, and payloads, it has a few notable differences. One key difference is Insomnia's feature to display the entire request timeline.
+
+Insomnia can be downloaded from its [official website](https://insomnia.rest/download), and its features can be read about in the [official documentation](https://docs.insomnia.rest/).
+
+## The Insomnia interface {#insomnia-interface}
+
+After opening the app, you'll first need to create a new request. After creating the request, you'll see an interface that looks like this:
+
+![Insomnia interface](./images/insomnia-interface.jpg)
+
+Let's break down the main sections:
+
+### List of requests
+
+You can configure multiple requests with a custom payload, headers, cookies, parameters, etc. They are automatically saved in the list of requests until deleted.
+
+### Address bar
+
+The place where you select the type of request to send (**GET**, **POST**, **PUT**, **DELETE**, etc.), specify the URI of the request and send the request with the **Send** button.
+
+### Request options
+
+Here, you can add a request payload, specify authorization parameters, add query parameters, and attach headers to the request.
+
+### Response
+
+Where the response body is displayed after the request has been sent. Like in Postman, the request can be viewed in preview mode, pretty-printed, or in its raw form. This section also has the **Headers** and **Cookies** tabs, which respectively show the request headers and cookies.
+
+## Request timeline {#request-timeline}
+
+The one feature of Insomnia that separates it from Postman is the **Timeline**.
+
+![Request timeline](./images/insomnia-timeline.jpg)
+
+This feature allows you to see information about the request that is not present in the response body.
+
+## Using proxies in Insomnia {#using-proxies}
+
+In order to use a proxy, you need to specify the proxy's parameters in Insomnia's preferences. In preferences, scroll down to the **HTTP Network Proxy** section under the **General** tab and specify the full proxy URL there:
+
+![Configuring a proxy](./images/insomnia-proxy.png)
+
+## Managing the cookies cache {#managing-cookies-cache}
+
+Insomnia keeps the cookies for the requests you have already sent before. This might result in you receiving a different response within your scraper from what you're receiving in Insomnia, as a necessary cookie is not present in the request sent by the scraper. To check whether or not some cookies associated with a certain request have been cached, click on the **Cookies** button at the top of the list of requests:
+
+![Click on the "Cookies" button](./images/insomnia-cookies.png)
+
+This will bring up the **Manage cookies** window, where all cached cookies can be viewed, edited, or deleted.
+
+![The "Manage Cookies" tab](./images/insomnia-manage-cookies.jpg)
+
+## Postman or Insomnia {#postman-or-insomnia}
+
+The application you choose to use is completely up to your personal preference, and will not affect your development workflow. If viewing timelines of the requests you send is important to you, then you should go with Insomnia; however, if that doesn't matter, choose the one that has the most intuitive interface for you.
+</file>
+
+<file path="glossary/tools/modheader.md">
+---
+title: ModHeader
+description: Discover a super useful Chrome extension called ModHeader, which allows you to modify your browser's HTTP request headers.
+sidebar_position: 9.5
+slug: /tools/modheader
+---
+
+# What is ModHeader? {#what-is-modheader}
+
+**Discover a super useful Chrome extension called ModHeader, which allows you to modify your browser's HTTP request headers.**
+
+---
+
+If you read about [Postman](./postman.md), you might remember that you can use it to modify request headers before sending a request. This is great, but the main problem is that Postman can only make static requests - meaning, it is unable to load JavaScript or any [dynamic content](../concepts/dynamic_pages.md).
+
+[ModHeader](https://chrome.google.com/webstore/detail/idgpnmonknjnojddfkpgkljpfnnfcklj) is a Chrome extension which can be used to modify the HTTP headers of the requests you make with your browser. This means that, for example, if your scraper using a headless browser Puppeteer is being blocked due to an improper **User-Agent** header, you can use ModHeader to test the target website and quickly solve the issue.
+
+## The ModHeader interface {#interface}
+
+After you install the ModHeader extension, you should see it pinned in Chrome's task bar. When you click it, you'll see an interface like this pop up:
+
+![Modheader's interface](./images/modheader.jpg)
+
+Here, you can add headers, remove headers, and even save multiple collections of headers that you can toggle between (which are called **Profiles** within the extension itself).
+
+## Use cases {#use-cases}
+
+When scraping dynamic websites, sometimes some specific headers are required to access certain pages. The most popularly required headers are generally `User-Agent` and `referer`. ModHeader, and other tools like it, make it easy to test requests to these websites right in your browser before writing logic for your scraper.
+</file>
+
+<file path="glossary/tools/postman.md">
+---
+title: Postman
+description: Learn about Postman, a valuable tool for testing requests and proxies when building scalable web scrapers.
+sidebar_position: 9.3
+slug: /tools/postman
+---
+
+# What is Postman? {#what-is-postman}
+
+**Learn about Postman, a valuable tool for testing requests and proxies when building scalable web scrapers.**
+
+---
+
+[Postman](https://www.postman.com/) is a powerful collaboration platform for API development and testing. For scraping use-cases, it's mainly used to test requests and proxies (such as checking the response body of a raw request, without loading any additional resources such as JavaScript or CSS). This tool can do much more than that, but we will not be discussing all of its capabilities here. Postman allows us to test requests with cookies, headers, and payloads so that we can be entirely sure what the response looks like for a request URL we plan to eventually use in a scraper.
+
+The desktop app can be downloaded from its [official download page](https://www.postman.com/downloads/), or the web app can be used with a signup - no download required. If this is your first time working with a tool like Postman, we recommend checking out their [Getting Started guide](https://learning.postman.com/docs/introduction/overview/).
+
+## Understanding the interface {#understanding-the-interface}
+
+![A basic outline of Postman's interface](./images/postman-interface.png)
+
+Following four sections are essential to get familiar with Postman:
+
+### Tabs
+
+Multiple test endpoints/requests can be opened at one time, each of which will be held within its own tab.
+
+### Address bar
+
+The section in which you select the type of request to send, the URL of the request, and of course, send the request with the **Send Request** button.
+
+### Request options
+
+This is a very useful section where you can view and edit structured query parameters, as well as specify any authorization parameters, headers, or payloads.
+
+### Response
+
+After sending a request, the response's body will be found here, along with its cookies and headers. The response body can be viewed in various formats - **Pretty-Print**, **Raw**, or **Preview**.
+
+## Using and testing proxies {#using-proxies}
+
+In order to use a proxy, the proxy's server and configuration must be provided in the **Proxy** tab in Postman settings.
+
+![Proxy configuration in Postman settings](./images/postman-proxy.png)
+
+After configuring a proxy, the next request sent will attempt to use it. To switch off the proxy, its details don't need to be deleted. The **Add a custom proxy configuration** option in settings needs to be un-ticked to disable it.
+
+## Managing the cookies cache {#managing-cookies}
+
+Postman keeps a cache of the cookies from all previous responses of a certain domain, which can be a blessing, but also a curse. Sometimes, you might notice that a request is going through just fine with Postman, but that your scraper is being blocked.
+
+More often than not in these cases, the reason is because the endpoint being reached requires a valid `cookie` header to be present when sending the request, and because of Postman's cache, it is sending a valid cookie within each request's headers, while your scraper is not. Another reason this may happen is because you are sending Postman requests without a proxy (using your local IP address), while your scraper is using a proxy that could potentially be getting blocked.
+
+In order to check whether there are any cookies associated with a certain request are cached in Postman, click on the **Cookies** button in any opened request tab:
+
+![Button to view the cached cookies](./images/postman-cookies-button.png)
+
+Clicking on this button opens a **MANAGE COOKIES** window, where a list of all cached cookies per domain can be seen. If we had been previously sending multiple requests to **https://github.com/apify**, within this window we would be able to find cached cookies associated with github.com. Cookies can also be edited (to update some specific values), or deleted (to send a "clean" request without any cached data) here.
+
+![Managing cookies in Postman with the "MANAGE COOKIES" window](./images/postman-manage-cookies.png)
+
+### Some alternatives to Postman {#alternatives}
+
+- [Hoppscotch](https://hoppscotch.io/)
+- [Insomnia](./insomnia.md)
+- [Testfully](https://testfully.io/)
+</file>
+
+<file path="glossary/tools/proxyman.md">
+---
+title: Proxyman
+description: Learn about Proxyman, a tool for viewing all network requests that are coming through your system. Filter by response type, by a keyword, or by application.
+sidebar_position: 9.4
+slug: /tools/proxyman
+---
+
+# What's Proxyman? {#what-is-proxyman}
+
+**Learn about Proxyman, a tool for viewing all network requests that are coming through your system. Filter by response type, by a keyword, or by application.**
+
+---
+
+Though the name sounds very similar to [Postman](./postman.md), [**Proxyman**](https://proxyman.io/) is used for a different purpose. Rather than for manually sending and analyzing the responses of requests, Proxyman is a tool for macOS that allows you to view and analyze the HTTP/HTTPS requests that are going through your device. This is done by routing all of your requests through a proxy, which intercepts them and allows you to view data about them. Because it's just a proxy, the HTTP/HTTPS requests going through iOS devices, Android devices, and even iOS simulators can also be viewed with Proxyman.
+
+If you've already gone through the [**Locating and learning** lesson](../../webscraping/api_scraping/general_api_scraping/locating_and_learning.md) in the **API scraping** section, you can think of Proxyman as an advanced Network Tab, where you can see requests that you sometimes can't see in regular browser DevTools.
+
+## The basics {#the-basics}
+
+Though the application offers a whole lot of advanced features, there are only a few main features you'll be utilizing when using Proxyman for scraper development purposes. Let's open up Proxyman and take a look at some of the basic features:
+
+### Apps
+
+The **Apps** tab allows you to both view all of the applications on your machine which are sending requests, as well as filter requests based on application.
+
+![Apps tab in Proxyman](./images/proxyman-apps-tab.png)
+
+### Results
+
+Let's open up Safari and visit **apify.com**, then check back in Proxyman to see all of the requests Safari has made when visiting the website.
+
+![Results in Proxyman](./images/proxyman-results.jpg)
+
+We can see all of the requests related to us visiting **apify.com**. Then, by clicking a request, we can see a whole lot of information about it. The most important information for you, however, will usually be the request and response **headers** and **body**.
+
+![View a request](./images/proxyman-view-request.jpg)
+
+### Filtering
+
+Sometimes, there can be hundreds (or even thousands) of requests that appear in the list. Rather than spending your time rooting through all of them, you can use the plethora of filtering methods that Proxyman offers to find exactly what you are looking for.
+
+![Filter requests with the filter options](./images/proxyman-filter.png)
+
+## Alternatives {#alternatives}
+
+Since Proxyman is only available for macOS, it's only appropriate to list some alternatives to it that are accessible to our Windows and Linux friends:
+
+- [Burp Suite](https://portswigger.net/burp)
+- [Charles Proxy](https://www.charlesproxy.com/documentation/installation/)
+- [Fiddler](https://www.telerik.com/fiddler)
+</file>
+
+<file path="glossary/tools/quick_javascript_switcher.md">
+---
+title: Quick JavaScript Switcher
+description: Discover a handy tool for disabling JavaScript on a certain page to determine how it should be scraped. Great for detecting SPAs.
+sidebar_position: 9.9
+slug: /tools/quick-javascript-switcher
+---
+
+# Quick JavaScript Switcher
+
+**Discover a handy tool for disabling JavaScript on a certain page to determine how it should be scraped. Great for detecting SPAs.**
+
+---
+
+**Quick JavaScript Switcher** is a Chrome extension that allows you to switch on/off the JavaScript for the current page with one click. It can be added to your browser via the [Chrome Web Store](https://chrome.google.com/webstore/category/extensions). After adding it to Chrome, you'll see its respective button next to any other Chrome extensions you might have installed.
+
+If JavaScript is enabled - clicking the button will switch it off and reload the page. The next click will re-enable JavaScript and refresh the page. This extension is useful for checking whether a certain website will work without JavaScript (and thus could be parsed without using a browser with a plain HTTP request) or not.
+
+![JavaScript toggled on (enabled)](./images/js-on.png)
+
+![JavaScript toggled off (disabled)](./images/js-off.png)
+</file>
+
+<file path="glossary/tools/switchyomega.md">
+---
+title: SwitchyOmega
+description: Discover SwitchyOmega, a Chrome extension to manage and switch between proxies, which is extremely useful when testing proxies for a scraper.
+sidebar_position: 9.6
+slug: /tools/switchyomega
+---
+
+# What is SwitchyOmega? {#what-is-switchyomega}
+
+**Discover SwitchyOmega, a Chrome extension to manage and switch between proxies, which is extremely useful when testing proxies for a scraper.**
+
+---
+
+SwitchyOmega is a Chrome extension for managing and switching between proxies which can be added in the [Chrome Webstore](https://chrome.google.com/webstore/detail/padekgcemlokbadohgkifijomclgjgif).
+
+After adding it to Chrome, you can see the SwitchyOmega icon somewhere amongst all your other Chrome extension icons. Clicking on it will display a menu, where you can select various different connection profiles, as well as open the extension's options.
+
+![The SwitchyOmega interface](./images/switchyomega.png)
+
+## Options {#options}
+
+The options page has the following:
+
+- General settings/interface settings (which you can keep to their default values).
+- A list of proxy profiles (separate profiles can be added for different proxy groups, or for different countries for the residential proxy group, etc).
+- The **New profile** button
+- The main section, which shows the selected settings sub-section or selected proxy profile connection settings.
+
+![SwitchyOmega options page](./images/switchyomega-options.png)
+
+## Adding a new proxy {#adding-a-new-proxy}
+
+After clicking on **New profile**, you'll be greeted with a **New profile** popup, where you can give the profile a name and select the type of profile you'd like to create. To add a proxy profile, select the respective option and click **Create**.
+
+![Adding a proxy profile](./images/switchyomega-proxy-profile.png)
+
+Then, you need to fill in the proxy settings:
+
+![Adding proxy settings](./images/switchyomega-proxy-settings.png)
+
+If the proxy requires authentication, click on the lock icon and fill in the details within the popup.
+
+![Authenticating a proxy](./images/switchyomega-auth.png)
+
+Don't forget to click on **Apply changes** within the left-hand side menu under **Actions**!
+
+## Selecting proxy profiles {#selecting-profiles}
+
+And that's it! All of your proxy profiles will appear in the menu. When one is chosen, the page you are currently on will be reloaded using the selected proxy profile.
+
+![SwitchyOmega menu](./images/switchyomega-menu.png)
+</file>
+
+<file path="glossary/tools/user_agent_switcher.md">
+---
+title: User-Agent Switcher
+description: Learn how to switch your User-Agent header to different values in order to monitor how a certain site responds to the changes.
+sidebar_position: 9.8
+slug: /tools/user-agent-switcher
+---
+
+# User-Agent Switcher
+
+**Learn how to switch your User-Agent header to different values in order to monitor how a certain site responds to the changes.**
+
+---
+
+**User-Agent Switcher** is a Chrome extension that allows you to quickly change your **User-Agent** and see how a certain website would behave with different user agents. After adding it to Chrome, you'll see a **Chrome UA Spoofer** button in the extension icons area. Clicking on it will open up a list of various **User-Agent** groups.
+
+![User-Agent Switcher groups](./images/user-agent-switcher-groups.png)
+
+Clicking on a group will display a list of possible User-Agents to set.
+
+![Default available Internet Explorer agents](./images/user-agent-switcher-agents.png)
+
+After setting the **User-Agent**, the page will be refreshed.
+
+## Configuration
+
+The extension configuration page allows you to edit the **User-Agent** list in case you want to add a specific User-Agent that isn't already provided. You can find some other options, but most likely you will never need to modify those.
+
+![User-Agent Switcher configuration page](./images/user-agent-switcher-config.png)
+</file>
+
+<file path="glossary/glossary.md">
+---
+title: Why a glossary?
+description: Browse important web scraping concepts, tools and topics in succinct articles explaining common web development terms in a web scraping and automation context.
+sidebar_position: 16
+category: glossary
+slug: /glossary
+---
+
+# Why a glossary? {#why-a-glossary}
+
+**Browse important web scraping concepts, tools and topics in succinct articles explaining common web development terms in a web scraping and automation context.**
+
+---
+
+Web scraping comes with a lot of terms that are specific to the area. Some of them are tools and libraries, like [Playwright](../webscraping/puppeteer_playwright/index.md) or Insomnia. Others are general topics that have a special place in web scraping, like headless browsers or browser fingerprints. And some topics are related to all web development, but play a special role in web scraping, such as HTTP headers and cookies.
+
+When writing the academy, we very early on realized that we needed a place to reference these terms, but quickly found out that the usual tutorials and guides available all over the web weren't the most ideal. The explanations were too broad and generic and did not fit the web scraping context. With the **Apify Academy** glossary, we aim to provide you with short articles and lessons that provide the necessary web scraping context for specific terms, then link to other parts of the web for further in-depth reading.
+</file>
+
+<file path="platform/deploying_your_code/deploying.md">
+---
+title: Deploying
+description: Push local code to the platform, or create a new Actor on the console and integrate it with a Git repo to optionally automatically rebuild any new changes.
+sidebar_position: 5
+slug: /deploying-your-code/deploying
+---
+
+# Deploying {#deploying}
+
+**Push local code to the platform, or create a new Actor on the console and integrate it with a Git repo to optionally automatically rebuild any new changes.**
+
+---
+
+Once you've **actorified** your code, there are two ways to deploy it to the Apify platform. You can either push the code directly from your local machine onto the platform, or you can create a blank Actor in the web interface, and then integrate its source code with a GitHub repository.
+
+## With a Git repository {#with-git-repository}
+
+Before we deploy our project onto the Apify platform, let's ensure that we've pushed the changes we made in the last 3 lessons into our remote GitHub repository.
+
+> The benefit of using this method is that any time you push to the Git repo, the code on the platform is also updated and the Actor is automatically rebuilt. Also, you don't have to use a GitHub repository - you can use GitLab or any other service you'd like.
+
+### Creating the Actor
+
+Before anything can be integrated, we've gotta create a new Actor. Let's head over to our [Apify Console](https://console.apify.com?asrc=developers_portal), navigate to the **Development** subsection and click on the **Develop new** button, then select the **Empty** template.
+
+![Create new button](../getting_started/images/develop-new-actor.png)
+
+### Changing source code location {#change-source-code}
+
+In the **Source** tab on the new Actor's page, we'll click the dropdown menu under **Source code** and select **Git repository**. By default, this is set to **Web IDE**.
+
+![Select source code location](../expert_scraping_with_apify/images/select-source-location.png)
+
+Now we'll paste the link to our GitHub repository into the **Git URL** text field and click **Save**.
+
+### Adding the webhook to the repository {#adding-repo-webhook}
+
+The final step is to click on **API** in the top right corner of our Actor's page:
+
+![API button](../expert_scraping_with_apify/images/api-button.jpg)
+
+And scroll through all of the links until we find the **Build Actor** API endpoint. Now we'll copy this endpoint's URL, head back over to our GitHub repository and navigate to **Settings > Webhooks > Add webhook**. The final thing to do is to paste the URL and save the webhook.
+
+![Adding a webhook to your GitHub repo](../../../platform/actors/development/deployment/images/ci-github-integration.png)
+
+That's it! The Actor should now pull its source code from the repo and automatically build.
+
+## Without a GitHub repository (using the Apify CLI) {#with-apify-cli}
+
+> If you don't yet have the Apify CLI, learn how to install it and log in by following along with [this brief lesson](../../glossary/tools/apify_cli.md) about it.
+
+If you're logged in to the Apify CLI, the `apify push` command can be used to push the code straight onto the Apify platform from your local machine (no GitHub repository required), where it will automatically be built for you. Prior to running this command, make sure that you have an **.actor/actor.json** file at the root of the project. If you don't already have one, you can use `apify init .` to automatically generate one for you.
+
+One important thing to note is that you can use a `.gitignore` file to exclude files from being pushed. When you use `apify push` without a `.gitignore`, the full folder contents will be pushed, meaning that even the **storage** and **node_modules** will be pushed. These files are unnecessary to push, as they are both generated on the platform.
+
+> The `apify push` command should only really be used for quickly pushing and testing Actors on the platform during development. If you are ready to make your Actor public, use a Git repository instead, as you will reap the benefits of using Git and others will be able to contribute to the project.
+
+## Deployed! {#deployed}
+
+Great! Once you've pushed your Actor to the platform, you should see it in the list of Actors under the **Actors** tab. If you used `apify push`, you'll have access to the **multifile editor** (discussed [here](../getting_started/creating_actors.md)).
+
+![Deployed Actor on the Apify platform](./images/actor-page.jpg)
+
+The next step is to test your Actor and experiment with the vast amount of features the platform has to offer.
+
+## Wrap up {#next}
+
+That's it! In this short section, you've learned how to take your code written in any programming language and turn it into a usable Actor that can run on the Apify platform! The next step is to start looking into the [paid Actors](/platform/actors/publishing) program, which allows you to monetize your work.
+</file>
+
+<file path="platform/deploying_your_code/docker_file.md">
+---
+title: Dockerfile
+description: Understand how to write a Dockerfile (Docker image blueprint) for your project so that it can be run within a Docker container on the Apify platform.
+sidebar_position: 4
+slug: /deploying-your-code/docker-file
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Dockerfile {#dockerfile}
+
+**Understand how to write a Dockerfile (Docker image blueprint) for your project so that it can be run within a Docker container on the Apify platform.**
+
+---
+
+The **Dockerfile** is a file which gives the Apify platform (or Docker, more specifically) instructions on how to create an environment for your code to run in. Every Actor must have a Dockerfile, as Actors run in Docker containers.
+
+> Actors on the platform are always run in Docker containers; however, they can also be run in local Docker containers. This is not common practice though, as it requires more setup and a deeper understanding of Docker. For testing, it's best to run the Actor on the local OS (this requires you to have the underlying runtime installed, such as Node.js, Python, Rust, GO, etc).
+
+## Base images {#base-images}
+
+If your project doesn’t already contain a Dockerfile, don’t worry! Apify offers [many base images](/sdk/js/docs/guides/docker-images) that are optimized for building and running Actors on the platform, which can be found [here](https://hub.docker.com/u/apify). When using a language for which Apify doesn't provide a base image, [Docker Hub](https://hub.docker.com/) provides a ton of free Docker images for most use-cases, upon which you can create your own images.
+
+> Tip: You can see all of Apify's Docker images [on DockerHub](https://hub.docker.com/u/apify).
+
+At the base level, each Docker image contains a base operating system and usually also a programming language runtime (such as Node.js or Python). You can also find images with preinstalled libraries or install them yourself during the build step.
+
+Once you find the base image you need, you can add it as the initial `FROM` statement:
+
+```Dockerfile
+FROM apify/actor-node:16
+```
+
+> For syntax highlighting in your Dockerfiles, download the [**Docker** VSCode extension](https://code.visualstudio.com/docs/containers/overview#_installation).
+
+## Writing the file {#writing-the-file}
+
+The rest of the Dockerfile is about copying the source code from the local filesystem into the container's filesystem, installing libraries, and setting the `RUN` command (which falls back to the parent image).
+
+> If you are not using a base image from Apify, then you should specify how to launch the source code of your Actor with the `CMD` instruction.
+
+Here's the Dockerfile for our Node.js example project's Actor:
+
+<Tabs groupId="main">
+<TabItem value="Node.js Dockerfile" label="Node.js Dockerfile">
+
+```Dockerfile
+FROM apify/actor-node:16
+
+# Second, copy just package.json and package-lock.json since they are the only files
+# that affect npm install in the next step
+COPY package*.json ./
+
+# Install npm packages, skip optional and development dependencies to keep the
+# image small. Avoid logging too much and print the dependency tree for debugging
+RUN npm --quiet set progress=false \
+ && npm install --only=prod --no-optional \
+ && echo "Installed npm packages:" \
+ && (npm list --all || true) \
+ && echo "Node.js version:" \
+ && node --version \
+ && echo "npm version:" \
+ && npm --version
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after npm install, quick build will be really fast
+# for simple source file changes.
+COPY . ./
+
+```
+
+</TabItem>
+<TabItem value="Python Dockerfile" label="Python Dockerfile">
+
+```Dockerfile
+# First, specify the base Docker image.
+# You can also use any other image from Docker Hub.
+FROM apify/actor-python:3.9
+
+# Second, copy just requirements.txt into the Actor image,
+# since it should be the only file that affects "pip install" in the next step,
+# in order to speed up the build
+COPY requirements.txt ./
+
+# Install the packages specified in requirements.txt,
+# Print the installed Python version, pip version
+# and all installed packages with their versions for debugging
+RUN echo "Python version:" \
+ && python --version \
+ && echo "Pip version:" \
+ && pip --version \
+ && echo "Installing dependencies from requirements.txt:" \
+ && pip install -r requirements.txt \
+ && echo "All installed Python packages:" \
+ && pip freeze
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after installing the dependencies, quick build will be really fast
+# for most source file changes.
+COPY . ./
+
+# Specify how to launch the source code of your Actor.
+# By default, the main.py file is run
+CMD python3 main.py
+
+```
+
+</TabItem>
+</Tabs>
+
+## Examples {#examples}
+
+The examples above show how to deploy Actors written in Node.js or Python, but you can use any language. As an inspiration, here are a few examples for other languages: Go, Rust, Julia.
+
+<Tabs groupId="main">
+<TabItem value="GO Actor Dockerfile" label="GO Actor Dockerfile">
+
+```Dockerfile
+FROM golang:1.17.1-alpine
+
+WORKDIR /app
+COPY . .
+
+RUN go mod download
+
+RUN go build -o /example-actor
+CMD ["/example-actor"]
+
+```
+
+</TabItem>
+<TabItem value="Rust Actor Dockerfile" label="Rust Actor Dockerfile">
+
+```Dockerfile
+# Image with prebuilt Rust. We use the newest 1.* version
+# https://hub.docker.com/_/rust
+FROM rust:1
+
+# We copy only package setup so we cache building all dependencies
+COPY Cargo* ./
+
+# We need to have dummy main.rs file to be able to build
+RUN mkdir src && echo "fn main() {}" > src/main.rs
+
+# Build dependencies only
+# Since we do this before copying  the rest of the files,
+# the dependencies will be cached by Docker, allowing fast
+# build times for new code changes
+RUN cargo build --release
+
+# Delete dummy main.rs
+RUN rm -rf src
+
+# Copy rest of the files
+COPY . ./
+
+# Build the source files
+RUN cargo build --release
+
+CMD ["./target/release/actor-example"]
+
+```
+
+</TabItem>
+<TabItem value="Julia Actor Dockerfile" label="Julia Actor Dockerfile">
+
+```Dockerfile
+FROM julia:1.7.1-alpine
+
+WORKDIR /app
+COPY . .
+
+RUN julia install.jl
+
+CMD ["julia", "main.jl"]
+
+```
+
+</TabItem>
+</Tabs>
+
+## Next up {#next}
+
+In the [next lesson](./deploying.md), we'll push our code directly to the Apify platform, or create and integrate a new Actor on the Apify platform with our project's GitHub repository.
+</file>
+
+<file path="platform/deploying_your_code/index.md">
+---
+title: Deploying your code
+description: In this course learn how to take an existing project of yours and deploy it to the Apify platform as an Actor.
+sidebar_position: 9
+category: apify platform
+slug: /deploying-your-code
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Deploying your code to Apify {#deploying}
+
+**In this course learn how to take an existing project of yours and deploy it to the Apify platform as an Actor.**
+
+---
+
+This section will discuss how to use your newfound knowledge of the Apify platform and Actors from the [**Getting started**](../getting_started/index.md) section to deploy your existing project's code to the Apify platform as an Actor.
+Any program running in a Docker container can become an Apify Actor.
+
+![The deployment workflow](../../images/deployment-workflow.png)
+
+Apify provides detailed guidance on how to deploy Node.js and Python programs as Actors, but apart from that you're not limited in what programming language you choose for your scraper.
+
+![Supported languages](../../images/supported-languages.jpg)
+
+Here are a few examples of Actors in other languages:
+
+- [Rust Actor](https://apify.com/lukaskrivka/rust-actor-example)
+- [Go Actor](https://apify.com/jirimoravcik/go-actor-example)
+- [Julia Actor](https://apify.com/jirimoravcik/julia-actor-example)
+
+## The "actorification" workflow {#workflow}
+
+Follow these four main steps to turn a piece of code into an Actor:
+
+1. Handle [accepting inputs and writing outputs](./inputs_outputs.md).
+2. Create an [input schema](./input_schema.md) **(optional)**.
+3. Add a [Dockerfile](./docker_file.md).
+4. [Deploy](./deploying.md) to the Apify platform!
+
+## Our example project
+
+For this section, we'll be turning this example project into an Actor:
+
+<Tabs groupId="main">
+<TabItem value="JavaScript" label="JavaScript">
+
+```js
+// index.js
+const addAllNumbers = (...nums) => nums.reduce((total, curr) => total + curr, 0);
+
+console.log(addAllNumbers(1, 2, 3, 4)); // -> 10
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+# index.py
+def add_all_numbers (nums):
+    total = 0
+
+    for num in nums:
+        total += num
+
+    return total
+
+print(add_all_numbers([1, 2, 3, 4])) # -> 10
+
+```
+
+</TabItem>
+</Tabs>
+
+> For all lessons in this section, we'll have examples for both Node.js and Python so that you can follow along in either language.
+
+<!-- We've pushed this code to GitHub and are ready to turn it into an Actor that takes any number of integers as input, adds them all up, then stores the solution as its output. -->
+
+## Next up {#next}
+
+[Next lesson](./inputs_outputs.md), we'll be learning how to accept input into our Actor as well as deliver output.
+</file>
+
+<file path="platform/deploying_your_code/input_schema.md">
+---
+title: Input schema
+description: Learn how to generate a user interface on the platform for your Actor's input with a single file - the INPUT_SCHEMA.json file.
+sidebar_position: 2
+slug: /deploying-your-code/input-schema
+---
+
+# Input schema {#input-schema}
+
+**Learn how to generate a user interface on the platform for your Actor's input with a single file - the INPUT_SCHEMA.json file.**
+
+---
+
+Though writing an [input schema](/platform/actors/development/actor-definition/input-schema) for an Actor is not a required step, it is most definitely an ideal one. The Apify platform will read the **INPUT_SCHEMA.json** file within the root of your project and generate a user interface for entering input into your Actor, which makes it significantly easier for non-developers (and even developers) to configure and understand the inputs your Actor can receive. Because of this, we'll be writing an input schema for our example Actor.
+
+> Without an input schema, the users of our Actor will have to provide the input in JSON format, which can be problematic for those who are not familiar with JSON.
+
+## Schema title & description {#title-and-description}
+
+In the root of our project, we'll create a file named **INPUT_SCHEMA.json** and start writing the first part of the schema.
+
+```json
+{
+    "title": "Adding Actor input",
+    "description": "Add all values in list of numbers with an arbitrary length.",
+    "type": "object",
+    "schemaVersion": 1
+}
+```
+
+The **title** and **description** describe what the input schema is for, and a bit about what the Actor itself does.
+
+## Properties {#properties}
+
+In order to define all of the properties our Actor is expecting, we must include them within an object with a key of **properties**.
+
+```json
+{
+    "title": "Adding Actor input",
+    "description": "Add all values in list of numbers with an arbitrary length.",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "numbers": {
+            "title": "Number list",
+            "description": "The list of numbers to add up."
+        }
+    }
+}
+```
+
+Each property's key corresponds to the name we're expecting within our code, while the **title** and **description** are what the user will see when configuring input on the platform.
+
+## Property types & editor types {#property-types}
+
+Within our new **numbers** property, there are two more fields we must specify. Firstly, we must let the platform know that we're expecting an array of numbers with the **type** field. Then, we should also instruct Apify on which UI component to render for this input property. In our case, we have an array of numbers, which means we should use the **json** editor type that we discovered in the ["array" section](/platform/actors/development/actor-definition/input-schema/specification/v1#array) of the input schema documentation. We could also use **stringList**, but then we'd have to parse out the numbers from the strings.
+
+```json
+{
+    "title": "Adding Actor input",
+    "description": "Add all values in list of numbers with an arbitrary length.",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "numbers": {
+            "title": "Number list",
+            "description": "The list of numbers to add up.",
+            "type": "array",
+            "editor": "json"
+        }
+    }
+}
+```
+
+## Required fields {#required-fields}
+
+The great thing about building an input schema is that it will automatically validate your inputs based on their type, maximum value, minimum value, etc. Sometimes, you want to ensure that the user will always provide input for certain fields, as they are crucial to the Actor's run. This can be done by using the **required** field and passing in the names of the fields you'd like to require.
+
+```json
+{
+    "title": "Adding Actor input",
+    "description": "Add all values in list of numbers with an arbitrary length.",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "numbers": {
+            "title": "Number list",
+            "description": "The list of numbers to add up.",
+            "type": "array",
+            "editor": "json"
+        }
+    },
+    "required": ["numbers"]
+}
+```
+
+For our case, we've made the **numbers** field required, as it is crucial to our Actor's run.
+
+## Final thoughts {#final-thoughts}
+
+Here is what the input schema we wrote will render on the platform:
+
+![Rendered UI from input schema](./images/rendered-ui.png)
+
+Later on, we'll be building more complex input schemas, as well as discussing how to write quality input schemas that allow the user to understand the Actor and not become overwhelmed.
+
+It's not expected to memorize all of the fields that properties can take or the different editor types available, which is why it's always good to reference the [input schema documentation](/platform/actors/development/actor-definition/input-schema) when writing a schema.
+
+## Next up {#next}
+
+In the [next lesson](/platform/actors/development/actor-definition/dataset-schema), we'll learn how to generate an appealing Overview table to display our Actor's results in real time, so users can get immediate feedback about the data being extracted.
+</file>
+
+<file path="platform/deploying_your_code/inputs_outputs.md">
+---
+title: Inputs & outputs
+description: Learn to accept input into your Actor, do something with it, and then return output. Actors can be written in any language, so this concept is language agnostic.
+sidebar_position: 1
+slug: /deploying-your-code/inputs-outputs
+---
+
+# Inputs & outputs {#inputs-outputs}
+
+**Learn to accept input into your Actor, do something with it, and then return output. Actors can be written in any language, so this concept is language agnostic.**
+
+---
+
+Most of the time when you're creating a project, you are expecting some sort of input from which your software will run off. Oftentimes as well, you want to provide some sort of output once your software has completed running. Apify provides a convenient way to handle inputs and deliver outputs.
+
+An important thing to understand regarding inputs and outputs is that they are read/written differently depending on where the Actor is running:
+
+- If your Actor is running locally, the inputs/outputs are usually provided in the filesystem, and environment variables are injected either by you, the developer, or by the Apify CLI by running the project with the `apify run` command.
+
+- While running in a Docker container on the platform, environment variables are automatically injected, and inputs & outputs are provided and modified using Apify's REST API.
+
+## A bit about storage {#about-storage}
+
+You can read/write your inputs/outputs: to the [key-value store](/platform/storage/key-value-store), or to the [dataset](/platform/storage/dataset). The key-value store can be used to store any sort of unorganized/unrelated data in any format, while the data pushed to a dataset typically resembles a table with columns (fields) and rows (items). Each Actor's run is allocated both a default dataset and a default key-value store.
+
+When running locally, these storages are accessible through the **storage** folder within your project's root directory, while on the platform they are accessible via Apify's API.
+
+## Accepting input {#accepting-input}
+
+You can utilize multiple ways to accept input into your project. The option you go with depends on the language you have written your project in. If you are using Node.js for your repo's code, you can use the [`apify`](https://www.npmjs.com/package/apify) package. Otherwise, you can use the useful environment variables automatically set up for you by Apify to write utility functions which read the Actor's input and return it.
+
+### Accepting input with the Apify SDK
+
+Since we're using Node.js, let's install the `apify` package by running the following command:
+
+```shell
+npm install apify
+```
+
+Now, let's import `Actor` from `apify` and use the `Actor.getInput()` function to grab our input.
+
+```js
+// index.js
+import { Actor } from 'apify';
+
+// We must initialize and exit the Actor. The rest of our code
+// goes in between these two.
+await Actor.init();
+
+const input = await Actor.getInput();
+console.log(input);
+
+await Actor.exit();
+```
+
+If we run this right now, we'll see **null** in our terminal - this is because we never provided any sort of test input, which should be provided in the default key-value store. The `Actor.getInput()` function has detected that there is no **storage** folder and generated one for us.
+
+![Default key-value store filepath](./images/filepath.jpg)
+
+We'll now add an **INPUT.json** file within **storage/key_value_stores/default** to match what we're expecting in our code.
+
+```json
+{
+    "numbers": [5, 5, 5, 5]
+}
+```
+
+Then we can add our example project code from earlier. It will grab the input and use it to generate a solution which is logged into the console.
+
+```js
+// index.js
+import { Actor } from 'apify';
+
+await Actor.init();
+
+const { numbers } = await Actor.getInput();
+
+const addAllNumbers = (...nums) => nums.reduce((total, curr) => total + curr, 0);
+
+const solution = addAllNumbers(...numbers);
+
+console.log(solution);
+
+await Actor.exit();
+```
+
+Cool! When we run `node index.js`, we see **20**.
+
+### Accepting input without the Apify SDK
+
+Alternatively, when writing in a language other than JavaScript, we can create our own `get_input()` function which utilizes the Apify API when the Actor is running on the platform. For this example, we are using the [Apify Client](../getting_started/apify_client.md) for Python to access the API.
+
+```py
+# index.py
+from apify_client import ApifyClient
+from os import environ
+import json
+
+client = ApifyClient(token='YOUR_TOKEN')
+
+# If being run on the platform, the "APIFY_IS_AT_HOME" environment variable
+# will be "1". Otherwise, it will be undefined/None
+def is_on_apify ():
+    return 'APIFY_IS_AT_HOME' in environ
+
+# Get the input
+def get_input ():
+    if not is_on_apify():
+        with open('./apify_storage/key_value_stores/default/INPUT.json') as actor_input:
+            return json.load(actor_input)
+
+    kv_store = client.key_value_store(environ.get('APIFY_DEFAULT_KEY_VALUE_STORE_ID'))
+    return kv_store.get_record('INPUT')['value']
+
+def add_all_numbers (nums):
+    total = 0
+
+    for num in nums:
+        total += num
+
+    return total
+
+actor_input = get_input()['numbers']
+
+solution = add_all_numbers(actor_input)
+
+print(solution)
+```
+
+> For a better understanding of the API endpoints for reading and modifying key-value stores, check the [official API reference](/api/v2#/reference/key-value-stores).
+
+## Writing output {#writing-output}
+
+Similarly to reading input, you can write the Actor's output either by using the Apify SDK in Node.js or by manually writing a utility function to do so.
+
+### Writing output with the Apify SDK
+
+In the SDK, we can write to the dataset with the `Actor.pushData()` function. Let's go ahead and write the solution of the `addAllNumbers()` function to the dataset store using this function:
+
+```js
+// index.js
+
+// This is our example project code from earlier.
+// We will use the Apify input as its input.
+import { Actor } from 'apify';
+
+await Actor.init();
+
+const { numbers } = await Actor.getInput();
+
+const addAllNumbers = (...nums) => nums.reduce((total, curr) => total + curr, 0);
+
+const solution = addAllNumbers(...numbers);
+
+// And save its output to the default dataset
+await Actor.pushData({ solution });
+
+await Actor.exit();
+```
+
+### Writing output without the Apify SDK
+
+Just as with the custom `get_input()` utility function, you can write a custom `set_output()` function as well if you cannot use the Apify SDK.
+
+> You can read and write your output anywhere; however, it is standard practice to use a folder named **storage**.
+
+```py
+# index.py
+from apify_client import ApifyClient
+from os import environ
+import json
+
+client = ApifyClient(token='YOUR_TOKEN')
+
+def is_on_apify ():
+    return 'APIFY_IS_AT_HOME' in environ
+
+def get_input ():
+    if not is_on_apify():
+        with open('./apify_storage/key_value_stores/default/INPUT.json') as actor_input:
+            return json.load(actor_input)
+
+    kv_store = client.key_value_store(environ.get('APIFY_DEFAULT_KEY_VALUE_STORE_ID'))
+    return kv_store.get_record('INPUT')['value']
+
+# Push the solution to the dataset
+def set_output (data):
+    if not is_on_apify():
+        with open('./apify_storage/datasets/default/solution.json', 'w') as output:
+            return output.write(json.dumps(data, indent=2))
+
+    dataset = client.dataset(environ.get('APIFY_DEFAULT_DATASET_ID'))
+    dataset.push_items('OUTPUT', value=[json.dumps(data, indent=4)])
+
+def add_all_numbers (nums):
+    total = 0
+
+    for num in nums:
+        total += num
+
+    return total
+
+actor_input = get_input()['numbers']
+
+solution = add_all_numbers(actor_input)
+
+set_output({ 'solution': solution })
+```
+
+## Testing locally {#testing-locally}
+
+Since we've changed our code a lot from the way it originally was by wrapping it in the Apify SDK to accept inputs and return outputs, we most definitely should test it locally before worrying about pushing it to the Apify platform.
+
+After running our script, there should be a single item in the default dataset that looks like this:
+
+```json
+{
+    "solution": 20
+}
+```
+
+## Next up {#next}
+
+That's it! We've now added all of the files and code necessary to convert our software into an Actor. In the [next lesson](./input_schema.md), we'll be learning how to generate a user interface for our Actor's input so that users don't have to provide the input in raw JSON format.
+</file>
+
+<file path="platform/deploying_your_code/output_schema.md">
+---
+title: Dataset schema
+description: Learn how to generate an appealing Overview table interface to preview your Actor results in real time on the Apify platform.
+sidebar_position: 3
+slug: /deploying-your-code/dataset-schema
+---
+
+# Dataset schema
+
+**Learn how to generate an appealing Overview table interface to preview your Actor results in real time on the Apify platform.**
+
+---
+
+The Dataset schema generates an interface that enables users to instantly preview their Actor results in real time.
+
+![Dataset Schema](../../../platform/actors/development/actor_definition/images/output-schema-example.png)
+
+In this quick tutorial, you will learn how to set up an output tab for your own Actor.
+
+## Implementation
+
+Firstly, create a `.actor` folder in the root of your Actor's source code. Then, create a `actor.json` file in this folder, after which you'll have .actor/actor.json.
+
+![.actor/actor.json](./images/actor-json-example.webp)
+
+Next, copy-paste the following template code into your `actor.json` file.
+
+```json
+{
+    "actorSpecification": 1,
+    "name": "___ENTER_ACTOR_NAME____",
+    "title": "___ENTER_ACTOR_TITLE____",
+    "version": "1.0.0",
+    "storages": {
+        "dataset": {
+            "actorSpecification": 1,
+            "views": {
+                "overview": {
+                    "title": "Overview",
+                    "transformation": {
+                        "fields": [
+                            "___EXAMPLE_NUMERIC_FIELD___",
+                            "___EXAMPLE_PICTURE_URL_FIELD___",
+                            "___EXAMPLE_LINK_URL_FIELD___",
+                            "___EXAMPLE_TEXT_FIELD___",
+                            "___EXAMPLE_BOOLEAN_FIELD___"
+                        ]
+                    },
+                    "display": {
+                        "component": "table",
+                        "properties": {
+                            "___EXAMPLE_NUMERIC_FIELD___": {
+                                "label": "ID",
+                                "format": "number"
+                            },
+                            "___EXAMPLE_PICTURE_URL_FIELD___": {
+                                "format": "image"
+                            },
+                            "___EXAMPLE_LINK_URL_FIELD___": {
+                                "label": "Clickable link",
+                                "format": "link"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+To configure the dataset schema, replace the fields in the template with the relevant fields to your Actor.
+
+For reference, you can use the [Zappos Scraper source code](https://github.com/PerVillalva/zappos-scraper-actor/blob/main/.actor/actor.json) as an example of how the final implementation of the output tab should look in a live Actor.
+
+```json
+{
+    "actorSpecification": 1,
+    "name": "zappos-scraper",
+    "title": "Zappos Scraper",
+    "description": "",
+    "version": "1.0.0",
+    "storages": {
+        "dataset": {
+            "actorSpecification": 1,
+            "title": "Zappos.com Dataset",
+            "description": "",
+            "views": {
+                "products": {
+                    "title": "Overview",
+                    "description": "It can take about one minute until the first results are available.",
+                    "transformation": {
+                        "fields": [
+                            "imgUrl",
+                            "brand",
+                            "name",
+                            "SKU",
+                            "inStock",
+                            "onSale",
+                            "price",
+                            "url"
+                        ]
+                    },
+                    "display": {
+                        "component": "table",
+                        "properties": {
+                            "imgUrl": {
+                                "label": "Product image",
+                                "format": "image"
+                            },
+                            "url": {
+                                "label": "Link",
+                                "format": "link"
+                            },
+                            "brand": {
+                                "format": "text"
+                            },
+                            "name": {
+                                "format": "text"
+                            },
+                            "SKU": {
+                                "format": "text"
+                            },
+                            "inStock": {
+                                "format": "boolean"
+                            },
+                            "onSale": {
+                                "format": "boolean"
+                            },
+                            "price": {
+                                "format": "text"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+Note that the fields specified in the dataset schema should match the object keys of your resulting dataset.
+
+Also, if your desired label has the same name as the defined object key, then you don't need to specify a label name. The schema will, by default, show a capitalized version of the key and even split camel case into separate words and capitalize all of them.
+
+The matching object for the Zappos Scraper shown in the example above will look something like this:
+
+```js
+const results = {
+    url: request.loadedUrl,
+    imgUrl: $('#stage button[data-media="image"] img[itemprop="image"]').attr('src'),
+    brand: $('span[itemprop="brand"]').text().trim(),
+    name: $('meta[itemprop="name"]').attr('content'),
+    SKU: $('*[itemprop~="sku"]').text().trim(),
+    inStock: !request.url.includes('oosRedirected=true'),
+    onSale: !$('div[itemprop="offers"]').text().includes('OFF'),
+    price: $('span[itemprop="price"]').text(),
+};
+```
+
+## Final result {#final-result}
+
+Great! Now that everything is set up, it's time to run the Actor and admire your Actor's brand new output tab.
+
+> Need some extra guidance? Visit the [dataset schema documentation](/platform/actors/development/actor-definition/dataset-schema) for more detailed information about how to implement this feature.
+
+A few seconds after running the Actor, you should see its results displayed in the `Overview` table.
+
+![Output table overview](./images/output-schema-final-example.webp)
+
+## Next up {#next}
+
+In the [next lesson](./docker_file.md), we'll learn about a very important file that is required for our project to run on the Apify platform - the Dockerfile.
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/handling_migrations.md">
+---
+title: V - Handling migrations
+description: Get real-world experience of maintaining a stateful object stored in memory, which will be persisted through migrations and even graceful aborts.
+sidebar_position: 5
+slug: /expert-scraping-with-apify/solutions/handling-migrations
+---
+
+# Handling migrations {#handling-migrations}
+
+**Get real-world experience of maintaining a stateful object stored in memory, which will be persisted through migrations and even graceful aborts.**
+
+---
+
+Let's first head into our **demo-actor** and create a new file named **asinTracker.js** in the **src** folder. Within this file, we are going to build a utility class which will allow us to store, modify, persist, and log our tracked ASIN data.
+
+Here's the skeleton of our class:
+
+```js
+// asinTracker.js
+class ASINTracker {
+    constructor() {
+        this.state = {};
+
+        // Log the state to the console every ten
+        // seconds
+        setInterval(() => console.log(this.state), 10000);
+    }
+
+    // Add an offer to the ASIN's offer count
+    // If ASIN doesn't exist yet, set it to 0
+    incrementASIN(asin) {
+        if (this.state[asin] === undefined) {
+            this.state[asin] = 0;
+            return;
+        }
+
+        this.state[asin] += 1;
+    }
+}
+
+// It is only a utility class, so we will immediately
+// create an instance of it and export that. We only
+// need one instance for our use case.
+module.exports = new ASINTracker();
+```
+
+Multiple techniques exist for storing data in memory; however, this is the most modular way, as all state-persistence and modification logic will be held in this file.
+
+Here is our updated **routes.js** file which is now utilizing this utility class to track the number of offers for each product ASIN:
+
+```js
+// routes.js
+import { createCheerioRouter } from '@crawlee/cheerio';
+import { BASE_URL, OFFERS_URL, labels } from './constants';
+import tracker from './asinTracker';
+import { dataset } from './main.js';
+
+export const router = createCheerioRouter();
+
+router.addHandler(labels.START, async ({ $, crawler, request }) => {
+    const { keyword } = request.userData;
+
+    const products = $('div > div[data-asin]:not([data-asin=""])');
+
+    for (const product of products) {
+        const element = $(product);
+        const titleElement = $(element.find('.a-text-normal[href]'));
+
+        const url = `${BASE_URL}${titleElement.attr('href')}`;
+
+        // For each product, add it to the ASIN tracker
+        // and initialize its collected offers count to 0
+        tracker.incrementASIN(element.attr('data-asin'));
+
+        await crawler.addRequest([{
+            url,
+            label: labels.PRODUCT,
+            userData: {
+                data: {
+                    title: titleElement.first().text().trim(),
+                    asin: element.attr('data-asin'),
+                    itemUrl: url,
+                    keyword,
+                },
+            },
+        }]);
+    }
+});
+
+router.addHandler(labels.PRODUCT, async ({ $, crawler, request }) => {
+    const { data } = request.userData;
+
+    const element = $('div#productDescription');
+
+    await crawler.addRequests([{
+        url: OFFERS_URL(data.asin),
+        label: labels.OFFERS,
+        userData: {
+            data: {
+                ...data,
+                description: element.text().trim(),
+            },
+        },
+    }]);
+});
+
+router.addHandler(labels.OFFERS, async ({ $, request }) => {
+    const { data } = request.userData;
+
+    const { asin } = data;
+
+    for (const offer of $('#aod-offer')) {
+        // For each offer, add 1 to the ASIN's
+        // offer count
+        tracker.incrementASIN(asin);
+
+        const element = $(offer);
+
+        await dataset.pushData({
+            ...data,
+            sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(),
+            offer: element.find('.a-price .a-offscreen').text().trim(),
+        });
+    }
+});
+```
+
+## Persisting state {#persisting-state}
+
+The **persistState** event is automatically fired (by default) every 60 seconds by the Apify SDK while the Actor is running and is also fired when the **migrating** event occurs.
+
+In order to persist our ASIN tracker object, let's use the `Actor.on` function to listen for the **persistState** event and store it in the key-value store each time it is emitted.
+
+```js
+// asinTracker.js
+import { Actor } from 'apify';
+// We've updated our constants.js file to include the name
+// of this new key in the key-value store
+const { ASIN_TRACKER } = require('./constants');
+
+class ASINTracker {
+    constructor() {
+        this.state = {};
+
+        Actor.on('persistState', async () => {
+            await Actor.setValue(ASIN_TRACKER, this.state);
+        });
+
+        setInterval(() => console.log(this.state), 10000);
+    }
+
+    incrementASIN(asin) {
+        if (this.state[asin] === undefined) {
+            this.state[asin] = 0;
+            return;
+        }
+
+        this.state[asin] += 1;
+    }
+}
+
+module.exports = new ASINTracker();
+```
+
+## Handling resurrections {#handling-resurrections}
+
+Great! Now our state will be persisted every 60 seconds in the key-value store. However, we're not done. Let's say that the Actor migrates and is resurrected. We never actually update the `state` variable of our `ASINTracker` class with the state stored in the key-value store, so as our code currently stands, we still don't support state-persistence on migrations.
+
+In order to fix this, let's create a method called `initialize` which will be called at the very beginning of the Actor's run, and will check the key-value store for a previous state under the key **ASIN-TRACKER**. If a previous state does live there, then it will update the class' `state` variable with the value read from the key-value store:
+
+```js
+// asinTracker.js
+import { Actor } from 'apify';
+import { ASIN_TRACKER } from './constants';
+
+class ASINTracker {
+    constructor() {
+        this.state = {};
+
+        Actor.on('persistState', async () => {
+            await Actor.setValue(ASIN_TRACKER, this.state);
+        });
+
+        setInterval(() => console.log(this.state), 10000);
+    }
+
+    async initialize() {
+        // Read the data from the key-value store. If it
+        // doesn't exist, it will be undefined
+        const data = await Actor.getValue(ASIN_TRACKER);
+
+        // If the data does exist, replace the current state
+        // (initialized as an empty object) with the data
+        if (data) this.state = data;
+    }
+
+    incrementASIN(asin) {
+        if (this.state[asin] === undefined) {
+            this.state[asin] = 0;
+            return;
+        }
+
+        this.state[asin] += 1;
+    }
+}
+
+module.exports = new ASINTracker();
+```
+
+We'll now call this function at the top level of the **main.js** file to ensure it is the first thing that gets called when the Actor starts up:
+
+```js
+// main.js
+
+// ...
+import tracker from './asinTracker';
+
+// The Actor.init() function should be executed before
+// the tracker's initialization
+await Actor.init();
+
+await tracker.initialize();
+// ...
+```
+
+That's everything! Now, even if the Actor migrates (or is gracefully aborted and then resurrected), this `state` object will always be persisted.
+
+## Quiz answers 📝 {#quiz-answers}
+
+**Q: Actors have an option in the Settings tab to Restart on error. Would you use this feature for regular Actors? When would you use this feature?**
+
+**A:** It's not best to use this option by default. If it fails, there must be a reason, which would need to be thought through first - meaning that the edge case of failing should be handled when resurrecting the Actor. The state should be persisted beforehand.
+
+**Q: Migrations happen randomly, but by [aborting gracefully](/platform/actors/running/runs-and-builds#aborting-runs), you can simulate a similar situation. Try this out on the platform and observe what happens. What changes occur, and what remains the same for the restarted Actor's run?**
+
+**A:** After aborting or throwing an error mid-process, it manages to start back from where it was upon resurrection.
+
+**Q: Why don't you (usually) need to add any special migration handling code for a standard crawling/scraping Actor? Are there any features in Crawlee or Apify SDK that handle this under the hood?**
+
+**A:** Because Apify SDK handles all of the migration handling code for us. If you want to add custom migration-handling code, you can use `Actor.events` to listen for the `migrating` or `persistState` events to save the current state in key-value store (or elsewhere).
+
+**Q: How can you intercept the migration event? How much time do you have after this event happens and before the Actor migrates?**
+
+**A:** By using the `Actor.on` function. You have a maximum of a few seconds before shutdown after the `migrating` event has been fired.
+
+**Q: When would you persist data to the default key-value store instead of to a named key-value store?**
+
+**A:** Persisting data to the default key-value store would help when handling an Actor's run state or with storing metadata about the run (such as results, miscellaneous files, or logs). Using a named key-value store allows you to persist data at the account level to handle data across multiple Actor runs.
+
+## Wrap up {#wrap-up}
+
+In this activity, we learned how to persist custom values on an interval as well as after Actor migrations by using the `persistState` event and the key-value store. With this knowledge, you can safely increase your Actor's performance by storing data in variables and then pushing them to the dataset periodically/at the end of the Actor's run as opposed to pushing data immediately after it's been collected.
+
+One important thing to note is that this workflow can be used to replace the usage of `userData` to pass data between requests, as it allows for the creation of a "global store" which all requests have access to at any time.
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/index.md">
+---
+title: Solutions
+description: View all of the solutions for all of the activities and tasks of this course. Please try to complete each task on your own before reading the solution!
+sidebar_position: 6.7
+slug: /expert-scraping-with-apify/solutions
+---
+
+# Solutions
+
+**View all of the solutions for all of the activities and tasks of this course. Please try to complete each task on your own before reading the solution!**
+
+---
+
+The final section of each lesson in this course will be a task which you as the course-taker are expected to complete before moving on to the next lesson. Each task's completion and understanding plays an important role in the ability to continue through the course.
+
+If you ever get stuck, or if you feel like your solution could be more optimal, you can always refer to the **Solutions** section of the course. Each solution will have all of the code and explanations needed to understand it.
+
+**Please** try to do each task **on your own** prior to checking out the solution!
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/integrating_webhooks.md">
+---
+title: I - Integrating webhooks
+description: Learn how to integrate webhooks into your Actors. Webhooks are a super powerful tool, and can be used to do almost anything!
+sidebar_position: 1
+slug: /expert-scraping-with-apify/solutions/integrating-webhooks
+---
+
+# Integrating webhooks {#integrating-webhooks}
+
+**Learn how to integrate webhooks into your Actors. Webhooks are a super powerful tool, and can be used to do almost anything!**
+
+---
+
+In this lesson we'll be writing a new Actor and integrating it with our beloved Amazon scraping Actor. First, we'll navigate to the same directory where our **demo-actor** folder lives, and run `apify create filter-actor` _(once again, you can name the Actor whatever you want, but for this lesson, we'll be calling the new Actor **filter-actor**)_. When prompted for which type of boilerplate to start out with, select **Empty**.
+
+![Selecting an empty template to start with](./images/select-empty.jpg)
+
+Cool! Now, we're ready to get started.
+
+## Building the new Actor {#building-the-new-actor}
+
+First of all, we should clear out any of the boilerplate code within **main.js** to get a clean slate:
+
+```js
+// main.js
+import { Actor } from 'apify';
+
+await Actor.init();
+
+// ...
+
+await Actor.exit();
+```
+
+We'll be passing the ID of the Amazon Actor's default dataset along to the new Actor, so we can expect that as an input:
+
+```js
+const { datasetId } = await Actor.getInput();
+const dataset = await Actor.openDataset(datasetId);
+// ...
+```
+
+> Tip: You will need to use `forceCloud` option - `Actor.openDataset(<name/id>, { forceCloud: true });` - to open dataset from platform storage while running Actor locally.
+
+Next, we'll grab hold of the dataset's items with the `dataset.getData()` function:
+
+```js
+const { items } = await dataset.getData();
+```
+
+While several methods can achieve the goal output of this Actor, using the [`Array.reduce()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/reduce) is the most concise approach
+
+```js
+const filtered = items.reduce((acc, curr) => {
+    // Grab the price of the item matching our current
+    // item's ASIN in the map. If it doesn't exist, set
+    // "prevPrice" to null
+    const prevPrice = acc?.[curr.asin] ? +acc[curr.asin].offer.slice(1) : null;
+
+    // Grab the price of our current offer
+    const price = +curr.offer.slice(1);
+
+    // If the item doesn't yet exist in the map, add it.
+    // Or, if the current offer's price is less than the
+    // saved one, replace the saved one
+    if (!acc[curr.asin] || prevPrice > price) acc[curr.asin] = curr;
+
+    // Return the map
+    return acc;
+}, {});
+```
+
+The results should be an array, so we can take the map we just created and push an array of its values to the Actor's default dataset:
+
+```js
+await Actor.pushData(Object.values(filtered));
+```
+
+Our final code looks like this:
+
+```js
+import { Actor } from 'apify';
+
+await Actor.init();
+
+const { datasetId } = await Actor.getInput();
+const dataset = await Actor.openDataset(datasetId);
+
+const { items } = await dataset.getData();
+
+const filtered = items.reduce((acc, curr) => {
+    const prevPrice = acc?.[curr.asin] ? +acc[curr.asin].offer.slice(1) : null;
+    const price = +curr.offer.slice(1);
+
+    if (!acc[curr.asin] || prevPrice > price) acc[curr.asin] = curr;
+
+    return acc;
+}, {});
+
+await Actor.pushData(Object.values(filtered));
+
+await Actor.exit();
+```
+
+Cool! But **wait**, don't forget to configure the **INPUT_SCHEMA.json** file as well! It's not necessary to do this step, as we'll be calling the Actor through Apify's API within a webhook, but it's still good to get into the habit of writing quality input schemas that describe the input values your Actors are expecting.
+
+```json
+{
+    "title": "Amazon Filter Actor",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "datasetId": {
+            "title": "Dataset ID",
+            "type": "string",
+            "description": "Enter the ID of the dataset.",
+            "editor": "textfield"
+        }
+    },
+    "required": ["datasetId"]
+}
+```
+
+Now we're done, and we can push it up to the Apify platform with the `apify push` command.
+
+## Setting up the webhook {#setting-up-the-webhook}
+
+Since we'll be calling the Actor via the [Apify API](/academy/api/run-actor-and-retrieve-data-via-api), we'll need to grab hold of the ID of the Actor we just created and pushed to the platform. The ID is always accessible through the **Settings** page of the Actor.
+
+![Actor ID in Actor settings](./images/actor-settings.jpg)
+
+With this `actorId`, and our `token`, which is retrievable through **Settings > Integrations** on the Apify Console, we can construct a link which will call the Actor:
+
+```text
+https://api.apify.com/v2/acts/Yk1bieximsduYDydP/runs?token=YOUR_TOKEN_HERE
+```
+
+We can also use our username and the name of the Actor like this:
+
+```text
+https://api.apify.com/v2/acts/USERNAME~filter-actor/runs?token=YOUR_TOKEN_HERE
+```
+
+Whichever one you choose is totally up to your preference.
+
+Next, within the Amazon scraping Actor, we will click the **Integrations** tab and choose **Webhook**, then fill out the details to look like this:
+
+![Configuring a webhook](./images/adding-webhook.jpg)
+
+We have chosen to run the webhook once the Actor has succeeded, which means that its default dataset will surely be populated. Since the filtering Actor is expecting the default dataset ID of the Amazon Actor, we use the `resource` variable to grab hold of the `defaultDatasetId`.
+
+Click **Save**, then run the Amazon **demo-actor** again.
+
+## Making sure it worked {#checking-the-webhook}
+
+If everything worked, then at the end of the **demo-actor**'s run, we should see this within the **Integrations** tab:
+
+![Webhook succeeded](./images/webhook-succeeded.png)
+
+Additionally, we should be able to see that our **filter-actor** was run, and have access to its dataset:
+
+![Dataset preview](./images/dataset-preview.png)
+
+## Quiz answers 📝 {#quiz-answers}
+
+**Q: How do you allocate more CPU for an Actor's run?**
+
+**A:** On the platform, more memory can be allocated in the Actor's input configuration, and the default allocated CPU can be changed in the Actor's **Settings** tab. When running locally, you can use the **APIFY_MEMORY_MBYTES** environment variable to set the allocated CPU. 4GB is equal to 1 CPU core on the Apify platform.
+
+**Q: Within itself, can you get the exact time that an Actor was started?**
+
+**A:** Yes. The time the Actor was started can be retrieved through the `startedAt` property from the `Actor.getEnv()` function, or directly from `process.env.APIFY_STARTED_AT`
+
+**Q: What are the types of default storages connected to an Actor's run?**
+
+Every Actor's run is given a default key-value store and a default dataset. The default key-value store by default has the `INPUT` and `OUTPUT` keys. The Actor's request queue is also stored.
+
+**Q: Can you change the allocated memory of an Actor while it's running?**
+
+**A:** Not while it's running. You'd need to stop it and run a new one. However, there is an option to soft abort an Actor, then resurrect then run with a different memory configuration.
+
+**Q: How can you run an Actor with Puppeteer on the Apify platform with headless mode set to `false`?**
+
+**A:** This can be done by using the `actor-node-puppeteer-chrome` Docker image and making sure that `launchContext.launchOptions.headless` in `PuppeteerCrawlerOptions` is set to `false`.
+
+## Wrap up {#wrap-up}
+
+See that?! Integrating webhooks is a piece of cake on the Apify platform! You'll soon discover that the platform factors away a lot of complex things and allows you to focus on what's most important - developing and releasing Actors.
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/managing_source.md">
+---
+title: II - Managing source
+description: View in-depth answers for all three of the quiz questions that were provided in the corresponding lesson about managing source code.
+sidebar_position: 2
+slug: /expert-scraping-with-apify/solutions/managing-source
+---
+
+# Managing source
+
+**View in-depth answers for all three of the quiz questions that were provided in the corresponding lesson about managing source code.**
+
+---
+
+In the lesson corresponding to this solution, we discussed an extremely important topic: source code management. Though we solved the task right in the lesson, we've still included the quiz answers here.
+
+## Quiz answers {#quiz-answers}
+
+**Q: Do you have to rebuild an Actor each time the source code is changed?**
+
+**A:** Yes. It needs to be built into an image, saved in a registry, and later on run in a container.
+
+**Q: In Git, what is the difference between pushing changes and making a pull request?**
+
+**A:** Pushing changes to the remote branch based on the content on the local branch. The pushing of code changes is usually made to a branch parallel to the one you want to eventually push it to.
+
+When creating a pull request, the code is meant to be reviewed, or at least pass all the test suites before being merged into the target branch.
+
+**Q: Based on your knowledge and experience, is the `apify push` command worth using (in your opinion)?**
+
+**A:** The `apify push` command can sometimes be useful when testing ideas; however, it is much more ideal to use GitHub integration rather than directly pushing to the platform.
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/rotating_proxies.md">
+---
+title: VI - Rotating proxies/sessions
+description: Learn firsthand how to rotate proxies and sessions in order to avoid the majority of the most common anti-scraping protections.
+sidebar_position: 6
+slug: /expert-scraping-with-apify/solutions/rotating-proxies
+---
+
+# Rotating proxies/sessions {#rotating-proxy-sessions}
+
+**Learn firsthand how to rotate proxies and sessions in order to avoid the majority of the most common anti-scraping protections.**
+
+---
+
+If you take a look at our current code for the Amazon scraping Actor, you might notice this snippet:
+
+```js
+const proxyConfiguration = await Actor.createProxyConfiguration({
+    groups: ['RESIDENTIAL'],
+});
+```
+
+We didn't provide much explanation for this initially, as it was not directly relevant to the lesson at hand. When you [create a **ProxyConfiguration**](../../../webscraping/anti_scraping/mitigation/using_proxies.md) and pass it to a crawler, Crawlee will make the crawler automatically rotate through the proxies. This entire time, we've been using the **RESIDENTIAL** proxy group to avoid being blocked by Amazon.
+
+> Go ahead and try commenting out the proxy configuration code then running the scraper. What happens?
+
+In order to rotate sessions, we must utilize the [**SessionPool**](https://crawlee.dev/api/core/class/AutoscaledPool), which we've actually also already been using by setting the **useSessionPool** option in our crawler's configuration to **true**. The SessionPool advances the concept of proxy rotation by tying proxies to user-like sessions and rotating those instead. In addition to a proxy, each user-like session has cookies attached to it (and potentially a browser fingerprint as well).
+
+## Configuring SessionPool {#configuring-session-pool}
+
+Let's go ahead and add a **sessionPoolOptions** key to our crawler's configuration so that we can modify the default settings:
+
+```js
+const crawler = new CheerioCrawler({
+    requestList,
+    requestQueue,
+    proxyConfiguration,
+    useSessionPool: true,
+    // This is where our session pool
+    // configuration lives
+    sessionPoolOptions: {
+        // We can add options for each
+        // session created by the session
+        // pool here
+        sessionOptions: {
+
+        },
+    },
+    maxConcurrency: 50,
+    // ...
+});
+```
+
+Now, we'll use the **maxUsageCount** key to force each session to be thrown away after 5 uses and **maxErrorScore** to trash a session once it receives an error.
+
+```js
+const crawler = new CheerioCrawler({
+    requestList,
+    requestQueue,
+    proxyConfiguration,
+    useSessionPool: true,
+    sessionPoolOptions: {
+        sessionOptions: {
+            maxUsageCount: 5,
+            maxErrorScore: 1,
+        },
+    },
+    maxConcurrency: 50,
+    // ...
+});
+```
+
+And that's it! We've successfully configured the session pool to match the task's requirements.
+
+## Limiting proxy location {#limiting-proxy-location}
+
+The final requirement was to use proxies only from the US. Back in our **ProxyConfiguration**, we need to add the **countryCode** key and set it to **US**:
+
+```js
+const proxyConfiguration = await Actor.createProxyConfiguration({
+    groups: ['RESIDENTIAL'],
+    countryCode: 'US',
+});
+```
+
+## Quiz answers {#quiz-answers}
+
+**Q: What are the different types of proxies that Apify proxy offers? What are the main differences between them?**
+
+**A:** Datacenter, residential, and Google SERP proxies with sub-groups. Datacenter proxies are fast and cheap but have a higher chance of being blocked on certain sites in comparison to residential proxies, which are IP addresses located in homes and offices around the world. Google SERP proxies are specifically for Google.
+
+**Q: Which proxy groups do users get on the free plan? Can they access the proxy from their computer?**
+
+**A:** All users have access to the **BUYPROXIES94952**, **GOOGLE_SERP** and **RESIDENTIAL** groups. Free users cannot access the proxy from outside the Apify platform (paying users can).
+
+**Q: How can you prevent an error from occurring if one of the proxy groups that a user has is removed? What are the best practices for these scenarios?**
+
+**A:** By making the proxy for the scraper to use be configurable by the user through the Actor's input. That way, they can switch proxies if the Actor stops working due to proxy-related issues. It can also be done by using the **AUTO** proxy instead of specific groups.
+
+**Q: Does it make sense to rotate proxies when you are logged into a website?**
+
+**A:** No, because most websites tie an IP address to a session. If you start making requests with cookies used with a different IP address, the website might see it as unusual activity and either block the scraper or automatically log out.
+
+**Q: Construct a proxy URL that will select proxies only from the US.**
+
+**A:** `http://country-US:<proxy_password>@proxy.apify.com:8000`
+
+**Q: What do you need to do to rotate a proxy (one proxy usually has one IP)? How does this differ for CheerioCrawler and PuppeteerCrawler?**
+
+**A:** Making a new request with the proxy endpoint above will automatically rotate it. Sessions can also be used to automatically do this. While proxy rotation is fairly straightforward for Cheerio, it's more complex in Puppeteer, as you have to retire the browser each time a new proxy is rotated in. The SessionPool will automatically retire a browser when a session is retired. Sessions can be manually retired with `session.retire()`.
+
+**Q: Name a few different ways how a website can prevent you from scraping it.**
+
+**A:** IP detection and rate-limiting, browser/fingerprint detection, user behavior tracking, etc.
+
+## Wrap up {#wrap-up}
+
+In this solution, you learned one of the most important concepts in web scraping - proxy/session rotation. With your newfound knowledge of the SessionPool, you'll be (practically) unstoppable!
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/saving_stats.md">
+---
+title: VII - Saving run stats
+description: Implement the saving of general statistics about an Actor's run, as well as adding request-specific statistics to dataset items.
+sidebar_position: 7
+slug: /expert-scraping-with-apify/solutions/saving-stats
+---
+
+# Saving run stats {#saving-stats}
+
+**Implement the saving of general statistics about an Actor's run, as well as adding request-specific statistics to dataset items.**
+
+---
+
+The code in this solution will be similar to what we already did in the **Handling migrations** solution; however, we'll be storing and logging different data. First, let's create a new file called **Stats.js** and write a utility class for storing our run stats:
+
+```js
+import Actor from 'apify';
+
+class Stats {
+    constructor() {
+        this.state = {
+            errors: {},
+            totalSaved: 0,
+        };
+    }
+
+    async initialize() {
+        const data = await Actor.getValue('STATS');
+
+        if (data) this.state = data;
+
+        Actor.on('persistState', async () => {
+            await Actor.setValue('STATS', this.state);
+        });
+
+        setInterval(() => console.log(this.state), 10000);
+    }
+
+    addError(url, errorMessage) {
+        if (!this.state.errors?.[url]) this.state.errors[url] = [];
+        this.state.errors[url].push(errorMessage);
+    }
+
+    success() {
+        this.state.totalSaved += 1;
+    }
+}
+
+module.exports = new Stats();
+```
+
+Cool, very similar to the **AsinTracker** class we wrote earlier. We'll now import **Stats** into our **main.js** file and initialize it along with the ASIN tracker:
+
+```js
+// ...
+import Stats from './Stats.js';
+
+await Actor.init();
+await asinTracker.initialize();
+await Stats.initialize();
+// ...
+```
+
+## Tracking errors {#tracking-errors}
+
+In order to keep track of errors, we must write a new function within the crawler's configuration called **errorHandler**. Passed into this function is an object containing an **Error** object for the error which occurred and the **Request** object, as well as information about the session and proxy which were used for the request.
+
+```js
+const crawler = new CheerioCrawler({
+    proxyConfiguration,
+    useSessionPool: true,
+    sessionPoolOptions: {
+        persistStateKey: 'AMAZON-SESSIONS',
+        sessionOptions: {
+            maxUsageCount: 5,
+            maxErrorScore: 1,
+        },
+    },
+    maxConcurrency: 50,
+    requestHandler: router,
+    // Handle all failed requests
+    errorHandler: async ({ error, request }) => {
+        // Add an error for this url to our error tracker
+        Stats.addError(request.url, error?.message);
+    },
+});
+```
+
+## Tracking total saved {#tracking-total-saved}
+
+Now, we'll increment our **totalSaved** count for every offer added to the dataset.
+
+```js
+router.addHandler(labels.OFFERS, async ({ $, request }) => {
+    const { data } = request.userData;
+
+    const { asin } = data;
+
+    for (const offer of $('#aod-offer')) {
+        tracker.incrementASIN(asin);
+        // Add 1 to totalSaved for every offer
+        Stats.success();
+
+        const element = $(offer);
+
+        await dataset.pushData({
+            ...data,
+            sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(),
+            offer: element.find('.a-price .a-offscreen').text().trim(),
+        });
+    }
+});
+```
+
+## Saving stats with dataset items {#saving-stats-with-dataset-items}
+
+Still, in the **OFFERS** handler, we need to add a few extra keys to the items which are pushed to the dataset. Luckily, all of the data required by the task is accessible in the context object.
+
+```js
+router.addHandler(labels.OFFERS, async ({ $, request }) => {
+    const { data } = request.userData;
+
+    const { asin } = data;
+
+    for (const offer of $('#aod-offer')) {
+        tracker.incrementASIN(asin);
+        // Add 1 to totalSaved for every offer
+        Stats.success();
+
+        const element = $(offer);
+
+        await dataset.pushData({
+            ...data,
+            sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(),
+            offer: element.find('.a-price .a-offscreen').text().trim(),
+            // Store the handledAt date or current date if that is undefined
+            dateHandled: request.handledAt || new Date().toISOString(),
+            // Access the number of retries on the request object
+            numberOfRetries: request.retryCount,
+            // Grab the number of pending requests from the requestQueue
+            currentPendingRequests: (await requestQueue.getInfo()).pendingRequestCount,
+        });
+    }
+});
+```
+
+## Quiz answers {#quiz-answers}
+
+**Q: Why might you want to store statistics about an Actor's run (or a specific request)?**
+
+**A:** If certain types of requests are error-prone, you might want to save stats about the run to look at them later to either eliminate or better handle the errors. Things like **dateHandled** can be generally useful information.
+
+**Q: In our Amazon scraper, we are trying to store the number of retries of a request once its data is pushed to the dataset. Where would you get this information? Where would you store it?**
+
+**A:** This information is available directly on the request object under the property **retryCount**.
+
+**Q: What is the difference between the `failedRequestHandler` and `errorHandler`?**
+
+**A:** `failedRequestHandler` runs after a request has failed and reached its `maxRetries` count. `errorHandler` runs on every failure and retry.
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/using_api_and_client.md">
+---
+title: IV - Using the Apify API & JavaScript client
+description: Learn how to interact with the Apify API directly through the well-documented RESTful routes, or by using the proprietary Apify JavaScript client.
+sidebar_position: 4
+slug: /expert-scraping-with-apify/solutions/using-api-and-client
+---
+
+# Using the Apify API & JavaScript client {#using-api-and-client}
+
+**Learn how to interact with the Apify API directly through the well-documented RESTful routes, or by using the proprietary Apify JavaScript client.**
+
+---
+
+Since we need to create another Actor, we'll once again use the `apify create` command and start from an empty template.
+
+![Selecting an empty template to start with](./images/select-empty.jpg)
+
+This time, let's call our project **actor-caller**.
+
+Let's also set up some boilerplate, grabbing our inputs and creating a constant variable for the task:
+
+```js
+import { Actor } from 'apify';
+import axios from 'axios';
+
+await Actor.init();
+
+const { useClient, memory, fields, maxItems } = await Actor.getInput();
+
+const TASK = 'YOUR_USERNAME~demo-actor-task';
+
+// our future code will go here
+
+await Actor.exit();
+```
+
+## Calling a task via JavaScript client {#calling-a-task-via-client}
+
+When using the `apify-client` package, you can create a new client instance by using `new ApifyClient()`. Within the Apify SDK however, it is not necessary to even install the `apify-client` package, as the `Actor.newClient()` function is available for use.
+
+We'll start by creating a function called `withClient()` and creating a new client, then calling the task:
+
+```js
+const withClient = async () => {
+    const client = Actor.newClient();
+    const task = client.task(TASK);
+
+    const { id } = await task.call({ memory });
+};
+```
+
+After the task has run, we'll grab hold of its dataset, then attempt to download the items, plugging in our `maxItems` and `fields` inputs. Then, once the data has been downloaded, we'll push it to the default key-value store under a key named **OUTPUT.csv**.
+
+```js
+const withClient = async () => {
+    const client = Actor.newClient();
+    const task = client.task(TASK);
+
+    const { id } = await task.call({ memory });
+
+    const dataset = client.run(id).dataset();
+
+    const items = await dataset.downloadItems('csv', {
+        limit: maxItems,
+        fields,
+    });
+
+    // If the content type is anything other than JSON, it must
+    // be specified within the third options parameter
+    return Actor.setValue('OUTPUT', items, { contentType: 'text/csv' });
+};
+```
+
+## Calling a task via API {#calling-a-task-via-api}
+
+First, we'll create a function (right under the `withClient()`) function named `withAPI` and instantiate a new variable which represents the API endpoint to run our task:
+
+```js
+const withAPI = async () => {
+    const uri = `https://api.apify.com/v2/actor-tasks/${TASK}/run-sync-get-dataset-items?`;
+};
+```
+
+To add the query parameters to the URL, we could create a super long string literal, plugging in all of our input values; however, there is a much better way: [`URLSearchParams`](https://nodejs.org/api/url.html#new-urlsearchparams). By using `URLSearchParams`, we can add the query parameters in an object:
+
+```js
+const withAPI = async () => {
+    const uri = `https://api.apify.com/v2/actor-tasks/${TASK}/run-sync-get-dataset-items?`;
+    const url = new URL(uri);
+
+    url.search = new URLSearchParams({
+        memory,
+        format: 'csv',
+        limit: maxItems,
+        fields: fields.join(','),
+        token: process.env.APIFY_TOKEN,
+    });
+};
+```
+
+Finally, let's make a `POST` request to our endpoint. You can use any library you want, but in this example, we'll use [`axios`](https://www.npmjs.com/package/axios). Don't forget to run `npm install axios` if you're going to use this package too!
+
+```js
+const withAPI = async () => {
+    const uri = `https://api.apify.com/v2/actor-tasks/${TASK}/run-sync-get-dataset-items?`;
+    const url = new URL(uri);
+
+    url.search = new URLSearchParams({
+        memory,
+        format: 'csv',
+        limit: maxItems,
+        fields: fields.join(','),
+        token: process.env.APIFY_TOKEN,
+    });
+
+    const { data } = await axios.post(url.toString());
+
+    return Actor.setValue('OUTPUT', data, { contentType: 'text/csv' });
+};
+```
+
+## Finalizing the Actor {#finalizing-the-actor}
+
+Now, since we've written both of these functions, all we have to do is write a conditional statement based on the boolean value from `useClient`:
+
+```js
+if (useClient) await withClient();
+else await withAPI();
+```
+
+And before we push to the platform, let's not forget to write an input schema in the **INPUT_SCHEMA.JSON** file:
+
+```json
+{
+  "title": "Actor Caller",
+  "type": "object",
+  "schemaVersion": 1,
+  "properties": {
+    "memory": {
+      "title": "Memory",
+      "type": "integer",
+      "description": "Select memory in megabytes.",
+      "default": 4096,
+      "maximum": 32768,
+      "unit": "MB"
+    },
+    "useClient": {
+      "title": "Use client?",
+      "type": "boolean",
+      "description": "Specifies whether the Apify JS client, or the pure Apify API should be used.",
+      "default": true
+    },
+    "fields": {
+      "title": "Fields",
+      "type": "array",
+      "description": "Enter the dataset fields to export to CSV",
+      "prefill": ["title", "url", "price"],
+      "editor": "stringList"
+    },
+    "maxItems": {
+      "title": "Max items",
+      "type": "integer",
+      "description": "Fill the maximum number of items to export.",
+      "default": 10
+    }
+  },
+  "required": ["useClient", "memory", "fields", "maxItems"]
+}
+```
+
+## Final code {#final-code}
+
+To ensure we're on the same page, here is what the final code looks like:
+
+```js
+import { Actor } from 'apify';
+import axios from 'axios';
+
+await Actor.init();
+
+const { useClient, memory, fields, maxItems } = await Actor.getInput();
+
+const TASK = 'YOUR_USERNAME~demo-actor-task';
+
+const withClient = async () => {
+    const client = Actor.newClient();
+    const task = client.task(TASK);
+
+    const { id } = await task.call({ memory });
+
+    const dataset = client.run(id).dataset();
+
+    const items = await dataset.downloadItems('csv', {
+        limit: maxItems,
+        fields,
+    });
+
+    return Actor.setValue('OUTPUT', items, { contentType: 'text/csv' });
+};
+
+const withAPI = async () => {
+    const uri = `https://api.apify.com/v2/actor-tasks/${TASK}/run-sync-get-dataset-items?`;
+    const url = new URL(uri);
+
+    url.search = new URLSearchParams({
+        memory,
+        format: 'csv',
+        limit: maxItems,
+        fields: fields.join(','),
+        token: process.env.APIFY_TOKEN,
+    });
+
+    const { data } = await axios.post(url.toString());
+
+    return Actor.setValue('OUTPUT', data, { contentType: 'text/csv' });
+};
+
+if (useClient) {
+    await withClient();
+} else {
+    await withAPI();
+}
+
+await Actor.exit();
+```
+
+## Quiz answers 📝 {#quiz-answers}
+
+**Q: What is the relationship between the Apify API and Apify client? Are there any significant differences?**
+
+**A:** The Apify client mimics the Apify API, so there aren't any super significant differences. It's super handy as it helps with managing the API calls (parsing, error handling, retries, etc) and even adds convenience functions.
+
+The one main difference is that the Apify client automatically uses [**exponential backoff**](/api/client/js/docs#retries-with-exponential-backoff) to deal with errors.
+
+**Q: How do you pass input when running an Actor or task via API?**
+
+**A:** The input should be passed into the **body** of the request when running an actor/task via API.
+
+**Q: Do you need to install the `apify-client` npm package when already using the `apify` package?**
+
+**A:** No. The Apify client is available right in the SDK with the `Actor.newClient()` function.
+
+## Wrap up {#wrap-up}
+
+That's it! Now, if you want to go above and beyond, you should create a GitHub repository for this Actor, integrate it with a new one on the Apify platform, and test if it works there as well (with multiple input configurations).
+</file>
+
+<file path="platform/expert_scraping_with_apify/solutions/using_storage_creating_tasks.md">
+---
+title: III - Using storage & creating tasks
+description: Get quiz answers and explanations for the lesson about using storage and creating tasks on the Apify platform.
+sidebar_position: 3
+slug: /expert-scraping-with-apify/solutions/using-storage-creating-tasks
+---
+
+# Using storage & creating tasks {#using-storage-creating-tasks}
+
+## Quiz answers 📝 {#quiz-answers}
+
+**Q: What is the relationship between Actors and tasks?**
+
+**A:** Tasks are pre-configured runs of Actors. The configurations of an Actor can be saved as a task so that it doesn't have to be manually configured every single time.
+
+**Q: What are the differences between default (unnamed) and named storage? Which one would you use for everyday usage?**
+
+**A:** Unnamed storage is persisted for only 7 days, while named storage is persisted indefinitely. For everyday usage, it is best to use default unnamed storages unless the data should explicitly be persisted for more than 7 days.
+
+> With named storages, it's easier to verify that you're using the correct store, as they can be referred to by name rather than by an ID.
+
+**Q: What is data retention, and how does it work for all types of storages (default and named)?**
+
+**A:** Default/unnamed storages expire after 7 days unless otherwise specified. Named storages are retained indefinitely.
+
+## Wrap up {#wrap-up}
+
+You've learned how to use the different storage options available on Apify, the two different types of storage, as well as how to create tasks for Actors.
+</file>
+
+<file path="platform/expert_scraping_with_apify/actors_webhooks.md">
+---
+title: I - Webhooks & advanced Actor overview
+description: Learn more advanced details about Actors, how they work, and the default configurations they can take. Also, learn how to integrate your Actor with webhooks.
+sidebar_position: 6.1
+slug: /expert-scraping-with-apify/actors-webhooks
+---
+
+# Webhooks & advanced Actor overview {#webhooks-and-advanced-actors}
+
+**Learn more advanced details about Actors, how they work, and the default configurations they can take. Also, learn how to integrate your Actor with webhooks.**
+
+---
+
+Thus far, you've run Actors on the platform and written an Actor of your own, which you published to the platform yourself using the Apify CLI; therefore, it's fair to say that you are becoming more familiar and comfortable with the concept of **Actors**. Within this lesson, we'll take a more in-depth look at Actors and what they can do.
+
+## Advanced Actor overview {#advanced-actors}
+
+In this course, we'll be working out of the Amazon scraper project from the **Web scraping for beginners** course. If you haven't already built that project, you can do it in three short lessons [here](../../webscraping/scraping_basics_javascript/challenge/index.md). We've made a few small modifications to the project with the Apify SDK, but 99% of the code is still the same.
+
+Take another look at the files within your Amazon scraper project. You'll notice that there is a **Dockerfile**. Every single Actor has a Dockerfile (the Actor's **Image**) which tells Docker how to spin up a container on the Apify platform which can successfully run the Actor's code. "Apify Actors" is a serverless platform that runs multiple Docker containers. For a deeper understanding of Actor Dockerfiles, refer to the [Apify Actor Dockerfile docs](/sdk/js/docs/guides/docker-images#example-dockerfile).
+
+## Webhooks {#webhooks}
+
+Webhooks are a powerful tool that can be used for just about anything. You can set up actions to be taken when an Actor reaches a certain state (started, failed, succeeded, etc). These actions usually take the form of an API call (generally a POST request).
+
+## Learning 🧠 {#learning}
+
+Prior to moving forward, please read over these resources:
+
+- Read about [running Actors, handling Actor inputs, memory and CPU](/platform/actors/running).
+- Learn about [Actor webhooks](/platform/integrations/webhooks), which we will implement in the next lesson.
+- Learn [how to run Actors](/academy/api/run-actor-and-retrieve-data-via-api) using Apify's REST API.
+
+## Knowledge check 📝 {#quiz}
+
+1. How do you allocate more CPU for an Actor's run?
+2. Within itself, can you get the exact time that an Actor was started?
+3. What are the types of default storages connected to an Actor's run?
+4. Can you change the allocated memory of an Actor while it's running?
+5. How can you run an Actor with Puppeteer on the Apify platform with headless mode set to `false`?
+
+## Our task {#our-task}
+
+In this task, we'll be building on top of what we already created in the [Web scraping for beginners](/academy/web-scraping-for-beginners/challenge) course's final challenge, so keep those files safe!
+
+Once our Amazon Actor has completed its run, we will, rather than sending an email to ourselves, call an Actor through a webhook. The Actor called will be a new Actor that we will create together, which will take the dataset ID as input, then subsequently filter through all of the results and return only the cheapest one for each product. All of the results of the Actor will be pushed to its default dataset.
+
+[**Solution**](./solutions/integrating_webhooks.md)
+
+## Next up {#next}
+
+This course's [next lesson](./managing_source_code.md) is brief, but discusses a very important topic: managing your code and storing it in a safe place.
+</file>
+
+<file path="platform/expert_scraping_with_apify/apify_api_and_client.md">
+---
+title: IV - Apify API & client
+description: Gain an in-depth understanding of the two main ways of programmatically interacting with the Apify platform - through the API, and through a client.
+sidebar_position: 6.4
+slug: /expert-scraping-with-apify/apify-api-and-client
+---
+
+# Apify API & client {#api-and-client}
+
+**Gain an in-depth understanding of the two main ways of programmatically interacting with the Apify platform - through the API, and through a client.**
+
+---
+
+You can use one of the two main ways to programmatically interact with the Apify platform: by directly using [Apify's RESTful API](/api/v2), or by using the [JavaScript](/api/client/js) and [Python](/api/client/python) API clients. In the next two lessons, we'll be focusing on the first two.
+
+> Apify's API and JavaScript API client allow us to do anything a regular user can do when interacting with the platform's web interface, only programmatically.
+
+## Learning 🧠 {#learning}
+
+- Scroll through the [Apify API docs](/api/v2) (there's a whole lot there, so you're not expected to memorize everything).
+- Read about the Apify client in [Apify's docs](/api/client/js). It can also be seen on [GitHub](https://github.com/apify/apify-client-js) and [npm](https://www.npmjs.com/package/apify-client).
+- Learn about the [`Actor.newClient()`](/sdk/js/reference/class/Actor#newClient) function in the Apify SDK.
+- Skim through [this article](https://help.apify.com/en/articles/2868670-how-to-pass-data-from-web-scraper-to-another-actor) about API integration (this article is old; however, still relevant).
+
+## Knowledge check 📝 {#quiz}
+
+1. What is the relationship between the Apify API and the Apify client? Are there any significant differences?
+2. How do you pass input when running an Actor or task via API?
+3. Do you need to install the `apify-client` npm package when already using the `apify` package?
+
+## Our task
+
+We'll be creating another new Actor, which will have two jobs:
+
+1. Programmatically call the task for the Amazon Actor.
+2. Export its results into CSV format under a new key called **OUTPUT.csv** in the default key-value store.
+
+Though it's a bit unintuitive, this is a perfect activity for learning how to use both the Apify API and the Apify JavaScript client.
+
+The new Actor should take the following input values, which be mapped to parameters in the API calls:
+
+```json
+{
+    // How much memory to allocate to the Amazon Actor
+    // Must be a power of 2
+    "memory": 4096,
+
+    // Whether to use the JavaScript client to make the
+    // call, or to use the API
+    "useClient": false,
+
+    // The fields in each item to return back. All other
+    // fields should be ommitted
+    "fields": ["title", "itemUrl", "offer"],
+
+    // The maximum number of items to return back
+    "maxItems": 10
+}
+```
+
+[**Solution**](./solutions/using_api_and_client.md)
+
+## Next up {#next}
+
+[Lesson VI](./migrations_maintaining_state.md) will teach us everything we need to know about migrations and how to handle them properly to avoid losing any state; therefore, increasing the reliability of our `demo-actor` Amazon scraper.
+</file>
+
+<file path="platform/expert_scraping_with_apify/bypassing_anti_scraping.md">
+---
+title: VI - Bypassing anti-scraping methods
+description: Learn about bypassing anti-scraping methods using proxies and proxy/session rotation together with Crawlee and the Apify SDK.
+sidebar_position: 6.6
+slug: /expert-scraping-with-apify/bypassing-anti-scraping
+---
+
+# Bypassing anti-scraping methods {#bypassing-anti-scraping-methods}
+
+**Learn about bypassing anti-scraping methods using proxies and proxy/session rotation together with Crawlee and the Apify SDK.**
+
+---
+
+Effectively bypassing anti-scraping software is one of the most crucial, but also one of the most difficult skills to master. The different types of [anti-scraping protections](../../webscraping/anti_scraping/index.md) can vary a lot on the web. Some websites aren't even protected at all, some require only moderate IP rotation, and some cannot even be scraped without using advanced techniques and workarounds. Additionally, because the web is evolving, anti-scraping techniques are also evolving and becoming more advanced.
+
+It is generally quite difficult to recognize the anti-scraping protections a page may have when first inspecting it, so it is important to thoroughly investigate a site prior to writing any lines of code, as anti-scraping measures can significantly change your approach as well as complicate the development process of an Actor. As your skills expand, you will be able to spot anti-scraping measures quicker, and better evaluate the complexity of a new project.
+
+You might have already noticed that we've been using the **RESIDENTIAL** proxy group in the `proxyConfiguration` within our Amazon scraping Actor. But what does that mean? This is a proxy group from [Apify Proxy](https://apify.com/proxy) which has been preventing us from being blocked by Amazon this entire time. We'll be learning more about proxies and Apify Proxy in this lesson.
+
+## Learning 🧠 {#learning}
+
+- Skim [this page](https://apify.com/proxy) for a general idea of Apify Proxy.
+- Give the [proxy documentation](/platform/proxy) a solid readover (feel free to skip most of the examples).
+- Check out the [anti-scraping guide](../../webscraping/anti_scraping/index.md).
+- Gain a solid understanding of the [SessionPool](https://crawlee.dev/api/core/class/SessionPool).
+- Look at a few Actors on the [Apify store](https://apify.com/store). How are they utilizing proxies?
+
+## Knowledge check 📝 {#quiz}
+
+1. What are the different types of proxies that Apify proxy offers? What are the main differences between them?
+2. Which proxy groups do users get on the free plan? Can they access the proxy from their computer?
+3. How can you prevent an error from occurring if one of the proxy groups that a user has is removed? What are the best practices for these scenarios?
+4. Does it make sense to rotate proxies when you are logged into a website?
+5. Construct a proxy URL that will select proxies **only from the US**.
+6. What do you need to do to rotate a proxy (one proxy usually has one IP)? How does this differ for CheerioCrawler and PuppeteerCrawler?
+7. Name a few different ways how a website can prevent you from scraping it.
+
+## Our task
+
+This time, we're going to build a trivial proxy-session manager for our Amazon scraping Actor. A session should be used a maximum of 5 times before being rotated; however, if a request fails, the IP should be rotated immediately.
+
+Additionally, the proxies used by our scraper should now only be from the US.
+
+[**Solution**](./solutions/rotating_proxies.md)
+
+## Next up {#next}
+
+Up [next](./saving_useful_stats.md), we'll be learning about how to save useful stats about our run, which becomes more and more useful as a project scales.
+</file>
+
+<file path="platform/expert_scraping_with_apify/index.md">
+---
+title: Expert scraping with Apify
+description: After learning the basics of Actors and Apify, learn to develop pro-level scrapers on the Apify platform with this advanced course.
+sidebar_position: 12
+category: apify platform
+slug: /expert-scraping-with-apify
+---
+
+# Expert scraping with Apify {#expert-scraping}
+
+**After learning the basics of Actors and Apify, learn to develop pro-level scrapers on the Apify platform with this advanced course.**
+
+---
+
+This course will teach you the nitty gritty of what it takes to build pro-level scrapers with Apify. We recommend that you've at least looked through all of the other courses in the academy prior to taking this one.
+
+## Preparations {#preparations}
+
+Before developing a pro-level Apify scraper, there are some important things you should have at least a bit of knowledge about (knowing the basics of each is enough to continue through this section), as well as some things that you should have installed on your system.
+
+> If you've already gone through the [Web scraping for beginners course](../../webscraping/scraping_basics_javascript/index.md) and the first courses of the [Apify platform category](../apify_platform.md), you will be more than well equipped to continue on with the lessons in this course.
+
+<!-- ### Puppeteer/Playwright {#puppeteer-playwright}
+
+[Puppeteer](https://pptr.dev/) is a library for running and controlling a [headless browser](../../webscraping/scraping_basics_javascript/crawling/headless_browser.md) in Node.js, and was developed at Google. The team working on it was hired by Microsoft to work on the [Playwright](https://playwright.dev/) project; therefore, many parallels can be seen between both the `puppeteer` and `playwright` packages. Proficiency in at least one of these will be good enough. -->
+
+### Crawlee, Apify SDK, and the Apify CLI {#crawlee-apify-sdk-and-cli}
+
+If you're feeling ambitious, you don't need to have any prior experience with Crawlee to get started with this course; however, at least 5–10 minutes of exposure is recommended. If you haven't yet tried out Crawlee, you can refer to [this lesson](../../webscraping/scraping_basics_javascript/crawling/pro_scraping.md) in the **Web scraping for beginners** course (and ideally follow along). To familiarize yourself with the Apify SDK, you can refer to the [Apify Platform](../apify_platform.md) category.
+
+The Apify CLI will play a core role in the running and testing of the Actor you will build, so if you haven't gotten it installed already, please refer to [this short lesson](../../glossary/tools/apify_cli.md).
+
+### Git {#git}
+
+In one of the later lessons, we'll be learning how to integrate our Actor on the Apify platform with a GitHub repository. For this, you'll need to understand at least the basics of [Git](https://git-scm.com/docs). Here's a [great tutorial](https://product.hubspot.com/blog/git-and-github-tutorial-for-beginners) to help you get started with Git.
+
+### Docker {#docker}
+
+Docker is a massive topic on its own, but don't be worried! We only expect you to know and understand the very basics of it, which can be learned about in [this short article](https://docs.docker.com/guides/docker-overview/) (10 minute read).
+
+### The basics of Actors {#actor-basics}
+
+Part of this course will be learning more in-depth about Actors; however, some basic knowledge is already assumed. If you haven't yet gone through the [Actors](../getting_started/actors.md) lesson of the **Apify platform** course, it's highly recommended to at least give it a glance before moving forward.
+
+## First up {#first}
+
+[First up](./actors_webhooks.md), we'll be learning in-depth about integrating Actors with each other using webhooks.
+
+> Each lesson will have a short _(and optional)_ quiz that you can take at home to test your skills and knowledge related to the lesson's content. Some questions have straight factual answers, but some others can have varying opinionated answers.
+</file>
+
+<file path="platform/expert_scraping_with_apify/managing_source_code.md">
+---
+title: II - Managing source code
+description: Learn how to manage your Actor's source code more efficiently by integrating it with a GitHub repository. This is standard on the Apify platform.
+sidebar_position: 6.2
+slug: /expert-scraping-with-apify/managing-source-code
+---
+
+# Managing source code {#managing-source-code}
+
+**Learn how to manage your Actor's source code more efficiently by integrating it with a GitHub repository. This is standard on the Apify platform.**
+
+---
+
+In this brief lesson, we'll discuss how to better manage an Actor's source code. Up 'til now, you've been developing your scripts locally, and then pushing the code directly to the Actor on the Apify platform; however, there is a much more optimal (and standard) way.
+
+## Learning 🧠 {#learning}
+
+Thus far, every time we've updated our code on the Apify platform, we've used the `apify push` CLI command; however, this can be problematic for a few reasons - mainly because, if someone else wants to make a change to/maintain your code, they don't have access to it, as it is on your local machine.
+
+If you're not yet familiar with Git, please get familiar with it through the [Git documentation](https://git-scm.com/docs), then take a quick moment to read about [GitHub integration](/platform/integrations/github) in the Apify docs.
+
+Also, try to explore the **Multifile editor** in one of the Actors you developed in the previous lessons before moving forward.
+
+## Knowledge check 📝 {#quiz}
+
+1. Do you have to rebuild an Actor each time the source code is changed?
+2. In Git, what is the difference between **pushing** changes and making a **pull request**?
+3. Based on your knowledge and experience, is the `apify push` command worth using (in your opinion)?
+
+[**Answers**](./solutions/managing_source.md)
+
+## Our task {#our-task}
+
+First, we must initialize a GitHub repository (you can use Gitlab if you like, but this lesson's examples will be using GitHub). Then, after pushing our main Amazon Actor's code to the repo, we must switch its source code to use the content of the GitHub repository instead.
+
+## Integrating GitHub source code {#integrating-github}
+
+First, let's create a repository. This can be done [in a number of ways](https://kbroman.org/github_tutorial/pages/init.html), but in this lesson, we'll do it by creating the remote repository on GitHub's website:
+
+![Create a new GitHub repo](./images/github-new-repo.png)
+
+Then, we'll run the commands it tells us in our terminal (while within the **demo-actor** directory) to initialize the repository locally, and then push all of the files to the remote one.
+
+After you've created your repo, navigate on the Apify platform to the Actor we called **demo-actor**. In the **Source** tab, click the dropdown menu under **Source code** and select **Git repository**. By default, this is set to **Web IDE**, which is what we've been using so far.
+
+![Select source code location](./images/select-source-location.png)
+
+Then, go ahead and paste the link to your repository into the **Git URL** text field and click **Save**.
+
+The final step is to click on **API** in the top right corner of your Actor's page:
+
+![API button](./images/api-button.jpg)
+
+And scroll through all of the links until you find the **Build Actor** API endpoint. Copy this endpoint's URL, then head back over to your GitHub repository and navigate to **Settings > Webhooks > Add webhook**. The final thing to do is to paste the URL and save the webhook.
+
+![Adding a webhook to your GitHub repo](../../../platform/actors/development/deployment/images/ci-github-integration.png)
+
+And you're done! 🎉
+
+## Quick chat about code management {#code-management}
+
+This was a bit of overhead, but the good news is that you don't ever have to configure this stuff again for this Actor. Now, every time the content of your **main**/**master** branch changes, the Actor on the Apify platform will rebuild based on the newest code.
+
+Think of it as combining two steps into one! Normally, you'd have to do a `git push` from your terminal in order to get the newest code onto GitHub, then run `apify push` to push it to the platform.
+
+It's also important to know that GitHub/Gitlab repository integration is standard practice. As projects grow and the number of contributors and maintainers increases, it only makes sense to have a GitHub repository integrated with the project's Actor. For the remainder of this course, all Actors created will be integrated with a GitHub repository.
+
+## Next up {#next}
+
+[Next up](./tasks_and_storage.md), you'll learn about the different ways to store scraped data, as well as how to utilize a cool feature to run pre-configured Actors.
+</file>
+
+<file path="platform/expert_scraping_with_apify/migrations_maintaining_state.md">
+---
+title: V - Migrations & maintaining state
+description: Learn about what Actor migrations are and how to handle them properly so that the state is not lost and runs can safely be resurrected.
+sidebar_position: 6.5
+slug: /expert-scraping-with-apify/migrations-maintaining-state
+---
+
+# Migrations & maintaining state {#migrations-maintaining-state}
+
+**Learn about what Actor migrations are and how to handle them properly so that the state is not lost and runs can safely be resurrected.**
+
+---
+
+We already know that Actors are Docker containers that can be run on any server. This means that they can be allocated anywhere there is space available, making them very efficient. Unfortunately, there is one big caveat: Actors move - a lot. When an Actor moves, it is called a **migration**.
+
+On migration, the process inside of an Actor is completely restarted and everything in its memory is lost, meaning that any values stored within variables or classes are lost.
+
+When a migration happens, you want to do a so-called "state transition", which means saving any data you care about so the Actor can continue right where it left off before the migration.
+
+## Learning 🧠 {#learning}
+
+Read this [article](/platform/actors/development/builds-and-runs/state-persistence) on migrations and dealing with state transitions.
+
+Before moving forward, read about Actor [events](/sdk/js/docs/upgrading/upgrading-to-v3#events) and how to listen for them.
+
+## Knowledge check 📝 {#quiz}
+
+1. Actors have an option in the **Settings** tab to **Restart on error**. Would you use this feature for regular Actors? When would you use this feature?
+2. Migrations happen randomly, but by [aborting **gracefully**](/platform/actors/running/runs-and-builds#aborting-runs), you can simulate a similar situation. Try this out on the platform and observe what happens. What changes occur, and what remains the same for the restarted Actor's run?
+3. Why don't you (usually) need to add any special migration handling code for a standard crawling/scraping Actor? Are there any features in the Crawlee/Apify SDK that handle this under the hood?
+4. How can you intercept the migration event? How much time do you have after this event happens and before the Actor migrates?
+5. When would you persist data to the default key-value store instead of to a named key-value store?
+
+## Our task
+
+Once again returning to our Amazon **demo-actor**, let's say that we need to store an object in memory (as a variable) containing all of the scraped ASINs as keys and the number of offers scraped from each ASIN as values. The object should follow this format:
+
+```json
+{
+    "B079ZJ1BPR": 3,
+    "B07D4R4258": 21
+}
+```
+
+Every 10 seconds, we should log the most up-to-date version of this object to the console. Additionally, the object should be able to solve Actor migrations, which means that even if the Actor were to migrate, its data would not be lost upon resurrection.
+
+[**Solution**](./solutions/handling_migrations.md)
+
+## Next up {#next}
+
+You might have already noticed that we've been using the **RESIDENTIAL** proxy group in the `proxyConfiguration` within our Amazon scraping Actor. But what does that mean? Learn why we've used this group, about proxies, and about avoiding anti-scraping measures in the [next lesson](./bypassing_anti_scraping.md).
+</file>
+
+<file path="platform/expert_scraping_with_apify/saving_useful_stats.md">
+---
+title: VII - Saving useful run statistics
+description: Understand how to save statistics about an Actor's run, what types of statistics you can save, and why you might want to save them for a large-scale scraper.
+sidebar_position: 6.7
+slug: /expert-scraping-with-apify/saving-useful-stats
+---
+
+# Saving useful run statistics {#savings-useful-run-statistics}
+
+**Understand how to save statistics about an Actor's run, what types of statistics you can save, and why you might want to save them for a large-scale scraper.**
+
+---
+
+Using Crawlee and the Apify SDK, we are now able to collect and format data coming directly from websites and save it into a Key-Value store or Dataset. This is great, but sometimes, we want to store some extra data about the run itself, or about each request. We might want to store some extra general run information separately from our results or potentially include statistics about each request within its corresponding dataset item.
+
+The types of values that are saved are totally up to you, but the most common are error scores, number of total saved items, number of request retries, number of captchas hit, etc. Storing these values is not always necessary, but can be valuable when debugging and maintaining an Actor. As your projects scale, this will become more and more useful and important.
+
+## Learning 🧠 {#learning}
+
+Before moving on, give these valuable resources a quick lookover:
+
+- Refamiliarize with the various available data on the [Request object](https://crawlee.dev/api/core/class/Request).
+- Learn about the [`failedRequestHandler` function](https://crawlee.dev/api/browser-crawler/interface/BrowserCrawlerOptions#failedRequestHandler).
+- Understand how to use the [`errorHandler`](https://crawlee.dev/api/browser-crawler/interface/BrowserCrawlerOptions#errorHandler) function to handle request failures.
+- Ensure you are comfortable using [key-value stores](/sdk/js/docs/guides/result-storage#key-value-store) and [datasets](/sdk/js/docs/guides/result-storage#dataset), and understand the differences between the two storage types.
+
+## Knowledge check 📝 {#quiz}
+
+1. Why might you want to store statistics about an Actor's run (or a specific request)?
+2. In our Amazon scraper, we are trying to store the number of retries of a request once its data is pushed to the dataset. Where would you get this information? Where would you store it?
+3. What is the difference between the `failedRequestHandler` and `errorHandler`?
+
+## Our task
+
+In our Amazon Actor, each dataset result must now have the following extra keys:
+
+```json
+{
+    "dateHandled": "date-here", // the date + time at which the request was handled
+    "numberOfRetries": 4, // the number of retries of the request before running successfully
+    "currentPendingRequests": 24 // the current number of requests left pending in the request queue
+}
+```
+
+Also, an object including these values should be persisted during the run in th Key-Value store and logged to the console every 10 seconds:
+
+```json
+{
+    "errors": { // all of the errors for every request path
+        "some-site.com/products/123": [
+            "error1",
+            "error2"
+        ]
+    },
+    "totalSaved": 43 // total number of saved items throughout the entire run
+}
+```
+
+[**Solution**](./solutions/saving_stats.md)
+
+## Wrap up
+
+Wow, you've learned a whole lot in this course, so give yourself the pat on the back that you deserve! If you were able to follow along with this course, that means that you're officially an **Apify pro**, and that you're equipped with all of the knowledge and tools you need to build awesome scalable web-scrapers either for your own personal projects or for the Apify platform.
+
+Congratulations! 🎉
+</file>
+
+<file path="platform/expert_scraping_with_apify/tasks_and_storage.md">
+---
+title: III - Tasks & storage
+description: Understand how to save the configurations for Actors with Actor tasks. Also, learn about storage and the different types Apify offers.
+sidebar_position: 6.3
+slug: /expert-scraping-with-apify/tasks-and-storage
+---
+
+# Tasks & storage {#tasks-and-storage}
+
+**Understand how to save the configurations for Actors with Actor tasks. Also, learn about storage and the different types Apify offers.**
+
+---
+
+Both of these are very different things; however, they are also tied together in many ways. **Tasks** run Actors, Actors return data, and data is stored in different types of **Storages**.
+
+## Tasks {#tasks}
+
+Tasks are a very useful feature which allow us to save pre-configured inputs for Actors. This means that rather than configuring the Actor every time, or rather than having to save screenshots of various different Actor configurations, you can store the configurations right in your Apify account instead, and run the Actor at will with them.
+
+## Storage {#storage}
+
+Storage allows us to save persistent data for further processing. As you'll learn, there are two main storage options on the Apify platform, as well as two main storage types (**named** and **unnamed**) with one big difference between them.
+
+## Learning 🧠 {#learning}
+
+- Check out [the docs about Actor tasks](/platform/actors/running/tasks).
+- Read about the [two main storage options](/platform/storage/dataset) on the Apify platform.
+- Understand the [crucial differences between named and unnamed storages](/platform/storage/usage#named-and-unnamed-storages).
+- Learn about the [`Dataset`](/sdk/js/reference/class/Dataset) and [`KeyValueStore`](/sdk/js/reference/class/KeyValueStore) objects in the Apify SDK.
+
+## Knowledge check 📝 {#quiz}
+
+1. What is the relationship between Actors and tasks?
+2. What are the differences between default (unnamed) and named storage? Which one would you use for everyday usage?
+3. What is data retention, and how does it work for all types of storages (default and named)?
+
+[**Solution**](./solutions/using_storage_creating_tasks.md)
+
+## Next up {#next}
+
+The [next lesson](./apify_api_and_client.md) is very exciting, as it will unlock the ability to seamlessly integrate your Apify Actors into your own external projects and applications with the Apify API.
+</file>
+
+<file path="platform/get_most_of_actors/actor_basics/_category_.yaml">
+label: Actor basics
+position: 2
+</file>
+
+<file path="platform/get_most_of_actors/actor_basics/actor_description_seo_description.md">
+---
+title: Actor description & SEO description
+description: Learn about Actor description and meta description. Where to set them and best practices for both content and length.
+sidebar_position: 3
+category: apify platform
+slug: /actor-marketing-playbook/actor-basics/actor-description
+---
+
+Learn about Actor description and meta description. Where to set them and best practices for both content and length.
+
+---
+
+## What is an Actor description?
+
+First impressions are important, especially when it comes to tools. Actor descriptions are the first connection potential users have with your Actor. You can set two kinds of descriptions: _regular description_ (in Apify Store) and _SEO description_ (on Google search), along with their respective names: regular name and SEO name.
+
+:::tip
+
+You can change descriptions and names as many times as you want.
+
+:::
+
+## Regular description vs. SEO description
+
+|  | Actor description & name | SEO description & name |
+|---|---|---|
+| Name length | 40-50 characters | 40-50 characters |
+| Description length | 300 characters | 145-155 characters |
+| Visibility  | Visible on Store | Visible on Google |
+
+### Description & Actor name
+
+Actor description is what users see on the Actor's web page in Apify Store, along with the Actor's name and URL. When creating an Actor description, a “warm” visitor experience is prioritized (more on that later).
+
+![actor name & description](images/actor-description-name.png)
+
+Actor description is also present in Apify Console and across Apify Store.
+
+![actor description in store](images/actor-description-store.png)
+
+### SEO description & SEO name
+
+Actor SEO description is a tool description visible on Google. It is shorter and SEO-optimized (keywords matter here). When creating the SEO description, a “cold” visitor experience is prioritized.
+
+![seo description](images/seo_description.png)
+
+Usually the way the potential user interacts with both these descriptions goes like this: SEO first, regular description second. Is there any benefit in them being different?
+
+### Is there any benefit in the description and meta description being different?
+
+Different descriptions give you a chance to target different stages of user acquisition. And make sure the acquisition takes place.
+
+_SEO description (and SEO name)_ is targeting a “cold” potential user who knows nothing about your tool yet and just came across it on Google search. They’re searching to solve a problem or use case. The goal of the meta description is to convince that visitor to click on your tool's page among other similar search results on Google. While it's shorter, SEO description is also the space to search-engine-optimize your language to the max to attract the most matching search intent.
+
+_Description (and name)_ is targeting a “warm” potential user who is already curious about your tool. They have clicked on the tool's page and have a few seconds to understand how complex the tool is and what it can do for them. Here you can forget SEO optimization and speak directly to the user. The regular description also has a longer character limit, which means you can expand on your Actor’s features.
+
+Learn more about search intent here: [SEO](/academy/actor-marketing-playbook/promote-your-actor/seo)
+
+## Where can Actor descriptions be set?
+
+Both descriptions can be found and edited on the very right **Publication tab →  Display information.** It has to be done separately for each Actor.
+
+:::note
+
+Setting the SEO description and SEO name is optional. If not set, the description will just be duplicated.
+
+:::
+
+![changing seo name](images/changing__SEO_name.png)
+
+![changing actor name and seo name](images/changing_Actor_name_and_SEO_name.png)
+
+Actor description specifically can also be quick-edited in this pop-up on the Actor's page in Apify Console.  Open the **Actor's page**, then click on **…** in the top right corner, and choose ✎ **Edit name or description**. Then set the URL in the **Unique name** ✎ field and click **Save**.
+
+![changing actor description](images/change_Actor_description.png)
+
+## Tips and recommendations on how to write descriptions
+
+When writing a description, less is more. You only have a few seconds to capture attention and communicate what your Actor can do. To make the most of that time, follow these guidelines used by Apify (these apply to both types of descriptions):
+
+### Use variations and experiment 🔄
+
+- _SEO name vs. regular name_:
+  - name: Airbnb Scraper
+  - SEO name: Airbnb Data Scraper
+- _Keywords on the web page_:<br/>
+Include variations, e.g. Airbnb API, Airbnb data, Airbnb data scraper, Airbnb rentals, Airbnb listings
+  - No-code scraping tool to extract Airbnb data: host info, prices, dates, location, and reviews.
+  - Scrape Airbnb listings without official Airbnb API!
+- _Scraping/automation process variations_:<br/>
+Use terms, e.g. crawl, crawler, scraping tool, finder, scraper, data extraction tool, extract data, get data
+  - Scrape XYZ data, scraped data, data scraper, data crawler.
+
+### Choose how to start your sentences 📝
+
+- _Noun-first (descriptive)_:
+  - Data extraction tool to extract Airbnb data: host info, prices, dates, location, and reviews.
+- _Imperative-first (motivating)_:
+  - Try a free web scraping tool to extract Airbnb data: host info, prices, dates, location, and reviews.
+
+
+### Keep it short and SEO-focused ✂️
+
+- _Be concise and direct_: clearly state what your Actor does. Avoid unnecessary fluff and boilerplate text.
+  - ✅ Scrapes job listings from Indeed and gathers...
+  - ❌ *This Actor scrapes job listings from Indeed in order to gather...
+- _Optimize for search engines_: include popular keywords related to your Actor’s functionality that users might search for.
+  - ✅ This Indeed scraper helps you collect job data efficiently. Use the tool to gather...
+  - ❌ This tool will search through job listings on Indeed and offers you...
+
+
+### List the data your Actor works with 📝
+
+- Data extraction tool to extract Airbnb data: host info, prices, dates, location, and reviews.
+- Get hashtags, usernames, mentions, URLs, comments, images, likes, locations without the official Instagram API.
+
+### Use keywords or the language of the target website 🗣️
+
+- Extract data from hundreds of Airbnb home rentals in seconds.
+- Extract data from chosen tik-toks. Just add a TikTok URL and get TikTok video and profile data: URLs, numbers of shares, followers, hashtags, hearts, video, and music metadata.
+- Scrape Booking with this hotels scraper and get data about accommodation on Booking.com.
+
+### Highlight your strong suits 🌟
+
+- Ease of use, no coding, user-friendly:
+  - Easy scraping tool to extract Airbnb data.
+- Fast and scalable:
+  - Scrape whole cities or extract data from hundreds of Airbnb rentals in seconds.
+- Free (only if the trial run can cover $5 free credits):
+  - Try a free scraping tool to extract Airbnb data: host info, prices, dates, location, and reviews.
+  - Extract host information, locations, availability, stars, reviews, images, and host/guest details for free.
+- Available platform features (various formats, API, integrations, scheduling):
+  - Export scraped data in formats like HTML, JSON, and Excel.
+- Additional tips:
+  - Avoid ending lists with etc.
+  - Consider adding relevant emojis for visual appeal.
+
+### Break it down 🔠
+
+Descriptions typically fit into 2-3 sentences. Don't try to jam everything into one.
+
+Examples:
+
+1. Scrape whole cities or extract data from hundreds of Airbnb rentals in seconds.
+1. Extract host information, addresses, locations, prices, availability, stars, reviews, images, and host/guest details.
+1. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.
+
+## FAQ
+<!-- markdownlint-disable MD001 -->
+#### Can the Actor's meta description and description be the same?
+
+Yes, they can, as long as they have the same (shorter) length (under 150 characters). But they can also be different - there's no harm in that.
+
+#### How different can description and meta description be?
+
+They can be vastly different and target different angles of your Actor. You can experiment by setting up different SEO descriptions for a period of time and seeing if the click-through rate rises.
+
+#### I set a custom SEO description but Google doesn't show it
+
+Sometimes Google picks up a part of the README as the SEO description. It's heavily dependent on the search query. Sometimes what you see on Google might look differently compared to how you set the SEO description. It's all a part of how Google customizes search results.
+</file>
+
+<file path="platform/get_most_of_actors/actor_basics/actors-and-emojis.md">
+---
+title: Actors & emojis
+description: Discover how emojis can boost your Actors by grabbing attention, simplifying navigation, and enhancing clarity. Improve user experience and engagement on Apify Store.
+sidebar_position: 5
+category: apify platform
+slug: /actor-marketing-playbook/actor-basics/actors-and-emojis
+---
+
+Using emojis in Actors is a science on its own. Learn how emojis enhance the user experience in Actors by grabbing attention, simplifying navigation, and making information clearer.
+
+## On the use of emojis in Actors
+
+We started using emojis in Actors for several reasons. First, tech today often uses emojis to make things look more user-friendly. Second, people don’t read as much as we’d like. You only have a few seconds to grab their attention, and text alone can feel overwhelming. Third, we don’t have many opportunities or space to explain things about Actors, and we want to avoid users needing to open extra tabs or pages. Clarity should come instantly, so we turned to emojis.
+
+When evaluating a new tool, those first 5 seconds are critical. That’s why we use emojis extensively with our Actors. They’re part of the Actor SEO title and description to help the tool stand out in Google search results, although Google doesn't always display them. In READMEs, they serve as shortcuts to different sections and help users quickly understand the type of data they’ll get. In complex input schemas, we rely on emojis to guide users and help them navigate the tool more efficiently.
+
+## Emoji science
+
+Believe it or not, there’s a science to emoji usage. When we use emojis in Actors and related content, we tap into the brain's iconic and working memory. Iconic memory holds information for less than a second - this is unconscious processing, where attributes like color, size, and location are instantly recognized. This part is where emojis guide the person's attention in the sea of text. They signify that something important is here. Emojis help with that immediate first impression and create a sense of clarity.
+
+After that, the brain shifts to working memory, where it combines information into visual chunks. Since we can only hold about 3-4 chunks at once, emojis help reinforce key points, thus reducing cognitive load. Consistent emoji use across the Actor ecosystem ensures users can quickly connect information without getting overwhelmed.
+
+As an example of this whole process, first, the user notices the emojis used in the field titles (pre-attentive processing). They learn to associate the emojis with those titles (attentive processing). Later, when they encounter the same emojis in a README section, they’ll make the connection, making it easier to navigate without drowning in a sea of text.
+
+## Caveats to emojis
+
+1. Don't overuse them, and don’t rely on emojis for critical information. Emojis should support the text, not replace key explanations or instructions. They're a crutch for concise copywriting, not a universal solution.
+2. Use them consistently. Choose one and stick with it across all content: descriptions, parts of input schema, mentions in README, blog posts, etc.
+3. Some emojis have multiple meanings, so choose the safest one. It could be general internet knowledge or cultural differences, so make sure the ones you choose won’t confuse or offend users in other markets.
+4. Some emojis don’t render well on Windows or older devices. Try to choose ones that display correctly on Mac, Windows, and mobile platforms. Besides, emoji-heavy content can be harder for screen readers and accessibility tools to interpret. Make sure the information is still clear without the emojis.
+5. It's okay not to use them.
+</file>
+
+<file path="platform/get_most_of_actors/actor_basics/how-to-create-actor-readme.md">
+---
+title: How to create an Actor README
+description: Learn how to write a comprehensive README to help users better navigate, understand and run public Actors in Apify Store.
+sidebar_position: 3
+category: apify platform
+slug: /actor-marketing-playbook/actor-basics/how-to-create-an-actor-readme
+---
+
+**Learn how to write a comprehensive README to help users better navigate, understand and run public Actors in Apify Store.**
+
+---
+
+## What's a README in the Apify sense?
+
+At Apify, when we talk about a README, we don’t mean a guide mainly aimed at developers that explains what a project is, how to set it up, or how to contribute to it. At least, not in its traditional sense.
+
+You could argue our notion of README is closer to this [one described on GitHub](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes):
+
+README files typically include information on:
+
+- What the project does
+- Why the project is useful
+- How users can get started with the project
+- Where users can get help with your project
+
+We mean all of this and even more. At Apify, when we talk about READMEs, we refer to the public Actor detail page on Apify Store. Specifically, its first tab. The README exists in the same form both on the web and in Console. So what is it for?
+
+Before we dive in, a little disclaimer: you don't need your Apify README to fulfill all its purposes. Technically, you could even publish an Actor with just a single word in the README. But you'd be missing out if you did that.
+
+Your Actor’s README has at least four functions:
+
+1. _SEO_ - If your README is well-structured and includes important keywords — both in headings and across the text — it has a high chance of being noticed and promoted by Google. Organic search brings the most motivated type of potential users. If you win this game, you've won most of the SEO game.
+2. _First impression_ - Your README is one of the first points of contact with a potential user. If you come across as convincing, clear, and reassuring it could be the factor that will make a user try your Actor for their task.
+3. _Extended instruction_ - The README is also the space that explains specific complex input settings. For example, special formatting of the input, any coding-related, or extended functions. Of course, you could put that all in a blog post as well, but the README should be their first point of contact.
+4. _Support_ - Your users come back to the README when they face issues. So use it as a space to let them know that's where they can find links to the tutorials if they run into issues, describe common troubleshooting techniques, share tricks, or warn you about bugs.
+
+## README elements theory
+
+These are the most important elements of the README. This structure is also not to be followed to a “t”. Of course, what you want to say to your potential users and how you want to promote your Actor will differ case by case. These are just the most common practices we have for our Actor READMEs. Beware that the headings are written with SEO in mind, which is why you see certain keywords repeated over and over.
+
+Aim for sections 1–6 below and try to include at least 300 words. You can move the sections around to some extent if it makes sense, e.g. 3 might come after 6. Consider using emojis as bullet points or otherwise trying to break up the text.
+
+### Intro and features
+
+What is [Actor]?
+
+- explain in two or three sentences what the Actor does and the easiest way to try it. Mention briefly what kind of data it can extract and any other tangible goal the tool can achieve. Describe the input in one sentence. Highlight the most important words in bold.
+
+What can this [Actor] do?
+
+- list the main features of this tool. list multiple ways of input if applicable. list platform advantages. If it's a bundle, mention the steps that the Actor will do for you, mention specific obstacles this tool is able to overcome, say upfront how many results you can get for free.
+
+:::tip Remember the Apify platform!
+
+Your Actor + the Apify platform. They come as a package. Don't forget to flaunt all the advantages that the platform gives to your solution.
+
+:::
+
+Imagine if there was a solution that is identical to yours but without the platform advantages such as monitoring, access to API, scheduling, possibility of integrations, proxy rotation. Now, if that tool suddenly gained all those advantages it would surely make a selling point out of it. This is how you should be thinking about your tool — as a solution boosted by the Apify platform. Don't ever forget that advantage.
+
+What data can [Actor] extract?
+
+What data can you extract from [target website]
+
+- Create a table that represents the main data points that the Actor can extract. You don't have to list every single one, just list the most understandable and relatable ones.
+
+Depending on the complexity of your Actor, you might include one or all three of these sections. It will also depend on what your Actor does. If your Actor has simple input but does a lot of steps for the user under the hood (like a bundle would), you might like to include the "What can this Actor do?" section. If your Actor extracts data, it makes sense to include a section with a table.
+
+### Tutorial section
+
+This could be a simple listed step-by-step section or a paragraph with a link to a tutorial on a blog.
+
+A step-by-step section is reassuring for the user, and it can be a section optimized for Google.
+
+How do I use [Actor] to scrape website data?
+
+### Pricing
+
+How much will it cost to scrape [target site]?
+
+How much will scraping [target site] cost?
+
+Is scraping [target site] free?
+
+How much does it cost to extract [target site] data?
+
+Web scraping can be very unpredictable because there are a lot of elements involved in order for the process to be successful: the complexity of the website, proxies, cookies, etc. This is why it's important to set the pricing and scraping volume expectations for your users.
+
+You might think the part above the Actor detail page already indicates pricing. But this paragraph can still be useful. First of all, cost-related questions can show up in Google, if they are SEO optimized. Second, you can use this space to inform and reassure the user about the pricing, give more details about it, or entice them with the promise of very scalable scraping.
+
+- If it's a consumption pricing model (only consumed CUs), you can use this space to set expectations and explain what it means to pay for Compute Units. Similarly, if it's a rental Actor, you can also use this paragraph to set expectations. Talk about the average amount of data that can be scraped per given price. Make it easy for users to imagine how much they will pay for a given dataset. This will also make it easier for them to compare your solution with others on the market price-wise and value-wise.
+- If it's price per result, you can extrapolate how many results a user can get on a free plan and also entice them with a larger plan and how many thousands of results they can get with that.
+- If it's a bundle that consists of a couple of Actors that are priced differently, you can use this section to talk about the difference between all the Actors involved and how that will affect the final price of a run.
+
+In any case, on top of setting expectations and reassuring users, this paragraph can get into Google. If somebody is Googling "How much does it cost to scrape [website]", they might come across this part of your README and it will lead them from Google search directly to your Actor's detail page. So you don't want to miss that opportunity.
+
+![readme example](images/readme.png)
+
+### Input and output examples
+
+This is what people click on the most in the table of contents of the README. After they are done scrolling through the first part of the README, users are interested in how difficult the input it, what it looks like, and what kind of information they can expect.
+
+**Input**: often a screenshot of the input schema. This is also a way for people to see the platform even before they create an account.
+
+**Output**: can be shown as a screenshot if your output schema looks like something you would want to promote to users. You can also just include a JSON example containing a few objects. Even better if there's continuity between the input example and output example.
+
+If your datasets come out too complex and you want to save your users some scrolling, you can also show multiple output examples: one for reviews, one for contact details, one for ads, etc.
+
+### Other Actors
+
+Don't forget to promote your other Actors. While our system for Actor recommendation works - you can see related Actors at the bottom of the README — it only works within the same category or similar name. It won't recommend a completely different Actor from the same creator. So make sure to interconnect your work by taking the initiative yourself. You can mention your other Actors in a list or as a table.
+
+### FAQ, disclaimers, and support
+
+The FAQ is a section where you can keep all the secondary questions that might still come up.
+
+Here are just a few things we usually push to the FAQ section.
+
+- disclaimers and legality
+- comparison table between your Actor and similar solutions
+- information about the official API and how the scraper is a stand-in for it (SEO)
+- questions brought up by the users
+- tips on how best to use the Actor
+- troubleshooting and mentioning known bugs
+- mentioning the Issues tab and highlighting that you're open for feedback and collecting feedback
+- mentioning being open to creating a custom solution based on the current one and showing a way to contact you
+- interlinking
+- mentioning the possibility of transferring data using an API — API tab
+- possibility for integrations
+- use cases for the data scraped, success stories exemplifying the use of data
+
+## Format of the README
+
+### Markdown
+
+The README has to be written in Markdown. The most important elements are H2 and H3 headings, links to pages, links to images, and tables. For specific formatting, you can try using basic HTML. That will also work. CSS won’t.
+
+### HTML use
+
+You can mix HTML with Markdown interchangeably. The Actor README will display either on the Apify platform. That gives you more freedom to use HTML when needed. Remember, don't try CSS.
+
+### Tone of the README
+
+Apify Store has many Actors in its stock, and it's only growing. The advantage of an Actor is that an Actor can be anything, as versatile or complex as possible. From a single URL type of input to complex features that give customized control over the input parameters to the user. There are Actors that are intended for users who aren't familiar with coding and don't have any experience with it. Ideally, the README should reflect the level of skill one should need to use the Actor.
+
+The tone of the README should make it immediately obvious who the tool is aimed at. If your tool's input includes glob patterns or looking for selectors, it should be immediately visible from the README. Before the user even tries the tool. Trying to simplify this information using simple words with ChatGPT can be misleading to the user. You will attract the wrong audience, and they will end up churning or asking you too many questions.
+
+And vice versa. If your target audience is people with little to no coding skills, who just prefer point-and-click solutions, this should be visible from the README. Speak in regular terms, avoid code blocks or complex information at the beginning unless it's absolutely necessary. This means that, when people land on your Actor detail page, they will have their expectations set from the get-go.
+
+### Length of a README
+
+When working on improving a README, we regularly look at heatmaps that show us where our website visitors spend most of their time. From our experience, most first-time visitors don't scroll past the first 25% of a README. That means that the first quarter of the README is where you want to focus the most of your attention if you're trying to persuade the page visitor to try your Actor.
+
+From the point of view of acquisition, the first few sections should make it immediately obvious what the tool is about, how hard it is to use, and who it is created for. This is why, in Apify's READMEs, you can see our first few paragraphs are built in such a way as to explain these things and reassure the visitors that anyone can use these tools.
+
+From the point of view of retention, it doesn't mean you can't have long or complex READMEs or not care for the information beyond the 25% mark. Since the README is also intended to be used as a backup when something goes wrong or the user needs more guidance, your users will come back to it multiple times.
+
+### Images and videos
+
+As for using screenshots and GIFs, put them in some sort of image hosting. Your own GitHub repository would be best because you have full control over it. Name the images with SEO in mind and try to keep them compressed but good enough quality. You don't want to load an image or GIF for too long.
+
+One trick is not only to add images but also to make them clickable. For some reason, people like clicking on images, at least they try to when we look at the heatmaps. You can lead the screenshot clicks towards a signup page, which is possible with Markdown.
+
+If your screenshot seems too big or occupies too much space, smaller size images are possible by using HTML.
+
+To embed a YouTube video, all you have to do is include its URL. No further formatting is needed, the thumbnail will render itself on the README page.
+
+:::tip Try Carbon for code
+
+If you want to add snippets of code anywhere in your README, you can use [Carbon](https://github.com/carbon-app/carbon).
+
+:::
+
+If you need quick Markdown guidance, check out [https://www.markdownguide.org/cheat-sheet/](https://www.markdownguide.org/cheat-sheet/)
+
+
+## README and SEO
+
+Your README is your landing page.
+
+If there were only one thing to remember about READMEs on Apify Store, it would be this. A README on Apify Store is not just dry instructions on how to use your Actor. It has much more potential than that.
+
+In the eyes of Google, your Actor's detail page, aka README, is a full-fledged landing page containing all the most important information to be found and understood by users.
+
+Of course, that all only counts if your README is both well formatted and contains keywords. We'll talk about that part later on.
+
+What makes a good README?
+
+A good README has to be a balance between what you want your page visitors to know, your users to turn to when they run into trouble, and Google to register when it's indexing pages and considering which one deserves to be put up higher.
+
+### Table of contents
+
+The H1 of your page is the Actor name, so you don't have to set that up. Don't add more H1s. README headings should be H2 or H3. H2 headings will make up the table of contents on the right. So if you don't want the table to be too crowded, keep the H2s to the basics and push all the longer phrases and questions to H3s. H3s will stay hidden in the accordion in the default state until the visitor hovers their cursor over it. H4 readings can also be included, of course, but they won't show up as a part of the table of contents.
+
+### Keyword opportunities
+
+Do SEO research for keywords and see how they can fit organically into the text. Priority for H2s and H3s, then the regular text. Add new keyword-heavy paragraphs if you see an opportunity.
+
+The easiest sections to include keywords in are, for example:
+
+- API, as in Instagram API
+- data, as in extract Instagram data
+- Python, as in extract data in Python
+- scrape, as in how to scrape X
+- scraping, as in scraping X
+
+Now, could every H2 just say exactly what it is about, without SEO? Of course. You don't have to optimize your H2s and H3s, and are free to call them simply Features, How it works, Pricing, Support, etc. or not even to have many H2s at all and keep it all as one page.
+
+However, the H2s and H3s are what sometimes get into the Google Search results. If you're familiar with the People Also Ask section, that's the best place to match your H2s. They can also get highlighted in the Sitelinks of Google Search Results.
+
+Any part of your README can make it onto Google pages. The intro sentence describing what your Actor is about, a video, a random question. Each one can become a good candidate for those prime Google pages. That's why it's important to structure and write your README with SEO in mind.
+
+### Importance of including a video
+
+If your page has a video, it has a better chance of ranking higher in Google.
+
+## README and input schema
+
+The README should serve as a fallback for your users if something isn't immediately obvious in the input schema. There's also only that much space in the input schema and the tooltips, so naturally, if you want to provide more details about something, e.g. input, formatting, or expectations, you should put it in the README and refer to it from the relevant place in the input schema.
+
+Learn about [How to create a great input schema](/academy/actor-marketing-playbook/product-optimization/how-to-create-a-great-input-schema)
+
+## Readme elements template
+
+1. What does (Actor name) do?
+    - in 1–2 sentences describe what the Actor does and what it does not do
+    - consider adding keywords like API, e.g. Instagram API
+    - always have a link to the target website in this section
+2. Why use (Actor name)? or Why scrape (target site)?
+    - How it can be beneficial for the user
+    - Business use cases
+    - Link to a success story, a business use case, or a blog post.
+3. How to scrape (target site)
+   - Link to "How to…" blogs, if one exists (or suggest one if it doesn't)
+   - Add a video tutorial or GIF from an ideal Actor run.
+
+:::tip Embedding YouTube videos
+
+For better user experience, Apify Console automatically renders every YouTube URL as an embedded video player. Simply add a separate line with the URL of your YouTube video.
+
+:::
+
+- Consider adding a short numbered tutorial, as Google will sometimes pick these up as rich snippets. Remember that this might be in search results, so you can repeat the name of the Actor and give a link, e.g.
+
+1. Is it legal to scrape (target site)?
+    - This can be used as a boilerplate text for the legal section, but you should use your own judgment and also customize it with the site name.
+
+    > Our scrapers are ethical and do not extract any private user data, such as email addresses, gender, or location. They only extract what the user has chosen to share publicly. We therefore believe that our scrapers, when used for ethical purposes by Apify users, are safe. However, you should be aware that your results could contain personal data. Personal data is protected by the GDPR in the European Union and by other regulations around the world. You should not scrape personal data unless you have a legitimate reason to do so. If you're unsure whether your reason is legitimate, consult your lawyers. You can also read our blog post on the legality of web scraping
+    >
+2. Input
+    - Each Actor detail page has an input tab, so you just need to refer to that. If you like, you can add a screenshot showing the user what the input fields will look like.
+    - This is an example of how to refer to the input tab:
+
+    > Twitter Scraper has the following input options. Click on the input tab for more information.
+    >
+3. Output
+    - Mention "You can download the dataset extracted by (Actor name) in various formats such as JSON, HTML, CSV, or Excel.”
+    - Add a simplified JSON dataset example, like here https://apify.com/compass/crawler-google-places#output-example
+4. Tips or Advanced options section
+    - Share any tips on how to best run the Actor, such as how to limit compute unit usage, get more accurate results, or improve speed.
+
+If you want some general tips on how to make a GitHub README that stands out, check out these guides. Not everything in there will be suitable for an Apify Actor README, so you should cherry-pick what you like and use your imagination.
+
+## Resources
+
+[Build a Stunning README For Your GitHub Profile](https://towardsdatascience.com/build-a-stunning-readme-for-your-github-profile-9b80434fe5d7)
+
+[How to Create a Beautiful README for Your GitHub Profile](https://yushi95.medium.com/how-to-create-a-beautiful-readme-for-your-github-profile-36957caa711c)
+</file>
+
+<file path="platform/get_most_of_actors/actor_basics/importance-of-actor-url.md">
+---
+title: Importance of Actor URL
+description: Learn how to set your Actor’s URL (technical name) and name effectively when creating it on Apify. Follow best practices to optimize your Actor’s web presence and ensure it stands out on Apify Store.
+sidebar_position: 2
+category: apify platform
+slug: /actor-marketing-playbook/actor-basics/importance-of-actor-url
+---
+
+**Actor URL (or technical name, as we call it), is the page URL of the Actor shown on the web. When you're creating an Actor, you can set the URL yourself along with the Actor name. Here are best practices on how to do it well.**
+
+![actor url example](images/what-is-actor-url.png)
+
+---
+
+## Why is Actor URL so important?
+
+The Actor URL plays a crucial role in SEO. Google doesn't just read the Actor's name or README; it also analyzes the URL. The _URL is one of the first signals to Google about the content of your page_- whether it's a product listing, a tool, a blog post, a landing page for a specific offering, or something else entirely. Therefore, it's important to know how to use this shorthand to your advantage and clearly communicate to Google what your page offers.
+
+:::tip Choose the URL carefully
+
+This part of the manual is only applicable to new Actors. _Once set, existing Actor URLs shouldn't change_.
+
+:::
+
+## How to choose a URL
+
+The right naming can propel or hinder the success of the Actor on Google Search. Just as naming your Actor is important, so is choosing its URL. The only difference is, once set, the URL is intended to be permanent (more on this [later](/academy/actor-marketing-playbook/actor-basics/importance-of-actor-url)). What's the formula for the best Actor URL?
+
+### Brainstorming
+
+What does your Actor do? Does it scrape, find, extract, automate, connect? Think of these when you are looking for a name. You might already have a code name in mind, but it’s essential to ensure it stands out and is distinct from similar names—both on Google and on Apify Store.
+
+### Matching URL and name
+
+The easiest way is to make sure the Actor name and the technical name match. As in TikTok Scraper (tiktok-scraper) or Facebook Data Extractor (facebook-data-extractor). But they can also be different.
+
+### SEO
+
+The name should reflect not only what Actor does (or what website it targets), but also what words people use when they search for it. This is why it's also important to do SEO research to see which keywords work best for the topic. Ideally, the URL should include a keyword that has low complexity (low competition) but high traffic (high demand).
+
+Learn more about SEO research and the best tools for it here: [SEO](/academy/actor-marketing-playbook/promote-your-actor/seo)
+
+### Inspiration in Apify Store
+
+Explore Store URLs of similar Actors. But avoid naming your Actor too similarly to what already exists, because of these two reasons:
+
+1. There’s evidence that new URLs that are similar to existing ones can have drastically different levels of success. The first URL might thrive while a similar one published later struggles to gain traction. For example, _onedev/pentagon-scraper_ was published first and has almost 100x traction than _justanotherdev/pentagon-scraper_. It will be very hard for the latter to beat the former. The reason for this is that Google operates on a "first come, first served” basis, and once it's set, it is very hard to make Google change its ways and make it pay attention to new pages with a similar name.
+2. As Apify Store is growing, it's important to differentiate yourself from the competition. A different URL is just one more way to do that. If a person is doing research on Store, they will be less likely to get confused between two tools with the same name.
+
+### Length of URL
+
+Ideally, keep it under four words. As in, _Facebook Data Extractor_ (_facebook-data-extractor_), not (_facebook-data-meta-online-extractor-light_). If the name is long and you're trying to match it with your URL, keep only the most essential words for the URL.
+
+### Variations
+
+It can be a long-tail keyword with the tool type in it: scraper, finder, extractor. But you can also consider keywords that include terms like API, data, and even variations of the website name. Check out what keywords competitors outside of Apify Store are using for similar tools.
+
+### Nouns and adjectives
+
+One last tip on this topic is to _avoid adjectives and verbs_. Your page is about a tool, so keep it to nouns. Anything regarding what the tool does (scrape, automate, import) and what it's like (fast, light, best) can be expressed in the Actor's name, not the Actor's URL. Adding an adjective or verb like that either does nothing for SEO and might even damage the SEO chances of the page.
+
+## Why you shouldn’t change your Actor URL
+
+:::tip Don't change the URL
+
+There's only one rule about Actor URL: don't change the URL. The Actor's name, however, can be changed without any problems.
+
+:::
+
+Once set, the page URL should not be changed. Because of those two important reasons:
+
+- Google dislikes changes to URLs. Once your Actor has built up keyword associations and familiarity with Google, regaining that standing after a URL change can be challenging. You will have to start from scratch.
+- Current integrations will break for your Actor's users. This is essential for maintaining functionality.
+
+If you absolutely have to change the URL, you will have to communicate that fact to your users.
+
+💡 Learn more about the easiest ways to communicate with your users: [Emails to Actor users]
+
+## How and where to set the Actor URL
+
+In Console. Open the **Actor's page**, then click on **…** in the top right corner, and choose ✎ **Edit name or description**. Then set the URL in the **Unique name** ✎ field and click **Save**.
+
+![set actor url in console](images/how-and-where-to-set-the-actor-url-console.png)
+
+![set the actor url](images/how-and-where-to-set-the-actor-url.png)
+
+
+## FAQ
+<!-- markdownlint-disable MD001 -->
+#### Can Actor URL be different from Actor name?
+
+Yes. While they can be the same, they don’t have to be. For the best user experience, keeping them identical is recommended, but you can experiment with the Actor's name. Just avoid changing the Actor URL.
+
+#### Can I change a very fresh Actor URL?
+
+Yes, but act quickly. It takes Google a few days to start recognizing your page. For this reason, if you really have to, _it is best to change the Actor's name in the first few days_, before you build a steady user base and rapport with Google.
+
+#### How long does it take Google to pick up on the new URL?
+
+Google reindexes Apify web pages almost every day. It might take anywhere from 3-7 days for it to pick up a new URL. Or it might happen within a day.
+
+#### Can I use the identical technical name as this other Actor?
+
+Yes, you can. But it will most likely lower your chances of being noticed by Google.
+
+#### Does changing my Apify account name affect the Actor URL?
+
+Yes. If you're changing from _justanotherdev/pentagon-scraper_ to _dev/pentagon-scraper_, it counts as a new page. Essentially, the consequences are the same as after changing the technical name of the Actor.
+</file>
+
+<file path="platform/get_most_of_actors/actor_basics/name-your-actor.md">
+---
+title: Name your Actor
+description: Learn Apify’s standards for naming Actors and how to choose the right name for your scraping and automation tools and maximize visibility on Apify Store.
+sidebar_position: 1
+category: apify platform
+slug: /actor-marketing-playbook/actor-basics/name-your-actor
+---
+
+**Apify's standards for Actor naming. Learn how to choose the right name for scraping and automation Actors and how to optimize your Actor for search engines.**
+
+---
+
+Naming your Actor can be tricky, especially after you’ve worked hard on it. To help people find your Actor and make it stand out, we’ve set some naming guidelines. These will help your Actor rank better on Google and keep things consistent on [Apify Store](https://apify.com/store).
+
+Ideally, you should choose a name that clearly shows what your Actor does and includes keywords people might use to search for it.
+
+## Parts of Actor naming
+
+Your Actor's name consists of four parts: actual name, SEO name, URL, and GitHub repository name.
+
+- Actor name (name shown in Apify Store), e.g. _Booking Scraper_.
+  - Actor SEO name (name shown on Google Search, optional), e.g. _Booking.com Hotel Data Scraper_.
+  - If the SEO name is not set, the Actor name will be the default name shown on Google.
+- Actor URL (technical name), e.g. _booking-scraper_.
+  - More on it on [Importance of Actor URL](/academy/actor-marketing-playbook/actor-basics/importance-of-actor-url) page.
+- GitHub repository name (best to keep it similar to the other ones, for convenience), e.g. _actor-booking-scraper_.
+
+## Actor name
+
+The Actor name provides a human-readable name. The name is the most important real estate from an SEO standpoint. It should exactly match the most likely search query that potential users of your Actor will use. At the same time, it should give your Actor a clear name for people who will use it every day.
+
+:::tip
+
+Your Actor's name should be _40-50 characters_ long. You can change your Actor name freely in Apify Console.
+
+:::
+
+### Actor name vs. SEO name
+
+There's an option to step away from your Actor's name for the sake of search engine optimization — the Actor SEO name. The Actor name and Actor SEO name serve different purposes:
+
+- _Actor name_: this is the name visible in Apify Store and Console. It should be easy for users to understand and quickly show what your Actor does. It’s about attracting users who browse the Store.
+
+    ![actor name example](images/actor-name.png)
+
+- _Actor SEO name_: this is the name that appears in search engine results. It should include keywords people might search for to find your Actor. It’s about improving visibility on search engines and encouraging users to click on your link.
+
+    ![actor seo name example](images/actor-seo-name.png)
+
+For example:
+
+- _Actor name_: YouTube Scraper
+- _Actor SEO name_: YouTube data extraction tool for video analysis
+
+Here, the SEO name uses extra keywords to help people find it through search engines, while the Actor name is simpler and easier for users to understand and find on Apify Store.
+
+💡 When creating the SEO name, focus on using relevant keywords that potential users might search for. It should still match what your Actor does. More about SEO name and description: [Actor description and SEO description]
+
+### Actor name vs. technical name
+
+The Actor name and technical name (or URL) have different uses:
+
+- _Actor name_: this is the name users see on Apify Store and Console. It’s designed to be user-friendly and should make the Actor's purpose clear to anyone browsing or searching for it.
+- _Technical name_: this is a simplified, URL-friendly version used in technical contexts like API calls and scripts. This name should be concise and easily readable. Once set, it should not be changed as it can affect existing integrations and cause broken links.
+
+For example:
+
+- _Actor name_: Google Search Scraper
+- _Technical name_: google-search-scraper
+
+The Actor name is user-friendly and descriptive, while the technical name is a clean, URL-compatible version. Note that the technical name does not include spaces or special characters to ensure it functions properly in technical contexts.
+
+:::important
+
+This is important for SEO! Once set, the technical name should not be changed. Make sure you finalize this name early in development. More on why here: [Importance of Actor URL]
+
+:::
+
+## Best practices for naming
+
+### Brainstorming
+
+What does your Actor do? Does it scrape, find, extract, automate, connect, or upload? When choosing a name, ensure it stands out and is distinct from similar names both on Google and on Apify Store.
+
+- _Use nouns and variations_: use nouns like "scraper", "extractor", “downloader”, “checker”, or "API" to describe what your Actor does. You can also include terms like API, data, or variations of the website name.
+- _Include key features_: mention unique features or benefits to highlight what sets your Actor apart.
+- _Check for uniqueness_: ensure your name isn’t too similar to existing Actors to avoid confusion and help with SEO.
+
+### Match name and URL
+
+The simplest approach is to make all names match. For example, TikTok Ads Scraper (tiktok-ads-scraper) or Facebook Data Extractor (facebook-data-extractor). However, variations are acceptable.
+
+### Name length
+
+Keep the name concise, ideally less than four words. For instance, Facebook Data Extractor is preferable to Facebook Meta Data Extractor Light.
+
+### Check Apify Store for inspiration
+
+Look at the names of similar Actors on Apify Store, but avoid naming your Actor too similarly. By choosing a unique name, you can stand out from the competition. This will also reduce confusion and help users easily distinguish your Actor.
+
+### Keep SEO in mind
+
+Even though you can set a different variation for SEO name specifically, consider doing a bit of research when setting the regular name as well. The name should reflect what the Actor does and the keywords people use when searching for it. If the keywords you find sound too robotic, save them for the SEO name. But if they sound like something you'd search for, it's a good candidate for a name.
+
+You can also check the keywords competitors use for similar tools outside Apify Store.
+
+### Occasionally experiment
+
+You can test and refine your SEO assumptions by occasionally changing the SEO name. This allows you to track how changes to names affect search rankings and user engagement. Changing the regular name is not forbidden but still less desirable since it can confuse your existing users and also affect SEO.
+
+## Naming examples
+
+### Scraping Actors
+
+✅:
+
+- Technical name (Actor's name in the [Apify Console](https://console.apify.com/)): `${domain}-scraper`, e.g. youtube-scraper.
+- Actor name: `${Domain} Scraper`, e.g. YouTube Scraper.
+- Name of the GitHub repository: `actor-${domain}-scraper`, e.g. actor-youtube-scraper.
+
+❌:
+
+- Technical name: `the-scraper-of-${domain}`, e.g. the-scraper-of-youtube.
+- Actor name: `The Scraper of ${Domain}`, e.g. The Scraper of YouTube.
+- GitHub repository: `actor-the-scraper-of-${domain}`, e.g. actor-the-scraper-of-youtube.
+
+If your Actor only caters to a specific service on a domain (and you don't plan on extending it), add the service to the Actor's name.
+
+For example,
+
+- Technical name: `${domain}-${service}-scraper`, e.g. google-search-scraper.
+- Actor name: `${Domain} ${Service} Scraper`, e.g. [Google Search Scraper](https://apify.com/apify/google-search-scraper).
+- GitHub repository: `actor-${domain}-${service}-scraper`, e.g. actor-google-search-scraper.
+
+### Non-scraping Actors
+
+Naming for non-scraping Actors is more liberal. Being creative and considering SEO and user experience are good places to start. Think about what your users will type into a search engine when looking for your Actor. What is your Actor's function?
+
+Below are examples for the [Google Sheets](https://apify.com/lukaskrivka/google-sheets) Actor.
+
+✅:
+
+- Technical name: google-sheets.
+- Actor name: Google Sheets Import & Export.
+- GitHub repository: actor-google-sheets.
+
+❌:
+
+- Technical name: import-to-and-export-from-google-sheets.
+- Actor name: Actor for Importing to and Exporting from Google Sheets.
+- GitHub repository: actor-for-import-and-export-google-sheets.
+
+:::warning Renaming your Actor
+
+You may rename your Actor freely, except when it comes to the Actor URL. Remember to read [Importance of Actor URL](/academy/actor-marketing-playbook/actor-basics/importance-of-actor-url) to find out why!
+
+:::
+</file>
+
+<file path="platform/get_most_of_actors/interact_with_users/_category_.yaml">
+label: Interact with users
+position: 4
+</file>
+
+<file path="platform/get_most_of_actors/interact_with_users/emails_to_actor_users.md">
+---
+title: Emails to Actor users
+description: Email communication is a key tool to keep users engaged and satisfied. Learn when and how to email your users effectively to build loyalty and strengthen relationships with this practical guide.
+sidebar_position: 1
+category: apify platform
+slug: /actor-marketing-playbook/interact-with-users/emails-to-actor-users
+---
+
+**Getting users is one thing, but keeping them is another. While emailing your users might not seem like a typical marketing task, any seasoned marketer will tell you it’s essential. It’s much easier to keep your current users happy and engaged than to find new ones. This guide will help you understand when and how to email your users effectively.**
+
+---
+
+## Whom and where to email
+
+You can email the audience of a specific Actor directly from Apify Console. Go to **Actors > Emails > Compose new ＋**. From there, select the Actor whose users you want to email, write a subject line, and craft your message. An automatic signature will be added to the end of your email.
+
+## How to write a good email
+
+Emails can include text, formatting, images, GIFs, and links. Here are four main rules for crafting effective emails:
+
+1. Don’t email users without a clear purpose.
+2. Keep your message concise and friendly.
+3. Make the subject line direct and to the point. Consider adding an emoji to give users a hint about the email’s content.
+4. Use formatting to your advantage. Console emails support Markdown, so use bold, italics, and lists to highlight important details.
+
+Additional tips:
+
+- Show, don’t tell — use screenshots with arrows to illustrate your points.
+- If you’re asking users to take action, include a direct link to what you're referring to.
+- Provide alternatives if it suits the situation.
+- Always send a preview to yourself before sending the email to all your users.
+
+## When to email users
+
+Our general policy is to avoid spamming users with unnecessary emails. We contact them only if there's a valid reason. Here’s the list of regular good reasons to contact users of the Actor:
+
+### 1. Introducing a new feature of the Actor
+
+New filter, faster scraping, changes in input schema, in output schema, a new Integration, etc.
+
+>✉️ 🏙️ Introducing Deep city search for Tripadvisor scrapers
+>
+>Hi,
+>
+>Tired of Tripadvisor's 3000 hotels-per-search limit? We've got your back. Say hello to our latest baked-in feature: Deep city search. Now, to get all results from a country-wide search you need to just set Max search results above 3000, and watch the magic happen.
+>
+>A bit of context: while Tripadvisor never limited the search for restaurants or attractions, hotel search was a different case; it always capped at 3000. Our smart search is designed to overcome that limit by including every city within your chosen location. We scrape hotels from each one, ensuring no hidden gems slip through the cracks. This feature is available for [Tripadvisor Scraper](https://console.apify.com/actors/dbEyMBriog95Fv8CW/console) and [Tripadvisor Hotels Scraper](https://console.apify.com/actors/qx7G70MC4WBE273SM/console).
+>
+>So get ready for an unbeatable hotel-hunting experience. Give it a spin, and let us know what you think!
+
+Introduce and explain the features, add a screenshot of a feature if it will show in the input schema, and ask for feedback.
+
+### 2. Actor adapting to the changes of the website it scrapes
+
+A common situation in web scraping that's out of your control.
+
+>✉️ 📣 Output changes for Facebook Ads Scraper
+>
+>Hi,
+>
+>We've got some news regarding your favorite Actor – [Facebook Ads Scraper](https://console.apify.com/actors/JJghSZmShuco4j9gJ/console). Recently, Facebook Ads have changed their data format. To keep our Actor running smoothly, we'll be adapting to these changes by slightly tweaking the Actor Output. Don't worry; it's a breeze! Some of the output data might just appear under new titles.
+>
+>This change will take place on October 10; please** **make sure to remap your integrations accordingly.
+>
+>Need a hand or have questions? Our support team is just one friendly message away.
+
+Inform users about the reason for changes and how the changes impact them and the Actor + give them a date when the change takes effect.
+
+### 3. Actor changing its payment model (from rental to pay-per-result, for example)
+
+Email 1 (before the change, warning about deprecation).
+
+>✉️ 🛎 Changes to Booking Scraper
+>
+>Hi,
+>
+>We’ve got news regarding the Booking scraper you have been using. This change will happen in two steps:
+>
+>1. On September 22, we will deprecate it, i.e., new users will not be able to find it in Store. You will still be able to use it though.
+>2. At the end of October, we will unpublish this Actor, and from that point on, you will not be able to use it anymore.
+>
+>Please use this time to change your integrations to our new [Booking Scraper](https://apify.com/voyager/booking-scraper).
+>
+>That’s it! If you have any questions or need more information, don’t hesitate to reach out.
+
+Warn the users about the deprecation and future unpublishing + add extra information about related Actors if applicable + give them steps and the date when the change takes effect.
+
+Email 2 (after the change, warning about unpublishing)
+
+>✉️ **📢 Deprecated Booking Scraper will stop working as announced 📢**
+>
+>Hi,
+>
+>Just a heads-up: today, the deprecated [Booking Scraper](https://console.apify.com/actors/5T5NTHWpvetjeRo3i/console) you have been using will be completely unpublished as announced, and you will not be able to use it anymore.
+>
+>If you want to continue to scrape Booking.com, make sure to switch to the [latest Actor version](https://apify.com/voyager/booking-scraper).
+>
+>For any assistance or questions, don't hesitate to reach out to our support team.
+
+Remind users to switch to the Actor with a new model.
+
+### 4. After a major issue
+
+Actor downtime, performance issues, Actor directly influenced by platform hiccups.
+
+>✉️ **🛠️ Update on Google Maps Scraper: fixed and ready to go**
+>
+>Hi,
+>
+>We've got a quick update on the Google Maps Scraper for you. If you've been running the Actor this week, you might have noticed some hiccups — scraping was failing for certain places, causing retries and overall slowness.
+>
+>We apologize for any inconvenience this may have caused you. The **good news is those performance issues are now resolved**. So feel free to resurrect any affected runs using the "latest" build, should work like a charm now.
+>
+>Need a hand or have questions? Feel free to reply to this email.
+
+Apologize to users and or let them know you're working on it/everything is fixed now. This approach helps maintain trust and reassures users that you're addressing the situation.
+
+:::tip
+
+It might be an obvious tip, but If you're not great at emails, just write a short draft and ask ChatGPT to polish it. Play with the style until you find the one that suits you. You can even create templates for each situation. If ChatGPT is being too wordy, you can ask it to write at 9th or 10th-grade level, and it will use simpler words and sentences.
+
+:::
+
+## Emails vs. newsletters
+
+While sending an email is usually a quick way to address immediate needs or support for your users, newsletters can be a great way to keep everyone in the loop on a regular basis. Instead of reaching out every time something small happens, newsletters let you bundle updates together.
+
+Unless it's urgent, it’s better to wait until you have 2 or 3 pieces of news and share them all at once. Even if those updates span across different Actors, it’s perfectly fine to send one newsletter to all relevant users.
+
+Here are a few things you can include in your newsletter:
+
+- updates or new features for your Actors or Actor-to-Actor Integrations
+- an invitation to a live webinar or tutorial session
+- asking your users to upvote your Actor, leave a review or a star
+- a quick feedback request after introducing new features
+- spotlighting a helpful blog post or guide you wrote or found
+- sharing success stories or use cases from other users
+- announcing a promotion or a limited-time discount
+- links to your latest YouTube videos or tutorials
+
+Newsletters are a great way to keep your users engaged without overwhelming them. Plus, it's an opportunity to build a more personal connection by showing them you’re actively working to improve the tools they rely on.
+
+## Emailing a separate user
+
+There may be times when you need to reach out to a specific user — whether it’s to address a unique situation, ask a question that doesn’t fit the public forum of the **Issue tab**, or explore a collaboration opportunity. While there isn’t a quick way to do this through Apify Console just yet, you can ensure users can contact you by **adding your email or other contact info to your Store bio**. This makes it easy for them to reach out directly.
+
+✍🏻 Learn best practices on how to use your Store bio to connect with your users [Your Store bio](/academy/actor-marketing-playbook/interact-with-users/your-store-bio).
+</file>
+
+<file path="platform/get_most_of_actors/interact_with_users/issues_tab.md">
+---
+title: Issues tab
+description: Learn how the Issues tab can help you improve your Actor, engage with users, and build a reliable, user-friendly solution.
+sidebar_position: 2
+category: apify platform
+slug: /actor-marketing-playbook/interact-with-users/issues-tab
+---
+
+**Once you publish your Actor in Apify Store, it opens the door to new users, feedback, and… issue reports. Users can create issues and add comments after trying your Actor. But why is this space so important?**
+
+---
+
+## What is the Issues tab?
+
+The Issues tab is a dedicated section on your Actor’s page where signed-in users can report problems, share feedback, ask questions, and have conversations with you. You can manage each issue thread individually, and the whole thread is visible to everyone. The tab is divided into three categories: **Open**, **Closed**, and **All**, and it shows how long each response has been there. While only signed-in users can post and reply, all visitors can see the interactions, giving your page a transparent and welcoming vibe.
+
+:::note Keep active
+
+🕑 On the web, your average 🕑 **Response time** is calculated and shown in your Actor Metrics. The purpose of this metric is to make it easy for potential users to see how active you are and how well-maintained the Actor is.
+
+:::
+
+You can view all the issues related to your Actors by going to **Actors** > [**Issues**](https://console.apify.com/actors?tab=issues) in Apify Console. Users can get automatic updates on their reported issues or subscribe to issues they are interested in, so they stay informed about any responses. When users report an issue, they’re encouraged to share their run, which helps you get the full context and solve the problem more efficiently. Note that shared runs aren’t visible on the public Actor page.
+
+## What is the Issues tab for?
+
+The tab is a series of conversations between you and your users. There are existing systems like GitHub for that. So why create a separate system like an Issues tab? Since the Issues tab exists both in private space (Console) and public space (Actor's page on the web), it can fulfill two different sets of purposes.
+
+### Issues tab in Apify Console
+
+Originally, the Issues tab was only available in Apify Console, and its main goals were:
+
+- Convenience: a single space to hold the communication between you and your users.
+- Unity and efficiency: make sure multiple users don't submit the same issue through multiple channels or multiple times.
+- Transparency: make sure users have their issues addressed publicly and professionally. You can’t delete issues, you can only close them, so there's a clear record of what's been resolved and how.
+- Quality of service and innovation: make sure the Actor gets fixed and continuously improved, and users get the quality scraping services they pay for.
+
+### Issues tab on the web
+
+Now that the Issues tab is public and on the web, it also serves other goals:
+
+- Credibility: new users can check how active and reliable you are by looking at the issues and your average 🕑 **Response time** even before trying your Actor. It also sets expectations for when to expect a response from you.
+- Collaboration: developers can learn from each other’s support styles, which motivates everyone to maintain good interactions and keep up good quality work.
+- SEO boost: every issue now generates its own URL, potentially driving more keyword traffic to your Actor's page
+
+## Example of a well-managed Issues tab
+
+Check out how the team behind the **Apollo.io leads scraper** manages their [Issues tab](https://apify.com/curious_coder/apollo-io-scraper/issues/open) for a great example of professional responses and quick problem-solving.
+
+Note that this Actor is a rental, so users expect a high-quality service.
+
+![issues tab example](images/issues-tab-example.png)
+
+:::warning
+
+Once your Actor is public, you’re required to have an Issues tab.
+
+:::
+
+## SEO for the Issues tab
+
+Yes, you read that right! The public Issues tab can boost your search engine visibility. Each issue now has its own URL, which means every report could help your Actor rank for relevant keywords.
+
+When we made the tab public, we took inspiration from StackOverflow’s SEO strategy. Even though StackOverflow started as a Q&A forum, its strong SEO has been key to its success. Similarly, your Actor’s Issues tab can help bring in more traffic, with each question and answer potentially generating more visibility. This makes it easier for users to find solutions quickly.
+
+## Tips for handling Actor issues
+
+1. _Don’t stay silent_
+
+    Respond quickly, even if it’s just a short note. If an issue takes weeks to resolve, keep the user in the loop. A quick update prevents frustration and shows the user (and others following it) that you’re actively working on solving the issue.
+
+2. _Encourage search to avoid duplication_
+
+    Save time by encouraging users to search for existing issues before submitting new ones. If a similar issue exists, they can follow that thread for updates instead of creating a new one.
+
+3. _Encourage reporters to be specific_
+
+    The more context, the better! Ask users to share details about their run, which helps you diagnose issues faster. If needed, remind them that runs are shared privately, so sensitive data won’t be exposed.
+
+4. _Use screenshots and links_
+
+    The same goes for your side. Screenshots and links to specific runs make your answers much clearer. It’s easier to walk the user through a solution if they can see what you’re referencing.
+
+5. _Structure issue reporting_
+
+    As you get more experienced, you’ll notice common types of issues: bugs, feature requests, questions, reports, misc. This way, you can prioritize and respond faster based on the category.
+
+6. _Have ready answers for common categories_
+
+    Once you recognize recurring types of issues, have pre-prepared responses. For example, if it’s a bug report, you might already have a troubleshooting guide you can link to, or if it’s a feature request, you can figure out the development timeline.
+
+7. _Be polite and precise_
+
+    Politeness goes a long way! Make sure your responses are respectful and straight to the point. It helps to keep things professional, even if the issue seems minor.
+
+
+https://rewind.com/blog/best-practices-for-using-github-issues/
+</file>
+
+<file path="platform/get_most_of_actors/interact_with_users/your_store_bio.md">
+---
+title: Your Store bio
+description: Your Apify Store bio is all about helping you promote your tools & skills.
+sidebar_position: 3
+category: apify platform
+slug: /actor-marketing-playbook/interact-with-users/your-store-bio
+---
+
+## Your Apify Store bio and Store “README”
+
+To help our community showcase their talents and projects, we introduced public profile pages for developers. On a dedicated page, you can showcase contact info, a summary of important Actor metrics (like total users, response time, and success rates), and all of their public Actors. We took inspiration from freelance platforms.
+
+This space is all about helping you shine and promote your tools and skills. Here’s how you can use it to your advantage:
+
+- Share your contact email, website, GitHub, X (Twitter), LinkedIn, or Discord handles.
+- Summarize what you’ve been doing in Apify Store, your main skills, big achievements, and any relevant experience.
+- Offer more ways for people to connect with you, such as links for booking a meeting, discounts, a subscription option for your email newsletter, or your YouTube channel or blog.
+  - You can even add a Linktree to keep things neat.
+- Highlight your other tools on different platforms.
+- Get creative by adding banners and GIFs to give your profile some personality.
+
+Everything is neatly available under a single URL, making it easy to share.
+
+Need some inspiration? Check out examples of how others are using their Store bio and README. You can set yours up by heading to **Settings > Account > Profile.**
+
+<!-- TODO screenshots -->
+
+[https://apify.com/anchor](https://apify.com/anchor)
+
+<!-- TODO screenshots -->
+
+[https://apify.com/jupri](https://apify.com/jupri)
+
+<!-- TODO screenshots -->
+
+[https://apify.com/apidojo](https://apify.com/apidojo)
+
+<!-- TODO screenshots -->
+
+[https://apify.com/curious_coder](https://apify.com/curious_coder)
+
+<!-- TODO screenshots -->
+
+[https://apify.com/epctex](https://apify.com/epctex)
+
+<!-- TODO screenshots -->
+
+[https://apify.com/microworlds](https://apify.com/microworlds)
+</file>
+
+<file path="platform/get_most_of_actors/product_optimization/_category_.yaml">
+label: Product optimization
+position: 5
+</file>
+
+<file path="platform/get_most_of_actors/product_optimization/actor_bundles.md">
+---
+title: Actor bundles
+description: Learn what an Actor bundle is, explore existing examples, and discover how to promote them.
+sidebar_position: 2
+category: apify platform
+slug: /actor-marketing-playbook/product-optimization/actor-bundles
+---
+
+**Learn what an Actor bundle is, explore existing examples, and discover how to promote them.**
+
+---
+
+## What is an Actor bundle?
+
+If an Actor is an example of web automation software, what is an Actor bundle? An Actor bundle is basically a chain of multiple Actors unified by a common use case. Bundles can include both scrapers and automation tools, and they are usually designed to achieve an overarching goal related to scraping or automation.
+
+The concept of an Actor bundle originated from frequent customer requests for comprehensive tools. For example, someone would ask for a Twitter scraper that also performs additional tasks, or for a way to find all profiles of the same public figure across multiple social media platforms without needing to use each platform separately.
+
+For example, consider a bundle that scrapes company reviews from multiple platforms, such as Glassdoor, LinkedIn, and Indeed. Typically, you would need to use several different scrapers and then consolidate the results. But this bundle would do it all in one run, once provided with the name of the company. Or consider a bundle that scrapes all posts and comments of a given profile, and then produces a sentiment score for each scraped comment.
+
+The main advantage of an Actor bundle is its ease of use. The user inputs a keyword or a URL, and the Actor triggers all the necessary Actors sequentially to achieve the desired result. The user is not expected to use each Actor separately and then process and filter the results themselves.
+
+### Examples of bundles
+
+🔍 [Social Media Finder](https://apify.com/tri_angle/social-media-finder) searches for profiles on 13 social media sites provided just the (nick)name.
+
+🍝 [Restaurant Review Aggregator](https://apify.com/tri_angle/restaurant-review-aggregator) gets restaurant reviews from Google Maps, DoorDash, Uber Eats, Yelp, Tripadvisor, and Facebook in one place.
+
+🤔 [Social Media Sentiment Analysis Tool](https://apify.com/tri_angle/social-media-sentiment-analysis-tool) not only collects comments from Facebook, Instagram, and TikTok but also performs sentiment analysis on them. It unites post scrapers, comments scrapers and a text analysis tool.
+
+🦾 [Website Content Crawler + Pinecone bundle](https://apify.com/tri_angle/wcc-pinecone-integration) scrapes a website and stores the data in a Pinecone database to build and improve your own AI chatbot assistant.
+
+🤖 [Pinecone GPT Chatbot](https://apify.com/tri_angle/pinecone-gpt-chatbot) combines OpenAI's GPT models with Pinecone's vector database, which simplifies creating a GPT Chatbot.
+
+As you can see, they vary in complexity and range.
+
+---
+
+## Caveats
+
+### Pricing model
+
+Since bundles are still relatively experimental, profitability is not guaranteed and will depend heavily on the complexity of the bundle.
+
+However, if you have a solid idea for a bundle, don’t hesitate to reach out. Prepare your case, write to our support team, and we’ll help determine if it’s worth it.
+
+### Specifics of bundle promotion
+
+First of all, when playing with the idea of creating a bundle, always check the keyword potential. Sometimes, there are true keyword gems just waiting to be discovered, with high search volume and little competition.
+
+However, bundles may face the challenge of being "top-of-the-funnel" solutions. People might not search for them directly because they don't have a specific keyword in mind. For instance, someone is more likely to search for an Instagram comment scraper than imagine a bundle that scrapes comments from 10 different platforms, including Instagram.
+
+Additionally, Google tends to favor tools with rather focused descriptions. If your tool offers multiple functions, it can send mixed signals that may conflict with each other rather than accumulate.
+
+Sometimes, even though a bundle can be a very innovative tool product-wise, it can be hard to market from an SEO perspective and match the search intent.
+
+In such cases, you may need to try different marketing and promotion strategies. Once you’ve exhausted every angle of SEO research, be prepared to explore non-organic marketing channels like Product Hunt, email campaigns, community engagement, Reddit, other social media, your existing customer base, word-of-mouth promotion, etc.
+
+Remember, bundles originated as customized solutions for specific use cases - they were not primarily designed to be easily found.
+
+This is also an opportunity to tell a story rather than just presenting a tool. Consider writing a blog post about how you created this tool, recording a video, or hosting a live webinar. If you go this route, it’s important to emphasize how the tool was created and what a technical feat it represents.
+
+That said, don’t abandon SEO entirely. You can still capture some SEO value by referencing the bundle in the READMEs of the individual Actors that comprise it. For example, if a bundle collects reviews from multiple platforms, potential users are likely to search for review scrapers for each specific platform—Google Maps reviews scraper, Tripadvisor reviews scraper, Booking reviews scraper, etc. These keywords may not lead directly to your review scraping bundle, but they can guide users to the individual scrapers, where you can then present the bundle as a more comprehensive solution.
+
+---
+
+## Resources
+
+Learn more about Actor Bundles: https://blog.apify.com/apify-power-actors/
+</file>
+
+<file path="platform/get_most_of_actors/product_optimization/how_to_create_a_great_input_schema.md">
+---
+title: How to create a great input schema
+description: Optimizing your input schema. Learn to design and refine your input schema with best practices for a better user experience.
+sidebar_position: 1
+category: apify platform
+slug: /actor-marketing-playbook/product-optimization/how-to-create-a-great-input-schema
+---
+
+Optimizing your input schema. Learn to design and refine your input schema with best practices for a better user experience.
+
+---
+
+## What is an input schema?
+
+So you've succeeded: your user has 1. found your Actor on Google, 2. explored the Actor's landing page, 3. decided to try it, and 4. created an Apify account. Now they’re on your Actor's page in Apify Console. The SEO fight is over. What’s next?
+
+Your user is finally one-on-one with your Actor — specifically, its input schema. This is the moment when they try your Actor and decide whether to stick with it. The input schema is your representative here, and you want it to work in your favor.
+
+Technically, the input schema is a `JSON` object with various field types supported by the Apify platform, designed to simplify the use of the Actor. Based on the input schema you define, the Apify platform automatically generates a _user interface_ for your Actor.
+
+Of course, you can create an Actor without setting up an elaborate input schema. If your Actor is designed for users who don't need a good interface (e.g. they’ll use a JSON object and call it via API), you can skip this guide. But most users engage with Actors in Manual mode, aka the Actor interface. So, if your Actor is complex or you’re targeting regular users who need an intuitive interface, it's essential to consider their experience.
+
+In this article, _we’ll refer to the input schema as the user interface_ of your Actor and focus exclusively on it.
+
+:::tip Understand input schemas
+
+To fully understand the recommendations in this blog post, you’ll first need to familiarize yourself with the [technical aspects of the input schema](https://docs.apify.com/platform/actors/development/actor-definition/input-schema). This context is essential to make good use of the insights shared here.
+
+:::
+
+## The importance of a good input schema
+
+It can feel intimidating when facing the Apify platform for the first time. You only have a few seconds for a user to assess the ease of using your Actor.
+
+If something goes wrong or is unclear with the input, an ideal user will first turn to the tooltips in the input schema. Next, they might check the README or tutorials, and finally, they’ll reach out to you through the **Issues** tab. However, many users won’t go through all these steps — they may simply get overwhelmed and abandon the tool altogether.
+
+A well-designed input schema is all about managing user expectations, reducing cognitive load, and preventing frustration. Ideally, a good input schema, as your first line of interaction, should:
+
+- Make the tool as easy to use as possible
+- Reduce the user’s cognitive load and make them feel confident about using and paying for it
+- Give users enough information and control to figure things out on their own
+- Save you time on support by providing clear guidance
+- Prevent incorrect or harmful tool usage, like overcharges or scraping personal information by default
+
+### Reasons to rework an input schema
+
+- Your Actor is complex and has many input fields
+- Your Actor offers multiple ways to set up input (by URL, search, profile, etc.)
+- You’re adding new features to your Actor
+- Certain uses of the Actor have caveats that need to be communicated immediately
+- Users frequently ask questions about specific fields
+
+👀 Input schema can be formatted using basic HTML.
+
+## Most important elements of the input schema
+
+You can see the full list of elements and their technical characteristics in [Docs](https://docs.apify.com/academy/deploying-your-code/input-schema): titles, tooltips, toggles, prefills, etc. That's not what this guide is about. It's not enough to just create an input schema, you should ideally aim to place and word its elements to the user's advantage: to alleviate the user's cognitive load and make the acquaintance and usage of your tool as smooth as possible.
+
+Unfortunately, when it comes to UX, there's only so much you can achieve armed with HTML alone. So here are the best elements to focus on, along with some best practices for using them effectively:
+
+- **`description` at the top**
+  - As the first thing users see, the description needs to provide crucial information and a sense of reassurance if things go wrong. Key points to mention: the easiest way to try the Actor, links to a guide, and any disclaimers or other similar Actors to try.
+
+      ![Input schema description example](images/description-sshot.png)
+
+  - Descriptions can include multiple paragraphs. If you're adding a link, it’s best to use the `target_blank` property so your user doesn’t lose the original Actor page when clicking.
+- **`title` of the field (regular bold text)**
+  - This is the default way to name a field.
+  - Keep it brief. The user’s flow should be 1. title → 2. tooltip → 3. link in the tooltip. Ideally, the title alone should provide enough clarity. However, avoid overloading the title with too much information. Instead, make the title as concise as possible, expand details in the tooltip, and include a link in the tooltip for full instructions.
+
+    ![Input schema input example](images/title-sshot.png)
+
+- **`prefill`, the default input**
+  - this is your chance to show rather than tell
+    - Keep the **prefilled number** low. Set it to 0 if it's irrelevant for a default run.
+    - Make the **prefilled text** example simple and easy to remember.
+    - If your Actor accepts various URL formats, add a few different **prefilled URLs** to show that possibility.
+    - Use the **prefilled date** format that the user is expected to follow. This way, they can learn the correct format without needing to check the tooltip.
+    - There’s also a type of field that looks like a prefill but isn’t — usually a `default` field. It’s not counted as actual input but serves as a mock input to show users what to type or paste. It is gray and disappears after clicking on it. Use this to your advantage.
+- **toggle**
+  - The toggle is a boolean field. A boolean field represents a yes/no choice.
+  - So how would you word this toggle: **Skip closed places** or **Scrape open places only**? And should the toggle be enabled or disabled by default?
+
+    ![Input schema toggle example](images/toggle-sshot.png)
+
+    - You have to consider this when you're choosing how to word the toggle button and which choice to set up as the default. If you're making this more complex than it's needed (e.g. by using negation as the ‘yes’ choice), you're increasing your user's cognitive load. You also might get them to receive way less, or way more, data than they need from a default run.
+    - In our example, we assume the default user wants to scrape all places but still have the option to filter out closed ones. However, they have to make that choice consciously, so we keep the toggle disabled by default. If the toggle were enabled by default, users might not notice it, leading them to think the tool isn't working properly when it returns fewer results than expected.
+- **sections or `sectionCaption` (BIG bold text) and `sectionDescription`**
+  - A section looks like a wrapped toggle list.
+
+      ![Input schema sections example](images/sections-sshot.png)
+
+  - It is useful to section off non-default ways of input or extra features. If your tool is complex, don't leave all fields in the first section. Just group them by topic and section them off (see the screenshot above ⬆️)
+    - You can add a description to every section. Use `sectionDescription` only if you need to provide extra information about the section (see the screenshot below ⬇️.
+    - sometimes `sectionDescription` is used as a space for disclaimers so the user is informed of the risks from the outset instead of having to click on the tooltip.
+
+    ![Input schema section description example](images/section-description-sshot.png)
+
+- tooltips or `description` to the title
+  - To see the tooltip's text, the user needs to click on the `?` icon.
+  - This is your space to explain the title and what's going to happen in that field: any terminology, referrals to other fields of the tool, examples that don't fit the prefill, or caveats can be detailed here. Using HTML, you can add links, line breaks, code, and other regular formatting here. Use this space to add links to relevant guides, video tutorials, screenshots, issues, or readme parts if needed.
+  - Wording in titles vs. tooltips. Titles are usually nouns. They have a neutral tone and simply inform on what content this field is accepting (**Usernames**).
+    - Tooltips to those titles are usually verbs in the imperative that tell the user what to do (_Add, enter, use_).
+    - This division is not set in stone, but the reason why the tooltip is an imperative verb is because, if the user is clicking on the tooltip, we assume they are looking for clarifications or instructions on what to do.
+
+    ![Input schema tooltips example](images/tooltips-sshot.png)
+
+- emojis (visual component)
+  - Use them to attract attention or as visual shortcuts. Use emojis consistently to invoke a user's iconic memory. The visual language should match across the whole input schema (and README) so the user can understand what section or field is referred to without reading the whole title.
+    - Don't overload the schema with emojis. They attract attention, so you need to use them sparingly.
+
+:::tip
+
+Read more on the use of emojis: [Actors and emojis]
+
+:::
+
+## Example of an improved input schema
+
+1. A well-used `description` space. The description briefly introduces possible scraping options, visual language (sections represented by emojis), the easiest way to try the tool, and a link to a tutorial in case of issues. The description isn't too long, uses different formatting, and looks reassuring.
+2. The main section is introduced and visually separated from the rest. This is the space for the user to try the first run before they can discover the other options.
+3. The title says right away that this field refers to multiple other fields, not only the first section.
+4. `prefill` is a small number (so in case users run the tool with default settings, it doesn't take too long and isn't expensive for them) and uses the language of the target website (not results or posts, _videos_).
+5. The tooltip expands with more details and refers to other sections it's applicable to using matching emojis.
+6. Section names are short. Sections are grouped by content type.
+7. More technical parameters lack emojis. They are formatted this way to attract less attention and visually inform the user that this section is the most optional to set.
+8. Visual language is unified across the whole input schema. Emojis are used as a shortcut for the user to understand what section or field is referred to without actually reading the whole title.
+
+![Input schema example](images/improved-input-schema-example.png)
+
+### Example of a worse input schema
+
+The version above was the improved input schema. Here's what this tool's input schema looked like before:
+
+1. Brief and dry description, with little value for the user, easy to miss. Most likely, the user already knows this info because what this Actor does is described in the Actor SEO description, description, and README.
+2. The field title is wordy and reads a bit techie: it uses terminology that's not the most accurate for the target website (_posts_) and limiting terms (_max_). The field is applicable for scraping by hashtags (field above) and by profile (section below). Easy detail to miss.
+3. The prefilled number is too high. If the user runs the Actor with default settings, they might spend a lot of money, and it will take some time. Users often just leave if an Actor takes a long time to complete on the first try.
+4. The tooltip simply reiterates what is said in the title. Could've been avoided if the language of the title wasn't so complex.
+5. Merging two possible input types into one (profiles and URLs) can cause confusion. Verbose, reminds the user about an unrelated field (hashtags).
+6. This section refers to profiles but is separate. The user had to make extra effort to scrape profiles. They have to move across 3 sections: (use Max posts from section 1, use Profiles input from section 2, use Date sorting filters from section 3).
+7. The proxy and browser section invites the users to explore it even though it's not needed for a default run. It's more technical to set up and can make an impression that you need to know how to set it so the tool works.
+
+![Input schema example](images/worse-input-schema.png)
+
+## Best practices
+
+1. Keep it short. Don’t rely too much on text - most users prefer to read as little as possible.
+2. Use formatting to your advantage (bold, italic, underline), links, and breaks to highlight key points.
+3. Use specific terminology (e.g., posts, images, tweets) from the target website instead of generic terms like "results" or "pages."
+4. Group related items for clarity and ease of use.
+5. Use emojis as shortcuts and visual anchors to guide attention.
+6. Avoid technical jargon — keep the language simple.
+7. Minimize cognitive load wherever possible.
+
+## Signs and tools for improving input schema
+
+- _User feedback_. If they're asking obvious things, complaining, or consistently making silly mistakes with input, take notes. Feedback from users can help you understand their experience and identify areas for improvement.
+- _High churn rates_. If your users are trying your tool but quickly abandon it, this is a sign they are having difficulties with your schema.
+- _Input Schema Viewer_. Write your base schema in any code editor, then copy the file and put it into [**Input Schema Viewer](https://console.apify.com/actors/UHTe5Bcb4OUEkeahZ/source).** This tool should help you visualize your Input Schema before you add it to your Actor and build it. Seeing how your edits look in Apify Console right away will make the process of editing the fields in code easier.
+
+## Resources
+
+- Basics of input schema: [https://docs.apify.com/academy/deploying-your-code/input-schema](https://docs.apify.com/academy/deploying-your-code/input-schema)
+- Specifications of input schema: [https://docs.apify.com/platform/actors/development/actor-definition/input-schema](https://docs.apify.com/platform/actors/development/actor-definition/input-schema)
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/_category_.yaml">
+label: Promote your Actor
+position: 3
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/blogs_and_blog_resources.md">
+---
+title: Blogs and blog resources
+description: Blogs are still a powerful way to promote your Actors and build authority. By sharing expertise, engaging users, and driving organic traffic, blogging remains a key strategy to complement social media, SEO, and other platforms in growing your audience.
+sidebar_position: 5
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/blogs-and-blog-resources
+---
+
+**Blogs remain a powerful tool for promoting your Actors and establishing authority in the field. With social media, SEO, and other platforms, you might wonder if blogging is still relevant. The answer is a big yes. Writing blog posts can help you engage your users, share expertise, and drive organic traffic to your Actor.**
+
+## Why blogs still matter
+
+1. SEO. Blog posts are great for boosting your Actor’s search engine ranking. Well-written content with relevant keywords can attract users searching for web scraping or automation solutions. For example, a blog about “how to scrape social media profiles” could drive people to your Actor who might not find it on Google otherwise.
+2. Establishing authority. When you write thoughtful, well-researched blog posts, you position yourself as an expert in your niche. This builds trust and makes it more likely users will adopt your Actors.
+3. Long-form content. Blogs give you the space to explain the value of your Actor in-depth. This is especially useful for complex tools that need more context than what can fit into a README or product description.
+4. Driving traffic. Blog posts can be shared across social media, linked in webinars, and included in your Actor’s README. This creates multiple avenues for potential users to discover your Actor.
+
+## Good topics for blog posts
+
+1. Problem-solving guides. Write about the specific problems your Actor solves. For example, if you’ve created an Actor that scrapes e-commerce reviews, write a post titled "How to automate e-commerce review scraping in 5 minutes". Focus on the pain points your tool alleviates.
+2. Actor use cases. Show real-world examples of how your Actor can be applied. These can be case studies or hypothetical scenarios like "Using web scraping to track competitor pricing."
+3. Tutorials and step-by-step guides. Tutorials showing how to use your Actor or similar tools are always helpful. Step-by-step guides make it easier for beginners to start using your Actor with minimal hassle.
+4. Trends. If you’ve noticed emerging trends in web scraping or automation, write about them. Tie your Actor into these trends to highlight its relevance.
+5. Feature announcements or updates. Have you recently added new features to your Actor? Write a blog post explaining how these features work and what makes them valuable.
+
+🪄 These days, blog posts always need to be written with SEO in mind. Yeah, it's annoying to use keywords, but think of it this way: even if there's the most interesting customer story and amazing programming insights, but nobody can find it, it won't have the impact you want. Do try to optimize your posts with relevant keywords and phrases — across text, structure, and even images — to ensure they reach your target audience.
+
+---
+
+## Factors to consider when writing a blog
+
+1. Audience. Know your target audience. Are they developers, small business owners, or data analysts? Tailor your writing to match their technical level and needs.
+2. SEO. Incorporate relevant keywords naturally throughout your post. Don’t overstuff your content, but make sure it ranks for search queries like "web scraping tools", "automation solutions", or "how to scrape LinkedIn profiles". Remember to include keywords in H2 and H3 headings.
+3. Clarity and simplicity. Avoid jargon, especially if your target audience includes non-technical users. Use simple language to explain how your Actor works and why it’s beneficial.
+4. Visuals. Include screenshots, GIFs, or even videos to demonstrate your Actor’s functionality. Visual content makes your blog more engaging and easier to follow.
+5. Call to action (CTA). Always end your blog with a clear CTA. Whether it’s "try our Actor today" or "download the demo", guide your readers to the next step.
+6. Engage with comments. If readers leave comments or questions, engage with them. Answer their queries and use the feedback to improve both your blog and Actor.
+
+---
+
+## Best places to publish blogs
+
+There are a variety of platforms where you can publish your blog posts to reach the right audience:
+
+1. [Dev.to](http://dev.to/): It's a developer-friendly platform where technical content gets a lot of visibility, and a great place to publish how-to guides, tutorials, and technical breakdowns of your Actor.
+2. Medium: Allows you to reach a broader, less technical audience. It’s also good for writing about general topics like automation trends or how to improve data scraping practices.
+3. ScrapeDiary: Run by Apify, [scrapediary.com](http://scrapediary.com) is a blog specifically geared toward Apify community devs and web scraping topics. Publishing here is a great way to reach users already interested in scraping and automation. Contact us if you want to publish a blog post there.
+4. Personal blogs or company websites. If you have your own blog or a company site, post there. It’s the most direct way to control your content and engage your established audience.
+
+---
+
+## Not-so-obvious SEO tips for blog posts
+
+Everybody knows you should include keywords wherever it looks natural. Some people know the structure of the blog post should be hierarchical and follow an H1 - H2 - H3 - H4 structure with only one possible H1. Here are some unobvious SEO tips for writing a blog post that can help boost its visibility and ranking potential:
+
+### 1. Keep URL length concise and strategic
+
+Optimal length. Keep your URL short and descriptive. URLs between 50-60 characters perform best, so aim for 3-4 words. Avoid unnecessary words like "and", "of", or long prepositions.
+
+Include keywords. Ensure your primary keyword is naturally integrated into the URL. This signals relevance to both users and search engines.
+
+Avoid dates. Don’t include dates or numbers in the URL to keep the content evergreen, as dates can make the post seem outdated over time.
+
+### 2. Feature a video at the top of the post
+
+Engagement boost. Videos significantly increase the time users spend on a page, positively influencing SEO rankings. Blog posts with videos in them generally do better SEO-wise.
+
+Thumbnail optimization. Use an optimized thumbnail with a clear title and engaging image to increase click-through rates.
+
+### 3. Alt text for images with a keyword focus
+
+Descriptive alt text. Include a short, descriptive alt text for every image with one or two keywords where it makes sense. This also improves accessibility.
+
+Optimize file names. Name your images with SEO-friendly keywords before uploading (e.g., "web-scraping-tools.png" rather than "IMG12345_screenshot1.png"). This helps search engines understand the content of your images.
+
+File format and size. Use web-optimized formats like WebP or compressed JPEGs/PNGs to ensure fast page loading, which is a key SEO factor.
+
+Lazy loading images. Use lazy loading to only load images when the user scrolls to them, reducing initial page load times, which can help your SEO ranking.
+
+### 4. Interlinking for better user experience and SEO
+
+Internal links. Use contextual links to other relevant blog posts or product pages on your site. This not only helps with SEO but also keeps users engaged longer on your site, reducing bounce rates.
+
+Anchor text. When linking internally, use keyword-rich anchor text that describes what users will find on the linked page.
+
+Content depth. By interlinking, you can show Google that your site has a strong internal structure and is a hub of related, authoritative content.
+
+### 5. Target the 'People Also Ask' section of Google results with an FAQ
+
+Answer common questions. Including an FAQ section that answers questions people search for can help you rank in the "People Also Ask" section of Google. Research questions that come up in this feature related to your topic and address them in your content.
+
+Provide clear, concise answers to the FAQs, typically between 40-60 words, since these match the format used in "People Also Ask".
+
+Don't bother using FAQ schema. Google doesn't react to those anymore unless you’re a .gov or .edu domain.
+
+### 6. Optimize for readability and structure
+
+Short paragraphs and subheadings. Make your blog post easy to scan by using short paragraphs and meaningful subheadings that contain keywords.
+
+Bullet points and lists. Include bullet points and numbered lists to break up content and make it more digestible. Search engines prioritize well-structured content.
+
+Readability tools. Use tools like Hemingway Editor or Grammarly to improve readability. Content that is easy to read tends to rank higher, as it keeps readers engaged.
+
+## Referring to blogs in your Actor’s ecosystem
+
+To drive traffic to your blog and keep users engaged, reference your blog posts across various touchpoints:
+
+1. README. Add links to your blog posts in your Actor’s README. If you’ve written a tutorial or feature guide, include it under a "Further reading" section.
+2. Input schema. Use your input schema to link to blog posts. For instance, if a certain field in your Actor has complex configurations, add a link to a blog post that explains how to use it.
+3. YouTube videos. If you’ve created tutorial videos about your Actor, link them in your blog and vice versa. Cross-promoting these assets will increase your overall engagement.
+4. Webinars and live streams. Mention your blog posts during webinars, especially if you’re covering a topic that’s closely related. Include the links in follow-up emails after the event.
+5. Social media. Share your blog posts on Twitter, LinkedIn, or other social platforms. Include snippets or key takeaways to entice users to click through.
+
+🔄 Remember, you can always turn your blog into a video and vice versa. You can also use parts of blog posts for social media promotion.
+
+## Additional tips for blog success
+
+1. Consistency. Regular posting helps build an audience and makes sure you keep at it. Try to stick to a consistent schedule, whether it’s weekly, bi-weekly, or monthly. As Woody Allen said, “80 percent of success is showing up”.
+2. Guest blogging. Reach out to other blogs or platforms like [Dev.to](http://dev.to/) for guest blogging opportunities. This helps you tap into new audiences.
+3. Repurpose content. Once you’ve written a blog post, repurpose it. Turn it into a YouTube video, break it down into social media posts, or use it as the base for a webinar.
+4. Monitor performance. Use analytics to track how your blog is performing. Are people reading it? Is it driving traffic to your Actor? What keywords is it ranking for? Who are your competitors? Use this data to refine your content strategy.
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/parasite_seo.md">
+---
+title: Parasite SEO
+description: Explore parasite SEO, a unique strategy that leverages third-party sites to boost rankings and drive traffic to your tools.
+sidebar_position: 3
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/parasite-seo
+---
+
+**Do you want to attract more users to your Actors? Consider parasite SEO, a non-conventional method of ranking that leverages third-party sites.**
+
+---
+
+Here’s a full definition, from Authority Hackers:
+
+> Parasite SEO involves publishing a quality piece of content on an established, high-authority external site to rank on search engines. This gives you the benefit of the host’s high traffic, boosting your chances for leads and successful conversions. These high DR websites have a lot of authority and trust in the eyes of Google
+>
+
+As you can see, you’re leveraging the existing authority of a third-party site where you can publish content promoting your Actors, and the content should rank better and faster as you publish it on an established site.
+
+You can do parasite SEO for free, but you can also pay for guest posts on high-authority sites to post your articles promoting the Actors.
+
+Let’s keep things simple and practical for this guide, so you can start immediately. We will cover only the free options, which should give you enough exposure to get started.
+
+If you want to learn more, we recommend the following reading about parasite SEO:
+
+- [Authority Hackers](https://www.authorityhacker.com/parasite-seo/)
+- [Ahrefs](https://ahrefs.com/blog/parasite-seo/)
+
+In this guide, we will cover the following sites that you can use for parasite SEO for free:
+
+- Medium
+- LinkedIn
+- Reddit
+- Quora
+
+## Medium
+
+You probably know [Medium](https://medium.com/). But you might not know that Google quite likes Medium, and you have a good chance of ranking high in Google with articles you publish there.
+
+1. You need a Medium account. It’s free and easy to create.
+2. Now, you need to do keyword research. Go to [Ahrefs Free Keyword Generator](https://ahrefs.com/keyword-generator/?country=us), enter your main keyword (e.g. Airbnb scraper), and check what keyword has the highest search volume.
+3. Search for that keyword in Google. Use incognito mode and a US VPN if you can. Analyze the results and check what type of content you need to create. Is it a how-to guide on how to create an Airbnb scraper? Or is it a list of the best Airbnb scrapers? Or perhaps it’s a review or just a landing page.
+4. Now, you should have a good idea of the article you have to write. Write the article and try to mimic the structure of the first results.
+5. Once you’re done with the article, don’t forget to include a few calls to action linking to your Actor on Apify Store. Don’t be too pushy, but mention all the benefits of your Actor.
+6. Publish the article. Make sure your title and URL have the main keyword and that the main keyword is also in the first paragraph of the article. Also, try to use relevant tags for your Actor.
+
+## LinkedIn Pulse
+
+LinkedIn Pulse is similar to Medium, so we won’t go into too much detail. The entire process is the same as with Medium; the way you publish the article differs.
+
+[Here is a full guide](https://www.linkedin.com/pulse/how-publish-content-linkedin-pulse-hamza-sarfraz/) for publishing your article on LinkedIn Pulse.
+
+## Reddit
+
+1. You must have a Reddit account to use to comment in relevant Subreddits.
+2. Go to Google and perform this search: `site:reddit.com <your keyword>`, where you replace `<your keyword>` with the main topic of your Actor.
+3. Now, list relevant Reddit threads that Google gives you. For an Airbnb scraper, this might be a good thread: [Has anybody have an latest Airbnb scraper code?](https://www.reddit.com/r/webscraping/comments/m650ol/has_anybody_have_an_latest_airbnb_scraper_code/)
+4. To prioritize threads from the list, you can check the traffic they get from Google in [Ahrefs Traffic Checker](https://ahrefs.com/traffic-checker). Just paste the URL, and the tool will give you traffic estimation. You can use this number to prioritize your list. If the volume exceeds 10, it usually has some traffic potential.
+5. Now, the last step is to craft a helpful comment that also promotes your Actor. Try to do that subtly. People on Reddit usually don’t like people who promote their stuff, but you should be fine if you’re being genuinely helpful.
+
+## Quora
+
+Quora is similar to Reddit, so again we won’t go into too much detail. The entire process is the same. You just have to use a different search phrase in Google, which is `site:quora.com <your keyword>`.
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/product_hunt.md">
+---
+title: Product Hunt
+description: Boost your Actor’s visibility by launching it on Product Hunt, a top platform for tech innovations. Attract early adopters, developers, and businesses while showcasing your tool’s value through visuals or demos.
+sidebar_position: 4
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/product-hunt
+---
+
+Product Hunt is one of the best platforms for introducing new tools, especially in the tech community. It attracts a crowd of early adopters, startup enthusiasts, and developers eager to discover the latest innovations. Even [Apify itself](https://www.producthunt.com/products/apify) was on PH.
+
+If you're looking to build awareness and generate short-term traffic, Product Hunt can be a powerful tool in your marketing strategy. It's a chance to attract a wide audience, including developers, startups, and businesses looking for automation. If your Actor solves a common problem, automates a tedious process, or enhances productivity, it's a perfect candidate for Product Hunt.
+
+Product Hunt is also great for tools with a strong visual component or demo potential. If you can show the value of your Actor in action, you’re more likely to grab attention and drive engagement.
+
+---
+
+## How to promote your Actor on Product Hunt
+
+### Create a compelling launch
+
+Launching your Actor on Product Hunt requires thoughtful planning. Start by creating a product page that clearly explains what your Actor does and why it’s valuable. You’ll need:
+
+- _A catchy tagline_. Keep it short and to the point. Think of something that captures your Actor's essence in just a few words.
+- _Eye-catching visuals_. Screenshots, GIFs, or short videos that demonstrate your Actor in action are essential. Show users what they’ll get, how it works, and why it’s awesome.
+- _Concise description_. Write a brief description of what your Actor does, who it’s for, and the problem it solves. Use plain language to appeal to a wide audience, even if they aren’t developers.
+- _Demo video_. A short video that shows how your Actor works in a real-life scenario will resonate with potential users.
+
+Once your page is set up, you’ll need to choose the right day to launch. Product Hunt is most active on weekdays, with Tuesday and Wednesday being the most popular launch days. Avoid launching on weekends or holidays when traffic is lower.
+
+### Build momentum before launch
+
+Start building awareness before your launch day. This is where your social media channels and community engagement come into play. Share teasers about your upcoming Product Hunt launch on Twitter (X), Discord, LinkedIn, and even StackOverflow, where other developers might take an interest. Highlight key features or the problems your Actor solves.
+
+If you have a mailing list, give your subscribers a heads-up about your launch date. Encourage them to visit Product Hunt and support your launch by upvoting and commenting. This pre-launch activity helps create early momentum on launch day.
+
+### Timing your launch
+
+The timing of your Product Hunt launch matters a lot. Since Product Hunt operates on a daily ranking system, getting in early gives your product more time to gain votes. Aim to launch between 12:01 AM and 2:00 AM PST, as this will give your product a full day to collect upvotes.
+
+Once you’ve launched, be ready to engage with the community throughout the day. Respond to comments, answer questions, and thank users for their support. Product Hunt users appreciate creators who are active and communicative, and this can help drive more visibility for your Actor.
+
+### Engage with your audience
+
+The first few hours after your launch are crucial for gaining traction. Engage with users who comment on your product page, answer any questions, and address any concerns they might have. The more interaction you generate, the more likely you are to climb the daily rankings.
+
+Be transparent and friendly in your responses. If users point out potential improvements or bugs, acknowledge them and make a commitment to improve your Actor. Product Hunt users are often open to giving feedback, and this can help you iterate on your product quickly.
+
+If possible, have team members or collaborators available to help respond to comments. The more responsive and helpful you are, the better the overall experience will be for users checking out your Actor.
+
+:::tip Leverage Apify
+
+You can also give a shoutout to Apify, this way your Actor will also notified to the community of Apify on Product Hunt: [https://www.producthunt.com/stories/introducing-shoutouts](https://www.producthunt.com/stories/introducing-shoutouts)
+
+:::
+
+## Expectations and results
+
+Launching on Product Hunt can provide a massive spike in short-term traffic and visibility. However, it’s important to manage your expectations. Not every launch will result in hundreds of upvotes or immediate sales. Here’s what you can realistically expect:
+
+- _Short-term traffic boost_. Your Actor might see a surge in visitors, especially on the day of the launch. If your Actor resonates with users, this traffic may extend for a few more days.
+- _Potential long-term benefits_. While the short-term traffic is exciting, the long-term value lies in the relationships you build with early users. Some of them may convert into paying customers or become advocates for your Actor.
+- _SEO boost_. Product Hunt is a high-authority site with a 91 [domain rating](https://help.ahrefs.com/en/articles/1409408-what-is-domain-rating-dr). Having your product listed can provide an SEO boost and help your Actor's page rank higher in search engines.
+- _User feedback_. Product Hunt is a great place to gather feedback. Users may point out bugs, request features, or suggest improvements.
+
+## Tricks for a successful launch
+
+1. _Leverage your network_. Ask friends, colleagues, and early users to support your launch. Ask the Apify community. Ask your users. Encourage them to upvote, comment, and share your product on social media.
+2. _Prepare for feedback_. Product Hunt users can be critical, but this is an opportunity to gather valuable insights. Be open to suggestions and use them to improve your Actor.
+3. _Use a consistent brand voice_. Make sure your messaging is consistent across all platforms when you're responding to comments and promoting your launch on social media.
+4. _Offer a special launch deal_. Incentivize users to try your Actor by offering a discount or exclusive access for Product Hunt users. This can drive early adoption and build momentum.
+
+## Caveats to Product Hunt promotion
+
+- _Not every Actor is a good fit_. Product Hunt is best for tools with broad appeal or innovative features. If your Actor is highly specialized or niche, it may not perform as well.
+- _High competition_. Product Hunt is a popular platform, and your Actor will be competing with many other launches. A strong marketing strategy is essential to stand out.
+- _Short-term focus_. While the traffic spike is great, Product Hunt tends to focus on short-term visibility. To maintain long-term growth, you’ll need to continue promoting your Actor through other channels.
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/seo.md">
+---
+title: SEO
+description: Learn how to optimize your content to rank higher on search engines like Google and Bing, attract more users, and drive long-term traffic - all for free.
+sidebar_position: 1
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/seo
+---
+
+SEO means optimizing your content to rank high for your target queries in search engines such as Google, Bing, etc. SEO is a great way to get more users for your Actors. It’s also free, and it can bring you traffic for years. This guide will give you a simple framework to rank better for your targeted queries.
+
+## Search intent
+
+Matching the search intent of potential users is super important when creating your Actor's README. The information you include should directly address the problems or needs that led users to search for a solution like yours. For example:
+
+- _User goals_: What are users trying to accomplish?
+- _Pain points_: What challenges are they facing?
+- _Specific use cases_: How might they use your Actor?
+
+Make sure your README demonstrates how your Actor aligns with the search intent. This alignment helps users quickly recognize your Actor's value and helps Google understand your Actor and rank you better.
+
+_Example:_
+
+Let’s say you want to create a “YouTube Hashtag Scraper” Actor. After you search YouTube HashTag Scraper, you see that most people searching for it want to extract hashtags from YouTube videos, not download videos using a certain hashtag.
+
+## Keyword research
+
+Keyword research is a very important part of your SEO success. Without that, you won’t know which keywords you should target with your Actor, and you might be leaving traffic on the table by not targeting all the angles or targeting the wrong one.
+
+We will do keyword research with free tools, but if you want to take this seriously, we highly recommend [Ahrefs](https://ahrefs.com/).
+
+### Google autocomplete suggestions
+
+Start by typing your Actor's main function or purpose into Google. As you type, Google will suggest popular search terms. These suggestions are based on common user queries and can provide insight into what your potential users are searching for.
+
+_Example:_
+
+Let's say you've created an Actor for scraping product reviews. Type "product review scraper" into Google and note the suggestions:
+
+- product review scraper free
+- product review scraper amazon
+- product review scraper python
+- product review scraper api
+
+These suggestions reveal potential features or use cases to highlight in your README.
+
+### Alphabet soup method
+
+This technique is similar to the previous one, but it involves adding each letter of the alphabet after your main keyword to discover more specific and long-tail keywords.
+
+_Example_:
+
+Continue with "product review scraper" and add each letter of the alphabet:
+
+- product review scraper a (autocomplete might suggest "api")
+- product review scraper b (might suggest "best")
+- product review scraper c (might suggest "chrome extension")
+
+...and so on through the alphabet.
+
+### People Also Ask
+
+Search for your Actor's main function or purpose on Google. Scroll down to find the "People Also Ask" section, which contains related questions.
+
+_Example_:
+
+For a "product review scraper" Actor:
+
+- How do I scrape product reviews?
+- Is it legal to scrape product reviews?
+- What is the best tool for scraping reviews?
+- How can I automate product review collection?
+
+Now, you can expand the “People Also Ask” questions. Click on each question to reveal the answer and generate more related questions you can use in your README.
+
+### Google Keyword Planner
+
+Another way to collect more keywords is to use the official Google Keyword Planner. Go to [Google Keyword Planner](https://ads.google.com/home/tools/keyword-planner/) and open the tool. You need a Google Ads account, so just create one for free if you don’t have one already.
+
+After you’re in the tool, click on “Discover new keywords”, make sure you’re in the “Start with keywords” tab, enter your Actor's main function or purpose, and then select the United States as the region and English as the language. Click “Get results” to see keywords related to your actor.
+
+Write them down.
+
+### Ahrefs Keyword Generator
+
+Go to [Ahrefs Keyword Generator](https://ahrefs.com/keyword-generator), enter your Actor's main function or purpose, and click “Find keywords.” You should see a list of keywords related to your actor.
+
+Write them down.
+
+## What to do with the keywords
+
+First, remove any duplicates that you might have on your list. You can use an online tool [like this one](https://dedupelist.com/) for that.
+
+After that, we need to get search volumes for your keywords. Put all your keywords in a spreadsheet, with one column being the keyword and the second one being the search volume.
+
+Go to the [Keyword Tool](https://backlinko.com/tools/keyword), enter the keyword, and write down the search volume. You will also see other related keywords, so you might as well write them down if you don’t have them on your list yet.
+
+At the end, you should have a list of keywords together with their search volumes that you can use to prioritize the keywords, use the keywords to name your Actor, choose the URL, etc.
+
+### Headings
+
+If it makes sense, consider using keywords with the biggest search volume and the most relevant for your Actor as H2 headings in your README.
+
+Put the most relevant keyword at the beginning of the heading when possible. Also, remember to use a clear hierarchy. The main features are H2, sub-features are H3, etc.
+
+### Content
+
+When putting keywords in your Actor’s README, it's important to maintain a natural, informative tone. Your primary goal should be to create valuable, easily understandable content for your users.
+
+Aim to use your most important keyword in the first paragraph of your README. This helps both search engines and users quickly understand what your Actor does. But avoid forcing keywords where they don't fit naturally.
+
+In your content, you can use the keywords you gathered before where they make sense. We want to include those keywords naturally in your README.
+
+If there are relevant questions in your keyword list, you can always cover them within an “FAQ” section of your Actor.
+
+Remember that while including keywords is important, always prioritize readability and user experience. Your content should flow naturally and provide real value to the reader.
+
+## Learn more about SEO
+
+If you want to learn more about SEO, these two free courses will get you started:
+
+- [SEO Course for Beginners](https://ahrefs.com/academy/seo-training-course) by Ahrefs
+- [SEO Courses](https://www.semrush.com/academy/courses/seo/) by Semrush
+
+The [Ahrefs YouTube channel](https://www.youtube.com/@AhrefsCom/featured) is also a great resource. You can start with [this video](https://www.youtube.com/watch?v=xsVTqzratPs).
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/social_media.md">
+---
+title: Social media
+description: Leverage social media to connect with users and grow your Actor’s audience. Learn how to showcase features, engage with users, and avoid common pitfalls.
+sidebar_position: 2
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/social-media
+---
+
+**Social media is a powerful way to connect with your Actor users and potential users. Whether your tool focuses on web scraping or automation, social platforms can help you showcase its features, answer user questions, and grow your audience. This guide will show you how to use social media effectively, what to share, and how to avoid common mistakes along the way.**
+
+Now, before we start listing social media platforms, it might be important to acknowledge something.
+
+Developers are notorious for not using social media that much. Or they use social media exclusively in the context of their own interests: that won’t find them new users, but rather colleagues or collaborators.
+
+That's a good start, and maybe it's enough. A developer that can also “do” social media is a unicorn. These are super rare. And if you want to really promote your Actor, you'll need to become that unicorn. Before we start, you need to understand the benefits of this activity.
+
+---
+
+## Why be active on social media
+
+Engaging with your users on social media offers a lot of benefits beyond just promoting your Actor. Let’s look at some of the main reasons why being active online can be a game-changer for your Actor’s success:
+
+1. Social platforms make it easy to gather real-time feedback and also provide support in real-time. You can quickly learn what users love, what they struggle with, and what features they’d like to see. This can guide your Actor’s future development. It also allows you to build trust and credibility with your audience.
+2. Shot in the dark: social media exposes your Actor to new users who might not find you through search engines alone. A shared post or retweet can dramatically expand your reach, helping you grow your user base.
+3. Consistent activity on social platforms creates more backlinks to your Actor’s page, which can improve its search engine ranking and drive organic traffic.
+
+## Where to engage: Choosing the right platforms
+
+Choosing the right platforms is key to reaching your target audience. Here's a breakdown of the best places for developers to promote their web scraping and automation tools:
+
+- _Discord_: We started with an easy one. Create a community around your Actor to engage with users directly. Offering quick support and discussing the features of your Actor in a real-time chat setting can lead to deeper user engagement.
+
+    :::tip Use Apify's Discord
+
+    You can also promote your tools through [Apify's Discord](https://discord.com/invite/crawlee-apify-801163717915574323).
+
+    :::
+
+- _Twitter (X)_: Good for short updates, feature announcements, and quick interactions with users. The tech community on Twitter is very active, which makes it a great spot for sharing tips and getting noticed.
+- _Reddit_: In theory, subreddits like r/webscraping, r/automation, and r/programming allow you to share expertise, engage in discussions, and present your Actor as a solution. However, in reality, you have to be quite careful with promotion there. Be very mindful of subreddit rules to avoid spamming or over-promoting. For Reddit, personal stories on how you built the tool + a roadblock you might be facing right now are the safest formula. If a tool is already finished and perfected, it will be treated as promotional content. But if you're asking for advice - now that's a community activity.
+- _TikTok_: Might not be an obvious choice, but that’s where most young people spend time. They discuss a myriad of topics, laugh at the newest memes, and create trends that take weeks to get to Reels and Shorts. If you want to create educational, fun, short video content (and be among the first to talk about web scraping), this is your place for experiments and taking algorithm guesses.
+- _YouTube_: Ideal for tutorials and demos. A visual walk-through of how to use your Actor can attract users who prefer watching videos to reading tutorials or READMEs. It's also good for Shorts and short, funny content.
+- _StackOverflow_: While not a traditional social media platform, StackOverflow is a great space to answer technical questions and demonstrate your expertise. Offering help related to web scraping or automation can build credibility, and you can subtly mention your Actor if it directly solves the issue (as long as it adheres to community guidelines).
+- _LinkedIn_: If your Actor solves problems for professionals or automates business tasks, LinkedIn is the place to explain how your tool provides value to an industry or business.
+
+---
+
+## Best practices for promoting your Actor on social media
+
+Now that you know where to engage and why it’s important, here are some best practices to help you make the most of social media:
+
+1. _Offer value beyond promotion_: If you look around, you'll see that the golden rule of social media these days is to educate and entertain. Focus on sharing useful information related to your Actor. Post tips on automation, web scraping techniques, or industry insights that can help your audience. When you do promote your Actor, users will see it as part of a valuable exchange, not just an ad. Besides, constantly posting promotional content turns anybody off.
+2. _Post consistently_: The most important rule for social media is to show up. Whether it’s a weekly post about new features or daily tips for using your Actor more effectively, maintaining a regular posting schedule keeps your audience connected.
+3. _Visuals matter_: Screenshots, GIFs, and short videos can explain more than text ever could. Show users how your Actor works, the results it scrapes, or how automation saves time.
+4. _Widen your reach_: Web scraping is a niche topic. Find ways to talk about it more widely. If you stumble upon ways to relate it to wider topics: news, science, research, even politics and art, use it. Or you can go more technical and talk about various libraries and languages you can use to build it.
+5. _Use relevant hashtags_: Hashtags like #webscraping, #automation, #programming, and #IT help you reach a wider audience on platforms like Twitter and TikTok. Stick to a few relevant hashtags per post to avoid clutter.
+6. _Engage actively_: Social media is a two-way street. Reply to comments, thank users for sharing your content, create stitches, and answer questions. Building relationships with your users helps foster loyalty and builds a sense of community around your Actor.
+7. _Use polls and Q&As_: Interactive content like polls or Q&A sessions can drive engagement. Ask users what features they’d like to see next or run a live Q&A to answer questions about using your Actor. These tools encourage participation and provide valuable insights.
+8. _Collaborate with other creators_.
+
+## Caveats to social media engagement
+
+1. _Over-promotion_: Constantly pushing your Actor without offering value can turn users away. Balance your promotional content with educational posts, interesting links, or insights into the development process. Users are more likely to engage when they feel like they’re learning something, rather than just being sold to.
+2. _Handling negative feedback_: Social media is a public forum, and not all feedback will be positive. Be prepared to address user concerns or criticism professionally. Responding kindly (or funnily) to criticism shows you’re committed to improving your tool and addressing users' needs.
+3. _Managing multiple platforms_: Social media management can be time-consuming, especially if you’re active on multiple platforms. Focus on one or two platforms that matter most to your audience instead of spreading yourself too thin.
+4. _Algorithm changes_: Social media platforms often tweak their algorithms, which can impact your content’s visibility. Stay updated on these changes, and adjust your strategy accordingly. If a post doesn’t perform well, experiment with different formats (videos, visuals, polls) to see what resonates with your audience.
+5. _Privacy and compliance_: Very important here to be mindful of sharing user data or results, especially if your Actor handles sensitive information. Make sure your posts comply with privacy laws and don’t inadvertently expose any personal data.
+
+## For inspiration
+
+It's sometimes hard to think of a good reason to scream into the void that is social media. Here are 25 scenarios where you might use social media to promote your Actor or your work:
+
+1. _Funny interaction with a user_: Share a humorous tweet or post about a quirky question or feedback from a user that highlights your Actor’s unique features.
+2. _Roadblock story_: Post about a challenging bug you encountered while developing your Actor and how you solved it, including a screenshot or snippet of code.
+3. _Success story_: Share a post detailing how a user’s feedback led to a new feature in your Actor and thank them for their suggestion.
+4. _Tutorial video_: Create and share a short video demonstrating how to use a specific feature of your Actor effectively.
+5. _Before-and-after example_: Post a visual comparison showing the impact of your Actor’s automation on a task or process.
+6. _Feature announcement_: Announce a new feature or update in your Actor with a brief description and a call-to-action for users to try it out.
+7. _User testimonial_: Share a positive review or testimonial from a user who benefited from your Actor, including their quote and a link to your tool.
+8. _Live Q&A_: Host a live Q&A session on a platform like Twitter or Reddit, answering questions about your Actor and its capabilities.
+9. _Behind-the-scenes look_: Post a behind-the-scenes photo or video of your development process or team working on your Actor.
+10. _Debugging tip_: Share a tip or trick related to debugging or troubleshooting common issues with web scraping or automation.
+11. _Integration highlight_: Post about how your Actor integrates with other popular tools or platforms, showcasing its versatility. Don't forget to tag them.
+12. _Case study_: Share a case study or success story showing how a business or individual used your Actor to achieve specific results.
+13. _Commentary on a news piece_: Offer your perspective on a recent news story related to technology, scraping, or automation. If possible, explain how it relates to your Actor.
+14. _User-generated content_: Share content created by your users, such as screenshots or examples of how they’re using your Actor.
+15. _Memes_: Post a relevant meme about the challenges of web scraping or automation.
+16. Milestone celebration: Announce and celebrate reaching a milestone, such as a certain number of users or downloads for your Actor.
+17. _Quick tip_: Share a short, useful tip or hack related to using your Actor more efficiently.
+18. _Throwback post_: Share a throwback post about the early development stages of your Actor, including any challenges or milestones you achieved.
+19. _Collaboration announcement_: Announce a new collaboration with another developer or tool, explaining how it enhances your Actor’s functionality.
+20. _Community shout-out_. Give a shout-out to a user or community member who has been particularly supportive or helpful.
+21. _Demo invitation_: Invite your followers to a live demo or webinar where you’ll showcase your Actor and answer questions.
+22. _Feedback request_: Ask your audience for feedback on a recent update or feature release, and encourage them to share their thoughts.
+23. _Book or resource recommendation_: Share a recommendation for a book or resource that helped you in developing your Actor, and explain its relevance.
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/video_tutorials.md">
+---
+title: Video tutorials
+description: Use video tutorials to demonstrate features, offer tutorials, and connect with users in real time, building trust and driving interest in your tools.
+sidebar_position: 6
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/video-tutorials
+---
+
+**Videos and live streams are powerful tools for connecting with users and potential users, especially when promoting your Actors. You can use them to demonstrate functionality, provide tutorials, or engage with your audience in real time.**
+
+---
+
+## Why videos and live streams matter
+
+1. _Visual engagement_. Videos allow you to show rather than just tell. Demonstrating how your Actor works or solving a problem in makes the content more engaging and easier to understand. For complex tools, visual explanations can be much more effective than text alone.
+2. _Enhanced communication_. Live streams offer a unique opportunity for direct interaction. You can answer questions, address concerns, and gather immediate feedback from your audience, creating a more dynamic and personal connection.
+3. _Increased reach_. Platforms like YouTube and TikTok have massive user bases, giving you access to a broad audience. Videos can also be shared across various social media channels, extending your reach even further.
+
+Learn more about the rules of live streams in our next section: [Webinars](/academy/actor-marketing-playbook/promote-your-actor/webinars)
+
+## Optimizing videos for SEO
+
+1. _Keywords and titles_. Use relevant keywords in your video titles and descriptions. For instance, if your Actor is a web scraping tool, include terms like “web scraping tutorial” or “how to use web scraping tools” to help users find your content.
+2. _Engaging thumbnails_. Create eye-catching thumbnails that accurately represent the content of your video. Thumbnails are often the first thing users see, so make sure they are visually appealing and relevant.
+3. _Transcriptions and captions_. Adding transcripts and captions to your videos improves accessibility and can enhance SEO. They allow search engines to index your content more effectively and help users who prefer reading or have hearing impairments.
+
+## YouTube vs. TikTok
+
+1. _YouTube_. YouTube is an excellent platform for longer, detailed videos. Create a channel dedicated to your Actors and regularly upload content such as tutorials, feature walkthroughs, and industry insights. Utilize YouTube’s SEO features by optimizing video descriptions, tags, and titles with relevant keywords. Engage with your audience through comments and encourage them to subscribe for updates. Collaborating with other YouTubers or influencers in the tech space can also help grow your channel.
+2. _TikTok_. TikTok is ideal for short, engaging videos. Use it to share quick tips, demo snippets, or behind-the-scenes content about your Actors. The platform’s algorithm favors high engagement, so create catchy content that encourages viewers to interact. Use trending hashtags and participate in challenges relevant to your niche to increase visibility. Consistency is key, so post regularly and monitor which types of content resonate most with your audience.
+
+## Growing your channels
+
+1. _Regular content_. Consistently upload content to keep your audience engaged and attract new viewers. Create a content calendar to plan and maintain a regular posting schedule.
+2. _Cross-promotion_. Share your videos across your social media channels, blogs, and newsletters. This cross-promotion helps drive traffic to your videos and increases your reach.
+3. _Engage with your audience_. Respond to comments and feedback on your videos. Engaging with viewers builds a community around your content and encourages ongoing interaction.
+4. _Analyze performance_. Use analytics tools provided by YouTube and TikTok to track the performance of your videos. Monitor metrics like watch time, engagement rates, and viewer demographics to refine your content strategy.
+
+---
+
+## Where to mention videos across your Actor ecosystem
+
+1. _README_: include links to your videos in your Actor’s README file. For example, if you have a tutorial video, mention it in a "How to scrape X" or "Resources" section to guide users.
+2. _Input schema_: if your Actor’s input schema includes complex fields, link to a video that explains how to configure these fields. This can be especially helpful for users who prefer visual guides.
+3. _Social media_: share your videos on platforms like Twitter, LinkedIn, and Facebook. Use engaging snippets or highlights to attract users to watch the full video.
+4. _Blog posts_: embed videos in your blog posts for a richer user experience. If you write a tutorial or feature update, include a video to provide additional context.
+5. _Webinars and live streams_: mention your videos during webinars or live streams. If you’re covering a topic related to a video you’ve posted, refer to it as a supplemental resource.
+</file>
+
+<file path="platform/get_most_of_actors/promote_your_actor/webinars.md">
+---
+title: Webinars
+description: Webinars and live streams are powerful tools to showcase your Actor’s features. Learn how to plan, host, and maximize the impact of your webinar.
+sidebar_position: 7
+category: apify platform
+slug: /actor-marketing-playbook/promote-your-actor/webinars
+---
+
+Webinars and live streams are a fantastic way to connect with your audience, showcase your Actor's capabilities, and gather feedback from users. Though the term webinar might sound outdated these days, the concept of a live video tutorial is alive and well in the world of marketing and promotion.
+
+Whether you're introducing a new feature, answering questions, or walking through a common use case, a live event can create more personal engagement, boost user trust, and open the door for valuable two-way communication.
+
+But how do you get started? Here's a friendly guide on where to host, how to prepare, and what to do before, during, and after your webinar.
+
+---
+
+## Why host a live stream?
+
+Here are a few reasons why live streams are ideal for promoting your Actor:
+
+- _Demo_. You can show your Actor in action and highlight its most powerful features. You can tell a story about how you built it. You can also show how your Actor interacts with other tools and platforms and what its best uses are. A live demo lets users see immediately how your tool solves their problems.
+- _Building trust and rapport_. Interacting directly with your users builds trust and rapport. Even showing up and showing your face/voice, it's a chance to let your users meet you and get a feel for the team behind the Actor.
+- _Live Q&A_. Users often have questions that can be hard to fully address in documentation, README, or tutorials. A live session allows for Q&A, so you can explain complex features and demonstrate how to overcome common issues.
+- _Tutorial or training_. If you don't have time for complex graphics, this is an easy replacement for a video tutorial until you do. Remember that some platforms (YouTube) give the option of publishing the webinar after it's over. You can reuse it later in other content as well as a guide. Also, if you’ve noticed users struggling with particular features, a webinar is a great way to teach them directly.
+
+Webinars help build a community around your Actor and turn one-time users into loyal advocates.
+
+## Where to host your webinar or live stream
+
+It all goes back to where you have or would like to have your audience and whether you want to have the webinar available on the web later.
+
+1. Social media:
+    1. _YouTube_: ideal for reaching a broad audience. It’s free and easy to set up. You can also make recordings available for future viewing.
+    2. _TikTok_: same, ideal for reaching a broad audience, free and easy to set up. However, live video will disappear once the broadcast has ended. TikTok does allow you to save your livestreams. You won't be able to republish them to the platform (we assume your live stream will be longer than 10 minutes). But you can later re-upload it elsewhere.
+    3. _Twitch_: Known for gaming, Twitch has become a space for tech demos, coding live streams, and webinars. If your target audience enjoys an interactive and casual format, Twitch might be a good fit.
+    4. _LinkedIn_: If your audience is more professional, LinkedIn Live could be a good fit to present your Actor there. Once a stream is complete, it will remain on the feed of your LinkedIn Page or profile as a video that was ‘previously recorded live’.
+    5. Facebook: Not recommended.
+2. General platforms:
+    1. _Zoom_ or _Google Meet_: More personal, these are great for smaller webinars where you might want closer interaction. They also give you control over who attends.
+
+Pick a platform where your users are most likely to hang out. If your audience is primarily tech-savvy, YouTube or Twitch could work. If your Actor serves businesses, LinkedIn might be the best spot.
+
+## Webinar/live stream prep
+
+### Promote your webinar and get your users
+
+Send an email blast if you have an email list of users or potential users, send a friendly invite. Include details about what you’ll cover and how they can benefit from attending.
+
+- Social media promotion on Twitter (X), LinkedIn, or other platforms. Highlight what people will learn and any special features you’ll be demonstrating. Do it a few times - 2 weeks before the webinar, 1 week before, a day before, and the day of. Don't forget to announce on Apify’s Discord. These are places where your potential audience is likely hanging out. Let them know you’re hosting an event and what they can expect.
+- Use every piece of real estate on Apify Store and Actor pages. Add a banner or notification to your Actor’s page (top of the README): This can be a great way to notify people who are already looking at your Actor. A simple “join us for a live demo on DATE” message works well. Add something like that to your Store bio and its README. Mention it at the top description of your Actor's input schema.
+
+:::tip Use UTM tags
+
+When creating a link to share to the webinar, you can add different UTM tags for different places where you will insert the link. That way you can later learn which space brought the most webinar sign-ups.
+
+:::
+
+- Collaborate with other developers. If you can team up with someone in the Apify community, you’ll double your reach. Cross-promotion can bring in users from both sides.
+
+---
+
+### Plan the content
+
+Think carefully about what you’ll cover. Focus on what’s most relevant for your audience:
+
+- _Decide on your content_. What will you cover? A demo? A deep dive into Actor configurations? Create a flow and timeline to keep yourself organized.
+- Prepare visuals. Slides, product demos, and examples are helpful to explain complex ideas clearly.
+- _Feature highlights_. Demonstrate the key features of your Actor. Walk users through common use cases and be ready to show live examples.
+- _Input schema_. If your Actor has a complex input schema, spend time explaining how to use it effectively. Highlight tips that will save users time and frustration. You can incorporate your knowledge from the issues tab.
+- _Q&A session_. Leave time for questions at the end. Make sure to keep this flexible, as it’s often where users will engage the most.
+
+Don't forget to add an intro with an agenda and an outro with your contact details.
+
+:::tip Consider timezones
+
+When thinking of when to run the webinar, focus on the timezone of the majority of your users.
+
+:::
+
+### Prepare technically
+
+Test your setup before going live. Here’s what to focus on:
+
+- _Stable internet connection_. This one’s obvious but essential. Test your stream quality ahead of time.
+- _Test the Actor live_. If you're demoing your Actor, ensure it works smoothly. Avoid running scripts that take too long or have potential bugs during the live session.
+- _Audio quality_. People are far more likely to tolerate a blurry video than bad audio. Use a good-quality microphone to ensure you’re heard clearly.
+- Screen sharing. If you’re doing a live demo, make sure you know how to seamlessly switch between windows and share your screen effectively.
+- _Backup plan_. Have a backup plan in case something goes wrong. This could be as simple as a recorded version of your presentation to share if things go south during the live session.
+- _Make it interactive_. Consider using polls or a live Q&A session to keep the audience engaged. Maybe have a support person assisting with that side of things while you're speaking.
+
+## Best practices during the live stream
+
+When the time comes, here’s how to make the most of your webinar or live stream:
+
+- _Start with an introduction_. Begin with a brief introduction of yourself, the Actor you’re showcasing, and what attendees can expect to learn. This sets expectations and gives context. It's also best if you have a slide that shows the agenda.
+- _Try to stay on time_. Stick to the agenda. Users appreciate when events run on schedule.
+- _Show a live demo_. Walk through a live demo of your Actor. Show it solving a problem from start to finish.
+- _Explain as you go_. Be mindful that some people might be unfamiliar with technical terms or processes. Try to explain things simply and offer helpful tips as you demonstrate but don't go off on a tangent.
+- _Invite questions and engage your audience_. Encourage users to ask questions throughout the session. This creates a more conversational tone and helps you address their concerns in real time. You can also ask a simple question or poll to get the chat going. Try to direct the Q&A into one place so you don't have to switch tabs. Throughout the presentation, pause for questions and make sure you're addressing any confusion in real time.
+- _Wrap up with a clear call to action_. Whether it’s to try your Actor, leave a review, or sign up for a future live, finish with a clear CTA. Let them know the next step to take.
+
+This works for when it's a simple tutorial walkthrough and if you have a code-along session, the practices work for it as well.s
+
+## After the live session
+
+Once your live session wraps up, there are still sides of it you can benefit from:
+
+- _Make it public and share the recording_. Not everyone who wanted to attend will have been able to make it. Send a recording to all attendees whose emails you have and make it publicly available on your channels (emails, README, social media, etc.). Upload the recorded session to YouTube and your Actor’s documentation. If it's on YouTube, you can also ask Apify's video team to add it to their Community playlist. Make it easy for people to revisit the content or share it with others.
+- _Follow up with attendees, thank them, and ask for feedback_. Send a follow-up email thanking people for attending. Include a link to the recording, additional resources, and ways to get in touch if they have more questions. Share any special offers or discount codes if relevant. If you don’t have the attendees' emails, include a link in your newsletter and publish it on your channels. Ask for feedback on what they liked and what could be improved. This can guide your next webinar or help fine-tune your Actor.
+- _Answer lingering questions_. If any questions didn’t get answered live, take the time to address them in the follow-up email.
+- _Create a blog post or article_. Summarize the key points of your webinar in a written format. This can boost your SEO and help users find answers in the future.
+- _Review your performance_. Analyze the data from your webinar, if available. How many people attended? Which platform brought the most sign-ups? How many questions did you receive? Were there any technical difficulties? This helps refine your approach for future events.
+- _Share snippets from the webinar or interesting takeaways on social media_. Encourage people to watch the recording and let them know when you’ll be hosting another event.
+</file>
+
+<file path="platform/get_most_of_actors/store_basics/_category_.yaml">
+label: Store basics
+position: 1
+</file>
+
+<file path="platform/get_most_of_actors/store_basics/actor_success_stories.md">
+---
+title: Actor success stories
+description: Learn about developers who successfully make passive income from their Actors.
+sidebar_position: 5
+category: apify platform
+slug: /actor-marketing-playbook/store-basics/actor-success-stories
+---
+
+_Web scraping freelance financial freedom with microworlds._
+
+Discover how Caleb David, founder of `microworlds`, achieved financial freedom through freelance web scraping. His journey showcases how mastering the craft with tools like Crawlee and creating a Twitter scraper transformed his career. See the full story [here](https://blog.apify.com/web-scraping-freelance-financial-freedom/) and learn from his success.
+
+https://apify.com/microworlds
+
+_Web scraping for freelance success – insights from Tugkan._
+
+In this success story, our first community dev Tugkan shares how his journey into freelancing via Apify changed his life. Learn about his process, challenges, and how his paid Actors have brought him financial rewards and freedom. Check out his story [here](https://apify.com/success-stories/paid-actor-journey-apify-freelancer-tugkan) for inspiration.
+
+https://apify.com/epctex
+
+
+Interested in sharing your story? Reach out to our marketing team at [marketing@apify.com](mailto:marketing@apify.com) for a case study to showcase your journey.
+</file>
+
+<file path="platform/get_most_of_actors/store_basics/how_actor_monetization_works.md">
+---
+title: How Actor monetization works
+description: Discover how to share your tools and explore monetization options to earn from your automation expertise.
+sidebar_position: 3
+category: apify platform
+slug: /actor-marketing-playbook/store-basics/how-actor-monetization-works
+---
+
+**You can turn your web scrapers into a source of income by publishing them on Apify Store. Learn how it's done and what monetization options you have.**
+
+---
+
+## Monetizing your Actor
+
+Monetizing your Actor on the Apify platform involves several key steps:
+
+1. _Development_: create and refine your Actor.
+2. _Testing_: ensure your Actor works reliably.
+3. _Publication & monetization_: publish your Actor and set up its monetization model.
+4. _Promotion_: attract users to your Actor.
+
+---
+
+## Monetization models
+
+### Rental pricing model
+
+![rental model example](images/rental-model.png)
+
+- _How it works_: you offer a free trial period and set a monthly fee. Users on Apify paid plans can continue using the Actor after the trial. You earn 80% of the monthly rental fees.
+- _Example_: you set a 7-day free trial and $30/month rental. If 3 users start using your Actor:
+  - 1st user on a paid plan pays $30 after the trial (you earn $24).
+  - 2nd user starts their trial but pays next month.
+  - 3rd user on a free plan finishes the trial without upgrading to a paid plan and can’t use the Actor further.
+
+Learn more about the rental pricing model in our [documentation](/platform/actors/publishing/monetize#rental-pricing-model).
+
+### Pay-per-result pricing model
+
+![pay per result model example](images/ppr-model.png)
+
+- _How it works_: you charge users based on the number of results your Actor generates. You earn 80% of the revenue minus platform usage costs.
+- _Profit calculation_: `profit = (0.8 * revenue) - platform usage costs`
+- _Cost breakdown_:
+  - Compute unit: $0.4 per CU
+  - Residential proxies: $13 per GB
+  - SERPs proxy: $3 per 1,000 SERPs
+  - Data transfer (external): $0.20 per GB
+  - Dataset storage: $1 per 1,000 GB-hours
+- _Example_: you set a price of $1 per 1,000 results. Two users generate 50,000 and 20,000 results, paying $50 and $20, respectively. If the platform usage costs are $5 and $2, your profit is $49.
+
+Learn more about the pay-per-result pricing model in our [documentation](/platform/actors/publishing/monetize#pay-per-result-pricing-model).
+
+### Pay-per-event pricing model
+
+![pay per event model example](images/ppe-model.png)
+
+- _How it works_: you charge users based on specific events triggered programmatically by your Actor's code. You earn 80% of the revenue minus platform usage costs.
+- - _Profit calculation_: `profit = (0.8 * revenue) - platform usage costs`
+- _Event cost example_: you set the following events for your Actor:
+  - `Actor start per 1 GB of memory` at $0.005
+  - `Pages scraped` at $0.002
+  - `Page opened with residential proxy` at $0.002 - this is on top of `Pages scraped`
+  - `Page opened with a browser` at $0.002 - this is on top of `Pages scraped`
+- _Example_:
+  - User A:
+    - Started the Actor 10 times = $0.05
+    - Scraped 1,000 pages = $2.00
+    - 500 of those were scraped using residential proxy = $1.00
+    - 300 of those were scraped using browser = $0.60
+    - This comes up to $3.65 of total revenue
+  - User B:
+    - Started the Actor 5 times = $0.025
+    - Scraped 500 pages = $1.00
+    - 200 of those were scraped using residential proxy = $0.40
+    - 100 of those were scraped using browser = $0.20
+    - This comes up to $1.625 of total revenue
+  - That means if platform usage costs are $0.365 for user A and $0.162 for user B your profit is $4.748
+
+Learn more about the pay-per-event pricing model in our [documentation](/platform/actors/publishing/monetize#pay-per-event-pricing-model).
+
+## Setting up monetization
+
+1. _Go to your Actor page_: navigate to the **Publication** tab and open the **Monetization** section.
+2. _Fill in billing details_: set up your payment details for payouts.
+3. _Choose your pricing model_: use the monetization wizard to select your model and set fees.
+
+### Changing monetization
+
+Adjustments to monetization settings take 14 days to take effect and can be made once per month.
+
+### Tracking and promotion
+
+- _Track profit_: review payout invoices and statistics in Apify Console (**Monitoring** tab).
+- _Promote your Actor_: optimize your Actor’s description for SEO, share on social media, and consider creating tutorials or articles to attract users.
+
+## Marketing tips for defining the price for your Actor
+
+It's up to you to set the pricing, of course. It can be as high or low as you wish, you can even make your Actor free. But if you're generally aiming for a successful, popular Actor, here are a few directions:
+
+### Do market research outside Apify Store
+
+The easiest way to understand your tool's value is to look around. Are there similar tools on the market? What do they offer, and how much do they charge? What added value does your tool provide compared to theirs? What features can your tool borrow from theirs for the future?
+
+Try competitor tools yourself (to assess the value and the quality they provide), check their SEO (to see how much traffic they get), and note ballpark figures. Think about what your Actor can do that competitors might be missing.
+
+Also, remember that your Actor is a package deal with the Apify platform. So all the platform's features automatically transfer onto your Actor and its value. Scheduling, monitoring runs, ways of exporting data, proxies, and integrations can all add value to your Actor (on top of its own functionalities). Be sure to factor this into your tool's value proposition and communicate that to the potential user.
+
+### Do research in Apify Store
+
+Apify Store is like any other marketplace, so take a look at your competition there. Are you the first in your lane, or are there other similar tools? What makes yours stand out? Remember, your README is your first impression — communicate your tool's benefits clearly and offer something unique. Competing with other developers is great, but collaborations can drive even better results 😉
+
+Learn more about what makes a good readme here: [How to create an Actor README](/academy/actor-marketing-playbook/actor-basics/how-to-create-an-actor-readme)
+
+### Rental, pay-per-result (PPR), or pay-per-event (PPE)
+
+Rental pricing is technically easier: you set the rental fee, and the user covers their CU usage. So all you have to define is how much you want to charge the users. With pay-per-result, you’ll need to include both CU usage and your margin. So you have to calculate how much the average run is going to cost for the user + define how much you want to charge them.
+
+To figure out the average cost per run for users, just run a few test runs and look at the statistics in the Actor [**Analytics**](https://console.apify.com/actors?tab=analytics) tab.
+
+From an average user's perspective, pay-per-result is often easier to grasp — $25 for a thousand pages, $5 for a thousand videos, $1 for a thousand images, etc. It gives users a clearer idea of what they’re paying for and allows them to estimate faster. But rental pricing has its fans, too — if your tool provides high value, users will come.
+
+Pay-per-event (PPE) lets you define pricing for individual events. You can charge for specific events directly from your Actor by calling our PPE charging API. The most common events will most likely be Actor start, dataset item, external API calls, etc. PPE is great for users who want to optimize their costs and value transparency. PPE is also a fairer pricing model for integration and AI-driven use cases, where dataset-based pricing doesn’t make sense.
+
+### Adapt when needed
+
+Don’t be afraid to experiment with pricing, especially at the start. You can monitor your results in the dashboard and adjust if necessary.
+
+Keep an eye on SEO as well. If you monitor the volume of the keywords your Actor is targeting as well as how well your Actor's page is ranking for those keywords, you can estimate the number of people who actually end up trying your tool (aka conversion rate). If your keywords are getting volume, but conversions are lower than expected, it might point to a few issues It could be due to your pricing, a verbose README, or complex input. If users are bouncing right away, it makes sense to check out your pricing and your closest competitors to see where adjustments might help.
+
+### Summary & a basic plan
+
+Pick a pricing model, run some tests, and calculate your preliminary costs (**Analytics** tab in Console).
+
+Then check your costs against similar solutions in the Store and the market (try Google search or other marketplaces), and set a price that gives you some margin.
+
+It’s also normal to adjust pricing as you get more demand. For context, most prices on Apify Store range between $1-10 per 1,000 results.
+
+Example of useful pricing estimates from the **Analytics** tab:
+
+![example of pricing estimates in analytics tab](images/analytisc-example.png)
+
+:::tip Use emails!
+
+📫 Don't forget to set an email sequence to warn and remind your users about pricing changes. Learn more about emailing your users here:  [Emails to Actor users]
+
+:::
+
+## Resources
+
+- Learn about [incentives behind monetization](https://apify.com/partners/actor-developers)
+- Detailed guide to [setting up monetization models](https://docs.apify.com/academy/actor-marketing-playbook/monetizing-your-actor)
+- Guide to [publishing Actors](https://docs.apify.com/platform/actors/publishing)
+- Watch our webinar on how to [build, publish and monetize Actors](https://www.youtube.com/watch?v=4nxStxC1BJM)
+- Read a blog post from our CEO on the [reasoning behind monetizing Actors](https://blog.apify.com/make-regular-passive-income-developing-web-automation-actors-b0392278d085/)
+- Learn about the [Creator plan](https://apify.com/pricing/creator-plan), which allows you to create and freely test your own Actors for $1
+</file>
+
+<file path="platform/get_most_of_actors/store_basics/how_store_works.md">
+---
+title: How Apify Store works
+description: Learn how to create and publish your own Actor, and join a growing community of innovators in scraping and web automation.
+sidebar_position: 1
+category: apify platform
+slug: /actor-marketing-playbook/store-basics/how-store-works
+---
+
+**Out of the 3,000+ Actors on [Apify Store](https://apify.com/store) marketplace, hundreds of them were created by developers just like you. Let's get acquainted with the concept of Apify Store and what it takes to publish an Actor there.**
+
+---
+
+## What are Actors (and why they're called that)?
+
+[Actors](https://apify.com/actors) are serverless cloud applications that run on the Apify platform, capable of performing various computing tasks on the web, such as crawling websites or sending automated emails. They are developed by independent developers all over the world, and _you can be one of them_.
+
+The term "Actor" is used because, like human actors, these programs follow a script. This naming convention unifies both web scraping and web automation solutions, including AI agents, under a single term. Actors can range in complexity and function, targeting different websites or performing multiple tasks, which makes the umbrella term very useful.
+
+## What is Apify Store?
+
+[Apify Store](https://apify.com/store) is a public library of Actors that is constantly growing and evolving. It's basically a publicly visible (and searchable) part of the Apify platform. With over 3,000 Actors currently available, most of them are created and maintained by the community. Actors that consistently perform well remain on Apify Store, while those reported as malfunctioning or under maintenance are eventually removed. This keeps the tools in our ecosystem reliable, effective, and competitive.
+
+### Types of Actors
+
+- _Web scraping Actors_: for instance, [Twitter (X) Scraper](https://apify.com/apidojo/twitter-user-scraper) extracts data from Twitter.
+- _Automation Actors_: for example, [Content Checker](https://apify.com/jakubbalada/content-checker) monitors website content for changes and emails you once a change occurs.
+- _Bundles_: chains of multiple Actors united by a common data point or target website. For example, [Restaurant Review Aggregator](https://apify.com/tri_angle/restaurant-review-aggregator) can scrape reviews from six platforms at once.
+
+Learn more about bundles here: [Actor bundles](/academy/actor-marketing-playbook/product-optimization/actor-bundles)
+
+## Public and private Actors
+
+Actors on Apify Store can be public or private:
+
+- _Private Actors_: these are only accessible to you in Apify Console. You can use them without exposing them to the web. However, you can still share the results they produce.
+- _Public Actors_: these are available to everyone on Apify Store. You can choose to make them free or set a price. By publishing your web scrapers and automation solutions, you can attract users and generate income.
+
+## How Actor monetization works (briefly)
+
+You can monetize your Actors using three different pricing models:
+
+- Pay for usage: charge based on how much the Actor is used.
+- Pay per result: the price is based on the number of results produced, with the first few free.
+- Pay per event: the price is based on specific events triggered by the Actor.
+- Monthly billing: set a fixed monthly rental rate for using the Actor.
+
+For detailed information on which pricing model might work for your Actor, refer to [How Actor monetization works](/academy/actor-marketing-playbook/store-basics/how-actor-monetization-works).
+
+## Actor ownership on Store
+
+Actors are either created and maintained by Apify or by members of the community:
+
+- _Maintained by Apify_: created and supported by the Apify team.
+- _Maintained by Community_: created and managed by independent developers from the community.
+
+To see who maintains an Actor, check the upper-right corner of the Actor's page.
+
+When it comes to managing Actors on Apify, it’s important that every potential community developer understands the differences between Apify-maintained and Community-maintained Actors. Here’s what you need to know to navigate the platform effectively and ensure your work stands out.
+
+### Community-maintained Actors
+
+✨ _Features and functionality_: offers a broader range of use cases and features, often tailored to specific needs. Great for exploring unique or niche applications.
+
+🧑‍💻 _Ownership_: created and maintained by independent developers like you.
+
+🛠 _Maintenance_: you’re responsible for all updates, bug fixes, and ongoing maintenance. Apify hosts your Actor but does not manage its code.
+
+👷‍♀️ _Reliability and testing_: it’s up to you to ensure your Actor’s reliability and performance.
+
+☝️ _Support and Issues_: Apify does not provide direct support for Community-maintained Actors. You must manage issues through the Issues tab, where you handle user queries and problems yourself.
+
+✍️ _Documentation_: you’re responsible for creating and maintaining documentation for your Actor. Make sure it’s clear and helpful for users.
+
+:::tip Test your Actor!
+
+For the best results, make sure your Actor is well-documented and thoroughly tested. Engage with users through the Issues tab to address any problems promptly. By maintaining high standards and being proactive, you’ll enhance your Actor’s reputation and usability in Apify Store.
+
+:::
+
+## Importance of Actor testing and reliability
+
+It's essential to test your Actors and make sure they work as intended. That's why Apify does it on our side as much as you should do it on yours.
+
+Apify runs automated tests daily to ensure all Actors on Apify Store are functional and reliable. These tests check _if an Actor can successfully run with its default input within 5 minutes_. If an Actor fails for three consecutive days, it’s labeled under maintenance, and the developer is notified. Continuous failures for another 28 days lead to deprecation.
+
+To restore an Actor's health, developers should fix and rebuild it. The testing system will automatically recognize the changes within 24 hours. If your Actor requires longer run times or authentication, contact support to explain why it should be excluded from tests. For more control, you can implement your own tests using the Actor Testing tool available on Apify Store.
+
+### Actor metrics and reliability score
+
+On the right panel of each Actor on Store, you can see a list of Actor metrics.
+
+Actor metrics such as the number of monthly users, star ratings, success rates, response times, creation dates, and recent modifications collectively offer insights into its reliability. Basically, they serve as a _shorthand for potential users to assess your Actor's reliability_ before even trying it out.
+
+A high number of monthly users indicates widespread trust and effective performance, while a high star rating reflects user satisfaction. A success rate nearing 100% demonstrates consistent performance. Short response times show a commitment to addressing issues promptly, though quicker responses are ideal. A recent creation date suggests modern features and ongoing development, while recent modifications point to active maintenance and continuous improvements. Together, these metrics provide a comprehensive view of an Actor’s reliability and quality.
+
+### Reporting Issues in Actors
+
+Each Actor has an **Issues** tab in Apify Console and on the web. Here, users can open an issue (ticket) and engage in discussions with the Actor's creator, platform admins, and other users. The tab is ideal for asking questions, requesting new features, or providing feedback.
+
+Since the **Issues** tab is public, the level of activity — or lack thereof — can be observed by potential users and may serve as an indicator of the Actor's reliability. A well-maintained Issues tab with prompt responses suggests an active and dependable Actor.
+
+Learn more about how to handle the [Issues tab](/academy/actor-marketing-playbook/interact-with-users/issues-tab)
+
+## Resources
+
+- Best practices on setting up [testing for your Actor](https://docs.apify.com/platform/actors/publishing/test)
+- What are Apify-maintained and [Community-maintained Actors](https://help.apify.com/en/articles/6999799-what-are-apify-maintained-and-community-maintained-actors)? On ownership, maintenance, features, and support
+- Step-by-step guide on how to [publish your Actor](https://docs.apify.com/platform/actors/publishing)
+- Watch our webinar on how to [build, publish and monetize Actors](https://www.youtube.com/watch?v=4nxStxC1BJM)
+- Detailed [guide on pricing models](https://docs.apify.com/platform/actors/running/actors-in-store) for Actors in Store
+</file>
+
+<file path="platform/get_most_of_actors/store_basics/how_to_build_actors.md">
+---
+title: How to build Actors
+description: Learn how to create web scrapers and automation tools on Apify. Use universal scrapers for quick setup, code templates for a head start, or SDKs and libraries for full control.
+sidebar_position: 2
+category: apify platform
+slug: /actor-marketing-playbook/store-basics/how-to-build-actors
+---
+
+At Apify, we try to make building web scraping and automation straightforward. You can customize our universal scrapers with JavaScript for quick tweaks, use our code templates for rapid setup in JavaScript, TypeScript, or Python, or build from scratch using our JavaScript and Python SDKs or Crawlee libraries for Node.js and Python for ultimate flexibility and control. This guide offers a quick overview of our tools to help you find the right fit for your needs.
+
+## Three ways to build Actors
+
+1. [Our universal scrapers](https://apify.com/store/scrapers/universal-web-scrapers) — customize our boilerplate tools to your needs with a bit of JavaScript and setup.
+2. [Our code templates](https://apify.com/templates) for web scraping projects — for a quick project setup to save you development time (includes JavaScript, TypeScript, and Python templates).
+3. Open-source libraries and SDKs
+    1. [JavaScript SDK](https://docs.apify.com/sdk/js/) & [Python SDK](https://docs.apify.com/sdk/python/) — for creating your own solution from scratch on the Apify platform using our free development kits. Involves more coding but offers infinite flexibility.
+    2. [Crawlee](https://crawlee.dev/) and [Crawlee for Python](https://crawlee.dev/python) — for creating your own solutions from scratch using our free web automation libraries. Involves even more coding but offers infinite flexibility. There’s also no need to host these on the platform.
+
+## Universal scrapers & what are they for
+
+[Universal scrapers](https://apify.com/scrapers/universal-web-scrapers) were built to provide an intuitive UI plus configuration that will help you start extracting data as quickly as possible. Usually, you just provide a [simple JavaScript function](https://docs.apify.com/tutorials/apify-scrapers/getting-started#the-page-function) and set up one or two parameters, and you're good to go.
+
+Since scraping and automation come in various forms, we decided to build not just one, but _six_ scrapers. This way, you can always pick the right tool for the job. Let's take a look at each particular tool and its advantages and disadvantages.
+
+| Scraper | Technology | Advantages | Disadvantages | Best for |
+| --- | --- | --- | --- | --- |
+| 🌐 Web Scraper | Headless Chrome Browser | Simple, fully JavaScript-rendered pages | Executes only client-side JavaScript | Websites with heavy client-side JavaScript |
+| 👐 Puppeteer Scraper | Headless Chrome Browser | Powerful Puppeteer functions,  executes both server-side and client-side JavaScript | More complex | Advanced scraping with client/server-side JS |
+| 🎭 Playwright Scraper | Cross-browser support with Playwright library | Cross-browser support, executes both server-side and client-side JavaScript | More complex | Cross-browser scraping with advanced features |
+| 🍩 Cheerio Scraper | HTTP requests + Cheerio parser (JQuery-like for servers) | Simple, fast, cost-effective | Pages may not be fully rendered (lacks JavaScript rendering), executes only server-side JavaScript | High-speed, cost-effective scraping |
+| ⚠️ JSDOM Scraper | JSDOM library (Browser-like DOM API) | + Handles client-side JavaScript<br/>+ Faster than full-browser solutions<br/>+ Ideal for light scripting | Not for heavy dynamic JavaScript, executes server-side code only, depends on pre-installed NPM modules | Speedy scraping with light client-side JS |
+| 🍲 BeautifulSoup Scraper | Python-based, HTTP requests + BeautifulSoup parser | Python-based, supports recursive crawling and URL lists | No full-featured web browser, not suitable for dynamic JavaScript-rendered pages | Python users needing simple, recursive crawling |
+
+### How do I choose the right universal web scraper to start with?
+
+🎯 Decision points:
+
+- Use 🌐 [Web Scraper](https://apify.com/apify/web-scraper) if you need simplicity with full browser capabilities and client-side JavaScript rendering.
+- Use 🍩 [Cheerio Scraper](https://apify.com/apify/cheerio-scraper) for fast, cost-effective scraping of static pages with simple server-side JavaScript execution.
+- Use 🎭 [Playwright Scraper](https://apify.com/apify/playwright-scraper) when cross-browser compatibility is crucial.
+- Use 👐 [Puppeteer Scraper](https://apify.com/apify/puppeteer-scraper) for advanced, powerful scraping where you need both client-side and server-side JavaScript handling.
+- Use ⚠️ [JSDOM Scraper](https://apify.com/apify/jsdom-scraper) for lightweight, speedy scraping with minimal client-side JavaScript requirements.
+- Use 🍲 [BeautifulSoup Scraper](https://apify.com/apify/beautifulsoup-scraper) for Python-based scraping, especially with recursive crawling and processing URL lists.
+
+
+To make it easier, here's a short questionnaire that guides you on selecting the best scraper based on your specific use case:
+
+<details>
+    <summary>Questionnaire</summary>
+    1. Is the website content rendered with a lot of client-side JavaScript?
+        - Yes:
+            - Do you need full browser capabilities?
+                - Yes: use Web Scraper or Playwright Scraper
+                - No, but I still want advanced features: use Puppeteer Scraper
+        - No:
+            - Do you prioritize speed and cost-effectiveness?
+                - Yes: use Cheerio Scraper
+                - No: use JSDOM Scraper
+    2. Do you need cross-browser support for scraping?
+        - Yes:** use Playwright Scraper
+        - No:** continue to the next step.
+    3. Is your preferred scripting language Python?**
+        - Yes:** use BeautifulSoup Scraper
+        - No:** continue to the next step.
+    4. Are you dealing with static pages or lightweight client-side JavaScript?**
+        - Static pages: use Cheerio Scraper or BeautifulSoup Scraper
+        - Light client-side JavaScript:
+            - Do you want a balance between speed and client-side JavaScript handling?
+                - Yes: use JSDOM Scraper
+                - No: use Web Scraper or Puppeteer Scraper
+    5. Do you need to support recursive crawling or process lists of URLs?
+        - Yes, and I prefer Python: use BeautifulSoup Scraper
+        - Yes, and I prefer JavaScript: use Web Scraper or Cheerio Scraper
+        - No: choose based on other criteria above.
+
+This should help you navigate through the options and choose the right scraper based on the website’s complexity, your scripting language preference, and your need for speed or advanced features.
+
+</details>
+
+
+📚 Resources:
+
+- How to use [Web Scraper](https://www.youtube.com/watch?v=5kcaHAuGxmY) to scrape any website
+- How to use [Beautiful Soup](https://www.youtube.com/watch?v=1KqLLuIW6MA) to scrape the web
+- Learn about our $1/month [Creator plan](https://apify.com/pricing/creator-plan) that encourages devs to build Actors based on universal scrapers
+
+## Web scraping code templates
+
+Similar to our universal scrapers, our [code templates](https://apify.com/templates) also provide a quick start for developing web scrapers, automation scripts, and testing tools. Built on popular libraries like BeautifulSoup for Python or Playwright for JavaScript, they save time on setup, allowing you to focus on customization. Though they require more coding than universal scrapers, they're ideal for those who want a flexible foundation while still needing room to tailor their solutions.
+
+| Code template | Supported libraries | Purpose | Pros | Cons |
+| --- | --- | --- | --- | --- |
+| 🐍 Python | Requests, BeautifulSoup, Scrapy, Selenium, Playwright | Creating scrapers Automation Testing tools | - Simplifies setup - Supports major Python libraries | - Requires more manual coding (than universal scrapers)- May be restrictive for complex tasks |
+| ☕️ JavaScript | Playwright, Selenium, Cheerio, Cypress, LangChain | Creating scrapers Automation Testing tools | - Eases development with pre-set configurations - Flexibility with JavaScript and TypeScript | - Requires more manual coding (than universal scrapers)- May be restrictive for tasks needing full control |
+
+
+📚 Resources:
+
+- [How to build a scraper](https://www.youtube.com/watch?v=u-i-Korzf8w) using a web scraper template.
+
+## Toolkits and libraries
+
+### Apify JavaScript and Python SDKs
+
+[Apify SDKs](https://docs.apify.com/sdk/js/) are designed for developers who want to interact directly with the Apify platform. It allows you to perform tasks like saving data in Apify Datasets, running Apify Actors, and accessing the key-value store. Ideal for those who are familiar with [Node.js](https://docs.apify.com/sdk/js/) and [Python](https://docs.apify.com/sdk/python/), SDKs provide the tools needed to develop software specifically on the Apify platform, offering complete freedom and flexibility within the JavaScript ecosystem.
+
+- _Best for_: interacting with the Apify platform (e.g., saving data, running Actors, etc)
+- _Pros_: full control over platform-specific operations, integrates seamlessly with Apify services
+- _Cons_: requires writing boilerplate code, higher complexity with more room for errors
+
+### Crawlee
+
+[Crawlee](https://crawlee.dev/) (for both Node.js and [Python](https://crawlee.dev/python)) is a powerful web scraping library that focuses on tasks like extracting data from web pages, automating browser interactions, and managing complex scraping workflows. Unlike the Apify SDK, Crawlee does not require the Apify platform and can be used independently for web scraping tasks. It handles complex operations like concurrency management, auto-scaling, and request queuing, allowing you to concentrate on the actual scraping tasks.
+
+- _Best for_: web scraping and automation (e.g., scraping paragraphs, automating clicks)
+- _Pros_: full flexibility in web scraping tasks, does not require the Apify platform, leverages the JavaScript ecosystem
+- _Cons_: requires more setup and coding, higher chance of mistakes with complex operations
+
+### Combining Apify SDK and Crawlee
+
+While these tools are distinct, they can be combined. For example, you can use Crawlee to scrape data from a page and then use the Apify SDK to save that data in an Apify dataset. This integration allows developers to make use of the strengths of both tools while working within the Apify ecosystem.
+
+📚 Resources:
+
+- Introduction to [Crawlee](https://www.youtube.com/watch?v=g1Ll9OlFwEQ)
+- Crawlee [blog](https://crawlee.dev/blog)
+- Webinar on scraping with [Crawlee 101](https://www.youtube.com/watch?v=iAk1mb3v5iI): how to create scrapers in JavaScript and TypeScript
+- Step-by-step video guide: [building an Amazon Scraper](https://www.youtube.com/watch?v=yTRHomGg9uQ) in Node.js with Crawlee
+- Webinar on how to use [Crawlee Python](https://www.youtube.com/watch?v=ip8Ii0eLfRY)
+- Introduction to Apify's [Python SDK](https://www.youtube.com/watch?v=C8DmvJQS3jk)
+
+
+## Code templates vs. universal scrapers vs. libraries
+
+Basically, the choice here depends on how much flexibility you need and how much coding you're willing to do. More flexibility → more coding.
+
+[Universal scrapers](https://apify.com/scrapers/universal-web-scrapers) are simple to set up but are less flexible and configurable. Our [libraries](https://crawlee.dev/), on the other hand, enable the development of a standard [Node.js](https://nodejs.org/) or Python application, so be prepared to write a little more code. The reward for that is almost infinite flexibility.
+
+[Code templates](https://apify.com/templates) are sort of a middle ground between scrapers and libraries. But since they are built on libraries, they are still on the rather more coding than less coding side. They will only give you a starter code to begin with. So please take this into account when choosing the way to build your scraper, and if in doubt — just ask us, and we'll help you out.
+
+## Switching sides: how to transfer an existing solution from another platform
+
+You can also take advantage of the Apify platform's features without having to modify your existing scraping or automation solutions.
+
+### Integrating Scrapy spiders
+
+The Apify platform fully supports Scrapy spiders. By [deploying your existing Scrapy code to Apify](https://apify.com/run-scrapy-in-cloud), you can take advantage of features like scheduling, monitoring, scaling, and API access, all without needing to modify your original spider. This process is made easy with the [Apify CLI](https://docs.apify.com/cli/), which allows you to convert your Scrapy spider into an Apify Actor with just a few commands. Once deployed, your spider can run in the cloud, offering a reliable and scalable solution for your web scraping needs.
+
+Additionally, you can monetize your spiders by [publishing them as Actors](https://apify.com/partners/actor-developers) on Apify Store, potentially earning passive income from your work while benefiting from the platform’s extensive features.
+
+### ScrapingBee, ScrapingAnt, ScraperAPI
+
+To make the transition from these platforms easier, we've also created [SuperScraper API](https://apify.com/apify/super-scraper-api). This API is an open-source REST API designed for scraping websites by simply passing a URL and receiving the rendered HTML content in return. This service functions as a cost-effective alternative to other scraping services like ScrapingBee, ScrapingAnt, and ScraperAPI. It supports dynamic content rendering with a headless browser, can use various proxies to avoid blocking, and offers features such as capturing screenshots of web pages. It is ideal for large-scale scraping tasks due to its scalable nature.
+
+To use SuperScraper API, you can deploy it with an Apify API token and access it via HTTP requests. The API supports multiple parameters for fine-tuning your scraping tasks, including options for rendering JavaScript, waiting for specific elements, and handling cookies and proxies. It also allows for custom data extraction rules and JavaScript execution on the scraped pages. Pricing is based on actual usage, which can be cheaper or more expensive than competitors, depending on the configuration.
+
+📚 Resources:
+
+- [How to integrate Scrapy projects](https://docs.apify.com/cli/docs/integrating-scrapy)
+- Scrapy monitoring: how to [manage your Scrapy spider on Apify](https://blog.apify.com/scrapy-monitoring-spidermon/)
+- Run ScrapingBee, ScraperAPI, and ScrapingAnt on Apify — [SuperScraper API Tutorial](https://www.youtube.com/watch?v=YKs-I-2K1Rg)
+
+## General resources
+
+- Creating your Actor: [Actor sources](https://docs.apify.com/academy/getting-started/creating-actors)
+- Use it, build it or buy it? [Choosing the right solution on Apify](https://help.apify.com/en/articles/3024655-choosing-the-right-solution)
+- How to programmatically retrieve data with the [Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM&t=0s)
+- Improved way to [build your scrapers from a Git repo](https://www.youtube.com/watch?v=8QJetr-BYdQ)
+- Webinar on [how to build and monetize Actors](https://www.youtube.com/watch?v=4nxStxC1BJM) on Apify Store
+- 6 things you should know before buying or [building a web scraper](https://blog.apify.com/6-things-to-know-about-web-scraping/)
+- For a comprehensive guide on creating your first Actor, visit the [Apify Academy](https://docs.apify.com/academy).
+</file>
+
+<file path="platform/get_most_of_actors/store_basics/ideas_page_and_its_use.md">
+---
+title: Ideas page and its use
+description: Learn where you can draw inspiration for your Actors.
+sidebar_position: 4
+category: apify platform
+slug: /actor-marketing-playbook/store-basics/ideas-page
+---
+
+So you want to build an Actor and publish it on Apify Store. Where should you start? How can you make people want to use it?
+
+To generate new Actor ideas, you can draw from your experience. You can also use SEO tools to discover relevant search terms and explore sites related to web scraping, automation, or integrations. But for direct inspiration straight from Apify, check out our Actor [Ideas page](https://apify.com/ideas) to see what data extraction tools are trending in the Apify community. Let's see how you can both use and contribute to this valuable resource.
+
+---
+
+## What's the Ideas page?
+
+The [Ideas page](https://apify.com/ideas) is where users can submit and explore potential projects for Actors, including scrapers, integrations, and automations. It serves as a collaborative space for proposing new tool ideas and finding inspiration for building and developing web scraping and automation solutions.
+
+## How you, as a developer, can use the Ideas page
+
+Got an innovative Actor idea or unsure what to build next? The Apify Ideas page is your go-to destination for submitting, developing, and claiming Actor concepts. If you're a developer ready to build an Actor using the Apify Ideas page, here’s how you can get involved:
+
+1. _Browse the Ideas page_<br/>
+    Check out the [Ideas page](https://apify.com/ideas) to find ideas that interest you. Look for ideas that align with your skills and the kind of Actor you want to build.
+2. _Select an idea_<br/>
+    Once you’ve found a promising idea, review the details and requirements provided. If you see an idea you want to develop, make sure to check its current status. If it’s marked as **Open to develop**, you’re good to go.
+3. _Develop your Actor_<br/>
+    Start building your Actor based on the idea. You don’t need to notify Apify about your development process. Focus on creating a functional and well-documented tool.
+4. _Prepare for launch_<br/>
+    Once your Actor is ready, ensure it meets all quality standards and has a comprehensive README. This documentation should include installation instructions, usage details, and any other relevant information.
+5. _Publish your Actor_<br/>
+    Deploy your Actor on Apify Store. Make sure it’s live and accessible for users.
+6. _Claim your idea_<br/>
+    After your Actor is published, email [ideas@apify.com](mailto:ideas@apify.com) with the URL of your Actor and the original idea. This will allow us to tag the idea as Completed and link it to your new Actor, giving you credit and visibility.
+7. _Monitor and optimize_<br/>
+    Make sure to monitor your Actor’s performance and user feedback. Use this information to make improvements and keep your Actor up to date.
+
+By following these steps, you’ll be able to contribute to the community while also gaining recognition for your work.
+
+## Criteria for claiming an idea
+
+To claim an idea, ensure that:
+
+1. Your Actor is functional.
+2. Your README contains relevant information.
+3. Your Actor closely aligns with the original idea.
+
+## Giving back to the Ideas page
+
+The Ideas page at Apify offers a variety of concepts for scrapers, integrations, and automations, and is a great place to find inspiration or solutions. It’s also a platform where you can contribute your own ideas to drive innovation and growth in our community.
+
+1. _Submit your Ideas_<br/>
+    Got a great Actor concept? Share it with us through the [Ideas form](https://apify.typeform.com/to/BNON8poB#source=ideas). Provide clear details about what your tool should do and how it should work.
+2. _Engage with the community_<br/>
+    Upvote ideas you find intriguing. The more support an idea receives, the more likely it is to catch a developer’s eye and move forward.
+3. _Don’t forget to claim your idea_<br/>
+    Once your Actor is up and running, claim your idea by emailing [ideas@apify.com](mailto:ideas@apify.com) with your Actor's URL and the original idea. We’ll mark your idea as **Completed** and link it to your Actor - a signal to the other developers that this tool already exists on Apify Store.
+
+## Multiple developers for one idea
+
+No problem! Apify Store can host multiple Actors with similar functions. However, we go by the “first come - first served” rule, so the first developer to claim an idea will receive the **Completed** tag and a link from the Ideas page.
+
+Remember that Apify Store is just like any other marketplace. We believe that competition helps developers thrive and improve upon their code, especially when there are similar scrapers on the horizon! You can still build the Actor, but try to be imaginative when it comes to its set of features.
+</file>
+
+<file path="platform/get_most_of_actors/index.md">
+---
+title: Actor marketing playbook
+description: Learn how to optimize and monetize your Actors on Apify Store by sharing them with other platform users.
+sidebar_position: 10
+category: apify platform
+slug: /actor-marketing-playbook
+---
+
+**Learn how to optimize and monetize your Actors on Apify Store by sharing them with other platform users.**
+
+---
+
+import Card from '@site/src/components/Card';
+import CardGrid from '@site/src/components/CardGrid';
+
+[Apify Store](https://apify.com/store) is a marketplace featuring thousands of ready-made automation tools called Actors. As a developer, you can publish your own Actors and generate revenue through our [monetization program](https://apify.com/partners/actor-developers).
+
+To help you succeed, we've created a comprehensive Actor marketing playbook. You'll learn how to:
+
+- Optimize your Actor's visibility on Apify Store
+- Create compelling descriptions and documentation
+- Build your developer brand
+- Promote your work to potential customers
+- Analyze performance metrics
+- Engage with the Apify community
+
+## Apify Store basics
+
+<CardGrid>
+    <Card
+        title="How Apify Store works"
+        desc="Apify Store is where users discover, run, and purchase Actors, making it the best place to showcase your work. This section explains how Store is structured, how Actors get visibility, and what factors contribute to success."
+        to="/academy/actor-marketing-playbook/store-basics/how-store-works"
+        smallImage
+    />
+    <Card
+        title="How to build Actors"
+        desc="Building a successful Actor isn’t just about writing code. You need to make your Actor reliable, scalable, and easy to use. Here, you’ll learn best practices for structuring your code, handling input/output, and making sure that everything runs smoothly."
+        to="/academy/actor-marketing-playbook/store-basics/how-to-build-actors"
+        smallImage
+    />
+    <Card
+        title="How Actor monetization works"
+        desc="Apify Store lets you earn revenue from your Actors, but success depends on pricing, user engagement, and quality. This section covers how subscriptions work, how to set competitive pricing, and what it takes to generate sustainable income."
+        to="/academy/actor-marketing-playbook/store-basics/how-actor-monetization-works"
+        smallImage
+    />
+    <Card
+        title="Ideas page and its use"
+        desc="Not sure what to build next? The Ideas page helps you find high-demand Actor concepts based on community requests and trending topics."
+        to="/academy/actor-marketing-playbook/store-basics/ideas-page"
+        smallImage
+    />
+    <Card
+        title="Actor success stories"
+        desc="Some developers have turned their Actors into profitable tools with steady user bases. This section shares real-world examples of what works, from product strategy to marketing tactics."
+        to="/academy/actor-marketing-playbook/store-basics/actor-success-stories"
+        smallImage
+    />
+</CardGrid>
+
+## Actor basics
+
+<CardGrid>
+    <Card
+        title="Naming your Actor"
+        desc="A clear, descriptive name helps users understand what your Actor does at a glance. Learn how to craft a name that’s both informative and easy to search for."
+        to="/academy/actor-marketing-playbook/actor-basics/name-your-actor"
+        smallImage
+    />
+    <Card
+        title="Importance of Actor URL"
+        desc="Your Actor’s URL isn’t just a link. You should also see it as a branding and SEO tool. This section explains why a simple, memorable URL can improve discoverability and credibility."
+        to="/academy/actor-marketing-playbook/actor-basics/importance-of-actor-url"
+        smallImage
+    />
+    <Card
+        title="Actor description and SEO description"
+        desc="Your Actor’s description determines whether users understand its value in seconds. Learn how to write compelling copy that highlights key features while improving search rankings."
+        to="/academy/actor-marketing-playbook/actor-basics/actor-description"
+        smallImage
+    />
+    <Card
+        title="How to create an Actor README"
+        desc="A strong README makes it easy for users to get started with your Actor. This guide walks you through the essential sections to include, from installation to usage examples."
+        to="/academy/actor-marketing-playbook/actor-basics/how-to-create-an-actor-readme"
+        smallImage
+    />
+    <Card
+        title="Actors and emojis"
+        desc="Emojis can make your Actor stand out, but using them strategically is key. This section covers when and how to use emojis effectively without overdoing it."
+        to="/academy/actor-marketing-playbook/actor-basics/actors-and-emojis"
+        smallImage
+    />
+</CardGrid>
+
+## Promoting your Actor
+
+<CardGrid>
+    <Card
+        title="SEO"
+        desc="Search engines play a huge role in driving users to your Actor. Learn the basics of keyword optimization, metadata, and other techniques that improve your ranking on Google and Apify’s search."
+        to="/academy/actor-marketing-playbook/promote-your-actor/seo"
+        smallImage
+    />
+    <Card
+        title="Social media"
+        desc="Your Actor won’t promote itself. This section covers simple, effective ways to showcase your Actor on platforms like X, LinkedIn, and Reddit."
+        to="/academy/actor-marketing-playbook/promote-your-actor/social-media"
+        smallImage
+    />
+    <Card
+        title="Parasite SEO"
+        desc="Ranking on Google is tough, but piggybacking on high-authority platforms can help. This guide explains how to use sites like Medium, GitHub, and Stack Overflow to increase visibility."
+        to="/academy/actor-marketing-playbook/promote-your-actor/parasite-seo"
+        smallImage
+    />
+    <Card
+        title="Product Hunt"
+        desc="A successful Product Hunt launch can drive early traction and long-term growth. Learn how to position your Actor, time your launch, and engage with the community for the best results."
+        to="/academy/actor-marketing-playbook/promote-your-actor/product-hunt"
+        smallImage
+    />
+    <Card
+        title="Blogs and blog resources"
+        desc="Writing about your Actor helps users discover and understand it. This section walks you through how to write effective blog posts, where to publish them, and how to attract readers."
+        to="/academy/actor-marketing-playbook/promote-your-actor/blogs-and-blog-resources"
+        smallImage
+    />
+    <Card
+        title="Video tutorials"
+        desc="Video content makes it easier for users to grasp your Actor’s functionality. Learn how to create short, engaging tutorials that explain key features and use cases."
+        to="/academy/actor-marketing-playbook/promote-your-actor/video-tutorials"
+        smallImage
+    />
+    <Card
+        title="Webinars"
+        desc="Hosting a webinar lets you connect directly with potential users and answer their questions. This section outlines how to plan, promote, and run a successful live session."
+        to="/academy/actor-marketing-playbook/promote-your-actor/webinars"
+        smallImage
+    />
+</CardGrid>
+
+## Interacting with users
+
+<CardGrid>
+    <Card
+        title="Emails to Actor users"
+        desc="Good email communication keeps users engaged and informed. Find out when and how to send helpful emails, from onboarding to feature updates."
+        to="/academy/actor-marketing-playbook/interact-with-users/emails-to-actor-users"
+        smallImage
+    />
+    <Card
+        title="Issues tab"
+        desc="User feedback is critical for improving your Actor. Learn how to track, respond to, and resolve issues efficiently through Apify’s Issues tab."
+        to="/academy/actor-marketing-playbook/interact-with-users/issues-tab"
+        smallImage
+    />
+    <Card
+        title="Your Store bio"
+        desc="Your bio is where users learn who you are and why they should trust your Actors. This guide helps you write a professional yet approachable bio that builds credibility."
+        to="/academy/actor-marketing-playbook/interact-with-users/your-store-bio"
+        smallImage
+    />
+</CardGrid>
+
+## Product optimization
+
+<CardGrid>
+    <Card
+        title="How to create a great input schema"
+        desc="A well-designed input schema makes your Actor easy to use. Learn how to create clear, logical inputs that guide users without overwhelming them."
+        to="/academy/actor-marketing-playbook/product-optimization/how-to-create-a-great-input-schema"
+        smallImage
+    />
+    <Card
+        title="Actor bundles"
+        desc="Bundling Actors together can increase their value and appeal. Learn how to package multiple Actors into a single, cohesive product that covers unique use cases."
+        to="/academy/actor-marketing-playbook/product-optimization/actor-bundles"
+        smallImage
+    />
+</CardGrid>
+<br/>
+Ready to grow your presence on the Apify platform? Check out our guide to [publishing your first Actor](/platform/actors/publishing).
+</file>
+
+<file path="platform/get_most_of_actors/monetizing_your_actor.md">
+---
+title: Monetizing your Actor
+description: Learn how you can monetize your web scraping and automation projects by publishing Actors to users in Apify Store.
+sidebar_position: 5
+slug: /get-most-of-actors/monetizing-your-actor
+unlisted: true
+---
+
+**Learn how you can monetize your web scraping and automation projects by publishing Actors to users in Apify Store.**
+
+---
+
+When you publish your Actor on the Apify platform, you have the option to make it a _Paid Actor_ and earn revenue from users who benefit from your tool. You can choose between two pricing models:
+
+- Rental
+- Pay-per-result
+
+## Rental pricing model
+
+With the rental model, you can specify a free trial period and a monthly rental price. After the trial, users with an [Apify paid plan](https://apify.com/pricing) can continue using your Actor by paying the monthly fee. You can receive 80% of the total rental fees collected each month.
+
+<details>
+    <summary>Example - rental pricing model</summary>
+
+You make your Actor rental with 7 days free trial and then $30/month. During the first calendar month, three users start to use your Actor:
+
+1. First user, on Apify paid plan, starts the free trial on 15th
+2. Second user, on Apify paid plan, starts the free trial on 25th
+3. Third user, on Apify free plan, start the free trial on 20th
+
+The first user pays their first rent 7 days after the free trial, i.e., on 22nd. The second user only starts paying the rent next month. The third user is on Apify free plan, so after the free trial ends on 27th, they are not charged and cannot use the Actor further until they get a paid plan. Your profit is computed only from the first user. They were charged $30, so 80% of this goes to you, i.e., _0.8 * 30 = $24_.
+</details>
+
+## Pay-per-result pricing model
+
+In this model, you set a price per 1000 results. Users are charged based on the number of results your Actor produces. Your profit is calculated as 80% of the revenue minus platform usage costs. The formula is:
+
+`(0.8 * revenue) - costs = profit`
+
+### Pay-per-result unit pricing for cost computation
+
+| Service                         | Unit price                 |
+|:--------------------------------|:---------------------------|
+| Compute unit                    | **$0.4** / CU              |
+| Residential proxies             | **$13** / GB               |
+| SERPs proxy                     | **$3** / 1,000 SERPs       |
+| Data transfer - external        | **$0.20** / GB             |
+| Data transfer - internal        | **$0.05** / GB             |
+| Dataset - timed storage         | **$1.00** / 1,000 GB-hours |
+| Dataset - reads                 | **$0.0004** / 1,000 reads  |
+| Dataset - writes                | **$0.005** / 1,000 writes  |
+| Key-value store - timed storage | **$1.00** / 1,000 GB-hours |
+| Key-value store - reads         | **$0.005** / 1,000 reads   |
+| Key-value store - writes        | **$0.05** / 1,000 writes   |
+| Key-value store - lists         | **$0.05** / 1,000 lists    |
+| Request queue - timed storage   | **$4.00** / 1,000 GB-hours |
+| Request queue - reads           | **$0.004** / 1,000 reads   |
+| Request queue - writes          | **$0.02** / 1,000 writes   |
+
+
+Only revenue & cost for Apify customers on paid plans are taken into consideration when computing your profit. Users on free plans are not reflected there, although you can see statistics about the potential reeenue of users that are currently on free plans in Actor Insights in the Apify Console.
+
+:::note What are Gigabyte-hours?
+
+Gigabyte-hours (GB-hours) are a unit of measurement used to quantify data storage and processing capacity over time. To calculate GB-hours, multiply the amount of data in gigabytes by the number of hours it's stored or processed.
+
+For example, if you host 50GB of data for 30 days:
+
+- Convert days to hours: _30 * 24 = 720_
+- Multiply data size by hours: _50 * 720 = 36,000_
+
+This means that storing 50 GB of data for 30 days results in 36,000 GB-hours.
+:::
+
+Read more about Actors in the Store and different pricing models from the perspective of your users in the [Store documentation](https://docs.apify.com/platform/actors/running/actors-in-store).
+
+<details>
+<summary>Example - pay-per-result pricing model</summary>
+
+You make your Actor pay-per-result and set price to be $1/1,000 results. During the first month, two users on Apify paid plans use your Actor to get 50,000 and 20,000 results, costing them $50 and $20 respectively. Let's say the underlying platform usage for the first user is $5 and for the second $2. Third user, this time on Apify free plan, uses the Actor to get 5,000 results, with underlying platform usage $0.5.
+
+Your profit is computed only from the first two users, since they are on Apify paid plans. The revenue for the first user is $50 and for the second $20, i.e., total revenue is $70. The total underlying cost is _$5 + $2 = $7_. Since your profit is 80% of the revenue minus the cost, it would be _0.8 * 70 - 7 = $49_.
+</details>
+
+### Best practices for Pay-per-results Actors
+
+To ensure profitable operation:
+
+- Set memory limits in your [`actor.json`](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) file to control platform usage costs
+- Implement the `ACTOR_MAX_PAID_DATASET_ITEMS` check to prevent excess result generation
+- Test your Actor with various result volumes to determine optimal pricing
+
+## Setting up monetization
+
+Navigate to your [Actor page](https://console.apify.com/actors?tab=my) in the Apify Console choose Actor that you want to monetize, and select the Publication tab.
+![Monetization section](./images/monetization-section.png)
+Open the Monetization section and complete your billing and payment details.
+![Set up monetization](./images/monetize_actor_set_up_monetization.png)
+Follow the monetization wizard to configure. Follow the monetization wizard to configure your pricing model.
+![Monetization wizard](./images/monetization_wizard.png)
+
+### Changing monetization
+
+You can change the monetization setting of your Actor by using the same wixard as for the setup in the **Monetization** section of your Actor's **Publication** tab. Any changes made to an already published Actor will take _14 days_ to come in effect, so that the users of your Actor have time to prepare.
+
+:::important Frequency of monetization adjustments
+
+Be aware that you can change monetization setting of each Actor only once per month. For further information & guidelines please refer to our [Terms & Conditions](https://apify.com/store-terms-and-conditions)
+
+:::
+
+
+## Payouts & analytics
+
+Payout invoices are generated automatically on the 14th of each month. Review your invoice in the Settings > Payout section within one week. If not approved by the 20th, the system will auto-approve on the 21st.
+
+Track your Actor's performance through:
+
+- The payout section for financial records
+- Actor Analytics for usage statistics
+
+    ![Actor analytics](./images/actor_analytics.png)
+
+- Individual Actor Insights for detailed performance metrics
+
+    ![Actor insights](./images/actor-insights.png)
+
+## Promoting your Actor
+
+Create SEO-optimized descriptions and README files to improve search engine visibility. Share your Actor on multiple channels:
+
+- Post on Reddit, Quora, and social media platform
+- Create tutorial videos demonstrating key features
+- Publish articles about your Actor on relevant websites
+- Consider creating a product showcase on platforms like Product hunt
+
+
+Remember to tag Apify in your social media posts for additional exposure. Effective promotion can significantly impact your Actor's success, differentiating between those with many paid users and those with few to none.
+
+Learn more about promoting your Actor from [Apify's Marketing Playbook](https://apify.notion.site/3fdc9fd4c8164649a2024c9ca7a2d0da?v=6d262c0b026d49bfa45771cd71f8c9ab).
+</file>
+
+<file path="platform/getting_started/actors.md">
+---
+title: Actors
+description: What is an Actor? How do we create them? Learn the basics of what Actors are, how they work, and try out an Actor yourself right on the Apify platform!
+sidebar_position: 1
+slug: /getting-started/actors
+---
+
+# Actors {#actors}
+
+**What is an Actor? How do we create them? Learn the basics of what Actors are, how they work, and try out an Actor yourself right on the Apify platform!**
+
+---
+
+After you've followed the **Getting started** lesson, you're almost ready to start creating some Actors! But before we get into that, let's discuss what an Actor is, and a bit about how they work.
+
+## What's an Actor? {#what-is-an-actor}
+
+When you deploy your script to the Apify platform, it is then called an **Actor**, which is a [serverless microservice](https://www.datadoghq.com/knowledge-center/serverless-architecture/serverless-microservices/#:~:text=Serverless%20microservices%20are%20cloud-based,suited%20for%20microservice-based%20architectures.) that accepts an input and produces an output. Actors can run for a few seconds, hours or even infinitely. An Actor can perform anything from a basic action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset.
+
+Once an Actor has been pushed to the Apify platform, they can be shared to the world through the [Apify Store](https://apify.com/store), and even monetized after going public.
+
+> Though the majority of Actors that are currently on the Apify platform are scrapers, crawlers, or automation software, Actors are not limited to scraping. They can be any program running in a Docker container.
+
+## Actors on the Apify platform {#actors-on-platform}
+
+For a super quick and dirty understanding of what a published Actor looks like, and how it works, let's run an SEO audit of **apify.com** using the [SEO audit Actor](https://apify.com/misceres/seo-audit-tool).
+
+On the front page of the Actor, click the green **Try for free** button. If you're logged into your Apify account which you created during the [**Getting started**](./index.md) lesson, you'll be taken to the Apify Console and greeted with a page that looks like this:
+
+![Actor configuration](./images/seo-actor-config.png)
+
+This is where we can provide input to the Actor. The defaults here are just fine, so we'll leave it as is and click the green **Start** button to run it. While the Actor is running, you'll see it log some information about itself.
+
+![Actor logs](./images/actor-logs.jpg)
+
+After the Actor has completed its run (you'll know this when you see **SEO audit for apify.com finished.** in the logs), the results of the run can be viewed by clicking the **Results** tab, then subsequently the **View in another tab** option under **Export**.
+
+## The "Actors" tab {#actors-tab}
+
+While still on the platform, click on the tab with the **< >** icon which says **Actors**. This tab is your one-stop-shop for seeing which Actors you've used recently, and which ones you've developed yourself. You will be frequently using this tab when developing and testing on the Apify platform.
+
+![The "Actors" tab on the Apify platform](./images/actors-tab.jpg)
+
+Now that you know the basics of what Actors are and how to use them, it's time to develop **an Actor of your own**!
+
+## Next up {#next}
+
+Get ready, because in the [next lesson](./creating_actors.md), you'll be writing your very own Actor!
+</file>
+
+<file path="platform/getting_started/apify_api.md">
+---
+title: Apify API
+description: Learn how to use the Apify API to programmatically call your Actors, retrieve data stored on the platform, view Actor logs, and more!
+sidebar_position: 4
+slug: /getting-started/apify-api
+---
+
+# The Apify API {#the-apify-api}
+
+**Learn how to use the Apify API to programmatically call your Actors, retrieve data stored on the platform, view Actor logs, and more!**
+
+---
+
+[Apify's API](/api/v2#/reference) is your ticket to the Apify platform without even needing to access the [Apify Console](https://console.apify.com?asrc=developers_portal) web-interface. The API is organized around RESTful HTTP endpoints.
+
+In this lesson, we'll be learning how to use the Apify API to call an Actor and view its results. We'll be using the Actor we created in the previous lesson, so if you haven't already gotten that one set up, go ahead do that before moving forward if you'd like to follow along.
+
+## Finding your endpoint {#finding-your-endpoint}
+
+Within one of your Actors on the [Apify Console](https://console.apify.com?asrc=developers_portal) (we'll use the **adding-actor** from the previous lesson), click on the **API** button in the top right-hand corner:
+
+![The "API" button on an Actor's page on the Apify Console](./images/api-tab.jpg)
+
+You should see a long list of API endpoints that you can copy and paste elsewhere, or even test right within the **API** modal. Go ahead and copy the endpoint labeled **Run Actor synchronously and get dataset items**. It should look something like this:
+
+```text
+https://api.apify.com/v2/acts/YOUR_USERNAME~adding-actor/run-sync?token=YOUR_TOKEN
+```
+
+> In this lesson, we'll only be focusing on this one endpoint, as it is the most popularly used one; however, don't let this limit your curiosity! Take a look at the other endpoints in the **API** window to learn about everything you can do to your Actor programmatically.
+
+Now, let's move over to our favorite HTTP client (in this lesson we'll use [Insomnia](../../glossary/tools/insomnia.md) in order to prepare and send the request).
+
+## Providing input {#providing-input}
+
+Our **adding-actor** takes in two input values (`num1` and `num2`). When using the Actor on the platform, provide these fields either through the UI generated by the **INPUT_SCHEMA.json**, or directly in JSON format. When providing input when making an API call to run an Actor, the input must be provided in the **body** of the POST request as a JSON object.
+
+![Providing input](./images/provide-input.jpg)
+
+## Parameters {#parameters}
+
+Let's say we want to run our **adding-actor** via API and view its results in CSV format at the end. We'll achieve this by passing the **format** parameter with a value of **csv** to change the output format:
+
+```text
+https://api.apify.com/v2/acts/YOUR_USERNAME~adding-actor/run-sync-get-dataset-items?token=YOUR_TOKEN_HERE&format=csv
+```
+
+Additional parameters can be passed to this endpoint. You can learn about them [here](/api/v2#/reference/actors/run-actor-synchronously-and-get-dataset-items/run-actor-synchronously-with-input-and-get-dataset-items)
+
+> Network components can record visited URLs, so it's more secure to send the token as a HTTP header, not as a parameter. The header should look like `Authorization: Bearer YOUR_TOKEN`. Popular HTTP clients, such as [Postman](../../glossary/tools/postman.md) or [Insomnia](../../glossary/tools/insomnia.md), provide a convenient way to configure the Authorization header for all your API requests.
+
+## Sending the request {#sending-the-request}
+
+If you're not using an HTTP client, you can send the request through your terminal with this command:
+
+```curl
+curl -d '{"num1":1, "num2":8}' -H "Content-Type: application/json" -X POST "https://api.apify.com/v2/acts/YOUR_USERNAME~adding-actor/run-sync-get-dataset-items?token=YOUR_TOKEN_HERE&format=csv"
+```
+
+Here's the response we got:
+
+![API response](./images/api-csv-response.png)
+
+And there it is! The Actor was run with our inputs of **num1** and **num2**, then the dataset results were returned back to us in CSV format.
+
+## Apify API's many features {#api-many-features}
+
+What we've done in this lesson only scratches the surface of what the Apify API can do. Right from Insomnia, or from any HTTP client, you can [manage datasets](/api/v2#/reference/datasets/dataset/get-dataset) and [key-value stores](/api/v2#/reference/key-value-stores/key-collection/get-dataset), [add to request queues](/api/v2#/reference/request-queues/queue-collection/add-request), [update Actors](/api/v2#/reference/actors/actor-object/add-request), and much more! Basically, whatever you can do on the platform's web interface, you also do through the API.
+
+## Next up {#next}
+
+[Next up](./apify_client.md), we'll be learning about how to use Apify's JavaScript and Python clients to interact with the API right within our code.
+
+<!-- Note: From the previous version of this lesson, some now unused but useful images still remain.
+
+- actor-settings-id.jpg
+- api-error.jpg
+
+ -->
+</file>
+
+<file path="platform/getting_started/apify_client.md">
+---
+title: Apify client
+description: Interact with the Apify API in your code by using the apify-client package, which is available for both JavaScript and Python.
+sidebar_position: 5
+slug: /getting-started/apify-client
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Apify client {#apify-client}
+
+**Interact with the Apify API in your code by using the apify-client package, which is available for both JavaScript and Python.**
+
+---
+
+Now that you've gotten your toes wet with interacting with the Apify API through raw HTTP requests, you're ready to become familiar with the **Apify client**, which is a package available for both JavaScript and Python that allows you to interact with the API in your code without  explicitly needing to make any GET or POST requests.
+
+This lesson will provide code examples for both Node.js and Python, so regardless of the language you are using, you can follow along!
+
+## Examples {#examples}
+
+You can access `apify-client` examples in the Console Actor detail page. Click the **API** button and then the **API Client** dropdown button.
+
+![API button](./images/api-button.png)
+
+## Installing and importing {#installing-and-importing}
+
+If you are going to use the client in Node.js, use this command within one of your projects to install the package through npm:
+
+```shell
+npm install apify-client
+```
+
+In Python, you can install it from PyPI with this command:
+
+```shell
+pip install apify-client
+```
+
+After installing the package, let's make a file named **client** and import the Apify client like so:
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+// client.js
+import { ApifyClient } from 'apify-client';
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+# client.py
+from apify_client import ApifyClient
+
+```
+
+</TabItem>
+</Tabs>
+
+## Running an Actor {#running-an-actor}
+
+In the last lesson, we ran the **adding-actor** and retrieved its dataset items. That's exactly what we're going to do now; however, by using the Apify client instead.
+
+Before we can use the client though, we must create a new instance of the `ApifyClient` class and pass it our API token from the [**Integrations** page](https://console.apify.com/account?tab=integrations&asrc=developers_portal) on the Apify Console:
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+const client = new ApifyClient({
+    token: 'YOUR_TOKEN',
+});
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+client = ApifyClient(token='YOUR_TOKEN')
+
+```
+
+</TabItem>
+</Tabs>
+
+> If you are planning on publishing your code to a public GitHub/Gitlab repository or anywhere else online, be sure to set your API token as en environment variable, and never hardcode it directly into your script.
+
+Now that we've got our instance, we can point to an Actor using the [`client.actor()`](/api/client/js/reference/class/ApifyClient#actor) function, then call the Actor with some input with the [`.call()`](/api/client/js/reference/class/ApifyClient#actor) function - the first parameter of which is the input for the Actor.
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+const run = await client.actor('YOUR_USERNAME/adding-actor').call({
+    num1: 4,
+    num2: 2,
+});
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+run = client.actor('YOUR_USERNAME/adding-actor').call(run_input={
+    'num1': 4,
+    'num2': 2
+})
+
+```
+
+</TabItem>
+</Tabs>
+
+> Learn more about the `.call()` function [here](/api/client/js/reference/class/ApifyClient#actor).
+
+## Downloading dataset items {#downloading-dataset-items}
+
+Once an Actor's run has completed, it will return a **run info** object that looks something like this:
+
+![Run info object](./images/run-info.jpg)
+
+The `run` variable we created in the last section points to the **run info** object of the run we created with the `.call()` function, which means that through this variable, we can access the run's `defaultDatasetId`. This ID can then be passed into the `client.dataset()` function.
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+const dataset = client.dataset(run.defaultDatasetId);
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+dataset = client.dataset(run['defaultDatasetId'])
+
+```
+
+</TabItem>
+</Tabs>
+
+Finally, we can download the items in the dataset by using the **list items** function, then log them to the console.
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+const { items } = await dataset.listItems();
+
+console.log(items);
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+items = dataset.list_items().items
+
+print(items)
+
+```
+
+</TabItem>
+</Tabs>
+
+The final code for running the Actor and fetching its dataset items looks like this:
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+// client.js
+import { ApifyClient } from 'apify-client';
+
+const client = new ApifyClient({
+    token: 'YOUR_TOKEN',
+});
+
+const run = await client.actor('YOUR_USERNAME/adding-actor').call({
+    num1: 4,
+    num2: 2,
+});
+
+const dataset = client.dataset(run.defaultDatasetId);
+
+const { items } = await dataset.listItems();
+
+console.log(items);
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+# client.py
+from apify_client import ApifyClient
+
+client = ApifyClient(token='YOUR_TOKEN')
+
+actor = client.actor('YOUR_USERNAME/adding-actor').call(run_input={
+    'num1': 4,
+    'num2': 2
+})
+
+dataset = client.dataset(run['defaultDatasetId'])
+
+items = dataset.list_items().items
+
+print(items)
+
+```
+
+</TabItem>
+</Tabs>
+
+## Updating an Actor {#updating-actor}
+
+If you check the **Settings** tab within your **adding-actor**, you'll notice that the default memory being allocated to the Actor is **2048 MB**. This is a bit overkill considering the fact that the Actor is only adding two numbers together - **256 MB** would be much more reasonable. Also, we can safely say that the run should never take more than 20 seconds (even this is a generous number) and that the default of 3600 seconds is also overkill.
+
+Let's change these two Actor settings via the Apify client using the [`actor.update()`](/api/client/js/reference/class/ActorClient#update) function. This function will call the **update Actor** endpoint, which can take `defaultRunOptions` as an input property. You can find the shape of the `defaultRunOptions` in the [API documentation](/api/v2#/reference/actors/actor-object/update-actor). Perfect!
+
+First, we'll create a pointer to our Actor, similar to before (except this time, we won't be using `.call()` at the end):
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+const actor = client.actor('YOUR_USERNAME/adding-actor');
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+actor = client.actor('YOUR_USERNAME/adding-actor')
+
+```
+
+</TabItem>
+</Tabs>
+
+Then, we'll call the `.update()` method on the `actor` variable we created and pass in our new **default run options**:
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+await actor.update({
+    defaultRunOptions: {
+        build: 'latest',
+        memoryMbytes: 256,
+        timeoutSecs: 20,
+    },
+});
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+actor.update(default_run_build='latest', default_run_memory_mbytes=256, default_run_timeout_secs=20)
+
+```
+
+</TabItem>
+</Tabs>
+
+After running the code, go back to the **Settings** page of **adding-actor**. If your default options now look like this, then it worked!:
+
+![New run defaults](./images/new-defaults.jpg)
+
+## Overview {#overview}
+
+You can do so much more with the Apify client than running Actors, updating Actors, and downloading dataset items. The purpose of this lesson was to get you comfortable using the client in your own projects, as it's the absolute best developer tool for integrating the Apify platform with an external system.
+
+For a more in-depth understanding of the Apify API client, give these a quick lookover:
+
+- [API client for JavaScript](/api/client/js)
+- [API client for Python](/api/client/python)
+
+## Next up {#next}
+
+Now that you're familiar and a bit more comfortable with the Apify platform, you're ready to start deploying your code to Apify! In the [next section](../deploying_your_code/index.md), you'll learn how to take any project written in any programming language and turn it into an Actor.
+</file>
+
+<file path="platform/getting_started/creating_actors.md">
+---
+title: Creating Actors
+description: Build and run your very first Actor directly in Apify Console from a template. This lesson provides hands-on experience with building and running Actors.
+sidebar_position: 2
+slug: /getting-started/creating-actors
+---
+
+# Creating Actors {#creating-actors}
+
+**This lesson offers hands-on experience in building and running Actors in Apify Console using a template. By the end of it, you will be able to build and run your first Actor using an Actor template.**
+
+---
+
+You can create an Actor in several ways. You can create one from your own source code hosted in a Git repository or in your local machine, for example. But in this tutorial, we'll focus on the easiest method: selecting an Actor code template. We don't need to install any special software, and everything can be done directly in Apify Console using an Apify account.
+
+## Choose the source {#choose-the-source}
+
+Once you're in Apify Console, go to [Development](https://console.apify.com/actors/development/my-actors), and click on the **Develop new** button in the top right-hand corner.
+
+![Develop an Actor button](./images/develop-new-actor.png)
+
+You'll be presented with a page featuring two ways to get started with a new Actor.
+
+1. Creating an Actor from existing source code (using Git providers or pushing the code from your local machine using Apify CLI)
+2. Creating an Actor from a code template
+
+|                               Existing source code                                |                                  Code templates                                  |
+|:---------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------:|
+|  ![Create and Actor from source code](./images/create-actor-from-source-code.png) | ![Create an Actor from code templates](./images/create-actor-from-templates.png) |
+
+## Creating Actor from existing source code {#existing-source-code}
+
+If you already have your code hosted by a Git provider, you can use it to create an Actor by linking the repository. If you use GitHub, you can use our [GitHub integration](/platform/integrations/github) to create an Actor from your public or private repository. You can also use GitLab, Bitbucket or other Git providers or external repositories.
+
+![Create an Actor from Git repository](./images/create-actor-git.png)
+
+You can also push your existing code from your local machine using [Apify CLI](/cli). This is useful when you develop your code locally and then you want to push it to the Apify Console to run the code as an Actor in the cloud. For this option, you'll need the [Apify CLI installed](/cli/docs/installation) on your machine. By clicking on the **Push your code using the Apify command-line interface (CLI)** button, you will be presented with instructions on how to push your code to the Apify Console.
+
+![Push your code using the Apify CLI](./images/create-actor-cli.png)
+
+## Creating Actor from code template {#code-template}
+
+Python, JavaScript, and TypeScript have several template options that you can use.
+
+> You can select one from the list on this page or you can browse all the templates in the template library by clicking on the **View all templates** button in the right corner.
+
+For example, let's choose the **Start with JavaScript** template and click on the template card.
+
+![JavaScript template card](./images/create-actor-template-javascript-card.png)
+
+You will end up on a template detail page where you can see all the important information about the template - description, included features, used technologies, and what is the use-case of this template. More importantly, there is a code preview and also instructions for how the code works.
+
+![JavaScript template detail page](./images/create-actor-template-detail-page.png)
+
+### Using the template in the Web IDE {#web-ide}
+
+By clicking **Use this template** button you will create the Actor in Apify Console and you will be moved to the **Code** tab with the [Web IDE](/platform/actors/development/quick-start/web-ide) where you can see the code of the template and start editing it.
+
+> The Web IDE is a great tool for developing your Actor directly in Apify Console without the need to install or use any other software.
+
+![Web IDE](./images/create-actor-web-ide.png)
+
+### Using the template locally {#local}
+
+If you want to use the template locally, you can again use our [Apify CLI](/cli) to download the template to your local machine.
+
+> Creating an Actor from a template locally is a great option if you want to develop your code using your local environment and IDE and then push the final solution back to the Apify Console.
+
+When you click on the **Use locally** button, you'll be presented with instructions on how to create an Actor from this template in your local environment.
+
+With the Apify CLI installed, you can run the following commands in your terminal:
+
+```shell
+apify create my-actor -t getting_started_node
+```
+
+```shell
+cd my-actor
+apify run
+```
+
+![Use the template locally](./images/create-actor-template-locally.png)
+
+## Start with scraping single page {#scraping-single-page}
+
+This template is a great starting point for web scraping as it extracts data from a single website. It uses [Axios](https://axios-http.com/docs/intro) for downloading the page content and [Cheerio](https://cheerio.js.org/) for parsing the HTML from the content.
+
+Let's see what's inside the **Start with JavaScript** template. The main logic of the template lives in the `src/main.js` file.
+
+```js
+// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
+import { Actor } from 'apify';
+import axios from 'axios';
+// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
+import * as cheerio from 'cheerio';
+// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
+
+// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
+await Actor.init();
+
+// Structure of input is defined in input_schema.json
+const input = await Actor.getInput();
+const { url } = input;
+
+// Fetch the HTML content of the page.
+const response = await axios.get(url);
+
+// Parse the downloaded HTML with Cheerio to enable data extraction.
+const $ = cheerio.load(response.data);
+
+// Extract all headings from the page (tag name and text).
+const headings = [];
+$('h1, h2, h3, h4, h5, h6').each((i, element) => {
+    const headingObject = {
+        level: $(element).prop('tagName').toLowerCase(),
+        text: $(element).text(),
+    };
+    console.log('Extracted heading', headingObject);
+    headings.push(headingObject);
+});
+
+// Save headings to Dataset - a table-like storage.
+await Actor.pushData(headings);
+
+// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
+await Actor.exit();
+```
+
+The Actor takes the `url` from the input and then:
+
+1. Sends a request to the URL.
+2. Downloads the page's HTML content.
+3. Extracts headings (H1 - H6) from the page.
+4. Stores the extracted data.
+
+The extracted data is stored in the [Dataset](/platform/storage/dataset) where you can preview it and download it. We'll show how to do that later in [Run the Actor](#run-the-actor) section.
+
+> Feel free to play around with the code and add some more features to it. For example, you can extract all the links from the page or extract all the images or completely change the logic of this template. Keep in mind that this template uses [input schema](/academy/deploying-your-code/input-schema) defined in the `.actor/input_schema.json` file and linked to the `.actor/actor.json`. If you want to change the input schema, you need to change it in those files as well. Learn more about the Actor input and output [in the next page](/academy/getting-started/inputs-outputs).
+
+## Build the Actor 🧱 {#build-an-actor}
+
+In order to run the Actor, you need to [build](/platform/actors/development/builds-and-runs/builds) it first. Click on the **Build** button at the bottom of the page or **Build now** button right under the code editor.
+
+![Build the Actor](./images/build-actor.png)
+
+After you've clicked the **Build** button, it'll take around 5–10 seconds to complete the build. You'll know it's finished when you see a green **Start** button.
+
+![Start button](./images/start.png)
+
+## Fill the input {#fill-input}
+
+And now we are ready to run the Actor. But before we do that, let's give the Actor some input by going to the `Input` tab.
+
+The input tab is where you can provide the Actor with some meaningful input. In this case, we'll be providing the Actor with a URL to scrape. For now, we'll use the prefilled value of [Apify website](https://apify.com/) (`https://apify.com/`).
+
+You can change the website you want to extract the data from by changing the URL in the input field.
+
+![Input tab](./images/actor-input-tab.png)
+
+## Run the Actor {#run-the-actor}
+
+Once you have provided the Actor with some URL you want to extract the data from, click **Start** button and wait a few seconds. You should see the Actor run logs in the **Last run** tab.
+
+![Actor run logs](./images/actor-run.png)
+
+After the Actor finishes, you can preview or download the extracted data by clicking on the **Export X results** button.
+
+![Export results](./images/actor-run-dataset.png)
+
+And that's it! You've just created your first Actor and extracted data from a website 🎉.
+
+## Getting stuck? Check out the tips 💡 {#get-help-with-tips}
+
+If you ever get stuck, you can always click on the **Tips** button in the top right corner of the page. It will show you a list of tips that are relevant to the Actor development.
+
+![Tips](./images/actor-tips.png)
+
+## Next up {#next}
+
+We've created an Actor, but how can we give it more complex inputs and make it do stuff based on these inputs? This is exactly what we'll be discussing in the [next lesson](./inputs_outputs.md)'s activity.
+</file>
+
+<file path="platform/getting_started/index.md">
+---
+title: Getting started
+description: Get started with the Apify platform by creating an account and learning about the Apify Console, which is where all Apify Actors are born!
+sidebar_position: 8
+category: apify platform
+slug: /getting-started
+---
+
+# Getting started {#getting-started}
+
+**Get started with the Apify platform by creating an account and learning about the Apify Console, which is where all Apify Actors are born!**
+
+---
+
+Your gateway to the Apify platform is your Apify account. The great thing about creating an account is that we support integration with both Google and GitHub, which takes only about 30 seconds!
+
+1. Create your account on the [sign up](https://console.apify.com/sign-up?asrc=developers_portal) page.
+2. Check your email, you should have a verification email with a link. Click it!
+3. Done! 👍
+
+## Getting to know the platform {#getting-to-know-the-platform}
+
+Now that you have an account, you have access to the [Apify Console](https://console.apify.com?asrc=developers_portal), which is a wonderful place where you utilize all of the features the platform has to offer, as well as manage and test your own projects.
+
+## Next up {#next}
+
+In our next lesson, we'll learn about something super exciting - **Actors**. Actors are the living and breathing core of the Apify platform and are an extremely powerful concept. What are you waiting for? Let's jump [right into the next lesson](./actors.md)!
+</file>
+
+<file path="platform/getting_started/inputs_outputs.md">
+---
+title: Inputs & outputs
+description: Create an Actor from scratch which takes an input, processes that input, and then outputs a result that can be used elsewhere.
+sidebar_position: 3
+slug: /getting-started/inputs-outputs
+---
+
+# Inputs & outputs {#inputs-outputs}
+
+**Create an Actor from scratch which takes an input, processes that input, and then outputs a result that can be used elsewhere.**
+
+---
+
+Actors, as any other programs, take inputs and generate outputs. The Apify platform has a way how to specify what inputs the Actor expects, and a way to temporarily or permanently store its results.
+
+In this lesson, we'll be demonstrating inputs and outputs by building an Actor which takes two numbers as input, adds them up, and then outputs the result.
+
+## Accept input into an Actor {#accept-input}
+
+Let's first create another new Actor using the same template as before. Feel free to refer to the [previous lesson](./creating_actors.md) for a refresher on how to do this.
+
+Replace all of the code in **main.js** with this code snippet:
+
+```js
+import { Actor } from 'apify';
+
+await Actor.init();
+
+// Grab our numbers which were inputted
+const { num1, num2 } = await Actor.getInput();
+
+// Calculate the solution
+const solution = num1 + num2;
+
+// Push the solution to the dataset
+await Actor.pushData({ solution });
+
+await Actor.exit();
+```
+
+Then, replace everything in **INPUT_SCHEMA.json**  with this:
+
+> This step isn't necessary, as the Actor will still be able to take input in JSON format without it; however, we are providing the content for this Actor's input schema in this lesson, as it will give the Apify platform a blueprint off of which it can generate a nice UI for your inputs, as well as validate their values.
+
+```json
+{
+    "title": "Number adder",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "num1": {
+            "title": "1st Number",
+            "type": "integer",
+            "description": "First number.",
+            "editor": "number"
+        },
+        "num2": {
+            "title": "2nd Number",
+            "type": "integer",
+            "description": "Second number.",
+            "editor": "number"
+        }
+    },
+    "required": ["num1", "num2"]
+}
+```
+
+> If you're interested in learning more about how the code works, and what the **INPUT_SCHEMA.json** means, read about [inputs](/sdk/js/docs/examples/accept-user-input) and [adding data to a dataset](/sdk/js/docs/examples/add-data-to-dataset) in the Apify SDK documentation, and refer to the [input schema docs](/platform/actors/development/actor-definition/input-schema/specification/v1#integer).
+
+Finally, **Save** and **Build** the Actor just as you did in the previous lesson.
+
+## Configuring an Actor with inputs {#configuring}
+
+If you scroll down a bit, you'll find the **Developer console** located under the multifile editor. By default, after running a build, the **Last build** tab will be selected, where you can see all of the logs related to building the Actor. Inputs can be configured within the **Input** tab.
+
+![Configuring inputs](./images/configure-inputs.jpg)
+
+Enter any two numbers you'd like, then press **Start**. The Actor's run should be completed almost immediately.
+
+## View Actor results {#view-results}
+
+Since we've pushed the result into the default dataset, it, and some info about it can be viewed by clicking this box, which will take you to the results tab:
+
+![Result box](./images/result-box.png)
+
+On the results tab, there are a whole lot of options for which format to view/download the data in. Keep the default of **JSON** selected, and click on **Preview**.
+
+![Dataset preview](./images/dataset-preview.png)
+
+There's our solution! Did it work for you as well? Now, we can download the data right from the results tab to be used elsewhere, or even programmatically retrieve it by using [Apify's API](/api/v2) (we'll be discussing how to do this in the next lesson).
+
+It's important to note that the default dataset of the Actor, which we pushed our solution to, will be retained for 7 days. If we wanted the data to be retained for an indefinite period of time, we'd have to use a named dataset. For more information about named storages vs unnamed storages, read a bit about [data retention on the Apify platform](/platform/storage/usage#data-retention).
+
+## Next up {#next}
+
+In [next lesson](./apify_api.md)'s fun activity, you'll learn how to call the Actor we created in this lesson programmatically using one of Apify's most powerful tools - the Apify API.
+</file>
+
+<file path="platform/apify_platform.md">
+---
+title: Introduction to Apify platform
+description: Learn all about the Apify platform, all of the tools it offers, and how it can improve your overall development experience.
+sidebar_position: 7
+category: apify platform
+slug: /apify-platform
+---
+
+# Introduction to the Apify platform {#about-the-platform}
+
+**Learn all about the Apify platform, all of the tools it offers, and how it can improve your overall development experience.**
+
+---
+
+The [Apify platform](https://apify.com) was built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to compute instances ([Actors](./getting_started/actors.md)), convenient request and result storages, proxies, scheduling, webhooks and more - all accessible through the **Console** web interface, [Apify's API](/api/v2), or our [JavaScript](/api/client/js) and [Python](/api/client/python) API clients.
+
+## Category outline {#this-category}
+
+In this category, you'll learn how to become an Apify platform developer from the ground up. From creating your first account, to developing Actors, this is your one-stop-shop for understanding how the platform works, and how to work with it.
+
+## First up {#first}
+
+We'll start off this category light, by showing you how to create an Apify account and get everything ready for development with the platform. [Let's go!](./getting_started/index.md)
+</file>
+
+<file path="platform/running_a_web_server.md">
+---
+title: Running a web server on the Apify platform
+description: A web server running in an Actor can act as a communication channel with the outside world. Learn how to set one up with Node.js.
+sidebar_position: 11
+category: apify platform
+slug: /running-a-web-server
+---
+
+# Running a web server on the Apify platform
+
+**A web server running in an Actor can act as a communication channel with the outside world. Learn how to set one up with Node.js.**
+
+---
+
+Sometimes, an Actor needs a channel for communication with other systems (or humans). This channel might be used to receive commands, to provide info about progress, or both. To implement this, we will run a HTTP web server inside the Actor that will provide:
+
+- An API to receive commands.
+- An HTML page displaying output data.
+
+Running a web server in an Actor is a piece of cake! Each Actor run is available at a unique URL (container URL) which always takes the form `https://CONTAINER-KEY.runs.apify.net`. This URL is available in the [**Actor run** object](/api/v2#/reference/actor-runs/run-object-and-its-storages/get-run) returned by the Apify API, as well as in the Apify console.
+
+If you start a web server on the port defined by the **APIFY_CONTAINER_PORT** environment variable (the default value is **4321**), the container URL becomes available and gets displayed in the **Live View** tab in the Actor run console.
+
+For more details, see [the documentation](/platform/actors/development/programming-interface/container-web-server).
+
+## Building the Actor {#building-the-actor}
+
+Let's try to build the following Actor:
+
+- The Actor will provide an API to receive URLs to be processed.
+- For each URL, the Actor will create a screenshot.
+- The screenshot will be stored in the key-value store.
+- The Actor will provide a web page displaying thumbnails linked to screenshots and a HTML form to submit new URLs.
+
+To achieve this we will use the following technologies:
+
+- [Express.js](https://expressjs.com) framework to create the server
+- [Puppeteer](https://pptr.dev) to grab screenshots.
+- The [Apify SDK](/sdk/js) to access Apify storages to store the screenshots.
+
+Our server needs two paths:
+
+- `/` - Index path will display a page form to submit a new URL and the thumbnails of processed URLs.
+- `/add-url` - Will provide an API to add new URLs using a HTTP POST request.
+
+First, we'll import `express` and create an Express.js app. Then, we'll add some middleware that will allow us to receive form submissions.
+
+```js
+import { Actor } from 'apify';
+import express from 'express';
+
+await Actor.init();
+
+const app = express();
+
+app.use(express.json());
+app.use(express.urlencoded({ extended: true }));
+```
+
+Now we need to read the following environment variables:
+
+- **APIFY_CONTAINER_PORT** contains a port number where we must start the server.
+- **APIFY_CONTAINER_URL** contains a URL under which we can access the container.
+- **APIFY_DEFAULT_KEY_VALUE_STORE_ID** is the ID of the default key-value store of this Actor where we can store screenshots.
+
+```js
+const {
+    APIFY_CONTAINER_PORT,
+    APIFY_CONTAINER_URL,
+    APIFY_DEFAULT_KEY_VALUE_STORE_ID,
+} = process.env;
+```
+
+Next, we'll create an array of the processed URLs where the **n**th URL has its screenshot stored under the key **n**.jpg in the key-value store.
+
+```js
+const processedUrls = [];
+```
+
+After that, the index route is ready to be defined.
+
+```js
+app.get('/', (req, res) => {
+    let listItems = '';
+
+    // For each of the processed
+    processedUrls.forEach((url, index) => {
+        const imageUrl = `https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${index}.jpg`;
+
+        // Display the screenshots below the form
+        listItems += `<li>
+    <a href="${imageUrl}" target="_blank">
+        <img src="${imageUrl}" width="300px" />
+        <br />
+        ${url}
+    </a>
+</li>`;
+    });
+
+    const pageHtml = `<html>
+    <head><title>Example</title></head>
+    <body>
+        <form method="POST" action="/add-url">
+            URL: <input type="text" name="url" placeholder="http://example.com" />
+            <input type="submit" value="Add" />
+            <hr />
+            <ul>${listItems}</ul>
+        </form>
+    </body>
+</html>`;
+
+    res.send(pageHtml);
+});
+```
+
+And then a second path that receives the new URL submitted using the HTML form; after the URL is processed, it redirects the user back to the root path.
+
+```js
+import { launchPuppeteer } from 'crawlee';
+
+app.post('/add-url', async (req, res) => {
+    const { url } = req.body;
+    console.log(`Got new URL: ${url}`);
+
+    // Start chrome browser and open new page ...
+    const browser = await launchPuppeteer();
+    const page = await browser.newPage();
+
+    // ... go to our URL and grab a screenshot ...
+    await page.goto(url);
+    const screenshot = await page.screenshot({ type: 'jpeg' });
+
+    // ... close browser ...
+    await page.close();
+    await browser.close();
+
+    // ... save screenshot to key-value store and add URL to processedUrls.
+    await Actor.setValue(`${processedUrls.length}.jpg`, screenshot, { contentType: 'image/jpeg' });
+    processedUrls.push(url);
+
+    res.redirect('/');
+});
+```
+
+And finally, we need to start the web server.
+
+```js
+// Start the web server!
+app.listen(APIFY_CONTAINER_PORT, () => {
+    console.log(`Application is listening at URL ${APIFY_CONTAINER_URL}.`);
+});
+```
+
+### Final code {#final-code}
+
+```js
+import { Actor } from 'apify';
+import express from 'express';
+
+await Actor.init();
+
+const app = express();
+
+app.use(express.json());
+app.use(express.urlencoded({ extended: true }));
+
+const {
+    APIFY_CONTAINER_PORT,
+    APIFY_CONTAINER_URL,
+    APIFY_DEFAULT_KEY_VALUE_STORE_ID,
+} = process.env;
+
+const processedUrls = [];
+
+app.get('/', (req, res) => {
+    let listItems = '';
+
+    // For each of the processed
+    processedUrls.forEach((url, index) => {
+        const imageUrl = `https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${index}.jpg`;
+
+        // Display the screenshots below the form
+        listItems += `<li>
+    <a href="${imageUrl}" target="_blank">
+        <img src="${imageUrl}" width="300px" />
+        <br />
+        ${url}
+    </a>
+</li>`;
+    });
+
+    const pageHtml = `<html>
+    <head><title>Example</title></head>
+    <body>
+        <form method="POST" action="${APIFY_CONTAINER_URL}/add-url">
+            URL: <input type="text" name="url" placeholder="http://example.com" />
+            <input type="submit" value="Add" />
+            <hr />
+            <ul>${listItems}</ul>
+        </form>
+    </body>
+</html>`;
+
+    res.send(pageHtml);
+});
+
+app.post('/add-url', async (req, res) => {
+    const { url } = req.body;
+    console.log(`Got new URL: ${url}`);
+
+    // Start chrome browser and open new page ...
+    const browser = await Actor.launchPuppeteer();
+    const page = await browser.newPage();
+
+    // ... go to our URL and grab a screenshot ...
+    await page.goto(url);
+    const screenshot = await page.screenshot({ type: 'jpeg' });
+
+    // ... close browser ...
+    await page.close();
+    await browser.close();
+
+    // ... save screenshot to key-value store and add URL to processedUrls.
+    await Actor.setValue(`${processedUrls.length}.jpg`, screenshot, { contentType: 'image/jpeg' });
+    processedUrls.push(url);
+
+    res.redirect('/');
+});
+
+app.listen(APIFY_CONTAINER_PORT, () => {
+    console.log(`Application is listening at URL ${APIFY_CONTAINER_URL}.`);
+});
+```
+
+When we deploy and run this Actor on the Apify platform, then we can open the **Live View** tab in the Actor console to submit the URL to your Actor through the form. After the URL is successfully submitted, it appears in the Actor log.
+
+With that, we're done! And our application works like a charm :)
+
+The complete code of this Actor is available [here](https://apify.com/apify/example-web-server). You can run it there or copy it to your account.
+</file>
+
+<file path="tutorials/api/index.md">
+---
+title: API tutorials
+description: A collection of various tutorials explaining how to interact with the Apify platform programmatically using its API.
+sidebar_position: 20
+category: tutorials
+slug: /api
+---
+
+# API Tutorials 💻📚
+
+**A collection of various tutorials explaining how to interact with the Apify platform programmatically using its API.**
+
+---
+
+This section explains how you can run [Apify Actors](/platform/actors) using Apify's [API](/api/v2), retrieve their results, and integrate them into your own product and workflows. You can do this using a raw HTTP client, or you can benefit from using one of our API clients for:
+
+- [JavaScript](/api/client/js/)
+- [Python](/api/client/python)
+</file>
+
+<file path="tutorials/api/retry_failed_requests.md">
+---
+title: How to retry failed requests
+description: Learn how to resurrect your run but retrying only failed requests
+sidebar_position: 6
+slug: /api/retry-failed-requests
+---
+
+**Learn how to re-scrape only failed requests in your run.**
+
+---
+
+Requests of a scraper can fail for many reasons. The most common causes are different page layouts or proxy blocking issues ([check here on how to effectively analyze errors](https://docs.apify.com/academy/node-js/analyzing-pages-and-fixing-errors)). Both [Apify](https://apify.com) and [Crawlee](https://crawlee.dev/) allow you to restart your scraper run from the point where it ended, but there is no native functionality to re-scrape only failed requests. Usually, you also want to first analyze the problem, update the code, and build it before trying again.
+
+If you attempt to restart an already finished run, it will likely immediately finish because all the requests in the [request queue](https://crawlee.dev/docs/guides/request-storage) are marked as handled. You need to update the failed requests in the queue to be marked as pending again.
+
+The additional complication is that the [Request](https://crawlee.dev/api/core/class/Request) object doesn't have anything like the `isFailed` property. We have to approximate it using other fields. Fortunately, we can use the `errorMessages` and `retryCount` properties to identify failed requests. Unless the user explicitly has overridden these properties, we can identify failed requests with a larger amount of `errorMessages` than `retryCount`. That happens because the last error that doesn't cause a retry anymore is added to `errorMessages`.
+
+A simplified code example can look like this:
+
+```ts
+// The code is similar for both Crawlee-only but uses a different API
+import { Actor } from 'apify';
+
+const REQUEST_QUEUE_ID = 'pFCvCasdvsyvyZdfD'; // Replace with your valid request queue ID
+const allRequests = [];
+let exclusiveStartId = null;
+// List all requests from the queue, we have to do it in a loop because the request queue list is paginated
+for (; ;) {
+    const { items: requests } = await Actor.apifyClient
+        .requestQueue(REQUEST_QUEUE_ID)
+        .listRequests({ exclusiveStartId, limit: 1000 });
+    allRequests.push(...requests);
+    // If we didn't get the full 1,000 requests, we have all and can finish the loop
+    if (requests.length < 1000) {
+        break;
+    }
+
+    // Otherwise, we need to set the exclusiveStartId to the last request id to get the next batch
+    exclusiveStartId = requests[requests.length - 1].id;
+}
+
+console.log(`Loaded ${allRequests.length} requests from the queue`);
+
+// Now we filter the failed requests
+const failedRequests = allRequests.filter((request) => (request.errorMessages?.length || 0) > (request.retryCount || 0));
+
+// We need to update them 1 by 1 to the pristine state
+for (const request of failedRequests) {
+    request.retryCount = 0;
+    request.errorMessages = [];
+    // This tells the request queue to handle it again
+    request.handledAt = null;
+    await Actor.apifyClient.requestQueue(REQUEST_QUEUE_ID).updateRequest(request);
+}
+
+// And now we can resurrect our scraper again; it will only process the failed requests.
+```
+
+## Resurrect automatically with a free public Actor {#resurrect-automatically-with-a-free-public-actor}
+
+Fortunately, you don't need to implement this code into your workflow. [Apify Store](https://apify.com/store) provides the [Rebirth Failed Requests](https://apify.com/lukaskrivka/rebirth-failed-requests) Actor (that is [open-source](https://github.com/metalwarrior665/rebirth-failed-requests)) that does this and more. The Actor can automatically scan multiple runs of your Actors based on filters like `date started`. It can also automatically resurrect the runs after renewing the failed requests. That means you will finish your scrape into the final successful state with a single click on the Run button.
+</file>
+
+<file path="tutorials/api/run_actor_and_retrieve_data_via_api.md">
+---
+title: Run Actor and retrieve data via API
+description: Learn how to run an Actor/task via the Apify API, wait for the job to finish, and retrieve its output data. Your key to integrating Actors with your projects.
+sidebar_position: 6
+slug: /api/run-actor-and-retrieve-data-via-api
+---
+
+**Learn how to run an Actor/task via the Apify API, wait for the job to finish, and retrieve its output data. Your key to integrating Actors with your projects.**
+
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+The most popular way of [integrating](https://help.apify.com/en/collections/1669769-integrations) the Apify platform with an external project/application is by programmatically running an [Actor](/platform/actors) or [task](/platform/actors/running/tasks), waiting for it to complete its run, then collecting its data and using it within the project. Follow this tutorial to have an idea on how to approach this, it isn't as complicated as it sounds!
+
+> Remember to check out our [API documentation](/api/v2) with examples in different languages and a live API console. We also recommend testing the API with a desktop client like [Postman](https://www.postman.com/) or [Insomnia](https://insomnia.rest).
+
+
+Apify API offers two ways of interacting with it:
+
+- [Synchronously](#synchronous-flow)
+- [Asynchronously](#asynchronous-flow)
+
+If the Actor being run via API takes 5 minutes or less to complete a typical run, it should be called **synchronously**. Otherwise, (if a typical run takes longer than 5 minutes), it should be called **asynchronously**.
+
+## Run an Actor or task {#run-an-actor-or-task}
+
+> If you are unsure about the differences between an Actor and a task, you can read about them in the [tasks](/platform/actors/running/tasks) documentation. In brief, tasks are pre-configured inputs for Actors.
+
+The API endpoints and usage (for both sync and async) for [Actors](/api/v2#tag/ActorsRun-collection/operation/act_runs_post) and [tasks](/api/v2#/reference/actor-tasks/run-collection/run-task) are essentially the same.
+
+To run, or **call**, an Actor/task, you will need a few things:
+
+- The name or ID of the Actor/task. The name looks like `username~actorName` or `username~taskName`. The ID can be retrieved on the **Settings** page of the Actor/task.
+
+- Your [API token](/platform/integrations), which you can find on the **Integrations** page in [Apify Console](https://console.apify.com/account?tab=integrations) (do not share it with anyone!).
+
+- Possibly an input, which is passed in JSON format as the request's **body**.
+
+- Some other optional settings if you'd like to change the default values (such as allocated memory or the build).
+
+The URL of [POST request](https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/POST) to run an Actor looks like this:
+
+```cURL
+https://api.apify.com/v2/acts/ACTOR_NAME_OR_ID/runs?token=YOUR_TOKEN
+```
+
+For tasks, we can switch the path from **acts** to **actor-tasks** and keep the rest the same:
+
+```cURL
+https://api.apify.com/v2/actor-tasks/TASK_NAME_OR_ID/runs?token=YOUR_TOKEN
+```
+
+If we send a correct POST request to one of these endpoints, the actor/actor-task will start just as if we had pressed the **Start** button on the Actor's page in the [Apify Console](https://console.apify.com).
+
+### Additional settings {#additional-settings}
+
+We can also add settings for the Actor (which will override the default settings) as additional query parameters. For example, if we wanted to change how much memory the Actor's run should be allocated and which build to run, we could add the `memory` and `build` parameters separated by `&`.
+
+```cURL
+https://api.apify.com/v2/acts/ACTOR_NAME_OR_ID/runs?token=YOUR_TOKEN&memory=8192&build=beta
+```
+
+This works in almost exactly the same way for both Actors and tasks; however, for tasks, there is no reason to specify a [`build`](/platform/actors/development/builds-and-runs/builds) parameter, as a task already has only one specific Actor build which cannot be changed with query parameters.
+
+### Input JSON {#input-json}
+
+Most Actors would not be much use if input could not be passed into them to change their behavior. Additionally, even though tasks already have specified input configurations, it is handy to have the ability to overwrite task inputs through the **body** of the POST request.
+
+> The input can technically be any [JSON object](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON), and will vary depending on the Actor being run. Ensure that you are familiar with the Actor's input schema while writing the body of the request.
+
+Good Actors have reasonable defaults for most input fields, so if you want to run one of the major Actors from [Apify Store](https://apify.com/store), you usually do not need to provide all possible fields.
+
+Via API, let's quickly try to run [Web Scraper](https://apify.com/apify/web-scraper), which is the most popular Actor on the Apify Store at the moment. The full input with all possible fields is [pretty long and ugly](https://apify.com/apify/web-scraper?section=example-run), so we will not show it here. Because it has default values for most fields, we can provide a JSON input containing only the fields we'd like to customize. We will send a POST request to the endpoint below and add the JSON as the **body** of the request:
+
+```cURL
+https://api.apify.com/v2/acts/apify~web-scraper/runs?token=YOUR_TOKEN
+```
+
+Here is how it looks in [Postman](https://www.postman.com/):
+
+![Run an Actor via API in Postman](./images/run-actor-postman.png)
+
+If we press **Send**, it will immediately return some info about the run. The `status` will be either `READY` (which means that it is waiting to be allocated on a server) or `RUNNING` (99% of cases).
+
+![Actor run info in Postman](./images/run-info-postman.png)
+
+We will later use this **run info** JSON to retrieve the run's output data. This info about the run can also be retrieved with another call to the [**Get run**](https://apify.com/docs/api/v2#/reference/actors/run-object/get-run) endpoint.
+
+## JavaScript and Python client {#javascript-and-python-client}
+
+If you are using JavaScript or Python, we highly recommend using the Apify API client ([JavaScript](https://docs.apify.com/api/client/js/), [Python](https://docs.apify.com/api/client/python/)) instead of the raw HTTP API. The client implements smart polling and exponential backoff, which makes calling Actors and getting results efficient.
+
+You can skip most of this tutorial by following this code example that calls Google Search Results Scraper and logs its results:
+
+<Tabs groupId="main">
+<TabItem value="Node.js" label="Node.js">
+
+```js
+import { ApifyClient } from 'apify-client';
+
+const client = new ApifyClient({ token: 'YOUR_API_TOKEN' });
+
+const input = { queries: 'Food in NYC' };
+
+// Run the Actor and wait for it to finish
+// .call method waits infinitely long using smart polling
+// Get back the run API object
+const run = await client.actor('apify/google-search-scraper').call(input);
+
+// Fetch and print Actor results from the run's dataset (if any)
+const { items } = await client.dataset(run.defaultDatasetId).listItems();
+items.forEach((item) => {
+    console.dir(item);
+});
+```
+
+</TabItem>
+<TabItem value="Python" label="Python">
+
+```py
+from apify_client import ApifyClient
+client = ApifyClient(token='YOUR_API_TOKEN')
+
+run_input = {
+    "queries": "Food in NYC",
+}
+
+# Run the Actor and wait for it to finish
+# .call method waits infinitely long using smart polling
+# Get back the run API object
+run = client.actor("apify/google-search-scraper").call(run_input=run_input)
+
+# Fetch and print Actor results from the run's dataset (if there are any)
+for item in client.dataset(run["defaultDatasetId"]).iterate_items():
+    print(item)
+```
+
+</TabItem>
+</Tabs>
+
+By using our client, you don't need to worry about choosing between synchronous or asynchronous flow. But if you don't want your code to wait during `.call` (potentially for hours), continue reading below about how to implement webhooks.
+
+## Synchronous flow {#synchronous-flow}
+
+If each of your runs will last shorter than 5 minutes, you can use a single [synchronous endpoint](https://usergrid.apache.org/docs/introduction/async-vs-sync.html#synchronous). When running **synchronously**, the connection will be held for _up to_ 5 minutes.
+
+If your synchronous run exceeds the 5-minute time limit, the response will be a run object containing information about the run and the status of `RUNNING`. If that happens, you need to restart the run [asynchronously](#asynchronous-flow) and [wait for the run to finish](#wait-for-the-run-to-finish).
+
+### Synchronous runs with dataset output {#synchronous-runs-with-dataset-output}
+
+Most Actor runs will store their data in the default [dataset](/platform/storage/dataset). The Apify API provides **run-sync-get-dataset-items** endpoints for [Actors](/api/v2#/reference/actors/run-actor-synchronously-and-get-dataset-items/run-actor-synchronously-with-input-and-get-dataset-items) and [tasks](/api/v2#/reference/actor-tasks/run-task-synchronously-and-get-dataset-items/run-task-synchronously-and-get-dataset-items-(post)), which allow you to run an Actor and receive the items from the default dataset once the run has finished.
+
+Here is a Node.js example of calling a task via the API and logging the dataset items to the console:
+
+```js
+// Use your favorite HTTP client
+import got from 'got';
+
+// Specify your API token
+// (find it at https://console.apify.com/account#/integrations)
+const myToken = '<YOUR_APIFY_TOKEN>';
+
+// Start apify/google-search-scraper Actor
+// and pass some queries into the JSON body
+const response = await got({
+    url: `https://api.apify.com/v2/acts/apify~google-search-scraper/run-sync-get-dataset-items?token=${myToken}`,
+    method: 'POST',
+    json: {
+        queries: 'web scraping\nweb crawling',
+    },
+    responseType: 'json',
+});
+
+const items = response.body;
+
+// Log each non-promoted search result for both queries
+items.forEach((item) => {
+    const { nonPromotedSearchResults } = item;
+    nonPromotedSearchResults.forEach((result) => {
+        const { title, url, description } = result;
+        console.log(`${title}: ${url} --- ${description}`);
+    });
+});
+```
+
+### Synchronous runs with key-value store output {#synchronous-runs-with-key-value-store-output}
+
+[Key-value stores](/platform/storage/key-value-store) are useful for storing files like images, HTML snapshots, or JSON data. The Apify API provides **run-sync** endpoints for [Actors](/api/v2#/reference/actors/run-actor-synchronously/with-input) and [tasks](/api/v2#/reference/actor-tasks/run-task-synchronously/run-task-synchronously), which allow you to run a specific task and receive the output. By default, they return the `OUTPUT` record from the default key-value store.
+
+> For more detailed information, check the [API reference](/api/v2#/reference/actors/run-actor-synchronously-and-get-dataset-items/run-actor-synchronously-with-input-and-get-dataset-items).
+
+## Asynchronous flow {#asynchronous-flow}
+
+For runs longer than 5 minutes, the process consists of three steps:
+
+- [Run the Actor or task](#run-an-actor-or-task)
+- [Wait for the run to finish](#wait-for-the-run-to-finish)
+- [Collect the data](#collect-the-data)
+
+### Wait for the run to finish {#wait-for-the-run-to-finish}
+
+There may be cases where we need to run the Actor and go away. But in any kind of integration, we are usually interested in its output. We have three basic options for how to wait for the actor/task to finish.
+
+- [`waitForFinish` parameter](#waitforfinish-parameter)
+- [Webhooks](#webhooks)
+- [Polling](#polling)
+
+#### `waitForFinish` parameter {#waitforfinish-parameter}
+
+This solution is quite similar to the synchronous flow. To make the POST request wait, add the `waitForFinish` parameter. It can have a value from `0` to `60`, which is the maximum time in seconds to wait (the max value for `waitForFinish` is 1 minute). Knowing this, we can extend the example URL like this:
+
+```cURL
+https://api.apify.com/v2/acts/apify~web-scraper/runs?token=YOUR_TOKEN&waitForFinish=60
+```
+
+You can also use the `waitForFinish` parameter with the [**GET Run** endpoint](/api/v2#/reference/actors/run-object/get-run) to implement a smarter [polling](#polling) system.
+
+Once again, the final response will be the **run info object**; however, now its status should be `SUCCEEDED` or `FAILED`. If the run exceeds the `waitForFinish` duration, the status will still be `RUNNING`.
+
+#### Webhooks {#webhooks}
+
+If you have a server, [webhooks](/platform/integrations/webhooks) are the most elegant and flexible solution for integrations with Apify. You can set up a webhook for any Actor or task, and that webhook will send a POST request to your server after an [event](/platform/integrations/webhooks/events) has occurred.
+
+Usually, this event is a successfully finished run, but you can also set a different webhook for failed runs, etc.
+
+![Webhook example](./images/webhook.png)
+
+The webhook will send you a pretty complicated [JSON object](/platform/integrations/webhooks/actions), but usually, you would only be interested in the `resource` object within the response, which is like the **run info** JSON from the previous sections. We can leave the payload template as is for our example since it is all we need.
+
+Once your server receives this request from the webhook, you know that the event happened, and you can ask for the complete data.
+
+> Don't forget to respond to the webhook with a **200** status code! Otherwise, it will ping you again.
+
+#### Polling {#polling}
+
+What if you don't have a server, and the run you'd like to do is much too long to use a synchronous call? In cases like these, periodic **polling** of the run's status is the solution.
+
+When we run the Actor with the [usual API call](#run-an-actor-or-task) shown above, we will back a response with the **run info** object. From this JSON object, we can then extract the ID of the Actor run that we just started from the `id` field. Then, we can set an interval that will poll the Apify API (let's say every 5 seconds) by calling the [**Get run**](https://apify.com/docs/api/v2#/reference/actors/run-object/get-run) endpoint to retrieve the run's status.
+
+Replace the `RUN_ID` in the following URL with the ID you extracted earlier:
+
+```cURL
+https://api.apify.com/v2/acts/ACTOR_NAME_OR_ID/runs/RUN_ID
+```
+
+Once a status of `SUCCEEDED` or `FAILED` has been received, we know the run has finished and can cancel the interval and finally [collect the data](#collect-the-data).
+
+### Collecting the data {#collect-the-data}
+
+Unless you used the [synchronous call](#synchronous-flow) mentioned above, you will have to make one additional request to the API to retrieve the data.
+
+The **run info** JSON also contains the IDs of the default [dataset](/platform/storage/dataset) and [key-value store](/platform/storage/key-value-store) that are allocated separately for each run, which is usually everything you need. The fields are called `defaultDatasetId` and `defaultKeyValueStoreId`.
+
+#### Retrieving a dataset {#retrieve-a-dataset}
+
+> If you are scraping products, or any list of items with similar fields, the [dataset](/platform/storage/dataset) should be your storage of choice. Don't forget though, that dataset items are immutable. This means that you can only add to the dataset, and not change the content that is already inside it.
+
+To retrieve the data from a dataset, send a GET request to the [**Get items**](/api/v2#/reference/datasets/item-collection/get-items) endpoint and pass the `defaultDatasetId` into the URL. For a GET request to the default dataset, no token is needed.
+
+```cURL
+https://api.apify.com/v2/datasets/DATASET_ID/items
+```
+
+By default, it will return the data in JSON format with some metadata. The actual data are in the `items` array.
+
+You can use plenty of additional parameters, to learn more about them, visit our API reference [documentation](/api/v2#/reference/datasets/item-collection/get-items). We will only mention that you can pass a `format` parameter that transforms the response into popular formats like CSV, XML, Excel, RSS, etc.
+
+The items are paginated, which means you can ask only for a subset of the data. Specify this using the `limit` and `offset` parameters. This endpoint has a limit of 250,000 items that it can return per request. To retrieve more, you will need to send more requests incrementing the `offset` parameter.
+
+```cURL
+https://api.apify.com/v2/datasets/DATASET_ID/items?format=csv&offset=250000
+```
+
+#### Retrieving a key-value store {#retrieve-a-key-value-store}
+
+> [Key-value stores](/platform/storage/key-value-store) are mainly useful if you have a single output or any kind of files that cannot be [stringified](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify) (such as images or PDFs).
+
+When you want to retrieve something from a key-value store, the `defaultKeyValueStoreId` is _not_ enough. You also need to know the name (or **key**) of the record you want to retrieve.
+
+If you have a single output JSON, the convention is to return it as a record named `OUTPUT` to the default key-value store. To retrieve the record's content, call the [**Get record**](/api/v2#/reference/key-value-stores/record/get-record) endpoint.
+
+```cURL
+https://api.apify.com/v2/key-value-stores/STORE_ID/records/RECORD_KEY
+```
+
+If you don't know the keys (names) of the records in advance, you can retrieve just the keys with the [**List keys**](https://apify.com/docs/api/v2#/reference/key-value-stores/key-collection/get-list-of-keys) endpoint.
+
+Keep in mind that you can get a maximum of 1000 keys per request, so you will need to paginate over the keys using the `exclusiveStartKey` parameter if you have more than 1000 keys. To do this, after each call, take the last record key and provide it as the `exclusiveStartKey` parameter. You can do this until you get 0 keys back.
+
+```cURL
+https://api.apify.com/v2/key-value-stores/STORE_ID/keys?exclusiveStartKey=myLastRecordKey
+```
+</file>
+
+<file path="tutorials/apify_scrapers/cheerio_scraper.md">
+---
+title: Scraping with Cheerio Scraper
+menuTitle: Cheerio Scraper
+description: Learn how to scrape a website using Apify's Cheerio Scraper. Build an Actor's page function, extract information from a web page and download your data.
+externalSourceUrl: https://raw.githubusercontent.com/apify/actor-scraper/master/docs/build/cheerio-scraper-tutorial.md
+sidebar_position: 3
+slug: /apify-scrapers/cheerio-scraper
+---
+
+[//]: # (TODO: Should be updated)
+
+#
+
+This scraping tutorial will go into the nitty gritty details of extracting data from **https://apify.com/store**
+using **Cheerio Scraper** ([apify/cheerio-scraper](https://apify.com/apify/cheerio-scraper)). If you arrived here from the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started),
+tutorial, great! You are ready to continue where we left off. If you haven't seen the Getting started yet,
+check it out, it will help you learn about Apify and scraping in general and set you up for this tutorial,
+because this one builds on topics and code examples discussed there.
+
+## Getting to know our tools
+
+In the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started) tutorial, we've confirmed that the scraper works as expected,
+so now it's time to add more data to the results.
+
+To do that, we'll be using the [Cheerio](https://github.com/cheeriojs/cheerio) library. This may not sound familiar,
+so let's try again. Does [jQuery](https://jquery.com/) ring a bell? If it does you're in luck,
+because Cheerio is like jQuery that doesn't need an actual browser to run. Everything else is the same.
+All the functions you already know are there and even the familiar `$` is used. If you still have no idea what either
+of those are, don't worry. We'll walk you through using them step by step.
+
+> [Check out the Cheerio docs](https://github.com/cheeriojs/cheerio) to learn more about it.
+
+Now that's out of the way, let's open one of the Actor detail pages in the Store, for example the
+**Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)) page, and use our DevTools-Fu to scrape some data.
+
+> If you're wondering why we're using Web Scraper as an example instead of Cheerio Scraper,
+it's only because we didn't want to triple the number of screenshots we needed to make. Lazy developers!
+
+## Building our Page function
+
+Before we start, let's do a quick recap of the data we chose to scrape:
+
+   1. **URL** - The URL that goes directly to the Actor's detail page.
+   2. **Unique identifier** - Such as **apify/web-scraper**.
+   3. **Title** - The title visible in the Actor's detail page.
+   4. **Description** - The Actor's description.
+   5. **Last modification date** - When the Actor was last modified.
+   6. **Number of runs** - How many times the Actor was run.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/scraping-practice.webp)
+
+We've already scraped numbers 1 and 2 in the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started)
+tutorial, so let's get to the next one on the list: title.
+
+### Title
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/title.webp)
+
+By using the element selector tool, we find out that the title is there under an `<h1>` tag, as titles should be.
+Maybe surprisingly, we find that there are actually two `<h1>` tags on the detail page. This should get us thinking.
+Is there any parent element that includes our `<h1>` tag, but not the other ones? Yes, there is! A `<header>`
+element that we can use to select only the heading we're interested in.
+
+> Remember that you can press CTRL+F (CMD+F) in the Elements tab of DevTools to open the search bar where you can quickly search for elements using
+> their selectors. And always make sure to use the DevTools to verify your scraping process and assumptions. It's faster than changing the crawler
+> code all the time.
+
+To get the title we need to find it using a `header h1` selector, which selects all `<h1>` elements that have a `<header>` ancestor.
+And as we already know, there's only one.
+
+```js
+// Using Cheerio.
+async function pageFunction(context) {
+    const { $ } = context;
+    // ... rest of your code can come here
+    return {
+        title: $('header h1').text(),
+    };
+}
+```
+
+### Description
+
+Getting the Actor's description is a little more involved, but still pretty straightforward. We cannot search for a `<p>` tag, because there's a lot of them in the page. We need to narrow our search down a little. Using the DevTools we find that the Actor description is nested within
+the `<header>` element too, same as the title. Moreover, the actual description is nested inside a `<span>` tag with a class `actor-description`.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/description.webp)
+
+```js
+async function pageFunction(context) {
+    const { $ } = context;
+    // ... rest of your code can come here
+    return {
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+    };
+}
+```
+
+### Modified date
+
+The DevTools tell us that the `modifiedDate` can be found in a `<time>` element.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/modified-date.webp)
+
+```js
+async function pageFunction(context) {
+    const { $ } = context;
+    // ... rest of your code can come here
+    return {
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+        modifiedDate: new Date(
+            Number(
+                $('ul.ActorHeader-stats time').attr('datetime'),
+            ),
+        ),
+    };
+}
+```
+
+It might look a little too complex at first glance, but let us walk you through it. We find all the `<time>` elements. Then, we read its `datetime` attribute, because that's where a unix timestamp is stored as a `string`.
+
+But we would much rather see a readable date in our results, not a unix timestamp, so we need to convert it. Unfortunately, the `new Date()`
+constructor will not accept a `string`, so we cast the `string` to a `number` using the `Number()` function before actually calling `new Date()`.
+Phew!
+
+### Run count
+
+And so we're finishing up with the `runCount`. There's no specific element like `<time>`, so we need to create
+a complex selector and then do a transformation on the result.
+
+```js
+async function pageFunction(context) {
+    const { $ } = context;
+    // ... rest of your code can come here
+    return {
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+        modifiedDate: new Date(
+            Number(
+                $('ul.ActorHeader-stats time').attr('datetime'),
+            ),
+        ),
+        runCount: Number(
+            $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                .text()
+                .match(/[\d,]+/)[0]
+                .replace(/,/g, ''),
+        ),
+    };
+}
+```
+
+The `ul.ActorHeader-stats > li:nth-of-type(3)` looks complicated, but it only reads that we're looking for a `<ul class="ActorHeader-stats ...">` element and within that
+element we're looking for the third `<li>` element. We grab its text, but we're only interested in the number of runs. We parse the number out
+using a regular expression, but its type is still a `string`, so we finally convert the result to a `number` by wrapping it with a `Number()` call.
+
+> The numbers are formatted with commas as thousands separators (e.g. `'1,234,567'`), so to extract it, we
+> first use regular expression `/[\d,]+/` - it will search for consecutive number or comma characters.
+> Then we extract the match via `.match(/[\d,]+/)[0]` and finally remove all the commas by calling `.replace(/,/g, '')`.
+> We need to use `/,/g` with the global modifier to support large numbers with multiple separators, without it
+> we would replace only the very first occurrence.
+>
+> This will give us a string (e.g. `'1234567'`) that can be converted via `Number` function.
+
+### Wrapping it up
+
+And there we have it! All the data we needed in a single object. For the sake of completeness, let's add
+the properties we parsed from the URL earlier and we're good to go.
+
+```js
+async function pageFunction(context) {
+    const { $ } = context;
+    const { url } = request;
+    // ... rest of your code can come here
+
+    const uniqueIdentifier = url
+        .split('/')
+        .slice(-2)
+        .join('/');
+
+    return {
+        url,
+        uniqueIdentifier,
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+        modifiedDate: new Date(
+            Number(
+                $('ul.ActorHeader-stats time').attr('datetime'),
+            ),
+        ),
+        runCount: Number(
+            $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                .text()
+                .match(/[\d,]+/)[0]
+                .replace(/,/g, ''),
+        ),
+    };
+}
+```
+
+All we need to do now is add this to our `pageFunction`:
+
+```js
+async function pageFunction(context) {
+    // $ is Cheerio
+    const { request, log, skipLinks, $ } = context;
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        // Do some stuff later.
+    }
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+            title: $('header h1').text(),
+            description: $('header span.actor-description').text(),
+            modifiedDate: new Date(
+                Number(
+                    $('ul.ActorHeader-stats time').attr('datetime'),
+                ),
+            ),
+            runCount: Number(
+                $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                    .text()
+                    .match(/[\d,]+/)[0]
+                    .replace(/,/g, ''),
+            ),
+        };
+    }
+}
+```
+
+### Test run
+
+As always, try hitting that **Save & Run** button and visit
+the **Dataset** preview of clean items. You should see a nice table of all the attributes correctly scraped.
+You nailed it!
+
+## Pagination
+
+Pagination is a term that represents "going to the next page of results". You may have noticed that we did not
+actually scrape all the Actors, just the first page of results. That's because to load the rest of the Actors,
+one needs to click the **Show more** button at the very bottom of the list. This is pagination.
+
+> This is a typical JavaScript pagination, sometimes called infinite scroll. Other pages may use links
+that take you to the next page. If you encounter those, make a Pseudo URL for those links and they
+will be automatically enqueued to the request queue. Use a label to let the scraper know what kind of URL
+it's processing.
+
+If you paid close attention, you may now see a problem. How do we click a button in the page when we're working
+with Cheerio? We don't have a browser to do it and we only have the HTML of the page to work with. The simple
+answer is that we can't click a button. Does that mean that we cannot get the data at all? Usually not,
+but it requires some clever DevTools-Fu.
+
+### Analyzing the page
+
+While with Web Scraper and **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper)), we could get away with clicking a button,
+with Cheerio Scraper we need to dig a little deeper into the page's architecture. For this, we will use
+the Network tab of the Chrome DevTools.
+
+> DevTools is a powerful tool with many features, so if you're not familiar with it, please [see Google's tutorial](https://developer.chrome.com/docs/devtools/), which explains everything much better than we ever could.
+
+We want to know what happens when we click the **Show more** button, so we open the DevTools **Network** tab and clear it.
+Then we click the **Show more** button and wait for incoming requests to appear in the list.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/inspect-network.webp)
+
+Now, this is interesting. It seems that we've only received two images after clicking the button and no additional
+data. This means that the data about Actors must already be available in the page and the **Show more** button only displays it. This is good news.
+
+### Finding the Actors
+
+Now that we know the information we seek is already in the page, we just need to find it. The first Actor in the store
+is Web Scraper, so let's try using the search tool in the **Elements** tab to find some reference to it. The first
+few hits do not provide any interesting information, but in the end, we find our goldmine. A `<script>` tag,
+with the ID `__NEXT_DATA__` that seems to hold a lot of information about Web Scraper. In DevTools,
+you can right click an element and click **Store as global variable** to make this element available in the **Console**.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/find-data.webp)
+
+A `temp1` variable is now added to your console. We're mostly interested in its contents and we can get that using
+the `temp1.textContent` property. You can see that it's a rather large JSON string. How do we know?
+The `type` attribute of the `<script>` element says `application/json`. But working with a string would be very
+cumbersome, so we need to parse it.
+
+```js
+const data = JSON.parse(temp1.textContent);
+```
+
+After entering the above command into the console, we can inspect the `data` variable and see that all the information
+we need is there, in the `data.props.pageProps.items` array. Great!
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/inspect-data.webp)
+
+> It's obvious that all the information we set to scrape is available in this one data object,
+so you might already be wondering, can I make one request to the store to get this JSON
+and then parse it out and be done with it in a single request? Yes you can! And that's the power
+of clever page analysis.
+
+### Using the data to enqueue all Actor details
+
+We don't really need to go to all the Actor details now, but for the sake of practice, let's imagine we only found
+Actor names such as `cheerio-scraper` and their owners, such as `apify` in the data. We will use this information
+to construct URLs that will take us to the Actor detail pages and enqueue those URLs into the request queue.
+
+```js
+// We're not in DevTools anymore,
+// so we use Cheerio to get the data.
+const dataJson = $('#__NEXT_DATA__').html();
+// We requested HTML, but the data are actually JSON.
+const data = JSON.parse(dataJson);
+
+for (const item of data.props.pageProps.items) {
+    const { name, username } = item;
+    const actorDetailUrl = `https://apify.com/${username}/${name}`;
+    await context.enqueueRequest({
+        url: actorDetailUrl,
+        userData: {
+            // Don't forget the label.
+            label: 'DETAIL',
+        },
+    });
+}
+```
+
+We iterate through the items we found, build Actor detail URLs from the available properties and then enqueue
+those URLs into the request queue. We need to specify the label too, otherwise our page function wouldn't know
+how to route those requests.
+
+>If you're wondering how we know the structure of the URL, see the [Getting started
+with Apify Scrapers](./getting_started.md) tutorial again.
+
+### Plugging it into the Page function
+
+We've got the general algorithm ready, so all that's left is to integrate it into our earlier `pageFunction`.
+Remember the `// Do some stuff later` comment? Let's replace it.
+
+```js
+async function pageFunction(context) {
+    const { request, log, skipLinks, $ } = context;
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+
+        const dataJson = $('#__NEXT_DATA__').html();
+        // We requested HTML, but the data are actually JSON.
+        const data = JSON.parse(dataJson);
+
+        for (const item of data.props.pageProps.items) {
+            const { name, username } = item;
+            const actorDetailUrl = `https://apify.com/${username}/${name}`;
+            await context.enqueueRequest({
+                url: actorDetailUrl,
+                userData: {
+                    label: 'DETAIL',
+                },
+            });
+        }
+    }
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+            title: $('header h1').text(),
+            description: $('header span.actor-description').text(),
+            modifiedDate: new Date(
+                Number(
+                    $('ul.ActorHeader-stats time').attr('datetime'),
+                ),
+            ),
+            runCount: Number(
+                $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                    .text()
+                    .match(/[\d,]+/)[0]
+                    .replace(/,/g, ''),
+            ),
+        };
+    }
+}
+```
+
+That's it! You can now remove the **Max pages per run** limit, **Save & Run** your task and watch the scraper
+scrape all of the Actors' data. After it succeeds, open the **Dataset** tab again click on **Preview**.
+You should have a table of all the Actor's details in front of you. If you do, great job! You've successfully
+scraped Apify Store. And if not, no worries, go through the code examples again, it's probably just a typo.
+
+> There's an important caveat. The way we implemented pagination here is in no way a generic system that you can
+use with other websites. Cheerio is fast (and that means it's cheap), but it's not easy. Sometimes there's just no way
+to get all results with Cheerio only and other times it takes hours of research. Keep this in mind when choosing
+the right scraper for your job. But don't get discouraged. Often times, the only thing you will ever need is to
+define a correct Pseudo URL. Do your research first before giving up on Cheerio Scraper.
+
+## Downloading the scraped data
+
+You already know the **Dataset** tab of the run console since this is where we've always previewed our data. Notice the row of data formats such as JSON, CSV, and Excel. Below it are options for viewing and downloading the data. Go ahead and try it.
+
+> If you prefer working with an API, you can find the example endpoint under the API tab: **Get dataset items**.
+
+### Clean items
+
+You can view and download your data without modifications, or you can choose to only get **clean** items. Data that aren't cleaned include a record
+for each `pageFunction` invocation, even if you did not return any results. The record also includes hidden fields
+such as `#debug`, where you can find a variety of information that can help you with debugging your scrapers.
+
+Clean items, on the other hand, include only the data you returned from the `pageFunction`. If you're only interested in the data you scraped, this format is what you will be using most of the time.
+
+To control this, open the **Advanced options** view on the **Dataset** tab.
+
+## Bonus: Making your code neater
+
+You may have noticed that the `pageFunction` gets quite bulky. To make better sense of your code and have an easier
+time maintaining or extending your task, feel free to define other functions inside the `pageFunction`
+that encapsulate all the different logic. You can, for example, define a function for each of the different pages:
+
+```js
+async function pageFunction(context) {
+    switch (context.request.userData.label) {
+        case 'START': return handleStart(context);
+        case 'DETAIL': return handleDetail(context);
+        default: throw new Error('Unknown request label.');
+    }
+
+    async function handleStart({ log, waitFor, $ }) {
+        log.info('Store opened!');
+
+        const dataJson = $('#__NEXT_DATA__').html();
+        // We requested HTML, but the data are actually JSON.
+        const data = JSON.parse(dataJson);
+
+        for (const item of data.props.pageProps.items) {
+            const { name, username } = item;
+            const actorDetailUrl = `https://apify.com/${username}/${name}`;
+            await context.enqueueRequest({
+                url: actorDetailUrl,
+                userData: {
+                    label: 'DETAIL',
+                },
+            });
+        }
+    }
+
+    async function handleDetail({ request, log, skipLinks, $ }) {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+            title: $('header h1').text(),
+            description: $('header span.actor-description').text(),
+            modifiedDate: new Date(
+                Number(
+                    $('ul.ActorHeader-stats time').attr('datetime'),
+                ),
+            ),
+            runCount: Number(
+                $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                    .text()
+                    .match(/[\d,]+/)[0]
+                    .replace(/,/g, ''),
+            ),
+        };
+    }
+}
+```
+
+> If you're confused by the functions being declared below their executions, it's called hoisting and it's a feature
+of JavaScript. It helps you put what matters on top, if you so desire.
+
+## Final word
+
+Thank you for reading this whole tutorial! Really! It's important to us that our users have the best information available to them so that they can use Apify and effectively. We're glad that you made it all the way here and congratulations on creating your first scraping task. We hope that you liked the tutorial and if there's anything you'd like to ask, [join us on Discord](https://discord.gg/jyEM2PRvMU)!
+
+## What's next
+
+* Check out the [Apify SDK](https://docs.apify.com/sdk) and its [Getting started](https://docs.apify.com/sdk/js/docs/guides/apify-platform) tutorial if you'd like to try building your own Actors. It's a bit more complex and involved than writing a `pageFunction`, but it allows you to fine-tune all the details of your scraper to your liking.
+* [Take a deep dive into Actors](/platform/actors), from how they work to [publishing](/platform/actors/publishing) them in Apify Store, and even [making money](https://blog.apify.com/make-regular-passive-income-developing-web-automation-actors-b0392278d085/) on Actors.
+* Found out you're not into the coding part but would still to use Apify Actors? Check out our [ready-made solutions](https://apify.com/store) or [order a custom Actor](https://apify.com/contact-sales) from an Apify-certified developer.
+
+
+**Learn how to scrape a website using Apify's Cheerio Scraper. Build an Actor's page function, extract information from a web page and download your data.**
+
+---
+</file>
+
+<file path="tutorials/apify_scrapers/getting_started.md">
+---
+title: Getting started with Apify scrapers
+menuTitle: Getting started
+description: Step-by-step tutorial that will help you get started with all Apify Scrapers. Learn the foundations of scraping the web with Apify and creating your own Actors.
+externalSourceUrl: https://raw.githubusercontent.com/apify/actor-scraper/master/docs/build/introduction-tutorial.md
+sidebar_position: 1
+slug: /apify-scrapers/getting-started
+---
+
+[//]: # (TODO: Should be updated)
+
+#
+
+Welcome to the getting started tutorial! It will walk you through creating your first scraping task step by step. You will learn how to set up all the different configuration options, code a **Page function** (`pageFunction`), and finally download the scraped data either as an Excel sheet or in another format, such as JSON or CSV. But first, let's give you a brief introduction to web scraping with Apify.
+
+## What is an Apify scraper
+
+It doesn't matter whether you arrived here from **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)), **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper)) or **Cheerio Scraper** ([apify/cheerio-scraper](https://apify.com/apify/cheerio-scraper)). All of them are **Actors** and for now, let's think of an **Actor** as an application that you can use with your own configuration. **apify/web-scraper** is therefore an application called **web-scraper**, built by **apify**, that you can configure to scrape any webpage. We call these configurations **tasks**.
+
+> If you need help choosing the right scraper, see this [great article](https://help.apify.com/en/articles/3024655-choosing-the-right-solution). If you want to learn more about Actors in general, you can read our [Actors page](https://apify.com/actors) or [browse the documentation](/platform/actors).
+
+You can create 10 different **tasks** for 10 different websites, with very different options, but there will always be just one **Actor**, the `apify/*-scraper` you chose. This is the essence of tasks. They are nothing but **saved configurations** of the Actor that you can run repeatedly.
+
+## Trying it out
+
+Depending on how you arrived at this tutorial, you may already have your first task created for the scraper of your choice. If not, the easiest way is to go to [Apify Store](https://console.apify.com/actors#/store/) and select the Actor you want to base your task on. Then, click the **Create a new task** button in the top-right corner.
+
+> This tutorial covers the use of **Web**, **Cheerio**, and **Puppeteer** scrapers, but a lot of the information here can be used with all Actors. For this tutorial, we will select **Web Scraper**.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/actor-selection.webp)
+
+### Running a task
+
+This takes you to the **Input and options** tab of the task configuration. Before we delve into the details, let's see how the example works. You can see that there are already some pre-configured input values. It says that the task should visit **https://apify.com** and all its subpages, such as **https://apify.com/contact** and scrape some data using the provided `pageFunction`, specifically the `<title>` of the page and its URL.
+
+Scroll down to the **Performance and limits** section and set the **Max pages per run** option to **10**. This tells your task to finish after 10 pages have been visited. We don't need to crawl the whole domain to see that the Actor works.
+
+> This also helps with keeping your [compute unit](/platform/actors/running/usage-and-resources) (CU) consumption low. To get an idea, our free plan includes 10 CUs and this run will consume about 0.04 CU, so you can run it 250 times a month for free. If you accidentally go over the limit, no worries, we won't charge you for it. You just won't be able to run more tasks that month.
+
+Now click **Save & Run**! *(in the bottom-left part of your screen)*
+
+### The run detail
+
+After clicking **Save & Run**, the window will change to the run detail. Here, you will see the run's log. If it seems that nothing is happening, don't worry, it takes a few seconds for the run to fully boot up. In under a minute, you should have the 10 pages scraped. You will know that the run successfully completed when the `RUNNING` card in top-left corner changes to `SUCCEEDED`.
+
+> Feel free to browse through the various new tabs: **Log**, **Info**, **Input** and other, but for the sake of brevity, we will not explain all their features in this tutorial.
+
+Now that the run has `SUCCEEDED`, click on the glowing **Results** card to see the scrape's results. This takes you to the **Dataset** tab, where you can display or download the results in various formats. For now, click the **Preview** button. Voila, the scraped data!
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/the-run-detail.webp)
+
+Good job! We've run our first task and got some results. Let's learn how to change the default configuration to scrape something more interesting than the page's `<title>`.
+
+## Creating your own task
+
+Before we jump into the scraping itself, let's have a quick look at the user interface that's available to us. Click on the task's name in the top-left corner to visit the task's configuration.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/task-name.webp)
+
+### Input and options
+
+The **Input** tab is where we started and it's the place where you create your scraping configuration. The Actor's creator prepares the **Input** form so that you can tell the Actor what to do. Feel free to check the tooltips of the various options to get a better idea of what they do. To display the tooltip, click the question mark next to each input field's name.
+
+> We will not go through all the available input options in this tutorial. See the Actor's README for detailed information.
+
+Below the input fields are the Build, Timeout and Memory options. Let's keep them at default settings for now. Remember that if you see a yellow `TIMED-OUT` status after running your task, you might want to come back here and increase the timeout.
+
+> Timeouts are there to prevent tasks from running forever. Always set a reasonable timeout to prevent a rogue task from eating up all your compute units.
+
+### Settings
+
+In the settings tab, you can set options that are common to all tasks and not directly related to the Actor's purpose. Unless you've already changed the task's name, it's set to **my-task**, so why not try changing it to **my-first-scraper** and clicking **Save**.
+
+### Runs
+
+You can find all the task runs and their detail pages here. Every time you start a task, it will appear here in the list. Apify securely stores your ten most recent runs indefinitely, ensuring your records are always accessible. All of your task's runs and their outcomes, beyond the latest ten, will be stored here for the data retention period, [which you can find under your plan](https://apify.com/pricing).
+
+### Webhooks
+
+Webhooks are a feature that help keep you aware of what's happening with your tasks. You can set them up to inform you when a task starts, finishes, fails etc., or you can even use them to run more tasks, depending on the outcome of the original one. [See webhooks documentation](/platform/integrations/webhooks).
+
+### Information
+
+Since tasks are configurations for Actors, this tab shows you all the information about the underlying Actor, the Apify scraper of your choice. You can see the available versions and their READMEs - it's always a good idea to read an Actor's README first before creating a task for it.
+
+### API
+
+The API tab gives you a quick overview of all the available API calls in case you would like to use your task programmatically. It also includes links to detailed API documentation. You can even try it out immediately using the **Test endpoint** button.
+
+> Never share a URL containing the authentication token (`?token=...` parameter in the URLs), as this will compromise your account's security.
+
+## Scraping theory
+
+Since this is a tutorial, we'll be scraping our own website. [Apify Store](https://apify.com/store) is a great candidate for some scraping practice. It's a page built on popular technologies, which displays a lot of different items in various categories, just like an online store, a typical scraping target, would.
+
+### The goal
+
+We want to create a scraper that scrapes all the Actors in the store and collects the following attributes for each Actor:
+
+   1. **URL** - The URL that goes directly to the Actor's detail page.
+   2. **Unique identifier** - Such as **apify/web-scraper**.
+   3. **Title** - The title visible in the Actor's detail page.
+   4. **Description** - The Actor's description.
+   5. **Last modification date** - When the Actor was last modified.
+   6. **Number of runs** - How many times the Actor was run.
+
+Some of this information may be scraped directly from the listing pages, but for the rest, we will need to visit the detail pages of all the Actors.
+
+### The start URL
+
+In the **Input** tab of the task we have, we'll change the **Start URL** from **https://apify.com**. This will tell the scraper to start by opening a different URL. You can add more **Start URL**s or even [use a file with a list of thousands of them](#crawling-the-website-with-pseudo-urls), but in this case, we'll be good with just one.
+
+How do we choose the new **Start URL**? The goal is to scrape all Actors in the store, which is available at [apify.com/store](https://apify.com/store), so we choose this URL as our **Start URL**.
+
+```text
+https://apify.com/store
+```
+
+We also need to somehow distinguish the **Start URL** from all the other URLs that the scraper will add later. To do this, click the **Details** button in the **Start URL** form and see the **User data** input. Here you can add any information you'll need during the scrape in a JSON format. For now, add a label to the **Start URL**.
+
+```json
+{
+  "label": "START"
+}
+```
+
+### Filtering with a Link selector
+
+The **Link selector**, together with **Pseudo URL**s, are your URL matching arsenal. The Link selector is a CSS selector and its purpose is to select the HTML elements where the scraper should look for URLs. And by looking for URLs, we mean finding the elements' `href` attributes. For example, to enqueue URLs from `<div class="my-class" href=...>` tags, we would enter `'div.my-class'`.
+
+What's the connection to **Pseudo URL**s? Well, first, all the URLs found in the elements that match the Link selector are collected. Then, **Pseudo URL**s are used to filter through those URLs and enqueue only the ones that match the **Pseudo URL** structure.
+
+To scrape all the Actors in Apify Store, we should use the Link selector to tell the scraper where to find the URLs we need. For now, let us tell you that the Link selector you're looking for is:
+
+```css
+div.item > a
+```
+
+Save it as your **Link selector**. If you're wondering how we figured this out, follow along with the tutorial. By the time we finish, you'll know why we used this selector, too.
+
+### Crawling the website with pseudo URLs
+
+What is a **Pseudo URL**? Let us explain. Before we can start scraping the Actor details, we need to find all the links to the details. If the links follow a set structure, we can use a certain pattern to describe this structure. And that's what a **Pseudo URL** is. A pattern that describes a URL structure. By setting a **Pseudo URL**, all links that follow the given structure will automatically be added to the crawling queue.
+
+Let's see an example. To find the pattern, open some of the Actor details in the store. You'll find that the URLs are always structured the same:
+
+```text
+https://apify.com/{OWNER}/{NAME}
+```
+
+In the structures, only the `OWNER` and `NAME` change. We can leverage this in a **Pseudo URL**.
+
+#### Making a pseudo URL
+
+**Pseudo URL**s are URLs with some variable parts in them. Those variable parts are represented by [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions) enclosed in brackets `[]`.
+
+Working with our Actor details example, we could produce a **Pseudo URL** like this:
+
+```text
+https://apify.com/[.+]/[.+]
+```
+
+This **Pseudo URL** will match all Actor detail pages, such as:
+
+```text
+https://apify.com/apify/web-scraper
+```
+
+But it will not match pages we're not interested in, such as:
+
+```text
+https://apify.com/contact
+```
+
+In addition, together with the filter we set up using the **Link selector**, the scraper will now avoid URLs such as:
+
+```text
+https://apify.com/industries/manufacturing
+```
+
+This is because even though it matches our **Pseudo URL**'s format, the HTML element that contains it does not match the `div.item > a` element we specified in the **Link selector**.
+
+Let's use the above **Pseudo URL** in our task. We should also add a label as we did with our **Start URL**. This label will be added to all pages that were enqueued into the request queue using the given **Pseudo URL**.
+
+```json
+{
+  "label": "DETAIL"
+}
+```
+
+### Test run
+
+Now that we've added some configuration, it's time to test it. Run the task, keeping the **Max pages per run** set to `10` and the `pageFunction` as it is. You should see in the log that the scraper first visits the **Start URL** and then several of the Actor details matching the **Pseudo URL**.
+
+## The page function
+
+The `pageFunction` is a JavaScript function that gets executed for each page the scraper visits. To figure out how to create it, you must first inspect the page's structure to get an idea of its inner workings. The best tools for that are a browser's inbuilt developer tools - DevTools.
+
+### Using DevTools
+
+Open [Apify Store](https://apify.com/store) in the Chrome browser (or use any other browser, just note that the DevTools may differ slightly) and open the DevTools, either by right-clicking on the page and selecting **Inspect** or by pressing **F12**.
+
+The DevTools window will pop up and display a lot of, perhaps unfamiliar, information. Don't worry about that too much - open the Elements tab (the one with the page's HTML). The Elements tab allows you to browse the page's structure and search within it using the search tool. You can open the search tool by pressing **CTRL+F** or **CMD+F**. Try typing **title** into the search bar.
+
+You'll see that the Element tab jumps to the first `<title>` element of the current page and that the title is **Store · Apify**. It's always good practice to do your research using the DevTools before writing the `pageFunction` and running your task.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/using-devtools.webp)
+
+> For the sake of brevity, we won't go into the details of using the DevTools in this tutorial. If you're just starting out with DevTools, this [Google tutorial](https://developer.chrome.com/docs/devtools/) is a good place to begin.
+
+### Understanding `context`
+
+The `pageFunction` has access to global variables such as `window` and `document`, which are provided by the browser, as well as to `context`, which is the `pageFunction`'s single argument. `context` carries a lot of useful information and helpful functions, which are described in the Actor's README.
+
+### New page function boilerplate
+
+We know that we'll visit two kinds of pages, the list page (**Start URL**) and the detail pages (enqueued using the **Pseudo URL**). We want to enqueue links on the list page and scrape data on the detail page.
+
+Since we're not covering jQuery in this tutorial for the sake of brevity, replace the default boilerplate with the code below.
+
+```js
+async function pageFunction(context) {
+    const { request, log, skipLinks } = context;
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        // Do some stuff later.
+    }
+    if (request.userData.label === 'DETAIL') {
+        log.info(`Scraping ${request.url}`);
+        await skipLinks();
+        // Do some scraping.
+        return {
+            // Scraped data.
+        };
+    }
+}
+```
+
+This may seem like a lot of new information, but it's all connected to our earlier configuration.
+
+### `context.request`
+
+The `request` is an instance of the [`Request`](https://sdk.apify.com/docs/api/request) class and holds information about the currently processed page, such as its `url`. Each `request` also has the `request.userData` property of type `Object`. While configuring the **Start URL** and the **Pseudo URL**, we gave them a `label`. We're now using them in the `pageFunction` to distinguish between the store page and the detail pages.
+
+### `context.skipLinks()`
+
+When a **Pseudo URL** is set, the scraper attempts to enqueue matching links on each page it visits. `skipLinks()` is used to tell the scraper that we don't want this to happen on the current page.
+
+### `context.log`
+
+`log` is used for printing messages to the console. You may be tempted to use `console.log()`, but this will not work unless you turn on the **Browser log** option. `log.info()` should be used for general messages, but you can also use `log.debug()` for messages that will only be shown when you turn on the **Debug log** option. [See the docs for more info](https://sdk.apify.com/docs/api/log).
+
+### The page function's return value
+
+The `pageFunction` may only return nothing, `null`, `Object` or `Object[]`. If an `Object` is returned, it will be saved as a single result. Returning an `Array` of `Objects` will save each item in the array as a result.
+
+The scraping results are saved in a [dataset](/platform/storage/dataset) (one of the tabs in the run console, as you may remember). It behaves like a table. Each item is a row in the table and its properties are its columns. Returning the following `Object`:
+
+```js
+async function pageFunction(context) {
+    // ... rest of your code
+    return {
+        url: 'https://apify.com',
+        title: 'Web Scraping, Data Extraction and Automation - Apify',
+    };
+}
+```
+
+will produce the following table:
+
+| title | url |
+| ----- | --- |
+| Web Scraping, Data Extraction and Automation - Apify | https://apify.com |
+
+## Scraper lifecycle
+
+Now that we're familiar with all the pieces in the puzzle, we'll quickly take a look at the scraper lifecycle,
+or in other words, what the scraper actually does when it scrapes. It's quite straightforward.
+
+The scraper:
+
+ 1. Visits the first **Start URL** and waits for the page to load.
+ 2. Executes the `pageFunction`.
+ 3. Finds all the elements matching the **Link selector** and extracts their `href` attributes (URLs).
+ 4. Uses the **pseudo URLs** to filter the extracted URLs and throws away those that don't match.
+ 5. Enqueues the matching URLs to the end of the crawling queue.
+ 6. Closes the page and selects a new URL to visit, either from the **Start URL**s if there are any left, or from the beginning of the crawling queue.
+
+> When you're not using the request queue, the scraper repeats steps 1 and 2. You would not use the request queue when you already know all the URLs you want to visit. For example, when you have a pre-existing list of a thousand URLs that you uploaded as a text file. Or when scraping a single URL.
+
+## Scraping practice
+
+We've covered all the concepts that we need to understand to successfully scrape the data in our goal, so let's get to it. We will only output data that are already available to us in the page's URL. Remember from [our goal](#the-goal) that we also want to include the **URL** and a **Unique identifier** in our results. To get those, we need the `request.url`, because it is the URL and includes the Unique identifier.
+
+```js
+const { url } = request;
+const uniqueIdentifier = url.split('/').slice(-2).join('/');
+```
+
+### Test run 2
+
+We'll add our first data to the `pageFunction` and carry out a test run to see that everything works as expected.
+
+```js
+async function pageFunction(context) {
+    const { request, log, skipLinks } = context;
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        // Do some stuff later.
+    }
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+        };
+    }
+}
+```
+
+Now **Save & Run** the task and once it finishes, check the dataset by clicking on the **Results** card. Click **Preview** and you should see the URLs and unique identifiers scraped. Great job!
+
+## Choosing sides
+
+Up until now, everything has been the same for all the Apify scrapers. Whether you're using Web Scraper,
+Puppeteer Scraper or Cheerio Scraper, what you've learned now will always be the same.
+This is great if you ever need to switch scrapers, because there's no need to learn everything from scratch.
+
+Differences can be found in the code we use in the `pageFunction`. Often subtle, sometimes large.
+In the next part of the tutorial, we'll focus on the individual scrapers' specific implementation details.
+It's time to choose sides. But don't worry, at Apify, no side is the dark side.
+
+- [Web Scraper tutorial](/academy/apify-scrapers/web-scraper)
+- [Cheerio Scraper tutorial](/academy/apify-scrapers/cheerio-scraper)
+- [Puppeteer Scraper tutorial](/academy/apify-scrapers/puppeteer-scraper)
+
+**Step-by-step tutorial that will help you get started with all Apify Scrapers. Learn the foundations of scraping the web with Apify and creating your own Actors.**
+</file>
+
+<file path="tutorials/apify_scrapers/index.md">
+---
+title: Apify scrapers
+description: Discover Apify's ready-made web scraping and automation tools. Compare Web Scraper, Cheerio Scraper and Puppeteer Scraper to decide which is right for you.
+sidebar_position: 13.2
+slug: /apify-scrapers
+---
+
+# Scraping with Apify
+
+**Discover Apify's ready-made web scraping and automation tools. Compare Web Scraper, Cheerio Scraper and Puppeteer Scraper to decide which is right for you.**
+
+---
+
+Scraping and crawling the web can be difficult and time-consuming without the right tools. That's why Apify provides ready-made solutions to crawl and scrape any website. They are based on our [Actors](https://apify.com/actors), the [Apify SDK](/sdk/js) and [Crawlee](https://crawlee.dev/).
+
+Don't let the number of options confuse you. Unless you're really sure you need to use a specific tool, go ahead and use **Web Scraper** ([apify/web-scraper](./web_scraper.md)). It is the easiest to pick up and can handle almost anything. Look at **Puppeteer Scraper** ([apify/puppeteer-scraper](./puppeteer_scraper.md)) or **Cheerio Scraper** ([apify/cheerio-scraper](./cheerio_scraper.md)) only after you know your target websites well and need to optimize your scraper.
+
+[Visit the Scraper introduction tutorial to get started!](./getting_started.md)
+
+## Web Scraper
+
+Web Scraper is a ready-made solution for scraping the web using the Chrome browser. It takes away all the work necessary to set up a browser for crawling, controls the browser automatically and produces machine-readable results in several common formats.
+
+Underneath, it uses the Puppeteer library to control the browser, but you don't need to worry about that. Using a web UI and a little of basic JavaScript, you can tweak it to serve almost any scraping need.
+
+[Visit the Web Scraper tutorial to get started!](./web_scraper.md)
+
+## Cheerio Scraper
+
+Cheerio Scraper is a ready-made solution for crawling the web using plain HTTP requests to retrieve HTML pages and then parsing and inspecting the HTML using the [cheerio](https://www.npmjs.com/package/cheerio) library. It's blazing fast.
+
+Cheerio is a server-side version of the popular jQuery library that does not run in the browser but instead constructs a DOM out of an HTML string and then provides the user an API to work with that DOM.
+
+Cheerio Scraper is ideal for scraping websites that do not rely on client-side JavaScript to serve their content. It can be as much as 20 times faster than using a full-browser solution like Puppeteer.
+
+[Visit the Cheerio Scraper tutorial to get started!](./cheerio_scraper.md)
+
+## Puppeteer Scraper
+
+Puppeteer Scraper is the most powerful scraper tool in our arsenal (aside from developing your own Actors). It uses the Puppeteer library to programmatically control a headless Chrome browser, and it can make it do almost anything. If using Web Scraper does not cut it, Puppeteer Scraper is what you need.
+
+Puppeteer is a Node.js library, so knowledge of Node.js and its paradigms is expected when working with Puppeteer Scraper.
+
+[Visit the Puppeteer Scraper tutorial to get started!](./puppeteer_scraper.md)
+</file>
+
+<file path="tutorials/apify_scrapers/puppeteer_scraper.md">
+---
+title: Scraping with Puppeteer Scraper
+menuTitle: Puppeteer Scraper
+description: Learn how to scrape a website using Apify's Puppeteer Scraper. Build an Actor's page function, extract information from a web page and download your data.
+externalSourceUrl: https://raw.githubusercontent.com/apify/actor-scraper/master/docs/build/puppeteer-scraper-tutorial.md
+sidebar_position: 4
+slug: /apify-scrapers/puppeteer-scraper
+---
+
+[//]: # (TODO: Should be updated)
+
+#
+
+This scraping tutorial will go into the nitty gritty details of extracting data from **https://apify.com/store**
+using **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper)). If you arrived here from the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started),
+tutorial, great! You are ready to continue where we left off. If you haven't seen the Getting started yet,
+check it out, it will help you learn about Apify and scraping in general and set you up for this tutorial,
+because this one builds on topics and code examples discussed there.
+
+## Getting to know our tools
+
+In the [Getting started with Apify scrapers](https://docs.apify.com/academy/apify-scrapers/getting-started) tutorial, we've confirmed that the scraper works as expected,
+so now it's time to add more data to the results.
+
+To do that, we'll be using the [Puppeteer library](https://github.com/puppeteer/puppeteer). Puppeteer is a browser
+automation library that allows you to control a browser using JavaScript. That is, simulate a real human sitting
+in front of a computer, using a mouse and a keyboard. It gives you almost unlimited possibilities, but you need to learn
+quite a lot before you'll be able to use all of its features. We'll walk you through some of the basics of Puppeteer,
+so that you can start using it for some of the most typical scraping tasks, but if you really want to master it,
+you'll need to visit its [documentation](https://pptr.dev/) and really dive deep into its intricacies.
+
+> The purpose of Puppeteer Scraper is to remove some of the difficulty faced when using Puppeteer by wrapping
+it in a nice, manageable UI. It provides almost all of its features in a format that is much easier to grasp
+when first trying to scrape using Puppeteer.
+
+### Web Scraper differences
+
+At first glance, it may seem like **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)) and Puppeteer Scraper are almost the same. Well, they are.
+In fact, Web Scraper uses Puppeteer underneath. The difference is the amount of control they give you.
+Where Web Scraper only gives you access to in-browser JavaScript and the `pageFunction` is executed
+in the browser context, Puppeteer Scraper's `pageFunction` is executed in Node.js context, giving you
+much more freedom to bend the browser to your will. You're the puppeteer and the browser is your puppet.
+It's also much easier to work with external APIs, databases or the [Apify SDK](https://sdk.apify.com)
+in the Node.js context. The tradeoff is simplicity vs power. Web Scraper is simple,
+Puppeteer Scraper is powerful (and the [Apify SDK](https://sdk.apify.com) is super-powerful).
+
+> In other words, Web Scraper's `pageFunction` is like a single [page.evaluate()](https://pptr.dev/#?product=Puppeteer&show=api-pageevaluatepagefunction-args) call.
+
+Now that's out of the way, let's open one of the Actor detail pages in the Store, for example the Web Scraper page and use our DevTools-Fu to scrape some data.
+
+> If you're wondering why we're using Web Scraper as an example instead of Puppeteer Scraper,
+it's only because we didn't want to triple the number of screenshots we needed to make. Lazy developers!
+
+## Building our Page function
+
+Before we start, let's do a quick recap of the data we chose to scrape:
+
+   1. **URL** - The URL that goes directly to the Actor's detail page.
+   2. **Unique identifier** - Such as **apify/web-scraper**.
+   3. **Title** - The title visible in the Actor's detail page.
+   4. **Description** - The Actor's description.
+   5. **Last modification date** - When the Actor was last modified.
+   6. **Number of runs** - How many times the Actor was run.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/scraping-practice.webp)
+
+We've already scraped numbers 1 and 2 in the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started)
+tutorial, so let's get to the next one on the list: title.
+
+### Title
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/title.webp)
+
+By using the element selector tool, we find out that the title is there under an `<h1>` tag, as titles should be.
+Maybe surprisingly, we find that there are actually two `<h1>` tags on the detail page. This should get us thinking.
+Is there any parent element that includes our `<h1>` tag, but not the other ones? Yes, there is! A `<header>`
+element that we can use to select only the heading we're interested in.
+
+> Remember that you can press CTRL+F (CMD+F) in the Elements tab of DevTools to open the search bar where you can quickly search for elements using
+> their selectors. And always make sure to use the DevTools to verify your scraping process and assumptions. It's faster than changing the crawler
+> code all the time.
+
+To get the title we need to find it using a `header h1` selector, which selects all `<h1>` elements that have a `<header>` ancestor.
+And as we already know, there's only one.
+
+```js
+// Using Puppeteer
+async function pageFunction(context) {
+    const { page } = context;
+    const title = await page.$eval(
+        'header h1',
+        ((el) => el.textContent),
+    );
+
+    return {
+        title,
+    };
+}
+```
+
+The [`page.$eval`](https://pptr.dev/#?product=Puppeteer&show=api-elementhandleevalselector-pagefunction-args-1)
+function allows you to run a function in the browser, with the selected element as the first argument.
+Here we use it to extract the text content of a `h1` element that's in the page. The return value of the function
+is automatically passed back to the Node.js context, so we receive an actual `string` with the element's text.
+
+### Description
+
+Getting the Actor's description is a little more involved, but still pretty straightforward. We cannot search for a `<p>` tag, because there's a lot of them in the page. We need to narrow our search down a little. Using the DevTools we find that the Actor description is nested within
+the `<header>` element too, same as the title. Moreover, the actual description is nested inside a `<span>` tag with a class `actor-description`.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/description.webp)
+
+```js
+async function pageFunction(context) {
+    const { page } = context;
+    const title = await page.$eval(
+        'header h1',
+        ((el) => el.textContent),
+    );
+    const description = await page.$eval(
+        'header span.actor-description',
+        ((el) => el.textContent),
+    );
+
+    return {
+        title,
+        description,
+    };
+}
+```
+
+### Modified date
+
+The DevTools tell us that the `modifiedDate` can be found in a `<time>` element.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/modified-date.webp)
+
+```js
+async function pageFunction(context) {
+    const { page } = context;
+    const title = await page.$eval(
+        'header h1',
+        ((el) => el.textContent),
+    );
+    const description = await page.$eval(
+        'header span.actor-description',
+        ((el) => el.textContent),
+    );
+
+    const modifiedTimestamp = await page.$eval(
+        'ul.ActorHeader-stats time',
+        (el) => el.getAttribute('datetime'),
+    );
+    const modifiedDate = new Date(Number(modifiedTimestamp));
+
+    return {
+        title,
+        description,
+        modifiedDate,
+    };
+}
+```
+
+Similarly to `page.$eval`, the [`page.$$eval`](https://pptr.dev/#?product=Puppeteer&show=api-elementhandleevalselector-pagefunction-args)
+function runs a function in the browser, only this time, it does not provide
+you with a single `Element` as the function's argument, but rather with an `Array` of `Elements`. Once again,
+the return value of the function will be passed back to the Node.js context.
+
+It might look a little too complex at first glance, but let us walk you through it. We find all the `<time>` elements. Then, we read its `datetime` attribute, because that's where a unix timestamp is stored as a `string`.
+
+But we would much rather see a readable date in our results, not a unix timestamp, so we need to convert it. Unfortunately, the `new Date()`
+constructor will not accept a `string`, so we cast the `string` to a `number` using the `Number()` function before actually calling `new Date()`.
+Phew!
+
+### Run count
+
+And so we're finishing up with the `runCount`. There's no specific element like `<time>`, so we need to create
+a complex selector and then do a transformation on the result.
+
+```js
+async function pageFunction(context) {
+    const { page } = context;
+    const title = await page.$eval(
+        'header h1',
+        ((el) => el.textContent),
+    );
+    const description = await page.$eval(
+        'header span.actor-description',
+        ((el) => el.textContent),
+    );
+
+    const modifiedTimestamp = await page.$eval(
+        'ul.ActorHeader-stats time',
+        (el) => el.getAttribute('datetime'),
+    );
+    const modifiedDate = new Date(Number(modifiedTimestamp));
+
+    const runCountText = await page.$eval(
+        'ul.ActorHeader-stats > li:nth-of-type(3)',
+        ((el) => el.textContent),
+    );
+    const runCount = Number(runCountText.match(/[\d,]+/)[0].replace(',', ''));
+
+    return {
+        title,
+        description,
+        modifiedDate,
+        runCount,
+    };
+}
+```
+
+The `ul.ActorHeader-stats > li:nth-of-type(3)` looks complicated, but it only reads that we're looking for a `<ul class="ActorHeader-stats ...">` element and within that
+element we're looking for the third `<li>` element. We grab its text, but we're only interested in the number of runs. We parse the number out
+using a regular expression, but its type is still a `string`, so we finally convert the result to a `number` by wrapping it with a `Number()` call.
+
+> The numbers are formatted with commas as thousands separators (e.g. `'1,234,567'`), so to extract it, we
+> first use regular expression `/[\d,]+/` - it will search for consecutive number or comma characters.
+> Then we extract the match via `.match(/[\d,]+/)[0]` and finally remove all the commas by calling `.replace(/,/g, '')`.
+> We need to use `/,/g` with the global modifier to support large numbers with multiple separators, without it
+> we would replace only the very first occurrence.
+>
+> This will give us a string (e.g. `'1234567'`) that can be converted via `Number` function.
+
+### Wrapping it up
+
+And there we have it! All the data we needed in a single object. For the sake of completeness, let's add
+the properties we parsed from the URL earlier and we're good to go.
+
+```js
+async function pageFunction(context) {
+    const { page, request } = context;
+    const { url } = request;
+
+    // ...
+
+    const uniqueIdentifier = url
+        .split('/')
+        .slice(-2)
+        .join('/');
+
+    const title = await page.$eval(
+        'header h1',
+        ((el) => el.textContent),
+    );
+    const description = await page.$eval(
+        'header span.actor-description',
+        ((el) => el.textContent),
+    );
+
+    const modifiedTimestamp = await page.$eval(
+        'ul.ActorHeader-stats time',
+        (el) => el.getAttribute('datetime'),
+    );
+    const modifiedDate = new Date(Number(modifiedTimestamp));
+
+    const runCountText = await page.$eval(
+        'ul.ActorHeader-stats > li:nth-of-type(3)',
+        ((el) => el.textContent),
+    );
+    const runCount = Number(runCountText.match(/[\d,]+/)[0].replace(',', ''));
+
+    return {
+        url,
+        uniqueIdentifier,
+        title,
+        description,
+        modifiedDate,
+        runCount,
+    };
+}
+```
+
+All we need to do now is add this to our `pageFunction`:
+
+```js
+async function pageFunction(context) {
+    // page is Puppeteer's page
+    const { request, log, skipLinks, page } = context;
+
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        // Do some stuff later.
+    }
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        // Get attributes in parallel to speed up the process.
+        const titleP = page.$eval(
+            'header h1',
+            (el) => el.textContent,
+        );
+        const descriptionP = page.$eval(
+            'header span.actor-description',
+            (el) => el.textContent,
+        );
+        const modifiedTimestampP = page.$eval(
+            'ul.ActorHeader-stats time',
+            (el) => el.getAttribute('datetime'),
+        );
+        const runCountTextP = page.$eval(
+            'ul.ActorHeader-stats > li:nth-of-type(3)',
+            (el) => el.textContent,
+        );
+
+        const [
+            title,
+            description,
+            modifiedTimestamp,
+            runCountText,
+        ] = await Promise.all([
+            titleP,
+            descriptionP,
+            modifiedTimestampP,
+            runCountTextP,
+        ]);
+
+        const modifiedDate = new Date(Number(modifiedTimestamp));
+        const runCount = Number(runCountText.match(/[\d,]+/)[0].replace(',', ''));
+
+        return {
+            url,
+            uniqueIdentifier,
+            title,
+            description,
+            modifiedDate,
+            runCount,
+        };
+    }
+}
+```
+
+> You have definitely noticed that we changed up the code a little bit. This is because the back and forth communication
+between Node.js and browser takes some time and it slows down the scraper. To limit the effect of this, we changed
+all the functions to start at the same time and only wait for all of them to finish at the end. This is called
+concurrency or parallelism. Unless the functions need to be executed in a specific order, it's often a good idea
+to run them concurrently to speed things up.
+
+### Test run
+
+As always, try hitting that **Save & Run** button and visit
+the **Dataset** preview of clean items. You should see a nice table of all the attributes correctly scraped.
+You nailed it!
+
+## Pagination
+
+Pagination is a term that represents "going to the next page of results". You may have noticed that we did not
+actually scrape all the Actors, just the first page of results. That's because to load the rest of the Actors,
+one needs to click the **Show more** button at the very bottom of the list. This is pagination.
+
+> This is a typical form of JavaScript pagination, sometimes called infinite scroll. Other pages may use links
+that take you to the next page. If you encounter those, make a **Pseudo URL** for those links and they will
+be automatically enqueued to the request queue. Use a label to let the scraper know what kind of URL it's processing.
+
+### Waiting for dynamic content
+
+Before we talk about paginating, we need to have a quick look at dynamic content. Since Apify Store is a JavaScript
+application (a popular approach), the button might not exist in the page when the scraper runs the `pageFunction`.
+
+How is this possible? Because the scraper only waits with executing the `pageFunction` for the page to load its HTML.
+If there's additional JavaScript that modifies the DOM afterwards, the `pageFunction` may execute before this
+JavaScript had the time to run.
+
+At first, you may think that the scraper is broken, but it just cannot wait for all the JavaScript in the page
+to finish executing. For a lot of pages, there's always some JavaScript executing or some network requests being made.
+It would never stop waiting. It is therefore up to you, the programmer, to wait for the elements you need.
+
+#### The `context.page.waitFor()` function
+
+`waitFor()` is a function that's available on the Puppeteer `page` object that's in turn available on
+the `context` argument of the  `pageFunction` (as you already know from previous chapters). It helps you with,
+well, waiting for stuff. It accepts either a number of milliseconds to wait, a selector to await in the page,
+or a function to execute. It will stop waiting once the time elapses, the selector appears or the provided function
+returns `true`.
+
+> See [`page.waitFor()`](https://pptr.dev/#?product=Puppeteer&show=api-pagewaitforselectororfunctionortimeout-options-args)
+in the Puppeteer documentation.
+
+```js
+// Waits for 2 seconds.
+await page.waitFor(2000);
+// Waits until an element with id "my-id" appears in the page.
+await page.waitFor('#my-id');
+// Waits until a "myObject" variable appears
+// on the window object.
+await page.waitFor(() => !!window.myObject);
+```
+
+The selector may never be found and the function might never return `true`, so the `page.waitFor()` function also has
+a timeout. The default is `30` seconds. You can override it by providing an options object as the second parameter,
+with a `timeout` property.
+
+```js
+await page.waitFor('.bad-class', { timeout: 5000 });
+```
+
+With those tools, you should be able to handle any dynamic content the website throws at you.
+
+### How to paginate
+
+After going through the theory, let's design the algorithm:
+
+1. Wait for the **Show more** button.
+2. Click it.
+3. Is there another **Show more** button?
+    - Yes? Repeat from 1. (loop)
+    - No? We're done. We have all the Actors.
+
+#### Waiting for the button
+
+Before we can wait for the button, we need to know its unique selector. A quick look in the DevTools tells us
+that the button's class is some weird randomly generated string, but fortunately, there's an enclosing `<div>`
+with a class of `show-more`. Great! Our unique selector:
+
+```css
+div.show-more > button
+```
+
+> Don't forget to confirm our assumption in the DevTools finder tool (CTRL/CMD + F).
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/waiting-for-the-button.webp)
+
+Now that we know what to wait for, we plug it into the `waitFor()` function.
+
+```js
+await page.waitFor('div.show-more > button');
+```
+
+#### Clicking the button
+
+We have a unique selector for the button and we know that it's already rendered in the page. Clicking it is a piece
+of cake. We'll use the Puppeteer `page` again to issue the click. Puppeteer will actually simulate dragging the mouse
+and making a left mouse click in the element.
+
+```js
+await page.click('div.show-more > button');
+```
+
+This will show the next page of Actors.
+
+#### Repeating the process
+
+We've shown two function calls, but how do we make this work together in the `pageFunction`?
+
+```js
+async function pageFunction(context) {
+
+    // ...
+
+    let timeout; // undefined
+    const buttonSelector = 'div.show-more > button';
+    for (;;) {
+        log.info('Waiting for the "Show more" button.');
+        try {
+        // Default timeout first time.
+            await page.waitFor(buttonSelector, { timeout });
+            // 2 sec timeout after the first.
+            timeout = 2000;
+        } catch (err) {
+        // Ignore the timeout error.
+            log.info('Could not find the "Show more button", '
+            + 'we\'ve reached the end.');
+            break;
+        }
+        log.info('Clicking the "Show more" button.');
+        await page.click(buttonSelector);
+    }
+
+    // ...
+
+}
+```
+
+We want to run this until the `waitFor()` function throws, so that's why we use a `while(true)` loop. We're also not
+interested in the error, because we're expecting it, so we ignore it and print a log message instead.
+
+You might be wondering what's up with the `timeout`. Well, for the first page load, we want to wait longer,
+so that all the page's JavaScript has had a chance to execute, but for the other iterations, the JavaScript is
+already loaded and we're waiting for the page to re-render so waiting for `2` seconds is enough to confirm
+that the button is not there. We don't want to stall the scraper for `30` seconds just to make sure that there's
+no button.
+
+### Plugging it into the Page function
+
+We've got the general algorithm ready, so all that's left is to integrate it into our earlier `pageFunction`.
+Remember the `// Do some stuff later` comment? Let's replace it.
+
+```js
+async function pageFunction(context) {
+    const { request, log, skipLinks, page } = context;
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        let timeout; // undefined
+        const buttonSelector = 'div.show-more > button';
+        for (;;) {
+            log.info('Waiting for the "Show more" button.');
+            try {
+                // Default timeout first time.
+                await page.waitFor(buttonSelector, { timeout });
+                // 2 sec timeout after the first.
+                timeout = 2000;
+            } catch (err) {
+                // Ignore the timeout error.
+                log.info('Could not find the "Show more button", '
+                    + 'we\'ve reached the end.');
+                break;
+            }
+            log.info('Clicking the "Show more" button.');
+            await page.click(buttonSelector);
+        }
+    }
+
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        // Get attributes in parallel to speed up the process.
+        const titleP = page.$eval(
+            'header h1',
+            (el) => el.textContent,
+        );
+        const descriptionP = page.$eval(
+            'header span.actor-description',
+            (el) => el.textContent,
+        );
+        const modifiedTimestampP = page.$eval(
+            'ul.ActorHeader-stats time',
+            (el) => el.getAttribute('datetime'),
+        );
+        const runCountTextP = page.$eval(
+            'ul.ActorHeader-stats > li:nth-of-type(3)',
+            (el) => el.textContent,
+        );
+
+        const [
+            title,
+            description,
+            modifiedTimestamp,
+            runCountText,
+        ] = await Promise.all([
+            titleP,
+            descriptionP,
+            modifiedTimestampP,
+            runCountTextP,
+        ]);
+
+        const modifiedDate = new Date(Number(modifiedTimestamp));
+        const runCount = Number(runCountText.match(/[\d,]+/)[0].replace(',', ''));
+
+        return {
+            url,
+            uniqueIdentifier,
+            title,
+            description,
+            modifiedDate,
+            runCount,
+        };
+    }
+}
+```
+
+That's it! You can now remove the **Max pages per run** limit, **Save & Run** your task and watch the scraper paginate
+through all the Actors and then scrape all of their data. After it succeeds, open the **Dataset** tab again and click on **Preview****. You should have a table of all the Actor's details in front of you. If you do, great job!
+You've successfully scraped Apify Store. And if not, no worries, go through the code examples again,
+it's probably just a typo.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/plugging-it-into-the-pagefunction.webp)
+
+## Downloading the scraped data
+
+You already know the **Dataset** tab of the run console since this is where we've always previewed our data. Notice the row of data formats such as JSON, CSV, and Excel. Below it are options for viewing and downloading the data. Go ahead and try it.
+
+> If you prefer working with an API, you can find the example endpoint under the API tab: **Get dataset items**.
+
+### Clean items
+
+You can view and download your data without modifications, or you can choose to only get **clean** items. Data that aren't cleaned include a record
+for each `pageFunction` invocation, even if you did not return any results. The record also includes hidden fields
+such as `#debug`, where you can find a variety of information that can help you with debugging your scrapers.
+
+Clean items, on the other hand, include only the data you returned from the `pageFunction`. If you're only interested in the data you scraped, this format is what you will be using most of the time.
+
+To control this, open the **Advanced options** view on the **Dataset** tab.
+
+## Bonus: Making your code neater
+
+You may have noticed that the `pageFunction` gets quite bulky. To make better sense of your code and have an easier
+time maintaining or extending your task, feel free to define other functions inside the `pageFunction`
+that encapsulate all the different logic. You can, for example, define a function for each of the different pages:
+
+```js
+async function pageFunction(context) {
+    switch (context.request.userData.label) {
+        case 'START': return handleStart(context);
+        case 'DETAIL': return handleDetail(context);
+        default: throw new Error('Unknown request label.');
+    }
+
+    async function handleStart({ log, page }) {
+        log.info('Store opened!');
+        let timeout; // undefined
+        const buttonSelector = 'div.show-more > button';
+        for (;;) {
+            log.info('Waiting for the "Show more" button.');
+            try {
+                // Default timeout first time.
+                await page.waitFor(buttonSelector, { timeout });
+                // 2 sec timeout after the first.
+                timeout = 2000;
+            } catch (err) {
+                // Ignore the timeout error.
+                log.info('Could not find the "Show more button", '
+                    + 'we\'ve reached the end.');
+                break;
+            }
+            log.info('Clicking the "Show more" button.');
+            await page.click(buttonSelector);
+        }
+    }
+
+    async function handleDetail({
+        request,
+        log,
+        skipLinks,
+        page,
+    }) {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        // Get attributes in parallel to speed up the process.
+        const titleP = page.$eval(
+            'header h1',
+            (el) => el.textContent,
+        );
+        const descriptionP = page.$eval(
+            'header span.actor-description',
+            (el) => el.textContent,
+        );
+        const modifiedTimestampP = page.$eval(
+            'ul.ActorHeader-stats time',
+            (el) => el.getAttribute('datetime'),
+        );
+        const runCountTextP = page.$eval(
+            'ul.ActorHeader-stats > li:nth-of-type(3)',
+            (el) => el.textContent,
+        );
+
+        const [
+            title,
+            description,
+            modifiedTimestamp,
+            runCountText,
+        ] = await Promise.all([
+            titleP,
+            descriptionP,
+            modifiedTimestampP,
+            runCountTextP,
+        ]);
+
+        const modifiedDate = new Date(Number(modifiedTimestamp));
+        const runCount = Number(runCountText.match(/[\d,]+/)[0].replace(',', ''));
+
+        return {
+            url,
+            uniqueIdentifier,
+            title,
+            description,
+            modifiedDate,
+            runCount,
+        };
+    }
+}
+```
+
+> If you're confused by the functions being declared below their executions, it's called hoisting and it's a feature
+of JavaScript. It helps you put what matters on top, if you so desire.
+
+## Bonus 2: Using jQuery with Puppeteer Scraper
+
+If you're familiar with the [jQuery library](https://jquery.com/), you may have looked at the scraping code and thought
+that it's unnecessarily complicated. That's probably up to everyone to decide on their own, but the good news is,
+you can use jQuery with Puppeteer Scraper too.
+
+### Injecting jQuery
+
+To be able to use jQuery, we first need to introduce it to the browser. The [`Apify.utils.puppeteer.injectJQuery`](https://sdk.apify.com/docs/api/puppeteer#puppeteerinjectjquerypage) function will help us with the task.
+
+> Friendly warning: Injecting jQuery into a page may break the page itself, if it expects a specific version
+of jQuery to be available and you override it with an incompatible one. Be careful.
+
+You can either call this function directly in your `pageFunction`, or you can set up jQuery injection in the
+**Pre goto function** in the **Input and options** section.
+
+```js
+async function pageFunction(context) {
+    const { Apify, page } = context;
+    await Apify.utils.puppeteer.injectJQuery(page);
+
+    // your code ...
+}
+```
+
+```js
+async function preGotoFunction({ page, Apify }) {
+    await Apify.utils.puppeteer.injectJQuery(page);
+}
+```
+
+The implementations are almost equal in effect. That means that in some cases, you may see performance differences,
+or one might work while the other does not. Depending on the target website.
+
+Let's try refactoring the Bonus 1 version of the `pageFunction` to use jQuery.
+
+```js
+async function pageFunction(context) {
+    switch (context.request.userData.label) {
+        case 'START': return handleStart(context);
+        case 'DETAIL': return handleDetail(context);
+        default: throw new Error(`Unknown label: ${context.request.userData.label}`);
+    }
+
+    async function handleStart({ log, page }) {
+        log.info('Store opened!');
+        let timeout; // undefined
+        const buttonSelector = 'div.show-more > button';
+        for (;;) {
+            log.info('Waiting for the "Show more" button.');
+            try {
+                await page.waitFor(buttonSelector, { timeout });
+                timeout = 2000;
+            } catch (err) {
+                log.info('Could not find the "Show more button", '
+                    + 'we\'ve reached the end.');
+                break;
+            }
+            log.info('Clicking the "Show more" button.');
+            await page.click(buttonSelector);
+        }
+    }
+
+    async function handleDetail(contextInner) {
+        const {
+            request,
+            log,
+            skipLinks,
+            page,
+            Apify,
+        } = contextInner;
+
+        // Inject jQuery
+        await Apify.utils.puppeteer.injectJQuery(page);
+
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        // Use jQuery only inside page.evaluate (inside browser)
+        const results = await page.evaluate(() => {
+            return {
+                title: $('header h1').text(),
+                description: $('header span.actor-description').text(),
+                modifiedDate: new Date(
+                    Number(
+                        $('ul.ActorHeader-stats time').attr('datetime'),
+                    ),
+                ).toISOString(),
+                runCount: Number(
+                    $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                        .text()
+                        .match(/[\d,]+/)[0]
+                        .replace(/,/g, ''),
+                ),
+            };
+        });
+
+        return {
+            url,
+            uniqueIdentifier,
+            // Add results from browser to output
+            ...results,
+        };
+    }
+}
+```
+
+> There's an important takeaway from the example code. You can only use jQuery in the browser scope, even though you're
+injecting it outside of the browser. We're using the [`page.evaluate()`](https://pptr.dev/#?product=Puppeteer&show=api-pageevaluatepagefunction-args)
+function to run the script in the context of the browser and the return value is passed back to Node.js. Keep this in mind.
+
+## Final word
+
+Thank you for reading this whole tutorial! Really! It's important to us that our users have the best information available to them so that they can use Apify effectively. We're glad that you made it all the way here and congratulations on creating your first scraping task. We hope that you liked the tutorial and if there's anything you'd like to ask, [join us on Discord](https://discord.gg/jyEM2PRvMU)!
+
+## What's next
+
+- Check out the [Apify SDK](https://docs.apify.com/sdk) and its [Getting started](https://docs.apify.com/sdk/js/docs/guides/apify-platform) tutorial if you'd like to try building your own Actors. It's a bit more complex and involved than writing a `pageFunction`, but it allows you to fine-tune all the details of your scraper to your liking.
+- [Take a deep dive into Actors](/platform/actors), from how they work to [publishing](/platform/actors/publishing) them in Apify Store, and even [making money](https://blog.apify.com/make-regular-passive-income-developing-web-automation-actors-b0392278d085/) on Actors.
+- Found out you're not into the coding part but would still to use Apify Actors? Check out our [ready-made solutions](https://apify.com/store) or [order a custom Actor](https://apify.com/contact-sales) from an Apify-certified developer.
+
+
+**Learn how to scrape a website using Apify's Puppeteer Scraper. Build an Actor's page function, extract information from a web page and download your data.**
+
+---
+</file>
+
+<file path="tutorials/apify_scrapers/web_scraper.md">
+---
+title: Scraping with Web Scraper
+menuTitle: Web Scraper
+description: Learn how to scrape a website using Apify's Web Scraper. Build an Actor's page function, extract information from a web page and download your data.
+externalSourceUrl: https://raw.githubusercontent.com/apify/actor-scraper/master/docs/build/web-scraper-tutorial.md
+sidebar_position: 2
+slug: /apify-scrapers/web-scraper
+---
+<!-- When changing the TITLE property, make sure to edit the dependent integration test: https://github.com/apify/apify-web/blob/develop/tests/e2e/cypress/integration/docs.js so it doesn't break  -->
+
+[//]: # (TODO: Should be updated)
+
+#
+
+This scraping tutorial will go into the nitty gritty details of extracting data from **https://apify.com/store**
+using **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)). If you arrived here from the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started),
+tutorial, great! You are ready to continue where we left off. If you haven't seen the Getting started yet,
+check it out, it will help you learn about Apify and scraping in general and set you up for this tutorial,
+because this one builds on topics and code examples discussed there.
+
+## Getting to know our tools
+
+In the [Getting started with Apify scrapers](https://docs.apify.com/academy/apify-scrapers/getting-started) tutorial,
+we've confirmed that the scraper works as expected, so now it's time to add more data to the results.
+
+To do that, we'll be using the [jQuery library](https://jquery.com/), because it provides some nice tools
+and a lot of people familiar with JavaScript already know how to use it.
+
+> [Check out the jQuery docs](https://api.jquery.com/) if you're not familiar with it. And if you don't want to use it, that's okay. Everything can be done using pure JavaScript, too.
+
+To add jQuery, all we need to do is turn on **Inject jQuery** under the  **Input and options** tab.
+This will add a `context.jQuery` function that you can use.
+
+Now that's out of the way, let's open one of the Actor detail pages in the Store, for example
+the [Web Scraper](https://apify.com/apify/web-scraper) page and use our DevTools-Fu to scrape some data.
+
+## Building our Page function
+
+Before we start, let's do a quick recap of the data we chose to scrape:
+
+   1. **URL** - The URL that goes directly to the Actor's detail page.
+   2. **Unique identifier** - Such as **apify/web-scraper**.
+   3. **Title** - The title visible in the Actor's detail page.
+   4. **Description** - The Actor's description.
+   5. **Last modification date** - When the Actor was last modified.
+   6. **Number of runs** - How many times the Actor was run.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/scraping-practice.webp)
+
+We've already scraped numbers 1 and 2 in the [Getting started with Apify scrapers](/academy/apify-scrapers/getting-started)
+tutorial, so let's get to the next one on the list: title.
+
+### Title
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/title.webp)
+
+By using the element selector tool, we find out that the title is there under an `<h1>` tag, as titles should be.
+Maybe surprisingly, we find that there are actually two `<h1>` tags on the detail page. This should get us thinking.
+Is there any parent element that includes our `<h1>` tag, but not the other ones? Yes, there is! A `<header>`
+element that we can use to select only the heading we're interested in.
+
+> Remember that you can press CTRL+F (CMD+F) in the Elements tab of DevTools to open the search bar where you can quickly search for elements using
+> their selectors. And always make sure to use the DevTools to verify your scraping process and assumptions. It's faster than changing the crawler
+> code all the time.
+
+To get the title we need to find it using a `header h1` selector, which selects all `<h1>` elements that have a `<header>` ancestor.
+And as we already know, there's only one.
+
+```js
+// Using jQuery.
+async function pageFunction(context) {
+    const { jQuery: $ } = context;
+
+    // ... rest of the code
+    return {
+        title: $('header h1').text(),
+    };
+}
+```
+
+### Description
+
+Getting the Actor's description is a little more involved, but still pretty straightforward. We cannot search for a `<p>` tag, because there's a lot of them in the page. We need to narrow our search down a little. Using the DevTools we find that the Actor description is nested within
+the `<header>` element too, same as the title. Moreover, the actual description is nested inside a `<span>` tag with a class `actor-description`.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/description.webp)
+
+```js
+async function pageFunction(context) {
+    const { jQuery: $ } = context;
+
+    // ... rest of the code
+    return {
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+    };
+}
+```
+
+### Modified date
+
+The DevTools tell us that the `modifiedDate` can be found in a `<time>` element.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/modified-date.webp)
+
+```js
+async function pageFunction(context) {
+    const { jQuery: $ } = context;
+
+    // ... rest of the code
+    return {
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+        modifiedDate: new Date(
+            Number(
+                $('ul.ActorHeader-stats time').attr('datetime'),
+            ),
+        ),
+    };
+}
+```
+
+It might look a little too complex at first glance, but let us walk you through it. We find all the `<time>` elements. Then, we read its `datetime` attribute, because that's where a unix timestamp is stored as a `string`.
+
+But we would much rather see a readable date in our results, not a unix timestamp, so we need to convert it. Unfortunately, the `new Date()`
+constructor will not accept a `string`, so we cast the `string` to a `number` using the `Number()` function before actually calling `new Date()`.
+Phew!
+
+### Run count
+
+And so we're finishing up with the `runCount`. There's no specific element like `<time>`, so we need to create
+a complex selector and then do a transformation on the result.
+
+```js
+async function pageFunction(context) {
+    const { jQuery: $ } = context;
+
+    // ... rest of the code
+    return {
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+        modifiedDate: new Date(
+            Number(
+                $('ul.ActorHeader-stats time').attr('datetime'),
+            ),
+        ),
+        runCount: Number(
+            $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                .text()
+                .match(/[\d,]+/)[0]
+                .replace(/,/g, ''),
+        ),
+    };
+}
+```
+
+The `ul.ActorHeader-stats > li:nth-of-type(3)` looks complicated, but it only reads that we're looking for a `<ul class="ActorHeader-stats ...">` element and within that
+element we're looking for the third `<li>` element. We grab its text, but we're only interested in the number of runs. We parse the number out
+using a regular expression, but its type is still a `string`, so we finally convert the result to a `number` by wrapping it with a `Number()` call.
+
+> The numbers are formatted with commas as thousands separators (e.g. `'1,234,567'`), so to extract it, we
+> first use regular expression `/[\d,]+/` - it will search for consecutive number or comma characters.
+> Then we extract the match via `.match(/[\d,]+/)[0]` and finally remove all the commas by calling `.replace(/,/g, '')`.
+> We need to use `/,/g` with the global modifier to support large numbers with multiple separators, without it
+> we would replace only the very first occurrence.
+>
+> This will give us a string (e.g. `'1234567'`) that can be converted via `Number` function.
+
+### Wrapping it up
+
+And there we have it! All the data we needed in a single object. For the sake of completeness, let's add
+the properties we parsed from the URL earlier and we're good to go.
+
+```js
+async function pageFunction(context) {
+    const { request, jQuery: $ } = context;
+    const { url } = request;
+
+    // ... rest of the code
+
+    const uniqueIdentifier = url.split('/').slice(-2).join('/');
+
+    return {
+        url,
+        uniqueIdentifier,
+        title: $('header h1').text(),
+        description: $('header span.actor-description').text(),
+        modifiedDate: new Date(
+            Number(
+                $('ul.ActorHeader-stats time').attr('datetime'),
+            ),
+        ),
+        runCount: Number(
+            $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                .text()
+                .match(/[\d,]+/)[0]
+                .replace(/,/g, ''),
+        ),
+    };
+}
+```
+
+All we need to do now is add this to our `pageFunction`:
+
+```js
+async function pageFunction(context) {
+    // use jQuery as $
+    const { request, log, skipLinks, jQuery: $ } = context;
+
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        // Do some stuff later.
+    }
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+            title: $('header h1').text(),
+            description: $('header span.actor-description').text(),
+            modifiedDate: new Date(
+                Number(
+                    $('ul.ActorHeader-stats time').attr('datetime'),
+                ),
+            ),
+            runCount: Number(
+                $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                    .text()
+                    .match(/[\d,]+/)[0]
+                    .replace(/,/g, ''),
+            ),
+        };
+    }
+}
+```
+
+### Test run
+
+As always, try hitting that **Save & Run** button and visit
+the **Dataset** preview of clean items. You should see a nice table of all the attributes correctly scraped.
+You nailed it!
+
+## Pagination
+
+Pagination is a term that represents "going to the next page of results". You may have noticed that we did not
+actually scrape all the Actors, just the first page of results. That's because to load the rest of the Actors,
+one needs to click the **Show more** button at the very bottom of the list. This is pagination.
+
+> This is a typical form of JavaScript pagination, sometimes called infinite scroll. Other pages may use links
+that take you to the next page. If you encounter those, make a **Pseudo URL** for those links and they will
+be automatically enqueued to the request queue. Use a label to let the scraper know what kind of URL it's processing.
+
+### Waiting for dynamic content
+
+Before we talk about paginating, we need to have a quick look at dynamic content. Since Apify Store is a JavaScript
+application (a popular approach), the button might not exist in the page when the scraper runs the `pageFunction`.
+
+How is this possible? Because the scraper only waits with executing the `pageFunction` for the page to load its HTML.
+If there's additional JavaScript that modifies the DOM afterwards, the `pageFunction` may execute before this
+JavaScript had the time to run.
+
+At first, you may think that the scraper is broken, but it just cannot wait for all the JavaScript in the page
+to finish executing. For a lot of pages, there's always some JavaScript executing or some network requests being made.
+It would never stop waiting. It is therefore up to you, the programmer, to wait for the elements you need.
+
+#### The `context.waitFor()` function
+
+`waitFor()` is a function that's available on the `context` object passed to the `pageFunction` and helps you with,
+well, waiting for stuff. It accepts either a number of milliseconds to wait, a selector to await in the page,
+or a function to execute. It will stop waiting once the time elapses, the selector appears or the provided function
+returns `true`.
+
+```js
+// Waits for 2 seconds.
+await waitFor(2000);
+// Waits until an element with id "my-id" appears
+// in the page.
+await waitFor('#my-id');
+// Waits until a "myObject" variable appears
+// on the window object.
+await waitFor(() => !!window.myObject);
+```
+
+The selector may never be found and the function might never return `true`, so the `waitFor()` function also has
+a timeout. The default is `20` seconds. You can override it by providing an options object as the second parameter,
+with a `timeoutMillis` property.
+
+```js
+await waitFor('.bad-class', { timeoutMillis: 5000 });
+```
+
+With those tools, you should be able to handle any dynamic content the website throws at you.
+
+### How to paginate
+
+After going through the theory, let's design the algorithm:
+
+1. Wait for the **Show more** button.
+2. Click it.
+3. Is there another **Show more** button?
+    - Yes? Repeat from 1. (loop)
+    - No? We're done. We have all the Actors.
+
+#### Waiting for the button
+
+Before we can wait for the button, we need to know its unique selector. A quick look in the DevTools tells us
+that the button's class is some weird randomly generated string, but fortunately, there's an enclosing `<div>`
+with a class of `show-more`. Great! Our unique selector:
+
+```text
+div.show-more > button
+```
+
+> Don't forget to confirm our assumption in the DevTools finder tool (CTRL/CMD + F).
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/waiting-for-the-button.webp)
+
+Now that we know what to wait for, we plug it into the `waitFor()` function.
+
+```js
+await waitFor('div.show-more > button');
+```
+
+#### Clicking the button
+
+We have a unique selector for the button and we know that it's already rendered in the page. Clicking it is a piece of cake. We'll use jQuery again, but feel free to use plain JavaScript, it works the same.
+
+```js
+$('div.show-more > button').click();
+```
+
+This will show the next page of Actors.
+
+#### Repeating the process
+
+We've shown two function calls, but how do we make this work together in the `pageFunction`?
+
+```js
+async function pageFunction(context) {
+
+    // ...
+
+    let timeoutMillis; // undefined
+    const buttonSelector = 'div.show-more > button';
+    for (;;) {
+        log.info('Waiting for the "Show more" button.');
+        try {
+        // Default timeout first time.
+            await waitFor(buttonSelector, { timeoutMillis });
+            // 2 sec timeout after the first.
+            timeoutMillis = 2000;
+        } catch (err) {
+        // Ignore the timeout error.
+            log.info('Could not find the "Show more button", '
+            + 'we\'ve reached the end.');
+            break;
+        }
+        log.info('Clicking the "Show more" button.');
+        $(buttonSelector).click();
+    }
+
+    // ...
+
+}
+```
+
+We want to run this until the `waitFor()` function throws, so that's why we use a `while(true)` loop. We're also not
+interested in the error, because we're expecting it, so we ignore it and print a log message instead.
+
+You might be wondering what's up with the `timeoutMillis`. Well, for the first page load, we want to wait longer,
+so that all the page's JavaScript has had a chance to execute, but for the other iterations, the JavaScript is
+already loaded and we're waiting for the page to re-render so waiting for `2` seconds is enough to confirm
+that the button is not there. We don't want to stall the scraper for `20` seconds just to make sure that there's
+no button.
+
+### Plugging it into the pageFunction
+
+We've got the general algorithm ready, so all that's left is to integrate it into our earlier `pageFunction`.
+Remember the `// Do some stuff later` comment? Let's replace it. And don't forget to destructure the `waitFor()`
+function on the first line.
+
+```js
+async function pageFunction(context) {
+    const { request,
+        log,
+        skipLinks,
+        jQuery: $,
+        waitFor,
+    } = context;
+
+    if (request.userData.label === 'START') {
+        log.info('Store opened!');
+        let timeoutMillis; // undefined
+        const buttonSelector = 'div.show-more > button';
+        for (;;) {
+            log.info('Waiting for the "Show more" button.');
+            try {
+                // Default timeout first time.
+                await waitFor(buttonSelector, { timeoutMillis });
+                // 2 sec timeout after the first.
+                timeoutMillis = 2000;
+            } catch (err) {
+                // Ignore the timeout error.
+                log.info('Could not find the "Show more button", '
+                    + 'we\'ve reached the end.');
+                break;
+            }
+            log.info('Clicking the "Show more" button.');
+            $(buttonSelector).click();
+        }
+    }
+    if (request.userData.label === 'DETAIL') {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+            title: $('header h1').text(),
+            description: $('header span.actor-description').text(),
+            modifiedDate: new Date(
+                Number(
+                    $('ul.ActorHeader-stats time').attr('datetime'),
+                ),
+            ),
+            runCount: Number(
+                $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                    .text()
+                    .match(/[\d,]+/)[0]
+                    .replace(/,/g, ''),
+            ),
+        };
+    }
+}
+```
+
+That's it! You can now remove the **Max pages per run** limit, **Save & Run** your task and watch the scraper paginate
+through all the Actors and then scrape all of their data. After it succeeds, open the **Dataset** tab again click on **Preview**. You should have a table of all the Actor's details in front of you. If you do, great job!
+You've successfully scraped Apify Store. And if not, no worries, go through the code examples again,
+it's probably just a typo.
+
+![$1](https://raw.githubusercontent.com/apify/actor-scraper/master/docs/img/plugging-it-into-the-pagefunction.webp)
+
+## Downloading the scraped data
+
+You already know the **Dataset** tab of the run console since this is where we've always previewed our data. Notice the row of data formats such as JSON, CSV, and Excel. Below it are options for viewing and downloading the data. Go ahead and try it.
+
+> If you prefer working with an API, you can find the example endpoint under the API tab: **Get dataset items**.
+
+### Clean items
+
+You can view and download your data without modifications, or you can choose to only get **clean** items. Data that aren't cleaned include a record
+for each `pageFunction` invocation, even if you did not return any results. The record also includes hidden fields
+such as `#debug`, where you can find a variety of information that can help you with debugging your scrapers.
+
+Clean items, on the other hand, include only the data you returned from the `pageFunction`. If you're only interested in the data you scraped, this format is what you will be using most of the time.
+
+To control this, open the **Advanced options** view on the **Dataset** tab.
+
+## Bonus: Making your code neater
+
+You may have noticed that the `pageFunction` gets quite bulky. To make better sense of your code and have an easier
+time maintaining or extending your task, feel free to define other functions inside the `pageFunction`
+that encapsulate all the different logic. You can, for example, define a function for each of the different pages:
+
+```js
+async function pageFunction(context) {
+    switch (context.request.userData.label) {
+        case 'START': return handleStart(context);
+        case 'DETAIL': return handleDetail(context);
+        default: throw new Error('Unknown request label.');
+    }
+
+    async function handleStart({ log, waitFor }) {
+        log.info('Store opened!');
+        let timeoutMillis; // undefined
+        const buttonSelector = 'div.show-more > button';
+        for (;;) {
+            log.info('Waiting for the "Show more" button.');
+            try {
+                // Default timeout first time.
+                await waitFor(buttonSelector, { timeoutMillis });
+                // 2 sec timeout after the first.
+                timeoutMillis = 2000;
+            } catch (err) {
+                // Ignore the timeout error.
+                log.info('Could not find the "Show more button", '
+                    + 'we\'ve reached the end.');
+                break;
+            }
+            log.info('Clicking the "Show more" button.');
+            $(buttonSelector).click();
+        }
+    }
+
+    async function handleDetail({
+        request,
+        log,
+        skipLinks,
+        jQuery: $,
+    }) {
+        const { url } = request;
+        log.info(`Scraping ${url}`);
+        await skipLinks();
+
+        // Do some scraping.
+        const uniqueIdentifier = url
+            .split('/')
+            .slice(-2)
+            .join('/');
+
+        return {
+            url,
+            uniqueIdentifier,
+            title: $('header h1').text(),
+            description: $('header span.actor-description').text(),
+            modifiedDate: new Date(
+                Number(
+                    $('ul.ActorHeader-stats time').attr('datetime'),
+                ),
+            ),
+            runCount: Number(
+                $('ul.ActorHeader-stats > li:nth-of-type(3)')
+                    .text()
+                    .match(/[\d,]+/)[0]
+                    .replace(/,/g, ''),
+            ),
+        };
+    }
+}
+```
+
+> If you're confused by the functions being declared below their executions, it's called hoisting and it's a feature
+of JavaScript. It helps you put what matters on top, if you so desire.
+
+## Final word
+
+Thank you for reading this whole tutorial! Really! It's important to us that our users have the best information available to them so that they can use Apify effectively. We're glad that you made it all the way here and congratulations on creating your first scraping task. We hope that you liked the tutorial and if there's anything you'd like to ask, [join us on Discord](https://discord.gg/jyEM2PRvMU)!
+
+## What's next
+
+- Check out the [Apify SDK](https://docs.apify.com/sdk) and its [Getting started](https://docs.apify.com/sdk/js/docs/guides/apify-platform) tutorial if you'd like to try building your own Actors. It's a bit more complex and involved than writing a `pageFunction`, but it allows you to fine-tune all the details of your scraper to your liking.
+- [Take a deep dive into Actors](/platform/actors), from how they work to [publishing](/platform/actors/publishing) them in Apify Store, and even [making money](https://blog.apify.com/make-regular-passive-income-developing-web-automation-actors-b0392278d085/) on Actors.
+- Found out you're not into the coding part but would still to use Apify Actors? Check out our [ready-made solutions](https://apify.com/store) or [order a custom Actor](https://apify.com/contact-sales) from an Apify-certified developer.
+
+
+**Learn how to scrape a website using Apify's Web Scraper. Build an Actor's page function, extract information from a web page and download your data.**
+
+---
+</file>
+
+<file path="tutorials/node_js/add_external_libraries_web_scraper.md">
+---
+title: How to add external libraries to Web Scraper
+description: Learn how to load external JavaScript libraries in Apify's Web Scraper Actor.
+sidebar_position: 15.7
+slug: /node-js/add-external-libraries-web-scraper
+---
+
+Sometimes you need to use some extra JavaScript in your [Web Scraper](https://apify.com/apify/web-scraper) page functions. Whether it is to work with dates and times using [Moment.js](https://momentjs.com/), or to manipulate the DOM using [jQuery](https://jquery.com/), libraries save precious time and make your code more concise and readable. Web Scraper already provides a way to add jQuery to your page functions. All you need to do is to check the Inject jQuery input option. There's also the option to Inject Underscore, a popular helper function library.
+
+In this tutorial, we'll learn how to inject any JavaScript library into your page functions, with the only limitation being that the library needs to be available somewhere on the internet as a downloadable file (typically a CDN).
+
+## Injecting Moment.js
+
+Moment.js is a very popular library for working with date and time. It helps you with the parsing, manipulation, and formatting of datetime values in multiple locales and has become the de-facto standard for this kind of work in JavaScript.
+
+To inject Moment.js into our page function (or any other library using the same method), we first need to find a link to download it from. We can find it in [Moment.js' documentation](https://momentjs.com/docs/#/use-it/browser/) under the CDN links.
+
+> https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.24.0/moment.min.js
+
+Now we have two options. Inject the library using plain JavaScript, or if you prefer working with jQuery, use a jQuery helper.
+
+## Injecting a library with plain JavaScript
+
+```js
+async function pageFunction(context) {
+    const libraryUrl = 'https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.24.0/moment.min.js';
+
+    // Inject Moment.js\
+    await new Promise((resolve) => {
+        const script = document.createElement('script');
+        script.src = libraryUrl;
+        script.addEventListener('load', resolve);
+        document.body.append(script);
+    });
+
+    // Confirm that it works.\
+    const now = moment().format('ddd, hA');
+    context.log.info(`NOW: ${now}`);
+}
+```
+
+We're creating a script element in the page's DOM and waiting for the script to load. Afterwards, we confirm that the library has been successfully loaded by using one of its functions.
+
+## Injecting a library using jQuery
+
+After you select the Inject jQuery input option, jQuery will become available in your page function as `context.jQuery` .
+
+```js
+async function pageFunction(context) {
+    const libraryUrl = 'https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.24.0/moment.min.js';
+
+    const $ = context.jQuery;
+
+    // Inject Moment.js\
+    await $.getScript(libraryUrl);
+
+    // Confirm that it works.\
+    const now = moment().format('ddd, hA');
+    context.log.info(`NOW: ${now}`);
+}
+```
+
+With jQuery, we're using the `$.getScript()` helper to fetch the script for us and wait for it to load.
+
+## Dealing with errors
+
+Some websites employ security measures that disallow loading external scripts within their pages. Luckily, those measures can be overridden with Web Scraper. If you are encountering errors saying that your library cannot be loaded due to a security policy, select the Ignore CORS and CSP input option at the very bottom of Web Scraper input and the errors should go away.
+
+Happy scraping!
+</file>
+
+<file path="tutorials/node_js/analyzing_pages_and_fixing_errors.md">
+---
+title: How to analyze and fix errors when scraping a website
+description: Learn how to deal with random crashes in your web-scraping and automation jobs. Find out the essentials of debugging and fixing problems in your crawlers.
+sidebar_position: 14.1
+slug: /node-js/analyzing-pages-and-fixing-errors
+---
+
+# How to analyze and fix errors when scraping a website {#scraping-with-sitemaps}
+
+**Learn how to deal with random crashes in your web-scraping and automation jobs. Find out the essentials of debugging and fixing problems in your crawlers.**
+
+---
+
+Debugging is absolutely essential in programming. Even if you don't call yourself a programmer, having basic debugging skills will make building crawlers easier. It will also help you save money by allowing you to avoid hiring an expensive developer to solve your issue for you.
+
+This quick lesson covers the absolute basics by discussing some of the most common problems and the simplest tools for analyzing and fixing them.
+
+## Possible causes {#possible-causes}
+
+It is often tricky to see the full scope of what can go wrong. We assume that once the code is set up correctly, it will keep working. Unfortunately, that is rarely true in the realm of web scraping and automation.
+
+Websites change, they introduce new [anti-scraping technologies](../../webscraping/anti_scraping/index.md), programming tools change and, in addition, people make mistakes.
+
+Here are the most common reasons your working solution may break.
+
+- The website changes its layout or [data feed](https://www.datafeedwatch.com/academy/data-feed).
+- A site's layout changes depending on location or uses [A/B testing](https://www.youtube.com/watch?v=XDoKXaGrUxE&feature=youtu.be).
+- A page starts to block you (recognizes you as a bot).
+- The website [loads its data later dynamically](./dealing_with_dynamic_pages.md), so the code works only sometimes, if you are slow or lucky enough.
+- You made a mistake when updating your code.
+- Your [proxies](../../webscraping/anti_scraping/mitigation/proxies.md) aren't working.
+- You have upgraded your [dependencies](https://www.quora.com/What-is-a-dependency-in-coding) (other software that your software relies upon), and the new versions no longer work (this is harder to debug).
+
+## Diagnosing/analyzing the issue {#issue-analysis}
+
+Web scraping and automation are very specific types of programming. It is not possible to rely on specialized debugging tools, since the code does not output the same results every time. However, there are still many ways to diagnose issues in a crawler.
+
+> Many issues are edge cases, which occur in one of a thousand pages or are time-dependent. Because of this, you cannot rely only on [determinism](https://en.wikipedia.org/wiki/Deterministic_algorithm).
+
+### Logging {#logging}
+
+Logging is an essential tool for any programmer. When used correctly, it helps you capture a surprising amount of information. Here are some general rules for logging:
+
+- Usually, **many logs** is better than **no logs** at all.
+- Putting more information into one line, rather than logging multiple short lines, helps reduce the overall log size.
+- Focus on numbers. Log how many items you extract from a page, etc.
+- Structure your logs and use the same structure in all your logs.
+- Append the current page's URL to each log. This lets you immediately open that page and review it.
+
+Here's an example of what a structured log message might look like:
+
+```text
+[CATEGORY]: Products: 20, Unique products: 4, Next page: true --- https://apify.com/store
+```
+
+The log begins with the **page type**. Usually, we use labels such as **\[CATEGORY\]** and **\[DETAIL\]**. Then, we log important numbers and other information. Finally, we add the page's URL, so we can check if the log is correct.
+
+#### Logging errors {#logging-errors}
+
+Errors require a different approach because, if your code crashes, your usual logs will not be called. Instead, exception handlers will print the error, but these are usually ugly messages with a [stack trace](https://en.wikipedia.org/wiki/Stack_trace) that only the experts will understand.
+
+You can overcome this by adding [try/catch blocks](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/try...catch) into your code. In the catch block, explain what happened and re-throw the error (so the request is automatically retried).
+
+```js
+try {
+    // Sensitive code block
+    // ...
+} catch (error) {
+    // You know where the code crashed so you can explain here
+    throw new Error('Request failed during login with an error', { cause: error });
+}
+```
+
+Read more information about logging and error handling in our developer [best practices](../../webscraping/scraping_basics_javascript/best_practices.md) section.
+
+### Saving snapshots {#saving-snapshots}
+
+By snapshots, we mean **screenshots** if you use a [browser with Puppeteer/Playwright](../../webscraping/puppeteer_playwright/index.md) and HTML saved into a [key-value store](https://crawlee.dev/api/core/class/KeyValueStore) that you can display in your own browser. Snapshots are useful throughout your code but especially important in error handling.
+
+Note that an error can happen only in a few pages out of a thousand and look completely random. You cannot do much else than to save and analyze a snapshot.
+
+Snapshots can tell you if:
+
+- A website has changed its layout. This can also mean A/B testing or different content for different locations.
+- You have been blocked—you open a [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) or an **Access Denied** page.
+- Data load later dynamically—the page is empty.
+- The page was redirected—the content is different.
+
+You can learn how to take snapshots in Puppeteer or Playwright in [this short lesson](../../webscraping/puppeteer_playwright/page/page_methods.md)
+
+#### When to save snapshots {#when-to-save-snapshots}
+
+The most common approach is to save on error. We can enhance our previous try/catch block like this:
+
+```js
+import { puppeteerUtils } from 'crawlee';
+
+// ...
+// storeId is ID of current key value store, where we save snapshots
+const storeId = Actor.getEnv().defaultKeyValueStoreId;
+try {
+    // Sensitive code block
+    // ...
+} catch (error) {
+    // Change the way you save it depending on what tool you use
+    const randomNumber = Math.random();
+    const key = `ERROR-LOGIN-${randomNumber}`;
+    await puppeteerUtils.saveSnapshot(page, { key });
+    const screenshotLink = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.jpg`;
+
+    // You know where the code crashed so you can explain here
+    throw new Error('Request failed during login with an error', { cause: error });
+}
+// ...
+```
+
+To make the error snapshot descriptive, we name it **ERROR-LOGIN**. We add a random number so the next **ERROR-LOGIN**s would not overwrite this one and we can see all the snapshots. If you can use an ID of some sort, it is even better.
+
+**Beware:**
+
+- The snapshot's **name** (key) can only contain letter, number, dot and dash characters. Other characters will cause an error, which makes the random number a safe pick.
+- Do not overdo the snapshots. Once you get out of the testing phase, limit them to critical places. Saving snapshots uses resources.
+
+### Error reporting {#error-reporting}
+
+Logging and snapshotting are great tools but once you reach a certain run size, it may be hard to read through them all. For a large project, it is handy to create a more sophisticated reporting system.
+
+## With the Apify SDK {#with-the-apify-sdk}
+
+This example extends our snapshot solution above by creating a [named dataset](/platform/storage/usage#named-and-unnamed-storages) (named datasets have infinite retention), where we will accumulate error reports. Those reports will explain what happened and will link to a saved snapshot, so we can do a quick visual check.
+
+```js
+import { Actor } from 'apify';
+import { puppeteerUtils } from 'crawlee';
+
+await Actor.init();
+// ...
+// Let's create reporting dataset
+// If you already have one, this will continue adding to it
+const reportingDataset = await Actor.openDataset('REPORTING');
+
+try {
+    // Sensitive code block
+    // ...
+} catch (error) {
+    // Change the way you save it depending on what tool you use
+    const randomNumber = Math.random();
+    const key = `ERROR-LOGIN-${randomNumber}`;
+    // The store gets removed with the run after data retention period so the links will stop working eventually
+    // You can store the snapshots infinitely in a named KV store by adding `keyValueStoreName` option
+    await puppeteerUtils.saveSnapshot(page, { key });
+
+    // To create the reporting URLs, we need to know the Key-Value store and run IDs
+    const { actorRunId, defaultKeyValueStoreId } = Actor.getEnv();
+
+    // We create a report object
+    const report = {
+        errorType: 'login',
+        errorMessage: error.toString(),
+        // .html and .jpg file extensions are added automatically by the saveSnapshot function
+        htmlSnapshotUrl: `https://api.apify.com/v2/key-value-stores/${defaultKeyValueStoreId}/records/${key}.html`,
+        screenshotUrl: `https://api.apify.com/v2/key-value-stores/${defaultKeyValueStoreId}/records/${key}.jpg`,
+        runUrl: `https://console.apify.com/actors/runs/${actorRunId}`,
+    };
+
+    // And we push the report to our reporting dataset
+    await reportingDataset.pushData(report);
+
+    // You know where the code crashed so you can explain here
+    throw new Error('Request failed during login with an error', { cause: error });
+}
+// ...
+await Actor.exit();
+```
+</file>
+
+<file path="tutorials/node_js/apify_free_google_serp_api.md">
+---
+title: Apify's free Google SERP API
+description: How to stay up to date on search results with a Google SERP API
+sidebar_position: 16.5
+slug: /node-js/apify-free-google-serp-api
+---
+
+You need to regularly grab SERP data about your target keywords? Apify provides a free SERP API that includes organic search, ads, people also ask, etc. Free Apify accounts come with unlimited proxy access and $5 of credit. To get started, head over to the [Google Search Results Scraper](https://apify.com/apify/google-search-scraper) page and click the `Try me` button. You'll be taken to a page where you can enter the search query, region, language and other settings.
+
+![Apify Google SERP API](./images/gserp-api.png)
+
+
+Hit `Save & Run` and you'll have the downloaded data as soon as the query finishes. To have it run at a regular frequency, you can set up the task to run on an [automatic schedule](/platform/schedules#setting-up-a-new-schedule).
+
+To run from the API, send a [synchronous POST request](</api/v2#/reference/actor-tasks/run-task-synchronously-and-get-dataset-items/run-task-synchronously-and-get-dataset-items-(post)>) to an endpoint such as `https://api.apify.com/v2/acts/TASK_NAME_OR_ID/runs?token=YOUR_TOKEN`. Include any required input in a JSON object in the request's body.
+
+Keep in mind that, as Google search uses a non-deterministic algorithm, output results may vary even if the input settings are exactly the same.
+</file>
+
+<file path="tutorials/node_js/avoid_eacces_error_in_actor_builds.md">
+---
+title: Avoid EACCES error in Actor builds with a custom Dockerfile
+description: Learn how to work around an issue where Actor builds with a custom Dockerfile fail to copy files due to write access errors.
+sidebar_position: 16.4
+slug: /node-js/avoid-eacces-error-in-actor-builds
+---
+
+Sometimes when building an Actor using a custom Dockerfile, you might receive errors like:
+
+```shell
+Missing write access to ...
+```
+
+or
+
+```shell
+EACCES: permission denied
+```
+
+This problem is usually caused by the fact that by default, the `COPY` Dockerfile instruction copies files as the root user (with UID and GID of 0), while your Dockerfile probably uses another user to copy files and run commands.
+
+To fix this problem, make sure the `COPY`  instruction in Dockerfile uses the `--chown` flag. For example, instead of
+
+```shell
+COPY . ./
+```
+
+use
+
+```shell
+COPY --chown=myuser:myuser . ./
+```
+
+where `myuser` is the user and group defined by the `USER`  instruction in the base Docker image. To learn more, see [Dockerfile documentation](https://docs.docker.com/reference/dockerfile/#copy).
+
+Hope this helps!
+</file>
+
+<file path="tutorials/node_js/block_requests_puppeteer.md">
+---
+title: Block requests in Puppeteer
+description: Why and how to block requests in Puppeteer
+sidebar_position: 16.2
+slug: /node-js/block-requests-puppeteer
+---
+
+:::caution Improve Performance: Use `blockRequests`
+
+Unfortunately, in the recent version of Puppeteer, request interception disables the native cache and slows down the Actor significantly. Therefore, it's not recommended to follow the examples shown in this article. Instead, use [<code>blockRequests</code>](https://crawlee.dev/api/puppeteer-crawler/namespace/puppeteerUtils#BlockRequestsOptions) _utility function from_ [_Crawlee_](https://crawlee.dev). It works through different paths and doesn't slow down your process.
+
+:::
+
+When using Puppeteer, often a webpage will load many resources that are not actually necessary for your use case. For example page could be loading many tracking libraries, that are completely unnecessary for most crawlers, but will cause the page to use more traffic and load slower.
+
+For example for this web page: https://edition.cnn.com/
+If we run an Actor that measures extracted downloaded data from each response until the page is fully loaded, we get these results:
+
+![Actor loading](./images/actor-load.png)
+
+
+Now if we want to optimize this to keep the webpage looking the same, but ignore unnecessary requests, then after
+
+```js
+const page = await browser.newPage();
+```
+
+we could can use this piece of code
+
+```js
+await page.setRequestInterception(true);
+page.on('request', (request) => {
+    if (someCondition) request.abort();
+    else request.continue();
+});
+```
+
+Where `someCondition` is a custom condition (not actually implemented in the code above) that checks whether a request should be aborted.
+
+For our example we will only disable some tracking scripts and then check if everything looks the same.
+
+Here is the code used:
+
+```js
+await page.setRequestInterception(true);
+page.on('request', (request) => {
+    const url = request.url();
+    const filters = [
+        'livefyre',
+        'moatad',
+        'analytics',
+        'controltag',
+        'chartbeat',
+    ];
+    const shouldAbort = filters.some((urlPart) => url.includes(urlPart));
+    if (shouldAbort) request.abort();
+    else request.continue();
+});
+```
+
+With this code set up this is the output:
+
+![Improved Actor loading](./images/improved-actor-loading.png)
+
+
+And except for different ads, the page should look the same.
+
+From this we can see that just by blocking a few analytics and tracking scripts the page was loaded nearly 25 seconds faster and downloaded 35% less data (approximately since the data is measured after it's decompressed).
+
+Hopefully this helps you make your solutions faster and use fewer resources.
+</file>
+
+<file path="tutorials/node_js/caching_responses_in_puppeteer.js">
+import { PuppeteerCrawler, Dataset } from 'crawlee';
+
+const cache = {};
+
+const crawler = new PuppeteerCrawler({
+    preNavigationHooks: [async ({ page }) => {
+        await page.setRequestInterception(true);
+
+        page.on('request', async (request) => {
+            const url = request.url();
+            if (cache[url] && cache[url].expires > Date.now()) {
+                await request.respond(cache[url]);
+                return;
+            }
+            request.continue();
+        });
+
+        page.on('response', async (response) => {
+            const url = response.url();
+            const headers = response.headers();
+            const cacheControl = headers['cache-control'] || '';
+            const maxAgeMatch = cacheControl.match(/max-age=(\d+)/);
+            const maxAge = maxAgeMatch && maxAgeMatch.length > 1 ? parseInt(maxAgeMatch[1], 10) : 0;
+
+            if (maxAge) {
+                if (!cache[url] || cache[url].expires > Date.now()) return;
+
+                let buffer;
+                try {
+                    buffer = await response.buffer();
+                } catch {
+                    // some responses do not contain buffer and do not need to be cached
+                    return;
+                }
+
+                cache[url] = {
+                    status: response.status(),
+                    headers: response.headers(),
+                    body: buffer,
+                    expires: Date.now() + maxAge * 1000,
+                };
+            }
+        });
+    }],
+    requestHandler: async ({ page, request }) => {
+        await Dataset.pushData({
+            title: await page.title(),
+            url: request.url,
+            succeeded: true,
+        });
+    },
+});
+
+await crawler.run(['https://apify.com/store', 'https://apify.com']);
+</file>
+
+<file path="tutorials/node_js/caching_responses_in_puppeteer.md">
+---
+title: How to optimize Puppeteer by caching responses
+description: Learn why it is important for performance to cache responses in memory when intercepting requests in Puppeteer and how to implement it in your code.
+sidebar_position: 14.2
+slug: /node-js/caching-responses-in-puppeteer
+---
+
+import Example from '!!raw-loader!roa-loader!./caching_responses_in_puppeteer.js';
+
+# How to optimize Puppeteer by caching responses {#caching-responses-in-puppeteer}
+
+**Learn why it is important for performance to cache responses in memory when intercepting requests in Puppeteer and how to implement it in your code.**
+
+---
+
+> In the latest version of Puppeteer, the request-interception function inconveniently disables the native cache and significantly slows down the crawler. Therefore, it's not recommended to follow the examples shown in this article unless you have a very specific use-case where the default browser cache is not enough (e.g. cashing over multiple scraper runs)
+
+When running crawlers that go through a single website, each open page has to load all resources again. The problem is that each resource needs to be downloaded through the network, which can be slow and/or unstable (especially when proxies are used).
+
+For this reason, in this article, we will take a look at how to use memory to cache responses in Puppeteer (only those that contain header **cache-control** with **max-age** above **0**).
+
+In this example, we will use a scraper which goes through top stories on the CNN website and takes a screenshot of each opened page. The scraper is very slow right now because it waits till all network requests are finished and because the posts contain videos. If the scraper runs with disabled caching, these statistics will show at the end of the run:
+
+![Bad run stats](./images/bad-scraper-stats.png)
+
+As you can see, we used 177MB of traffic for 10 posts (that is how many posts are in the top-stories column) and 1 main page.
+
+From the screenshot above, it's clear that most of the traffic is coming from script files (124MB) and documents (22.8MB). For this kind of situation, it's always good to check if the content of the page is cache-able. You can do that using Chromes Developer tools.
+
+## Understanding and reproducing the issue
+
+If we go to the CNN website, open up the tools and go to the **Network** tab, we will find an option to disable caching.
+
+![Disabling cache in the Network tab](./images/cnn-network-tab.png)
+
+Once caching is disabled, we can take a look at how much data is transferred when we open the page. This is visible at the bottom of the developer tools.
+
+![5.3MB of data transferred](./images/slow-no-cache.png)
+
+If we uncheck the disable-cache checkbox and refresh the page, we will see how much data we can save by caching responses.
+
+![642KB of data transferred](./images/fast-with-cache.png)
+
+By comparison, the data transfer appears to be reduced by 88%!
+
+## Solving the problem by creating an in-memory cache
+
+We can now emulate this and cache responses in Puppeteer. All we have to do is to check, when the response is received, whether it contains the **cache-control** header, and whether it's set with a **max-age** higher than **0**. If so, then we'll save the headers, URL, and body of the response to memory, and on the next request check if the requested URL is already stored in the cache.
+
+The code will look like this:
+
+```js
+// On top of your code
+const cache = {};
+
+// The code below should go between newPage function and goto function
+
+await page.setRequestInterception(true);
+
+page.on('request', async (request) => {
+    const url = request.url();
+    if (cache[url] && cache[url].expires > Date.now()) {
+        await request.respond(cache[url]);
+        return;
+    }
+    request.continue();
+});
+
+page.on('response', async (response) => {
+    const url = response.url();
+    const headers = response.headers();
+    const cacheControl = headers['cache-control'] || '';
+    const maxAgeMatch = cacheControl.match(/max-age=(\d+)/);
+    const maxAge = maxAgeMatch && maxAgeMatch.length > 1 ? parseInt(maxAgeMatch[1], 10) : 0;
+    if (maxAge) {
+        if (cache[url] && cache[url].expires > Date.now()) return;
+
+        let buffer;
+        try {
+            buffer = await response.buffer();
+        } catch (error) {
+            // some responses do not contain buffer and do not need to be catched
+            return;
+        }
+
+        cache[url] = {
+            status: response.status(),
+            headers: response.headers(),
+            body: buffer,
+            expires: Date.now() + (maxAge * 1000),
+        };
+    }
+});
+```
+
+> If the code above looks completely foreign to you, we recommending going through our free [Puppeteer/Playwright course](../../webscraping/puppeteer_playwright/index.md).
+
+After implementing this code, we can run the scraper again.
+
+![Good run results](./images/good-run-results.png)
+
+Looking at the statistics, caching responses in Puppeteer brought the traffic down from 177MB to 13.4MB, which is a reduction of data transfer by 92%. The related screenshots can be found [here](https://my.apify.com/storage/key-value/iWQ3mQE2XsLA2eErL).
+
+It did not speed up the crawler, but that is only because the crawler is set to wait until the network is nearly idle, and CNN has a lot of tracking and analytics scripts that keep the network busy.
+
+## Implementation in Crawlee
+
+Since most of you are likely using [Crawlee](https://crawlee.dev), here is what response caching would look like using `PuppeteerCrawler`:
+
+<RunnableCodeBlock className="language-js" type="puppeteer">
+    {Example}
+</RunnableCodeBlock>
+</file>
+
+<file path="tutorials/node_js/choosing_the_right_scraper.md">
+---
+title: How to choose the right scraper for the job
+description: Learn basic web scraping concepts to help you analyze a website and choose the best scraper for your particular use case.
+sidebar_position: 14.3
+slug: /node-js/choosing-the-right-scraper
+---
+
+# How to choose the right scraper for the job {#choosing-the-right-scraper}
+
+**Learn basic web scraping concepts to help you analyze a website and choose the best scraper for your particular use case.**
+
+---
+
+You can use one of the two main ways to proceed with building your crawler:
+
+1. Using plain HTTP requests.
+2. Using an automated browser.
+
+We will briefly go through the pros and cons of both, and also will cover the basic steps on how to determine which one should you go with.
+
+## Performance {#performance}
+
+First, let's discuss performance. Plain HTTP request-based scraping will **always** be faster than browser-based scraping. When using plain requests, the page's HTML is not rendered, no JavaScript is executed, no images are loaded, etc. Also, there's no memory used by the browser, and there are no CPU-hungry operations.
+
+If it were only a question of performance, you'd of course use request-based scraping every time; however, it's unfortunately not that simple.
+
+## Dynamic pages & blocking {#dynamic-pages}
+
+Some websites do not load any data without a browser, as they need to execute some scripts to show it (these are known as [dynamic pages](./dealing_with_dynamic_pages.md)). Another problem is blocking. If the website collects a [browser fingerprint](../../webscraping/anti_scraping/techniques/fingerprinting.md), it can distinguish between a real user and a bot (crawler) and block access.
+
+## Making the choice {#making-the-choice}
+
+When choosing which scraper to use, we would suggest first checking whether the website works without JavaScript or not. Probably the easiest way to do so is to use the [Quick JavaScript Switcher](../../glossary/tools/quick_javascript_switcher.md) extension for Chrome. If JavaScript is not needed, or you've spotted some XHR requests in the **Network** tab with the data you need, you probably won't need to use an automated browser browser. You can then check what data is received in response using [Postman](../../glossary/tools/postman.md) or [Insomnia](../../glossary/tools/insomnia.md) or try to send a few requests programmatically. If the data is there and you're not blocked straight away, a request-based scraper is probably the way to go.
+
+It also depends of course on whether you need to fill in some data (like a username and password) or select a location (such as entering a zip code manually). Tasks where interacting with the page is absolutely necessary cannot be done using plain HTTP scraping, and require headless browsers. In some cases, you might also decide to use a browser-based solution in order to better blend in with the rest of the "regular" traffic coming from real users.
+</file>
+
+<file path="tutorials/node_js/dealing_with_dynamic_pages.js">
+import { PuppeteerCrawler, Dataset } from 'crawlee';
+
+const BASE_URL = 'https://demo-webstore.apify.org';
+
+const crawler = new PuppeteerCrawler({
+    requestHandler: async ({ parseWithCheerio, infiniteScroll }) => {
+        // Add the utility function
+        await infiniteScroll();
+
+        const $ = await parseWithCheerio();
+
+        const products = $('a[href*="/product/"]');
+
+        const results = [...products].map((product) => {
+            const elem = $(product);
+
+            const title = elem.find('h3').text();
+            const price = elem.find('div[class*="price"]').text();
+            const image = elem.find('img[src]').attr('src');
+
+            return {
+                title,
+                price,
+                image: new URL(image, BASE_URL).href,
+            };
+        });
+
+        // Push our results to the dataset
+        await Dataset.pushData(results);
+    },
+});
+
+await crawler.run([{ url: 'https://demo-webstore.apify.org/search/new-arrivals' }]);
+</file>
+
+<file path="tutorials/node_js/dealing_with_dynamic_pages.md">
+---
+title: How to scrape from dynamic pages
+description: Learn about dynamic pages and dynamic content. How can we find out if a page is dynamic? How do we programmatically scrape dynamic content?
+sidebar_position: 14.4
+slug: /node-js/dealing-with-dynamic-pages
+---
+
+import Example from '!!raw-loader!roa-loader!./dealing_with_dynamic_pages.js';
+
+# How to scrape from dynamic pages {#dealing-with-dynamic-pages}
+
+**Learn about dynamic pages and dynamic content. How can we find out if a page is dynamic? How do we programmatically scrape dynamic content?**
+
+---
+
+## A quick experiment {#quick-experiment}
+
+From our adored and beloved [Fakestore](https://demo-webstore.apify.org/), we have been tasked to scrape each product's title, price, and image from the [new arrivals](https://demo-webstore.apify.org/search/new-arrivals) page.
+
+![New arrival products in Fakestore](./images/new-arrivals.jpg)
+
+First, create a file called **dynamic.js** and copy-paste the following boiler plate code into it:
+
+```js
+import { CheerioCrawler } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+    requestHandler: async ({ $, request }) => {
+        // We'll put our logic here in a minute
+    },
+});
+
+await crawler.addRequests([{ url: 'https://demo-webstore.apify.org/search/new-arrivals' }]);
+
+await crawler.run();
+```
+
+If you're in a brand new project, don't forget to initialize your project, then install the necessary dependencies:
+
+```shell
+# this command will initialize your project
+# and install the "crawlee" and "cheerio" packages
+npm init -y && npm i crawlee
+```
+
+Now, let's write some data extraction code to extract each product's data. This should look familiar if you went through the [Data Extraction](../../webscraping/scraping_basics_javascript/data_extraction/index.md) lessons:
+
+```js
+import { CheerioCrawler } from 'crawlee';
+
+const BASE_URL = 'https://demo-webstore.apify.org';
+
+const crawler = new CheerioCrawler({
+    requestHandler: async ({ $, request }) => {
+        const products = $('a[href*="/product/"]');
+
+        const results = [...products].map((product) => {
+            const elem = $(product);
+
+            const title = elem.find('h3').text();
+            const price = elem.find('div[class*="price"]').text();
+            const image = elem.find('img[src]').attr('src');
+
+            return {
+                title,
+                price,
+                image: new URL(image, BASE_URL).href,
+            };
+        });
+
+        console.log(results);
+    },
+});
+
+await crawler.run([{ url: 'https://demo-webstore.apify.org/search/new-arrivals' }]);
+```
+
+> Here, we are using the [`Array.prototype.map()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map) function to loop through all of the product elements and save them into an array we call `results` all at the same time.
+
+After running it, you might say, "Great! It works!" **But wait...** What are those results being logged to console?
+
+![Bad results in console](./images/bad-results.png)
+
+Every single image seems to have the same exact "URL," but they are most definitely not the image URLs we are looking for. This is strange, because in the browser, we were getting URLs that looked like this:
+
+```text
+https://demo-webstore.apify.org/_next/image?url=https%3A%2F%2Fm.media-amazon.com%2Fimages%2FI%2F81ywGFOb0eL._AC_UL1500_.jpg&w=3840&q=85
+```
+
+The reason this is happening is because CheerioCrawler makes static HTTP requests, so it only manages to capture the content from the `DOMContentLoaded` event. Any elements or attributes generated dynamically thereafter using JavaScript (and usually XHR/Fetch requests) are not part of the downloaded HTML, and therefore are not accessible through the `$` object.
+
+What's the solution? We need to use something that is able to allow the page to follow through with the entire load process - a headless browser.
+
+## Scraping dynamic content {#scraping-dynamic-content}
+
+Let's change a few lines of our code to switch the crawler type from CheerioCrawler to PuppeteerCrawler, which will run a headless browser, allowing the `load` and `networkidle` events to fire:
+
+> Also, don't forget to run `npm i puppeteer` in order to install the `puppeteer` package!
+
+```js
+import { PuppeteerCrawler } from 'crawlee';
+
+const BASE_URL = 'https://demo-webstore.apify.org';
+
+// Switch CheerioCrawler to PuppeteerCrawler
+const crawler = new PuppeteerCrawler({
+    // Replace "$" with "page"
+    requestHandler: async ({ parseWithCheerio, request }) => {
+        // Create the $ Cheerio object based on the page's content
+        const $ = await parseWithCheerio();
+
+        const products = $('a[href*="/product/"]');
+
+        const results = [...products].map((product) => {
+            const elem = $(product);
+
+            const title = elem.find('h3').text();
+            const price = elem.find('div[class*="price"]').text();
+            const image = elem.find('img[src]').attr('src');
+
+            return {
+                title,
+                price,
+                image: new URL(image, BASE_URL).href,
+            };
+        });
+
+        console.log(results);
+    },
+});
+
+await crawler.run([{ url: 'https://demo-webstore.apify.org/search/new-arrivals' }]);
+```
+
+After running this one, we can see that our results look different from before. We're getting the image links!
+
+![Not perfect results](./images/almost-there.png)
+
+Well... Not quite. It seems that the only images which we got the full links to were the ones that were being displayed within the view of the browser. This means that the images are lazy-loaded. **Lazy-loading** is a common technique used across the web to improve performance. Lazy-loaded items allow the user to load content incrementally, as they perform some action. In most cases, including our current one, this action is scrolling.
+
+We've gotta scroll down the page to load these images. Luckily, because we're using Crawlee, we don't have to write the logic that will achieve that, because a utility function specifically for Puppeteer called [`infiniteScroll`](https://crawlee.dev/api/puppeteer-crawler/namespace/puppeteerUtils#infiniteScroll) already exists right in the library, and can be accessed through `utils.puppeteer`. Let's add it to our code now:
+
+<RunnableCodeBlock className="language-js" type="puppeteer">
+    {Example}
+</RunnableCodeBlock>
+
+Let's run this and check our dataset results...
+
+```json
+{
+  "title": "women's shoes",
+  "price": "$40.00 USD",
+  "image": "https://demo-webstore.apify.org/_next/image?url=https%3A%2F%2Fdummyjson.com%2Fimage%2Fi%2Fproducts%2F46%2F1.jpg&w=3840&q=85"
+}
+```
+
+Each product looks like this, and each image is a valid link that can be visited. These are the results we were after.
+
+## Small Recap {#small-recap}
+
+Making static HTTP requests only downloads the HTML content from the `DOMContentLoaded` event. We must use a browser to allow dynamic code to load, or find different means altogether of scraping the data (see [API Scraping](../../webscraping/api_scraping/index.md))
+</file>
+
+<file path="tutorials/node_js/debugging_web_scraper.md">
+---
+title: Debugging your Web Scraper pageFunction in browser's console
+description: Test your Page Function's code directly in your browser's console
+sidebar_position: 16.3
+slug: /node-js/debugging-web-scraper
+---
+
+A lot of beginners struggle through trial and error while scraping a simple site. They write some code that might work, press the run button, see that error happened and they continue writing more code that might work but probably won't. This is extremely inefficient and gets tedious really fast.
+
+What beginners are missing are basic tools and tricks to get things done quickly. One of these wow tricks is the option to run the JavaScript code directly in your browser.
+
+Pressing F12 while browsing with Chrome, Firefox, or other popular browsers opens up the browser console, the magic toolbox of any web developer. The console allows you to run a code in the context of the website you are in. Don't worry, you cannot mess the site up (well, unless you start doing really nasty tricks) as the page content is downloaded on your computer and any change is only local to your PC.
+
+# Running code in a browser console
+
+> Test your Page Function's code directly in your browser's console.
+
+First, you need to inject jQuery. You can try to paste and run this snippet.
+
+```js
+const jq = document.createElement('script');
+jq.src = 'https://ajax.googleapis.com/ajax/libs/jquery/2.2.2/jquery.min.js';
+document.getElementsByTagName('head')[0].appendChild(jq);
+```
+
+If that doesn't work because of a CORS violation, you can install [this extension](https://chrome.google.com/webstore/detail/ekkjohcjbjcjjifokpingdbdlfekjcgi) that injects jQuery on a button click.
+
+You can test a `pageFunction` code in two ways in your console:
+
+## Pasting and running a small code snippet
+
+Usually, you don't need to paste in the whole pageFunction as you can isolate the critical part of the code you are trying to debug. You will need to remove any references to the `context` object and its properties like `request` and the final return statement but otherwise, the code should work 1:1.
+
+I will also usually remove `const` declarations on the top level variables. This helps you to run the same code many times over without needing to restart the console (you cannot declare constants more than once). My declaration will change from:
+
+```js
+const results = [];
+// Scraping something to fill the results
+```
+
+into
+
+```js
+results = [];
+```
+
+You can get all the information you need by running a snippet of your `pageFunction` like this:
+
+```js
+results = [];
+$('.my-list-item').each((i, el) => {
+    results.push({
+        title: $(el).find('.title').text().trim(),
+        // other fields
+    });
+});
+```
+
+Now the `results` variable stays on the page and you can do whatever you wish with it. Log it to analyze if your scraping code is correct. Writing a single expression will also log it in a browser console.
+
+```js
+results;
+// Will log a nicely formatted [{ title: 'my-article-1'}, { title: 'my-article-2'}] etc.
+```
+
+## Pasting and running a full pageFunction
+
+If you don't want to deal with copy/pasting a proper snippet, you can always paste the whole pageFunction. You will have to mock the context object when calling it. If you use some advanced tricks, this might not work but in most cases copy pasting this code should do it. This code is only for debugging your Page Function for a particular page. It does not crawl the website and the output is not saved anywhere.
+
+<!-- eslint-disable -->
+```js
+async function pageFunction(context) {
+    // this is your pageFunction
+}
+// Now you will call it with mocked context
+pageFunction({
+    request: {
+        url: window.location.href,
+        userData: { label: 'paste-a-label-if-you-use-one' },
+    },
+    async waitFor(ms) {
+        console.log('(waitFor)');
+        await new Promise((res) => setTimeout(res, ms));
+    },
+    enqueueRequest() { console.log('(enqueuePage)', arguments); },
+    skipLinks() { console.log('(skipLinks)', arguments); },
+    jQuery: $,
+});
+```
+
+Happy debugging!
+</file>
+
+<file path="tutorials/node_js/filter_blocked_requests_using_sessions.md">
+---
+title: Filter out blocked proxies using sessions
+description: Handling blocked requests efficiently using sessions
+sidebar_position: 16
+slug: /node-js/filter-blocked-requests-using-sessions
+---
+
+_This article explains how the problem was solved before the [SessionPool](/sdk/js/docs/api/session-pool) class was added into [Apify SDK](/sdk/js/). We are keeping the article here as it might be interesting for people who want to see how to work with sessions on a lower level. For any practical usage of sessions, follow the documentation and examples of SessionPool._
+
+### Overview of the problem
+
+You want to crawl a website with a proxy pool, but most of your proxies are blocked. It's a very common situation. Proxies can be blocked for many reasons:
+
+1. You overused them in your current Actor run and they got banned.
+
+2. You overused them in some of your previous runs and they are still banned (and may never be unbanned).
+
+3. Some other user with whom you share part of your proxy pool overused them when crawling the same website before you even touched it.
+
+4. The proxies were actually banned before anyone used them to crawl the website because they share a subnetwork in some datacenter and all proxies of that subnet got banned.
+
+5. The proxies actually got banned before anyone used them to crawl the website because they use anti-bot protection that bans proxies across websites (e.g. Cloudflare).
+
+Nobody can make sure that a proxy will work infinitely. The only real solution to this problem is to use [residential proxies](/platform/proxy/residential-proxy), but they can sometimes be too costly.
+
+However, usually, at least some of our proxies work. To crawl successfully, it is therefore imperative to handle blocked requests properly. You first need to discover that you are blocked, which usually means that either your request returned status greater or equal to 400 (it didn't return the proper response) or that the page displayed a captcha. To ensure that this bad request is retried, you usually throw an error and it gets automatically retried later (our [SDK](/sdk/js/) handles this for you). Check out [this article](https://docs.apify.com/academy/node-js/handle-blocked-requests-puppeteer) as inspiration for how to handle this situation with `PuppeteerCrawler` class.
+
+### Solution
+
+Now we are able to retry bad requests and eventually unless all of our proxies get banned, we should be able to successfully crawl what we want. The problem is that it takes too long and our log is full of errors. Fortunately, we can overcome this with [proxy sessions](/platform/proxy/datacenter-proxy#username-parameters) (look at the proxy and SDK documentation for how to use them in your Actors.)
+
+First we define `sessions`  object at the top of our code (in global scope) to hold the state of our working sessions.
+
+`let sessions;`
+
+Then we need to define an interval that will ensure our sessions are periodically saved to the key-value store, so if the Actor restarts, we can load them.
+
+```js
+setInterval(async () => {
+    await Apify.setValue('SESSIONS', sessions);
+}, 30 * 1000);
+```
+
+And inside our main function, we load the sessions the same way we load an input. If they were not saved yet (the Actor was not restarted), we instantiate them as an empty object.
+
+```js
+Apify.main(async () => {
+    sessions = (await Apify.getValue('SESSIONS')) || {};
+    // ...the rest of your code
+});
+```
+
+### Algorithm
+
+You don't necessarily need to understand the solution below - it should be fine to copy/paste it to your Actor.
+
+`sessions`  will be an object whose keys will be the names of the sessions and values will be objects with the name of the session (we choose a random number as a name here) and user agent (you can add any other useful properties that you want to match with each session.) This will be created automatically, for example:
+
+```json
+{
+    "0.7870849452667994": {
+        "name": "0.7870849452667994",
+        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
+        },
+    "0.4787584713044999": {
+        "name": "0.4787584713044999",
+        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"
+    }
+    // ...
+}
+```
+
+Now let's get to the algorithm that will define which sessions to pick for a request. It can be done in many ways and this is by no means the ideal way, so I encourage you to find a more intelligent algorithm and paste it into the comments of this article.
+
+This function takes `sessions`  as an argument and returns a `session`  object which will either be a random object from `sessions`  or a new one with random user agent.
+
+```js
+const pickSession = (sessions, maxSessions = 100) => {
+
+    // sessions is our sessions object, at the beginning instantiated as {}
+    // maxSessions is a constant which should be the number of working proxies we aspire to have.
+    // The lower the number, the faster you will use the working proxies
+    // but the faster the new one will not be picked
+    // 100 is reasonable default
+    // Since sessions is an object, we prepare an array of the session names
+    const sessionsKeys = Object.keys(sessions);
+
+    console.log(`Currently we have ${sessionsKeys.length} working sessions`);
+
+    // We define a random floating number from 0 to 1 that will serve
+    // both as a chance to pick the session and its possible name
+    const randomNumber = Math.random();
+
+    // The chance to pick a session will be higher when we have more working sessions
+    const chanceToPickSession = sessionsKeys.length / maxSessions;
+
+    console.log(`Chance to pick a working session is ${Math.round(chanceToPickSession * 100)}%`);
+
+    // If the chance is higher than the random number, we pick one from the working sessions
+    const willPickSession = chanceToPickSession > randomNumber;
+
+    if (willPickSession) {
+        // We randomly pick one of the working sessions and return it
+        const indexToPick = Math.floor(sessionsKeys.length * Math.random());
+
+        const nameToPick = sessionsKeys[indexToPick];
+
+        console.log(`We picked a working session: ${nameToPick} on index ${indexToPick}`);
+
+        return sessions[nameToPick];
+    }
+    // We create a new session object, assign a random userAgent to it and return it
+
+    console.log(`Creating new session: ${randomNumber}`);
+
+    return {
+        name: randomNumber.toString(),
+        userAgent: Apify.utils.getRandomUserAgent(),
+    };
+
+};
+```
+
+### Puppeteer example
+
+We then use this function whenever we want to get the session for our request. Here is an example of how we would use it for bare bones Puppeteer (for example as a part of `BasicCrawler` class).
+
+```js
+const session = pickSession(sessions);
+const browser = await Apify.launchPuppeteer({
+    useApifyProxy: true,
+    apifyProxySession: session.name,
+    userAgent: session.userAgent,
+});
+```
+
+Then we only need to add the session if the request was successful or remove it if it was not. It doesn't matter if we add the same session twice or delete a non-existent session (because of how JavaScript objects work).
+
+After success:
+`sessions[session.name] = session;`
+
+After failure (captcha, blocked request, etc.):
+`delete sessions[session.name]`
+
+### PuppeteerCrawler example
+
+Now you might start to wonder, "I have already prepared an Actor using PuppeteerCrawler, can I make it work there?". The problem is that with PuppeteerCrawler we don't have everything nicely inside one function scope like when using pure Puppeteer or BasicCrawler. Fortunately, there is a little hack that enables passing the session name to where we need it.
+
+First we define `lauchPuppeteerFunction` which tells the crawler how to create new browser instances and we pass the picked session there.
+
+```js
+const crawler = new Apify.PuppeteerCrawler({
+    launchPuppeteerFunction: async () => {
+        const session = pickSession(sessions);
+        return Apify.launchPuppeteer({
+            useApifyProxy: true,
+            userAgent: `${session.userAgent} s=${session.name}`,
+            apifyProxySession: session.name,
+        });
+    },
+    // handlePageFunction etc.
+});
+```
+
+We picked the session and added it to the browser as `apifyProxySession` but for userAgent, we didn't pass the User-Agent as it is but added the session name into it. That is the hack because we can retrieve the user agent from the Puppeteer browser itself.
+
+Now we need to retrieve the session name back in the `gotoFunction`, pass it into userData and fix the hacked userAgent back to normal so it is not suspicious for the website.
+
+```js
+const gotoFunction = async ({ request, page }) => {
+    const userAgentWithSession = await page.browser().userAgent();
+    const match = userAgentWithSession.match(/(.+) s=(.+)/);
+    const session = {
+        name: match[2],
+        userAgent: match[1],
+    };
+    request.userData.session = session;
+    await page.setUserAgent(session.userAgent);
+    return page.goto(request.url, { timeout: 60000 });
+};
+```
+
+Now we have access to the session in the `handlePageFunction` and the rest of the logic is the same as in the first example. We extract the session from the userData, try/catch the whole code and on success we add the session and on error we delete it. Also it is useful to retire the browser completely (check [here](https://docs.apify.com/academy/node-js/handle-blocked-requests-puppeteer) for reference) since the other requests will probably have similar problem.
+
+```js
+const handlePageFunction = async ({ request, page, puppeteerPool }) => {
+    const { session } = request.userData;
+    console.log(`URL: ${request.url}, session: ${session.name}, userAgent: ${session.userAgent}`);
+
+    try {
+        // your main logic that is executed on each page
+        sessions[session.name] = session;
+    } catch (e) {
+        delete sessions[session.name];
+        await puppeteerPool.retire(page.browser());
+        throw e;
+    }
+};
+```
+
+Things to consider
+
+1. Since the good and bad proxies are getting filtered over time, this solution only makes sense for crawlers with at least hundreds of requests.
+
+2. This solution will not help you if you don't have enough proxies for your job. It can even get your proxies banned faster (since the good ones will be used more often), so you should be cautious about the speed of your crawl.
+
+3. If you are more concerned about the speed of your crawler and less about banning proxies, set the `maxSessions` parameter of `pickSession` function to a number relatively lower than your total number of proxies. If on the other hand, keeping your proxies alive is more important, set `maxSessions`  relatively higher so you will always pick new proxies.
+
+4. Since sessions only last 24 hours, if you have bigger intervals between your crawler runs, they will start fresh each time.
+</file>
+
+<file path="tutorials/node_js/handle_blocked_requests_puppeteer.md">
+---
+title: How to handle blocked requests in PuppeteerCrawler
+description: Getting around website defense mechanisms when crawling
+sidebar_position: 15.9
+slug: /node-js/handle-blocked-requests-puppeteer
+---
+
+One of the main defense mechanisms websites use to ensure they are not scraped by bots is allowing only a limited number of requests from a specific IP address. That's why Apify provides a [proxy](https://docs.apify.com/platform/proxy) component with intelligent rotation. With a large enough pool of proxies, you can multiply the number of allowed requests per day to cover your crawling needs. Let's look at how we can rotate proxies when using our [JavaScript SDK](https://github.com/apify/apify-sdk-js).
+
+# BasicCrawler
+
+> Getting around website defense mechanisms when crawling.
+
+You can use `handleRequestFunction` to set up proxy rotation for a [BasicCrawler](https://crawlee.dev/api/basic-crawler/class/BasicCrawler). The following example shows how to use a fresh proxy on each request if you make requests through the popular [request-promise](https://www.npmjs.com/package/request-promise) npm package:
+
+```js
+const Apify = require('apify');
+const requestPromise = require('request-promise');
+
+const PROXY_PASSWORD = process.env.APIFY_PROXY_PASSWORD;
+const proxyUrl = `http://auto:${PROXY_PASSWORD}@proxy.apify.com`;
+
+const crawler = new Apify.BasicCrawler({
+    requestList: someInitializedRequestList,
+    handleRequestFunction: async ({ request }) => {
+        const response = await requestPromise({
+            url: request.url,
+            proxy: proxyUrl,
+        });
+    },
+});
+```
+
+Each time `handleRequestFunction` is executed in this example, requestPromise will send a request through the least used proxy for that target domain. This way you will not burn through your proxies.
+
+# Puppeteer Crawler
+
+With [PuppeteerCrawler](/sdk/js/docs/api/puppeteer-crawler) the situation is a little more complicated. That's because you have to restart the browser to change the proxy the browser is using. By default, PuppeteerCrawler restarts the browser every 100 requests, which can lead to a number of requests being wasted because the IP address the browser is using is already blocked by the website.
+
+The straightforward solution would be to set the 'retireInstanceAfterRequestCount' option to 1. PuppeteerCrawler would then rotate the proxies in the same way as BasicCrawler. While this approach could sometimes be useful for the toughest websites, the price you pay is in performance. Restarting the browser is an expensive operation.
+
+That's why PuppeteerCrawler offers a utility retire() function through a PuppeteerPool class. You can access PuppeteerPool by passing it into the object parameter of gotoFunction or handlePageFunction.
+
+```js
+const crawler = new PuppeteerCrawler({
+    requestList: someInitializedRequestList,
+    launchPuppeteerOptions: {
+        useApifyProxy: true,
+    },
+    handlePageFunction: async ({ request, page, puppeteerPool }) => {
+        // you are on the page now
+    },
+
+});
+```
+
+It is really up to a developer to spot if something is wrong with his request. A website can interfere with your crawling in [many ways](https://docs.apify.com/academy/anti-scraping). Page loading can be cancelled right away, it can timeout, the page can display a captcha, some error or warning message, or the data may be missing or corrupted. The developer can then choose if he will try to handle these problems in the code or focus on receiving the proper data. Either way, if the request went wrong, you should throw a proper error.
+
+Now that we know when the request is blocked, we can use the retire() function and continue crawling with a new proxy. Google is one of the most popular websites for scrapers, so let's code a Google search crawler. The two main blocking mechanisms used by Google is either to display their (in)famous 'sorry' captcha or to not load the page at all so we will focus on covering these.
+
+For example, let's assume we have already initialized a requestList of Google search pages. Let's show how you can use the retire() function in both gotoFunction and handlePageFunction.
+
+```js
+const crawler = new Apify.PuppeteerCrawler({
+    requestList: someInitializedRequestList,
+    launchPuppeteerOptions: {
+        useApifyProxy: true,
+    },
+    gotoFunction: async ({ request, page, puppeteerPool }) => {
+        const response = page.goto(request.url).catch(() => null);
+        if (!response) {
+            await puppeteerPool.retire(page.browser());
+            throw new Error(`Page didn't load for ${request.url}`);
+        }
+        return response;
+    },
+    handlePageFunction: async ({ request, page, puppeteerPool }) => {
+        if (page.url().includes('sorry')) {
+            await puppeteerPool.retire(page.browser());
+            throw new Error(`We got captcha for ${request.url}`);
+        }
+    },
+    retireInstanceAfterRequestCount: 50,
+});
+
+Apify.main(async () => {
+    await crawler.run();
+});
+```
+
+Now we have a crawler that catches the most common blocking issues on Google. In `gotoFunction` we will catch if the page doesn't load and in the handlePageFunction we check if we were redirected to the 'sorry page'. In both cases we throw an error afterwards so the request is added back to the crawling queue (otherwise the crawler would think everything was okay and would treat that request as handled).
+</file>
+
+<file path="tutorials/node_js/how_to_fix_target_closed.md">
+---
+title: How to fix the 'Target closed' error in Puppeteer and Playwright
+description: Learn about common causes for the 'Target closed' error in your browser automation workflow and what you can do to fix it.
+sidebar_position: 14.2
+slug: /node-js/how_to_fix_target-closed
+---
+
+# How to fix 'Target closed' error in Puppeteer and Playwright
+
+**Learn about common causes for the 'Target closed' error in browser automation and what you can do to fix it.**
+
+---
+
+The `Target closed` error happens when you try to access the `page` object (or some of its parent objects like the `browser`), but the underlying browser tab has already been closed. The exact error message can appear in several variants, such as `Target page, context or browser has been closed`, but none of them are very helpful for debugging. To debug it, attach logs in multiple places or use the headful mode.
+
+## Out of memory
+
+![Chrome crashed tab](./images/chrome-crashed-tab.png)
+
+Browsers create a separate process for each tab. That means each tab lives with a separate memory space. If you have a lot of tabs open, you might run out of memory. The browser cannot close your old tabs to free extra memory so it will usually kill your current memory hungry tab.
+
+### Memory solution
+
+If you use [Crawlee](https://crawlee.dev/), your concurrency automatically scales up and down to fit in the allocated memory. You can change the allocated memory using the environment variable or the [Configuration](https://crawlee.dev/docs/guides/configuration) class. But very hungry pages can still occasionally cause sudden memory spikes, and you might have to limit the [maxConcurrency](https://crawlee.dev/docs/guides/scaling-crawlers#minconcurrency-and-maxconcurrency) of the crawler. This problem is very rare, though.
+
+Without Crawlee, you will need to predict the maximum concurrency the particular use case can handle or increase the allocated memory.
+
+## Page closed prematurely
+
+If you close the page before executing all code that tries to access the page, you will get the 'Target closed' error. The most common cause is that your crawler doesn't properly wait for all actions and instead closes the page earlier than it should. Usually, this is caused by forgotten `await` keyword (floating promise), using event handlers like `page.on` or having wrongly ordered crawling loop.
+
+### Page closed solution
+
+[Add logs to your code](https://docs.apify.com/academy/node-js/analyzing-pages-and-fixing-errors) to see exactly at which point the crash occurs. See if you can spot one of the above mentioned problems. Adding missing `await` is simple but if your code runs in an event handler, you will need to wrap it in try/catch block and ensure that you give it enough time to execute before you close the main crawling handler.
+
+If you use Crawlee and utilize [preNavigationHooks](https://crawlee.dev/api/playwright-crawler/interface/PlaywrightCrawlerOptions#preNavigationHooks) to execute event handlers like `page.on` asynchronously be aware that this can cause the above mentioned problem that the [requestHandler](https://crawlee.dev/api/playwright-crawler/interface/PlaywrightCrawlerOptions#requestHandler) already finishes before we access the `page` in the event handler. You can solve this issue by making sure the `requestHandler` waits for all promises from the `preNavigationHooks`. This can be achieved by passing the promises to the `context` which is accessible to both functions and awaiting them before the scraping code starts.
+
+```js
+const crawler = new PlaywrightCrawler({
+    // ...other options
+    preNavigationHooks: [
+        async ({ page, context }) => {
+            // Some action that takes time, we don't await here
+            // Try/catch all non awaited code because it can cause unhandled rejection which crashes the whole process
+            const responsePromise = page.waitForResponse('https://example.com/resource').catch((e) => e);
+            // Attach the promise to the context which is accessible to requestHandler
+            context.responsePromise = responsePromise;
+        },
+    ],
+    requestHandler: async ({ request, page, context }) => {
+        // We first wait for the response before doing anything else
+        const response = await context.responsePromise;
+        // Check if it errored out, otherwise proceed with parsing it
+        if (typeof response === 'string' || response instanceof Error) {
+            throw new Error(`Failed to load resource from response`, { cause: response });
+        }
+        // Now process the response and continue with the code synchronously
+    },
+});
+```
+
+If you are still unsure what causes your particular error, check with the community and Apify team on [Discord](https://discord.com/invite/jyEM2PRvMU).
+</file>
+
+<file path="tutorials/node_js/how_to_save_screenshots_puppeteer.md">
+---
+title: How to save screenshots from puppeteer
+description: Code example for how to save screenshots from puppeteer to Apify key-value store
+sidebar_position: 15.8
+slug: /node-js/how-to-save-screenshots-puppeteer
+---
+
+A good way to debug your puppeteer crawler in Apify Actors is to save a screenshot of a browser window to the Apify key-value store. You can do that using this function:
+
+```js
+/**
+* Store screen from puppeteer page to Apify key-value store
+* @param page - Instance of puppeteer Page class https://pptr.dev/api/puppeteer.page
+* @param [key] - Function stores your screen in Apify key-value store under this key
+* @return {Promise<void>}
+*/
+const saveScreen = async (page, key = 'debug-screen') => {
+    const screenshotBuffer = await page.screenshot({ fullPage: true });
+    await Apify.setValue(key, screenshotBuffer, { contentType: 'image/png' });
+};
+```
+
+This function takes the parameters page (an instance of a puppeteer page) and key (your screen is stored under this key function in the Apify key-value store).
+
+Because this is so common use-case Apify SDK has a utility function called [saveSnapshot](/sdk/js/docs/api/puppeteer#puppeteersavesnapshot) that does exactly this and a little bit more:
+
+- You can choose the quality of your screenshots (high-quality images take more size)
+
+- You can also save the HTML of the page
+
+An example of such Apify Actor:
+
+```js
+import { Actor } from 'apify';
+import { puppeteerUtils, launchPuppeteer } from 'crawlee';
+
+Actor.main(async () => {
+    const input = await Actor.getValue('INPUT');
+
+    console.log('Launching Puppeteer...');
+    const browser = await launchPuppeteer();
+
+    const page = await browser.newPage();
+    await page.goto(input.url);
+
+    await puppeteerUtils.saveSnapshot(page, { key: 'test-screen' });
+
+    console.log('Closing Puppeteer...');
+    await browser.close();
+
+    console.log('Done.');
+});
+```
+
+After you call the function, your screen appears in the KEY-VALUE STORE tab in the Actor console. You can click on the row with your saved screen and it'll open it in a new window.
+
+![Puppeteer Key-Value store](./images/kv-store-puppeteer.png)
+
+If you have any questions, feel free to contact us in chat.
+
+Happy coding!
+</file>
+
+<file path="tutorials/node_js/index.md">
+---
+title: Node.js tutorials
+description: A collection of various Node.js tutorials on scraping sitemaps, optimizing your scrapers, using popular Node.js web scraping libraries, and more.
+sidebar_position: 14
+category: tutorials
+slug: /node-js
+---
+
+# Node.js Tutorials 💻📚
+
+**A collection of various Node.js tutorials on scraping sitemaps, optimizing your scrapers, using popular Node.js web scraping libraries, and more.**
+
+---
+
+This section contains various web-scraping or web-scraping related tutorials for Node.js. Whether you're trying to scrape from a website with sitemaps, struggling with a dynamic page, want to optimize your slow Puppeteer scraper, or need some general tips for scraping in Node.js, this section is right for you.
+</file>
+
+<file path="tutorials/node_js/js_in_html.md">
+---
+title: How to scrape hidden JavaScript objects in HTML
+description: Learn about "hidden" data found within the JavaScript of certain pages, which can increase the scraper reliability and improve your development experience.
+sidebar_position: 14.5
+slug: /node-js/js-in-html
+---
+
+# How to scrape hidden JavaScript objects in HTML {#what-is-js-in-html}
+
+**Learn about "hidden" data found within the JavaScript of certain pages, which can increase the scraper reliability and improve your development experience.**
+
+---
+
+Depending on the technology the target website is using, the data to be collected not only can be found within HTML elements, but also in a JSON format within `<script>` tags in the DOM.
+
+The advantages of using these objects instead of parsing the HTML are that parsing JSON is much simpler, and more reliable than parsing HTML elements. They are much less likely to change, while the CSS selectors are prone to updates and re-namings every time the website is updated.
+
+> **Note:** In this tutorial, we'll be using [SoundCloud's website](https://soundcloud.com) as an example target, but the techniques described here can be applied to any site.
+
+## Locating JSON objects within script tags {#locating-json-in-html}
+
+Using our DevTools, we can inspect our [target page](https://soundcloud.com/tiesto/tracks), or right click the page and click **View Page Source** to see the DOM. Next, we'll find a value on the page that we can predict would be in a potential API response. For our page, we'll use the **Tracks** count of `845`. On the **View Page Source** page, we'll do **⌘** + **F** and type in this value, which will show all matches for it within the DOM. This method can expose `<script>` tag objects which hold the target data.
+
+![Find the value within the DOM using CMD + F](./images/view-845.png)
+
+These data objects will usually be attached to the window object (often prefixed with two underscores - `__`). When scrolling to the beginning of the script tag on our **View Page Source** page, we see that the name of our target object is `__sc_hydration`. Heading back to DevTools and typing this into the console, the object is displayed.
+
+![View the target data in the window object using the console in DevTools](./images/view-object-in-window.png)
+
+## Parsing {#parsing-objects}
+
+You can obtain these objects to be used and manipulated in JavaScript in two ways:
+
+### 1. Parsing them directly from the HTML
+
+```js
+// same as "document.querySelector('html').innerHTML"
+const html = $.html();
+
+const string = html.split('window.__sc_hydration = ')[1].split(';</script>')[0];
+
+const data = JSON.parse(string);
+
+console.log(data);
+```
+
+### 2. Retrieving them within the context of the browser
+
+Tools like [Puppeteer](https://github.com/puppeteer/puppeteer) allow us to run code within the context in the browser, as well as return things out of these functions and use the data back in the Node.js context.
+
+```js
+const data = await page.evaluate(() => window.__sc_hydration);
+
+console.log(data);
+```
+
+Which of these methods you use totally depends on the type of crawler you are using. Grabbing the data directly from the `window` object within the context of the browser using Puppeteer is of course the most reliable solution; however, it is less efficient than making a static HTTP request and parsing the object directly from the downloaded HTML.
+</file>
+
+<file path="tutorials/node_js/multiple-runs-scrape.md">
+---
+title: Scrape website in parallel with multiple Actor runs
+description: Learn how to run multiple instances of an Actor to scrape a website faster. This tutorial will guide you through the process of setting up your scraper.
+sidebar_position: 15.10
+slug: /node-js/multiple-runs-scrape
+---
+
+# Scrape website in parallel with multiple Actor runs
+
+**Learn how to run multiple instances of an Actor to scrape a website faster. This tutorial will guide you through the process of setting up your scraper.**
+
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+import OrchestratorActorMainTs from '../../../../examples/ts-parallel-scraping/orchestrator/src/main.ts';
+import OrchestratorActorInputSchemaJson from '../../../../examples/ts-parallel-scraping/orchestrator/.actor/input_schema.json';
+import ScraperActorMainTs from '../../../../examples/ts-parallel-scraping/scraper/src/main.ts';
+import ScraperActorInputSchemaJson from '../../../../examples/ts-parallel-scraping/scraper/.actor/input_schema.json';
+
+Imagine a large website that you need to scrape. You have a scraper that works well, but scraping the whole website is slow.
+You can speed up the scraping process by running multiple instances of the scraper in parallel.
+This tutorial will guide you through setting up your scraper to run multiple instances in parallel.
+
+:::note In a rush?
+
+You can check [full code example](https://github.com/apify/apify-docs/tree/master/examples/ts-parallel-scraping) right away.
+
+:::
+
+## Managing Multiple Scraper Runs
+
+To manage multiple instances of the scraper, we need to build an Orchestrator Actor to oversee the process. This Orchestrator Actor will initiate several scraper runs and manage their operations.
+It will set up a request queue and a dataset that the other Actor runs will utilize to crawl the website and store results. In this tutorial, we set up the Orchestrator Actor and the scraper Actor.
+
+## Orchestrator Actor Configuration
+
+The Orchestrator Actor orchestrates the parallel execution of scraper Actor runs.
+It runs multiple instances of the scraper Actor and passes the request queue and dataset to them.
+For the Actor's base structure, we use Apify CLI and create a new Actor with the following command and use the [Empty TypeScript Actor template](https://apify.com/templates/ts-empty).
+
+```shell
+apify create orchestrator-actor
+````
+
+If you don't have Apify CLI installed, check out our installation [instructions](https://docs.apify.com/cli/docs/installation).
+
+### Input Configuration
+
+Let's start by defining the Input Schema for the Orchestrator Actor.
+The input for the Actor will specify configurations needed to initiate and manage multiple scraper Actors in parallel.
+Here’s the breakdown of the necessary input:
+
+<Tabs groupId="main">
+<TabItem value="input_schema.json" label="input_schema.json">
+
+<CodeBlock language="json">{OrchestratorActorInputSchemaJson}</CodeBlock>
+
+</TabItem>
+<TabItem value="main.ts" label="main.ts">
+
+<!-- eslint-disable @typescript-eslint/no-explicit-any -->
+```ts
+import { Actor, log } from 'apify';
+
+interface Input {
+    parallelRunsCount: number;
+    targetActorId: string;
+    targetActorInput: Record<string, any>;
+    targetActorRunOptions: Record<string, any>;
+}
+
+await Actor.init();
+
+const {
+    parallelRunsCount = 1,
+    targetActorId,
+    targetActorInput = {},
+    targetActorRunOptions = {},
+} = await Actor.getInput<Input>() ?? {} as Input;
+const { apifyClient } = Actor;
+
+if (!targetActorId) throw new Error('Missing the "targetActorId" input!');
+```
+
+</TabItem>
+</Tabs>
+
+### Reusing dataset and request queue
+
+The Orchestrator Actor will reuse its default dataset and request queue. The dataset stores the results of the scraping process, and the request queue is used as shared storage for processing requests.
+
+```ts
+import { Actor } from 'apify';
+
+const requestQueue = await Actor.openRequestQueue();
+const dataset = await Actor.openDataset();
+```
+
+
+### State
+
+The Orchestrator Actor will maintain the state of the scraping runs to track progress and manage continuity. It will record the state of Actor runs, initializing this tracking with the first run.
+This persistent state ensures that, in migration or restart (resurrection) cases, the Actor can resume the same runs without losing progress.
+
+<!-- eslint-disable react-hooks/rules-of-hooks -->
+```ts
+import { Actor, log } from 'apify';
+
+const { apifyClient } = Actor;
+const state = await Actor.useState<State>('actor-state', { parallelRunIds: [], isInitialized: false });
+
+if (state.isInitialized) {
+    for (const runId of state.parallelRunIds) {
+        const runClient = apifyClient.run(runId);
+        const run = await runClient.get();
+
+        // This should happen if the run was deleted or the state was incorectly saved.
+        if (!run) throw new Error(`The run ${runId} from state does not exists.`);
+
+        if (run.status === 'RUNNING') {
+            log.info('Parallel run is already running.', { runId });
+        } else {
+            log.info(`Parallel run was in state ${run.status}, resurrecting.`, { runId });
+            await runClient.resurrect(targetActorRunOptions);
+        }
+    }
+} else {
+    for (let i = 0; i < parallelRunsCount; i++) {
+        const run = await Actor.start(targetActorId, {
+            ...targetActorInput,
+            datasetId: dataset.id,
+            requestQueueId: requestQueue.id,
+        }, targetActorRunOptions);
+        log.info(`Started parallel run with ID: ${run.id}`, { runId: run.id });
+        state.parallelRunIds.push(run.id);
+    }
+    state.isInitialized = true;
+}
+```
+
+Once Actor is initialized, it launches parallel scraper runs and waits for them to complete using `Promise.all()`.
+Additionally, by registering for abort events, the Actor can terminate all parallel runs if the Coordinator Actor is stopped.
+
+<Tabs groupId="main">
+<TabItem value="input_schema.json" label="input_schema.json">
+<CodeBlock language="json">{OrchestratorActorInputSchemaJson}</CodeBlock>
+</TabItem>
+<TabItem value="main.ts" label="main.ts">
+<CodeBlock language="typescript">{OrchestratorActorMainTs}</CodeBlock>
+</TabItem>
+</Tabs>
+
+### Pushing to Apify
+
+Once you have the Orchestrator Actor ready, you can push it to Apify using the following command from the root directory of the Actor project:
+
+```shell
+apify push
+```
+
+:::tip First log in
+
+If you are pushing the Actor for the first time, you will need to [login to your Apify account](https://docs.apify.com/cli/docs/reference#apify-login).
+
+:::
+
+By running this command, you will be prompted to provide the Actor ID, which you can find in the Apify Console under the Actors tab.
+
+![orchestrator-actor.png](./images/orchestrator-actor.png)
+
+## Scraper Actor Configuration
+
+The Scraper Actor performs website scraping. It operates using the request queue and dataset provided by the Orchestrator Actor.
+You will need to integrate your chosen scraper logic into this framework. The only thing you need to do is utilize the request queue and dataset initialized by the Orchestrator Actor.
+
+```ts
+import { Actor } from 'apify';
+
+interface Input {
+    requestQueueId: string;
+    datasetId: string;
+}
+
+const {
+    requestQueueId,
+    datasetId,
+} = await Actor.getInput<Input>() ?? {} as Input;
+
+const requestQueue = await Actor.openRequestQueue(requestQueueId);
+const dataset = await Actor.openDataset(datasetId);
+```
+
+Once you initialized the request queue and dataset, you can start scraping the website.
+In this example, we will use the CheerioCrawler to scrape [the example of ecommerce website](https://warehouse-theme-metal.myshopify.com/).
+You can create your scraper from the [Crawlee + Cheerio TypeScript Actor template](https://apify.com/templates/ts-crawlee-cheerio).
+
+<Tabs groupId="main">
+<TabItem value="input_schema.json" label="input_schema.json">
+<CodeBlock language="json">{ScraperActorInputSchemaJson}</CodeBlock>
+</TabItem>
+<TabItem value="main.ts" label="main.ts">
+<CodeBlock language="typescript">{ScraperActorMainTs}</CodeBlock>
+</TabItem>
+</Tabs>
+
+You can check [full code example](https://github.com/apify/apify-docs/tree/master/examples/ts-parallel-scraping/scraper).
+
+You need to push the Scraper Actor to Apify using the following command from the root directory of the Actor project:
+
+```shell
+apify push
+```
+
+After pushing the Scraper Actor to Apify, you must get the Actor ID from the Apify Console.
+
+![scraper-actor.png](./images/scraper-actor.png)
+
+## Run orchestration in Apify Console
+
+Once you have the Orchestrator Actor and Scraper Actor pushed to Apify, you can run the Orchestrator Actor in the Apify Console.
+You can set the input for the Orchestrator Actor to specify the number of parallel runs and the target Actor ID, input, and run options.
+After you hit the **Start** button, the Orchestrator Actor will start the parallel runs of the Scraper Actor.
+
+![orchestrator-actor-input.png](./images/orchestrator-actor-input.png)
+
+After starting the Orchestrator Actor, you will see the parallel runs initiated in the Apify Console.
+
+![scraper-actor-runs.png](./images/scraper-actor-runs.png)
+
+## Summary
+
+In this tutorial, you learned how to run multiple instances of an Actor to scrape a website faster. You created an Orchestrator Actor to manage the parallel execution of the Scraper Actor runs.
+The Orchestrator Actor initialized the Scraper Actor runs and managed their state. The Scraper Actor utilized the request queue and dataset provided by the Orchestrator Actor to scrape the website.
+You could speed up the scraping process by running multiple instances of the Scraper Actor in parallel.
+
+The code in this tutorial is for learning purposes and does not cover all specific edge cases. You can modify it to suit your exact requirements and use cases.
+</file>
+
+<file path="tutorials/node_js/optimizing_scrapers.md">
+---
+title: How to optimize and speed up your web scraper
+description: We all want our scrapers to run as cost-effective as possible. Learn how to think about performance in the context of web scraping and automation.
+sidebar_position: 14.6
+slug: /node-js/optimizing-scrapers
+---
+
+# How to optimize and speed up your web scraper {#optimizing-scrapers}
+
+**We all want our scrapers to run as cost-effective as possible. Learn how to think about performance in the context of web scraping and automation.**
+
+---
+
+Especially if you are running your scrapers on [Apify](https://apify.com), performance is directly related to your wallet (or rather bank account). The slower and heavier your program is, the more proxy bandwidth, storage, [compute units](https://help.apify.com/en/articles/3490384-what-is-a-compute-unit) and higher [subscription plan](https://apify.com/pricing) you'll need.
+
+The goal of optimization is to make the code run as fast as possible while using the least resources possible. On Apify, the resources are memory and CPU usage (don't forget that the more memory you allocate to a run, the bigger share of CPU you get - proportionally). The memory alone should never be a bottleneck though. If it is, that means either a bug (memory leak) or bad architecture of the program (you need to split the computation into smaller parts). The rest of this article will focus only on optimizing CPU usage. You allocate more memory only to get more power from the CPU.
+
+One more thing to remember. Optimization has its own cost: development time. You should always think about how much time you're able to spend on it and if it's worth it.
+
+Before we dive into the practical side of things, let us diverge with an analogy to help us think about the performance of scrapers.
+
+## Game development analogy {#analogy}
+
+Games are extremely complicated beasts. Every frame (usually 60 times a second), the game has to calculate the physics of the world, run AI, user input, and render everything into a beautiful scene. You can imagine that running all of that every 16 ms in a complicated game is a developer's nightmare. That's why a significant portion of game development is spent on optimizations. Every little waste matters.
+
+This is mainly true in the programming heart of the game - the engine. The engine is responsible for the heavy lifting of performance critical parts like physics, animation, AI, and rendering. Once the engine is built, you can design the game on top of it. You can add different spells, conversation chains, items, animations etc. to make your game cool. Those extra things may not run every frame and don't need to be optimized as heavily as the engine itself.
+
+Now, if you want to build your own game and you are not a C/C++ veteran with a team, you will likely use an existing engine (like Unreal or Unity) and focus on the design of the game environment itself. Unless you go crazy, the game will likely run just fine since those engines have already been optimized for you. Your job is to choose an appropriate engine and use it well.
+
+## Back to scrapers {#back-to-scrapers}
+
+What are the engines of the scraping world? A [browser](https://github.com/puppeteer/puppeteer?tab=readme-ov-file#puppeteer), an [HTTP library](https://www.npmjs.com/package/@apify/http-request), an [HTML parser](https://github.com/cheeriojs/cheerio), and a [JSON parser](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/parse). The CPU spends more than 99% of its workload in these libraries. As with engines, you are not likely gonna write these from scratch - instead you'll use something like [Crawlee](https://crawlee.dev) that handles a lot of the overheads for you.
+
+It is about how you use these tools. The small amount of code you write in your [`requestHandler`](https://crawlee.dev/api/http-crawler/interface/HttpCrawlerOptions#requestHandler) is absolutely insignificant compared to what is running inside these tools. In other words, it doesn't matter how many functions you call or how many variables you extract. If you want to optimize your scrapers, you need to choose the lightweight option from the tools and use it as little as possible. A crawler scraping only JSON API can be as much as 200 times faster/cheaper than a browser based solution.
+
+**Ranking of the tools from the most efficient to the least:**
+
+1. **JSON API** (HTTP call + JSON parse) - Scraping an API (public or internal) is the best option. The response is usually smaller than the HTML page and the data are already structured and cheap to parse. Usable for about 30% of websites.
+2. **Pure HTML** (HTTP call + HTML parse) -  All data is on the main single HTML page. Often the HTML contains script and JSON data that are rich and nicely structured. Some pages can be quite big and the parsing is slower than for JSON. But it is still 10–20 times faster than a browser. Usable for about 90% of websites.
+3. **Browser** (hundreds of HTTP calls, script execution, rendering) - Browsers are huge beasts. They do so much work to allow for smooth human interaction which makes them really inefficient for scraping. Use a browser only if it helps you bypass anti-scraping protection or if you need to interact with the page.
+</file>
+
+<file path="tutorials/node_js/processing_multiple_pages_web_scraper.md">
+---
+title: Processing the same page multiple times with different setups in Web Scraper
+description: Solving a common problem with scraper automatically deduplicating the same URLs
+sidebar_position: 15.6
+slug: /node-js/processing-multiple-pages-web-scraper
+---
+
+Sometimes you need to process the same URL several times, but each time with a different setup. For example, you may want to submit the same form with different data each time.
+
+Let's illustrate a solution to this problem by creating a scraper which starts with an array of keywords and inputs each of them to Google, one by one. Then it retrieves the results.
+
+> This isn't an efficient solution to searching keywords on Google. You could directly enqueue search URLs like `https://www.google.cz/search?q=KEYWORD`.
+
+# Enqueuing start pages for all keywords
+
+> Solving a common problem with scraper automatically deduplicating the same URLs.
+
+First, we need to start the scraper on the page from which we're going to do our enqueuing. To do that, we create one start URL with the label "enqueue" and URL "https://example.com/". Now we can proceed to enqueue all the pages. The first part of our `pageFunction` will look like this:
+
+```js
+async function pageFunction(context) {
+    const $ = context.jQuery;
+
+    if (context.request.userData.label === 'enqueue') {
+    // parse input keywords
+        const keywords = context.customData;
+
+        // process all the keywords
+        for (const keyword of keywords) {
+        // enqueue the page and pass the keyword in
+        // the interceptRequestData attribute
+            await context.enqueueRequest({
+                url: 'https://google.com',
+                uniqueKey: `${Math.random()}`,
+                userData: {
+                    label: 'fill-form',
+                    keyword,
+                },
+            });
+        }
+        // No return here because we don't extract any data yet
+    }
+}
+```
+
+To set the keywords, we're using the customData scraper parameter. This is useful for smaller data sets, but may not be perfect for bigger ones. For such cases you may want to use something like [Importing a list of URLs from an external source](https://docs.apify.com/academy/node-js/scraping-urls-list-from-google-sheets).
+
+Since we're enqueuing the same page more than once, we need to set our own uniqueKey so the page will be added to the queue (by default uniqueKey is set to be the same as the URL). The label for the next page will be "fill-form". We're passing the keyword to the next page in the userData field (this can contain any data).
+
+# Inputting the keyword into Google
+
+Now we come to the next page (Google). We need to retrieve the keyword and input it into the Google search bar. This will be the next part of the pageFunction:
+
+```js
+async function pageFunction(context) {
+    const $ = context.jQuery;
+
+    if (context.request.userData.label === 'enqueue') {
+        // copy from the previous part
+    } else if (context.request.userData.label === 'fill-form') {
+        // retrieve the keyword
+        const { keyword } = context.request.userData;
+
+        // input the keyword into the search bar
+        $('#lst-ib').val(keyword);
+
+        // submit the form
+        $('#tsf').submit();
+    }
+}
+```
+
+For the next page to correctly enqueue, we're going to need a new pseudoURL. Create a pseudoURL with the label "result" and the URL `https://www.google.com/search?[.+]`.
+
+Now we're on the last page and can finally extract the results.
+
+```js
+async function pageFunction(context) {
+    const $ = context.jQuery;
+
+    if (context.request.userData.label === 'enqueue') {
+        // copy from the previous part
+    } else if (context.request.userData.label === 'result') {
+        // create result array
+        const result = [];
+
+        // process all the results
+        $('.rc').each((index, elem) => {
+
+            // wrap element in jQuery
+            const gResult = $(elem);
+
+            // lookup link and text
+            const link = gResult.find('.r a');
+            const text = gResult.find('.s .st');
+
+            // extract data and add it to result array
+            result.push({
+                name: link.text(),
+                link: link.attr('href'),
+                text: text.text(),
+            });
+        });
+        // Now we finally return
+
+        return result;
+    }
+}
+```
+
+To test the scraper, set the customData to something like this `["apple", "orange", "banana"]` and push the Run button to start.
+</file>
+
+<file path="tutorials/node_js/request_labels_in_apify_actors.md">
+---
+title: Request labels and how to pass data to other requests
+description: How to handle request labels in Apify Actors with Cheerio or Puppeteer Crawler
+sidebar_position: 15.1
+slug: /node-js/request-labels-in-apify-actors
+---
+
+Are you trying to use Actors for the first time and don't know how to deal with the request label or how to pass data to the request?
+
+Here's how to do it.
+
+If you are using the requestQueue, you can do it this way.
+
+When you add a request to the queue, use the userData attribute.
+
+```js
+// Create a request list.
+const requestQueue = await Apify.openRequestQueue();
+// Add the request to the queue
+await requestQueue.addRequest({
+    url: 'https://www.example.com/',
+    userData: {
+        label: 'START',
+    },
+});
+```
+
+Right now, we have one request in the queue that has the label "START".  Now we can specify which code should be executed for this request in the handlePageFunction.
+
+```js
+if (request.userData.label === 'START') {
+    // your code for the first request for example
+    // enqueue the items of a shop
+} else if (request.userData.label === 'ITEM') {
+    // other code for the item of a shop
+}
+```
+
+And in the same way you can keep adding requests in the handlePageFunction.
+
+You can also handle the passing of data to the request like this. For example, when we have extracted the item from the shop above, we want to extract some information about the seller. We need to pass the item object to the seller page, where we save the rating of a seller, e.g..
+
+```js
+await requestQueue.addRequest({
+    url: sellerDetailUrl,
+    userData: {
+        label: 'SELLERDETAIL',
+        data: itemObject,
+    },
+});
+```
+
+Now, in the "SELLERDETAIL" url, we can evaluate the page and extracted data merge to the object from the item detail, for example like this
+
+```js
+const result = { ...request.userData.data, ...sellerDetail };
+```
+
+Save the results, and we're done!
+
+```js
+await Apify.pushData(result);
+```
+</file>
+
+<file path="tutorials/node_js/scraping_from_sitemaps.js">
+import { Dataset, RequestList, PuppeteerCrawler } from 'crawlee';
+
+const requestList = await RequestList.open(null, [{
+    requestsFromUrl: 'https://www.brewbound.com/sitemap.xml',
+    regex: /http(s)?:\/\/www\.brewbound\.com\/breweries\/[^/<]+\/[^/<]+/gm,
+}]);
+
+const crawler = new PuppeteerCrawler({
+    requestList,
+    async requestHandler({ page }) {
+        const beerPage = await page.evaluate(() => {
+            return document.getElementsByClassName('productreviews').length;
+        });
+        if (!beerPage) return;
+
+        const data = await page.evaluate(() => {
+            const title = document.getElementsByTagName('h1')[0].innerText;
+            const [brewery, beer] = title.split(':');
+            const description = document.getElementsByClassName('productreviews')[0].innerText;
+
+            return { brewery, beer, description };
+        });
+
+        await Dataset.pushData(data);
+    },
+});
+
+await crawler.run();
+</file>
+
+<file path="tutorials/node_js/scraping_from_sitemaps.md">
+---
+title: How to scrape from sitemaps
+description: The sitemap.xml file is a jackpot for every web scraper developer. Take advantage of this and learn an easier way to extract data from websites using Crawlee.
+sidebar_position: 14.7
+slug: /node-js/scraping-from-sitemaps
+---
+
+import Example from '!!raw-loader!roa-loader!./scraping_from_sitemaps.js';
+
+# How to scrape from sitemaps {#scraping-with-sitemaps}
+
+:::tip Processing sitemaps automatically with Crawlee
+
+Crawlee allows you to scrape sitemaps with ease. If you are using Crawlee, you can skip the following steps and just gather all the URLs from the sitemap in a few lines of code.
+
+:::
+
+```js
+import { RobotsFile } from 'crawlee';
+
+const robots = await RobotsFile.find('https://www.mysite.com');
+
+const allWebsiteUrls = await robots.parseUrlsFromSitemaps();
+```
+
+**The sitemap.xml file is a jackpot for every web scraper developer. Take advantage of this and learn an easier way to extract data from websites using Crawlee.**
+
+---
+
+Let's say we want to scrape a database of craft beers ([brewbound.com](https://www.brewbound.com/)) before summer starts. If we are lucky, the website will contain a sitemap at [brewbound.com/sitemap.xml](https://www.brewbound.com/sitemap.xml).
+
+> Check out [Sitemap Sniffer](https://apify.com/vaclavrut/sitemap-sniffer), which can discover sitemaps in hidden locations!
+
+## Analyzing the sitemap {#analyzing-the-sitemap}
+
+The sitemap is usually located at the path **/sitemap.xml**. It is always worth trying that URL, as it is rarely linked anywhere on the site. It usually contains a list of all pages in [XML format](https://en.wikipedia.org/wiki/XML).
+
+```XML
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    <url>
+        <loc>http://www.brewbound.com/advertise</loc>
+        <lastmod>2015-03-19</lastmod>
+        <changefreq>daily</changefreq>
+    </url>
+    <url>
+    ...
+```
+
+The URLs of breweries take this form:
+
+```text
+http://www.brewbound.com/breweries/[BREWERY_NAME]
+```
+
+And the URLs of craft beers look like this:
+
+```text
+http://www.brewbound.com/breweries/[BREWERY_NAME]/[BEER_NAME]
+```
+
+They can be matched using the following regular expression:
+
+```regexp
+http(s)?:\/\/www\.brewbound\.com\/breweries\/[^\/]+\/[^\/<]+
+```
+
+Note the two parts of the regular expression `[^\/<]` containing the `<` symbol. This is because we want to exclude the `</loc>` tag, which closes each URL.
+
+## Scraping the sitemap in Crawlee {#scraping-the-sitemap}
+
+If you're scraping sitemaps (or anything else, really), [Crawlee](https://crawlee.dev) is perfect for the job.
+
+First, let's add the beer URLs from the sitemap to the [`RequestList`](https://crawlee.dev/api/core/class/RequestList) using our regular expression to match only the (craft!!) beer URLs and not pages of breweries, contact page, etc.
+
+```js
+const requestList = await RequestList.open(null, [{
+    requestsFromUrl: 'https://www.brewbound.com/sitemap.xml',
+    regex: /http(s)?:\/\/www\.brewbound\.com\/breweries\/[^/<]+\/[^/<]+/gm,
+}]);
+```
+
+Now, let's use [`PuppeteerCrawler`](https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler) to scrape the created `RequestList` with [Puppeteer](https://pptr.dev/) and push it to the final dataset.
+
+```js
+const crawler = new PuppeteerCrawler({
+    requestList,
+    async requestHandler({ page }) {
+        const beerPage = await page.evaluate(() => {
+            return document.getElementsByClassName('productreviews').length;
+        });
+        if (!beerPage) return;
+
+        const data = await page.evaluate(() => {
+            const title = document.getElementsByTagName('h1')[0].innerText;
+            const [brewery, beer] = title.split(':');
+            const description = document.getElementsByClassName('productreviews')[0].innerText;
+
+            return { brewery, beer, description };
+        });
+
+        await Dataset.pushData(data);
+    },
+});
+```
+
+## Full code {#full-code}
+
+If we create a new Actor using the code below on the [Apify platform](../../platform/apify_platform.md), it returns a nicely formatted spreadsheet containing a list of breweries with their beers with descriptions.
+
+Make sure to use the **apify/actor-node-puppeteer-chrome** image for your Dockerfile, otherwise the run will fail.
+
+<RunnableCodeBlock className="language-js" type="puppeteer">
+    {Example}
+</RunnableCodeBlock>
+</file>
+
+<file path="tutorials/node_js/scraping_shadow_doms.md">
+---
+title: How to scrape sites with a shadow DOM
+description: The shadow DOM enables isolation of web components, but causes problems for those building web scrapers. Here's a workaround.
+sidebar_position: 14.8
+slug: /node-js/scraping-shadow-doms
+---
+
+# How to scrape sites with a shadow DOM {#scraping-shadow-doms}
+
+**The shadow DOM enables isolation of web components, but causes problems for those building web scrapers. Here's a workaround.**
+
+---
+
+Each website is represented by an HTML DOM, a tree-like structure consisting of HTML elements (e.g. paragraphs, images, videos) and text. [Shadow DOM](https://developer.mozilla.org/en-US/docs/Web/API/Web_components/Using_shadow_DOM) allows the separate DOM trees to be attached to the main DOM while remaining isolated in terms of CSS inheritance and JavaScript DOM manipulation. The CSS and JavaScript codes of separate shadow DOM components do not clash, but the downside is that you can't access the content from outside.
+
+Let's take a look at this page [alodokter.com](https://www.alodokter.com/). If you click on the menu and open a Chrome debugger, you will see that the menu tree is attached to the main DOM as shadow DOM under the element `<top-navbar-view id="top-navbar-view">`.
+
+![Shadow root of the top-navbar-view custom element](./images/shadow.png)
+
+The rest of the content is rendered the same way. This makes it hard to scrape because `document.body.innerText`, `document.getElementsByTagName('a')`, and all others return an empty result.
+
+The content of the menu can be accessed only via the [`shadowRoot`](https://developer.mozilla.org/en-US/docs/Web/API/ShadowRoot) property. If you use jQuery you can do the following:
+
+```js
+// Find element that is shadow root of menu DOM tree.
+const { shadowRoot } = document.getElementById('top-navbar-view');
+
+// Create a copy of its HTML and use jQuery find links.
+const links = $(shadowRoot.innerHTML).find('a');
+
+// Get URLs from link elements.
+const urls = links.map((obj, el) => el.href);
+```
+
+However, this isn't very convenient, because you have to find the root element of each component you want to work with, and you can't take advantage of all the scripts and tools you already have.
+
+Instead of that, we can replace the content of each element containing shadow DOM with the HTML of shadow DOM.
+
+```js
+// Iterate over all elements in the main DOM.
+for (const el of document.getElementsByTagName('*')) {
+    // If element contains shadow root then replace its
+    // content with the HTML of shadow DOM.
+    if (el.shadowRoot) el.innerHTML = el.shadowRoot.innerHTML;
+}
+```
+
+After you run this, you can access all the elements and content using jQuery or plain JavaScript. The downside is that it breaks all the interactive components because you create a new copy of the shadow DOM HTML content without the JavaScript code and CSS attached, so this must be done after all the content has been rendered.
+
+Some websites may contain shadow DOMs recursively inside of shadow DOMs. In these cases, we must replace them with HTML recursively:
+
+```js
+// Returns HTML of given shadow DOM.
+const getShadowDomHtml = (shadowRoot) => {
+    let shadowHTML = '';
+    for (const el of shadowRoot.childNodes) {
+        shadowHTML += el.nodeValue || el.outerHTML;
+    }
+    return shadowHTML;
+};
+
+// Recursively replaces shadow DOMs with their HTML.
+const replaceShadowDomsWithHtml = (rootElement) => {
+    for (const el of rootElement.querySelectorAll('*')) {
+        if (el.shadowRoot) {
+            replaceShadowDomsWithHtml(shadowRoot);
+            el.innerHTML += getShadowDomHtml(el.shadowRoot);
+        }
+    }
+};
+
+replaceShadowDomsWithHtml(document.body);
+```
+</file>
+
+<file path="tutorials/node_js/scraping_urls_list_from_google_sheets.md">
+---
+title: Scraping a list of URLs from a Google Sheets document
+description: Learn how to crawl a list of URLs specified in a Google Sheets document using one of the Apify web scraping Actors.
+sidebar_position: 15
+slug: /node-js/scraping-urls-list-from-google-sheets
+---
+
+You can export URLs from [Google Sheets](https://workspace.google.com/products/sheets/) such as [this one](https://docs.google.com/spreadsheets/d/1-2mUcRAiBbCTVA5KcpFdEYWflLMLp9DDU3iJutvES4w) directly into an [Actor](/platform/actors)'s Start URLs field.
+
+1. Make sure the spreadsheet has one sheet and a simple structure to help the Actor find the URLs.
+
+2. Add the `/gviz/tq?tqx=out:csv` query parameter to the Google Sheet URL base, right after the long document identifier part. For example, https://docs.google.com/spreadsheets/d/1-2mUcRAiBbCTVA5KcpFdEYWflLMLp9DDU3iJutvES4w/gviz/tq?tqx=out:csv. This automatically exports the spreadsheet to CSV format.
+
+3. In the Actor's input, click Link remote text file and paste the URL there:
+
+![List of URLs](./images/gsheets-url.png)
+
+IMPORTANT: Make sure anyone with the link can view the document. Otherwise, the Actor will not be able to access it.
+
+![Link sharing](./images/anyone-with-link.png)
+</file>
+
+<file path="tutorials/node_js/submitting_form_with_file_attachment.md">
+---
+title: Submitting a form with file attachment
+description: How to submit a form with attachment using request-promise.
+sidebar_position: 15.5
+slug: /node-js/submitting-form-with-file-attachment
+---
+
+When doing web automation with Apify, it can sometimes be necessary to submit an HTML form with a file attachment. This article will cover a situation where the file is publicly accessible (e.g. hosted somewhere) and will use an Apify Actor. If it's impossible to use request-promise, it might be necessary to use [Puppeteer](https://docs.apify.com/academy/puppeteer-playwright/common-use-cases/submitting-a-form-with-a-file-attachment).
+
+# Downloading the file to memory
+
+**How to submit a form with attachment using request-promise.**
+
+---
+
+After creating a new Actor, the first thing to do is download the file. We can do that using the request-promise module, so make sure it is included.
+
+```js
+const request = require('request-promise');
+```
+
+The actual downloading is going to be slightly different for text and binary files. For a text file, do it like this:
+
+```js
+const fileData = await request('https://example.com/file.txt');
+```
+
+For a binary file, we need to provide additional parameters so as not to interpret it as text:
+
+```js
+const fileData = await request({
+    uri: 'https://example.com/file.pdf',
+    encoding: null,
+});
+```
+
+In this case, fileData will be a Buffer instead of a String.
+
+# Submitting the form
+
+When the file is ready, we can submit the form as follows:
+
+```js
+await request({
+    uri: 'https://example.com/submit-form.php',
+    method: 'POST',
+
+    formData: {
+        // set any form values
+        name: 'John',
+        surname: 'Doe',
+        email: 'john.doe@example.com',
+
+        // add the attachment
+        attachment: {
+            value: fileData,
+            options: {
+                filename: 'file.pdf',
+                contentType: 'application/pdf',
+            },
+        },
+    },
+});
+```
+
+The header Content-Type: multipart/form-data will be set automatically.
+</file>
+
+<file path="tutorials/node_js/submitting_forms_on_aspx_pages.md">
+---
+title: Submitting forms on .ASPX pages
+description: How to handle pages created with ASP.NET in Web Scraper.
+sidebar_position: 15.4
+slug: /node-js/submitting-forms-on-aspx-pages
+---
+
+Apify users sometimes need to submit a form on pages created with ASP.NET (URL typically ends with .aspx). These pages have a different approach for how they submit forms and navigate through pages.
+
+This tutorial shows you how to handle these kinds of pages. This approach is based on a [blog post](https://toddhayton.com/2015/05/04/scraping-aspnet-pages-with-ajax-pagination/) from Todd Hayton, where he explains how crawlers for ASP.NET pages should work.
+
+First of all, you need to copy&paste this function to your [Web Scraper](https://apify.com/apify/web-scraper) _Page function_:
+
+```js
+const enqueueAspxForm = async function (request, formSelector, submitButtonSelector, async) {
+    request.payload = $(formSelector).serialize();
+    if ($(submitButtonSelector).length) {
+        request.payload += decodeURIComponent(`&${$(submitButtonSelector).attr('name')}=${$(submitButtonSelector).attr('value')}`);
+    }
+    request.payload += decodeURIComponent(`&__ASYNCPOST=${async.toString()}`);
+    request.method = 'POST';
+    request.uniqueKey = Math.random();
+    await context.enqueueRequest(request);
+    return request;
+};
+```
+
+The function has these parameters:
+
+`request` - the object that describes the next request
+
+`formSelector` - selector for a form to be submitted e.g 'form[name="test"]'
+
+`submitButtonSelector` - selector for a button for submit form e.g. '#nextPageButton'
+
+`async` - if true, request returns only params, not HTML content
+
+Then you can use it in your Page function as follows:
+
+```js
+await enqueueAspxForm({
+    url: 'http://architectfinder.aia.org/frmSearch.aspx',
+    userData: { label: 'SEARCH-RESULT' },
+}, 'form[name="aspnetForm"]', '#ctl00_ContentPlaceHolder1_btnSearch', false);
+```
+</file>
+
+<file path="tutorials/node_js/using_proxy_to_intercept_requests_puppeteer.md">
+---
+title: Using man-in-the-middle proxy to intercept requests in Puppeteer
+description: This article demonstrates how to set up a reliable interception of HTTP requests in headless Chrome / Puppeteer using a local proxy.
+sidebar_position: 16.1
+slug: /node-js/using-proxy-to-intercept-requests-puppeteer
+---
+
+Sometimes you may need to intercept (or maybe block) requests in headless Chrome / Puppeteer, but `page.setRequestInterception()`  is not 100% reliable when the request is started in a new window.
+
+One possible way to intercept these requests is to use a man-in-the-middle (MITM) proxy, i.e. a proxy server that can intercept and modify HTTP requests, even those over HTTPS. In this example, we're going to use https://github.com/joeferner/node-http-mitm-proxy, since it has all the tools that we need.
+
+First we set up the MITM proxy:
+
+```js
+const { promisify } = require('util');
+const { exec } = require('child_process');
+const Proxy = require('http-mitm-proxy');
+const Promise = require('bluebird');
+
+const execPromise = promisify(exec);
+
+const wait = (timeout) => new Promise((resolve) => setTimeout(resolve, timeout));
+
+const setupProxy = async (port) => {
+    // Setup chromium certs directory
+    // WARNING: this only works in debian docker images
+    // modify it for any other use cases or local usage.
+    await execPromise('mkdir -p $HOME/.pki/nssdb');
+    await execPromise('certutil -d sql:$HOME/.pki/nssdb -N');
+    const proxy = Proxy();
+    proxy.use(Proxy.wildcard);
+    proxy.use(Proxy.gunzip);
+    return new Promise((resolve, reject) => {
+        proxy.listen({ port, silent: true }, (err) => {
+            if (err) return reject(err);
+            // Add CA certificate to chromium and return initialize proxy object
+            execPromise('certutil -d sql:$HOME/.pki/nssdb -A -t "C,," -n mitm-ca -i ./.http-mitm-proxy/certs/ca.pem')
+                .then(() => resolve(proxy))
+                .catch(reject);
+        });
+    });
+};
+```
+
+Then we'll need a Docker image that has the `certutil` utility. Here is an [example of a Dockerfile](https://github.com/apify/actor-example-proxy-intercept-request/blob/master/Dockerfile) that can create such an image and is based on the [apify/actor-node-chrome](https://hub.docker.com/r/apify/actor-node-chrome/) image that contains Puppeteer.
+
+Now we need to specify how the proxy shall handle the intercepted requests:
+
+```js
+// Setup blocking of requests in proxy
+const proxyPort = 8000;
+const proxy = setupProxy(proxyPort);
+proxy.onRequest((context, callback) => {
+    if (blockRequests) {
+        const request = context.clientToProxyRequest;
+        // Log out blocked requests
+        console.log('Blocked request:', request.headers.host, request.url);
+
+        // Close the connection with custom content
+        context.proxyToClientResponse.end('Blocked');
+        return;
+    }
+    return callback();
+});
+```
+
+The final step is to let Puppeteer use the local proxy:
+
+```js
+// Launch puppeteer with local proxy
+const browser = await puppeteer.launch({
+    args: ['--no-sandbox', `--proxy-server=localhost:${proxyPort}`],
+});
+```
+
+And we're done! By adjusting the `blockRequests` variable, you can allow or block any request initiated through Puppeteer.
+
+Here is a GitHub repository with a full example and all necessary files: https://github.com/apify/actor-example-proxy-intercept-request
+
+If you have any questions, feel free to contact us in the chat.
+
+Happy intercepting!
+</file>
+
+<file path="tutorials/node_js/waiting_for_dynamic_content.md">
+---
+title: Waiting for dynamic content
+description: You load the page. You execute the correct selectors. Everything should work. It doesn't? Learn how to wait for dynamic loading.
+sidebar_position: 14.9
+slug: /node-js/waiting-for-dynamic-content
+---
+
+Use these helper functions to wait for data:
+
+- `page.waitFor` in [Puppeteer](https://pptr.dev/) (or Puppeteer Scraper ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper))).
+
+- `context.waitFor` in Web Scraper ([apify/web-scraper](https://apify.com/apify/web-scraper)).
+
+Pass in time in milliseconds or a selector to wait for.
+
+Examples:
+
+- `await page.waitFor(10000)` - waits for 10 seconds.
+
+- `await context.waitFor('my-selector')` - waits for `my-selector` to appear on the page.
+
+For details, code examples, and advanced use cases, visit our [documentation](../../webscraping/puppeteer_playwright/page/waiting.md).
+</file>
+
+<file path="tutorials/node_js/when_to_use_puppeteer_scraper.md">
+---
+title: When to use Puppeteer Scraper
+description: Choosing between Web Scraper and Puppeteer Scraper can be difficult. We explain the important differences to help you pick the right tool.
+sidebar_position: 15.2
+slug: /node-js/when-to-use-puppeteer-scraper
+---
+
+You may have read in the [Web Scraper](https://apify.com/apify/web-scraper) readme or somewhere else at Apify that [Puppeteer Scraper](https://apify.com/apify/puppeteer-scraper) is more powerful and gives you more control over the browser, enabling you to do almost anything. But what does that really mean? In this article, we will talk about the differences in more detail and show you some minimal examples to strengthen that understanding.
+
+## What exactly is Puppeteer?
+
+Both the Web Scraper and Puppeteer Scraper use Puppeteer to control the Chrome browser, so, what's the difference? Consider Puppeteer and Chrome as two separate programs.
+
+Puppeteer is a JavaScript program that's used to control the browser and by controlling we mean opening tabs, closing tabs, moving the mouse, clicking buttons, typing on the keyboard, managing network activity, etc. If a website is watching for any of these events, there is no way for it to know that those actions were performed by a robot and not a human user. Chrome is just Chrome as you know it.
+
+_Robot browsers can be detected in numerous ways.. But there are no ways to tell if a specific mouse click was made by a user or a robot._
+
+Ok, so both Web Scraper and Puppeteer Scraper use Puppeteer to give commands to Chrome. Where's the difference? It's called the execution environment.
+
+## Execution environment
+
+It may sound fancy, but it's just a technical term for "where does my code run". When you open the DevTools and start typing JavaScript in the browser Console, it gets executed in the browser. Browser is the code's execution environment. But you can't control the browser from the inside. For that, you need a different environment. Puppeteer's environment is Node.js. If you don't know what Node.js is, don't worry about it too much. Remember that it's the environment where Puppeteer runs.
+
+By now you probably figured this out on your own, so this will not come as a surprise. The difference between Web Scraper and Puppeteer Scraper is where your page function gets executed. When using the Web Scraper, it's executed in the browser environment. It means that it gets access to all the browser specific features such as the `window` or `document` objects, but it cannot control the browser with Puppeteer directly. This is done automatically in the background by the scraper. Whereas in Puppeteer Scraper, the page function is executed in the Node.js environment, giving you full access to Puppeteer and all its features.
+
+![Puppeteer Scraper Diagram](./images/puppeteer-scraper-diagram.jpeg)
+_This does not mean that you can't execute in-browser code with Puppeteer Scraper. Keep reading to learn how._
+
+## Practical differences
+
+Ok, cool, different environments, but how does that help you scrape stuff? Actually, quite a lot. Some things you just can't do from within the browser, but you can do them with Puppeteer. We will not attempt to create an exhaustive list, but rather show you some very useful features that we use every day in our scraping.
+
+## Evaluating in-browser code
+
+In Web Scraper, everything runs in the browser, so there's really not much to talk about there. With Puppeteer Scraper, it's a single function call away.
+
+```js
+const bodyHTML = await context.page.evaluate(() => {
+    console.log('This will be printed in browser console.');
+    return document.body.innerHTML;
+});
+```
+
+The `context.page.evaluate()` call executes the provided function in the browser environment and passes back the return value back to the Node.js environment. One very important caveat though! Since we're in different environments, we cannot use our existing variables, such as `context` inside of the evaluated function, because they are not available there. Different environments, different variables.
+
+_See the_ `page.evaluate()` _[documentation](https://pptr.dev/#?product=Puppeteer&show=api-pageevaluatepagefunction-args) for info on how to pass variables from Node.js to browser._
+
+With the help of Apify SDK, we can even inject jQuery into the browser. You can use the `Pre goto function` input option to manipulate the page's environment before it loads.
+
+```js
+async function preGotoFunction({ request, page, Apify }) {
+    await Apify.utils.puppeteer.injectJQuery(page);
+}
+```
+
+This will make jQuery available in all pages. You can then use it in `context.page.evaluate()` calls:
+
+```js
+const bodyText = await context.page.evaluate(() => {
+    return $('body').text();
+});
+```
+
+You can do a lot of DOM manipulation directly from Node.js / Puppeteer, but when you're planning to do a lot of sequential operations, it's often better and faster to do it with jQuery in a single `context.page.evaluate()` call than using multiple `context.page.$`, `context.page.$eval()` and other Puppeteer methods.
+
+## Navigation to other pages (URLs)
+
+In Web Scraper, your page function literally runs within a page so it makes sense that when this page gets destroyed, the page function throws an error. Sadly, navigation (going to a different URL) destroys pages, so whenever you click a button in Web Scraper that forces the browser to navigate somewhere else, you end up with an error. In Puppeteer Scraper, this is not an issue, because the `page` object gets updated with new data seamlessly.
+
+Imagine that you currently have `https://example.com/page-1` open and there's a button on the page that will take you to `https://example.com/page-2`.Or that you're on `https://google.com` and you fill in the search bar and click on the search button.
+
+Consider the following code inside Web Scraper page function:
+
+```js
+await context.waitFor('button');
+$('button').click();
+```
+
+With a `button` that takes you to the next page or launches a Google search (which takes you to the results page), the page function will fail with a nasty error.
+
+However, when using Puppeteer Scraper, this code:
+
+```js
+await context.page.waitFor('button');
+await Promise.all([
+    context.page.waitForNavigation(),
+    context.page.click('button'),
+]);
+```
+
+Will work as expected and after the `Promise.all()` call resolves, you will have the next page loaded and ready for scraping.
+
+Pay special attention to the `page.waitForNavigation()` ([see docs](https://pptr.dev/#?product=Puppeteer&show=api-pagewaitfornavigationoptions)) call which is very important. It pauses your script until the navigation completes. Without it, the execution would start immediately after the mouse click. It's also important that you place it before the click itself, otherwise it creates a race condition and your script will behave unpredictably.
+
+You can go even further and navigate programmatically by calling:
+
+```js
+await context.page.goto('https://some-new-page.com');
+```
+
+## Intercepting network activity
+
+Some very useful scraping techniques revolve around listening to network requests and responses and even modifying them on the fly. Web Scraper's page function doesn't have access to the network, besides calling JavaScript APIs such as `fetch()`. Puppeteer Scraper, on the other hand, has full control over the browser's network activity.
+
+You can listen to all the network requests that are being dispatched from the browser. For example, the following code will print all their URLs to the console.
+
+```js
+context.page.on('request', (req) => console.log(req.url()));
+```
+
+This can be useful in many ways, such as blocking unwanted assets or scripts from being downloaded, modifying request methods or faking responses, etc.
+
+_Explaining how to do interception properly is out of scope of this article. See [Puppeteer docs](https://pptr.dev/#?product=Puppeteer&show=api-pagesetrequestinterceptionvalue) and the [Apify SDK helper](/sdk/js/docs/api/puppeteer#puppeteeraddinterceptrequesthandler-promise) for request interception._
+
+## Enqueueing JavaScript links
+
+A large number of websites use either form submissions or JavaScript redirects for navigation and displaying of data. With Web Scraper, you cannot crawl those websites, because there are no links to find and enqueue on those pages. Puppeteer Scraper enables you to automatically click all those elements that cause navigation, intercept the navigation requests and enqueue them to the request queue.
+
+If it seems complicated, don't worry. We've abstracted all the complexity away to a `Clickable elements selector` input option. When left empty, none of the said clicking and intercepting happens, but once you choose a selector, Puppeteer Scraper will automatically click all the selected elements, watch for page navigations and enqueue them into the `RequestQueue`.
+
+_The_ `Clickable elements selector` _will also work on regular non-JavaScript links, however, it is significantly slower than using the plain_ `Link selector`_. Unless you know you need it, use the_ `Link selector` _for best performance._
+
+## Word of caution
+
+Since we're actually clicking in the page, which may or may not trigger some nasty JavaScript, anything can happen really, including the page completely breaking. Three common scenarios exist though.
+
+## Plain form submit navigations
+
+This works out of the box. It's typically used on older websites such as [Turkish Remax](https://www.remax.com.tr/ofis-office-franchise-girisimci-agent-arama). For a site like this you can set the `Clickable elements selector` and you're good to go:
+
+```js
+'a[onclick ^= getPage]';
+```
+
+## Form submit navigations with side-effects
+
+Those are similar to the ones above with an important caveat. Once you click the first thing, it usually modifies the page in a way that causes more clicking to become impossible. We deal with those by scraping the pages one by one, using the pagination "next" button. See [Maxwell Materials](http://www.maxwellrender.com/materials/) and use the following selector:
+
+```js
+'li.page-item.next a';
+```
+
+## Frontend navigations
+
+Websites often won't navigate away just to fetch the next set of results. They will do it in the background and update the displayed data. You can paginate such websites with either Web Scraper or Puppeteer Scraper. Try it on [Udemy](https://www.udemy.com/topic/javascript/) for example. Click the next button to load the next set of courses.
+
+```js
+// Web Scraper\
+$('li a span.pagination-next').click();
+
+// Puppeteer Scraper\
+await page.click('li a span.pagination-next');
+```
+
+## Using Apify SDK
+
+[Apify SDK](/sdk/js/) is the library we used to build all of our scrapers. For power users, it is the best tool out there to scrape using JavaScript. If you're not yet ready to start writing your own Actors using SDK, Puppeteer Scraper enables you to use its features without having to worry about building your own Actors.
+
+The possibilities are endless, but to show you some examples:
+
+- Check out the [Apify.utils.puppeteer.infiniteScroll()](/sdk/js/docs/api/puppeteer#puppeteer.infiniteScroll) function that enables scraping pages with infinite scroll in one line of code.
+
+- [Apify.utils.puppeteer.blockRequests()](/sdk/js/docs/api/puppeteer#puppeteer.blockRequests) allows you to block network requests based on URL patterns.
+
+- [Apify.openDataset()](/sdk/js/docs/api/apify#module_Apify.openDataset) lets you work with any dataset under your account.
+
+- Make HTTP requests with `Apify.utils.requestAsBrowser()` to fetch external resources.
+
+And we're only scratching the surface here.
+
+## Wrapping it up
+
+Many more techniques are available to Puppeteer Scraper that are either too complicated to replicate in Web Scraper or downright impossible to do. Web Scraper is a great tool for basic scraping, because it goes right to the point and uses in-browser JavaScript which is well-known to millions of people, even non-developers.
+
+Once you start hitting some roadblocks, you may find that Puppeteer Scraper is just what you need to overcome them. And if Puppeteer Scraper still doesn't cut it, there's still Apify SDK to rule them all. We hope you found this tutorial helpful and happy scraping.
+</file>
+
+<file path="tutorials/php/index.md">
+---
+title: PHP tutorials
+description: A collection of PHP tutorials to aid you in your journey to becoming a master web scraping and automation developer.
+sidebar_position: 16
+category: tutorials
+slug: /php
+---
+
+# PHP Tutorials 👨‍💻📚
+
+**A collection of PHP tutorials to aid you in your journey to becoming a master web scraping and automation developer.**
+
+---
+
+This section contains web-scraping or web-scraping related tutorials for PHP. Whether you're trying to scrape from a website with sitemaps, struggling with a dynamic page, want to optimize your slow scraper, or need some general tips for scraping in Apify with PHP, this section is right for you.
+</file>
+
+<file path="tutorials/php/using_apify_from_php.md">
+---
+title: Use Apify via API from PHP
+description: Learn how to access Apify's REST API endpoints from your PHP projects using the guzzle package. Follow a tutorial to run an Actor and download its data.
+sidebar_position: 1
+slug: /php/use-apify-from-php
+---
+
+# How to use Apify from PHP
+
+Apify's [RESTful API](https://docs.apify.com/api/v2#) allows you to use the platform from basically anywhere. Many projects are and will continue to be built using [PHP](https://www.php.net/). This tutorial enables you to use Apify in these projects in PHP and frameworks built on it.
+
+Apify does not have an official PHP client (yet), so we are going to use [guzzle](https://github.com/guzzle/guzzle), a great library for HTTP requests. By covering a few fundamental endpoints, this tutorial will show you the principles you can use for all Apify API endpoints.
+
+## Before you start
+
+Make sure you have an Apify account and API token. You will find the token in the [Integrations](https://console.apify.com/account#/integrations) section in Apify Console.
+
+If you don't already have guzzle installed in your project (or just want to try out the code examples), run `composer require guzzlehttp/guzzle` to install it in the current directory.
+
+## Preparing the client
+
+To get a guzzle instance ready to be used with the Apify API, we first need to set up the base endpoint and authentication.
+
+```php
+require 'vendor/autoload.php';
+
+$client = new \GuzzleHttp\Client([
+    'base_uri' => 'https://api.apify.com/v2/',
+    'headers' => [
+        // Replace <YOUR_APIFY_API_TOKEN> with your actual token
+        'Authorization' => 'Bearer <YOUR_APIFY_API_TOKEN>',
+    ]
+]);
+```
+
+Note that we pass the API token in the header. It can also be passed as a query string `token` parameter, but passing it in the header is preferred and more secure.
+
+To check whether everything works well, we'll try to get information about the [current user](/api/v2#/reference/users/private-data/get-private-user-data).
+
+```php
+// Call the endpoint using our client
+// Note that the path does not have a leading slash
+$response = $client->get('users/me');
+// Parse the response (most Apify API endpoints return JSON)
+$parsedResponse = \json_decode($response->getBody(), true);
+// The actual data are usually present under the `data` key
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+If, instead of data, you see an error saying `Authentication token is not valid`, check if the API token you used to instantiate the client is valid.
+
+## Running an Actor
+
+Now that we have our guzzle client ready to go, we can run some Actors. Let's try the **Contact Details Scraper** ([vdrmota/contact-info-scraper](https://apify.com/vdrmota/contact-info-scraper)).
+
+The [API reference](/api/v2#/reference/actors/run-collection/run-actor) states that an Actor's input should be passed as JSON in the request body. Other options are passed as query parameters.
+
+```php
+// To run the Actor, we make a POST request to its run's endpoint
+// To identify the Actor, you can use its ID, but you can also pass
+// the full Actor name [username]~[actorName] or just ~[actorName] for
+// your own Actors
+$response = $client->post('acts/vdrmota~contact-info-scraper/runs', [
+  // Actors usually accept JSON as input. When using the `json` key in
+  // a POST request's options, guzzle sets proper request headers
+  // and serializes the array we pass in
+  'json' => [
+    'startUrls' => [
+        ['url' => 'https://www.apify.com/contact']
+    ],
+    'maxDepth' => 0,
+  ],
+  // Other run options are passed in as query parameters
+  // This is optional since Actors usually have reasonable defaults
+  'query' => [ 'timeout' => 30 ],
+]);
+$parsedResponse = \json_decode($response->getBody(), true);
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+You should see information about the run, including its ID and the ID of its default [dataset](/platform/storage/dataset). Take note of these, we will need them later.
+
+## Getting the results from dataset
+
+Actors usually store their output in a default dataset. The [Actor runs endpoint](/api/v2#/reference/actor-runs) lets you get overall info about an Actor run's default dataset.
+
+```php
+// Replace <RUN_ID> with the run ID you from earlier
+$response = $client->get('actor-runs/<RUN_ID>/dataset');
+$parsedResponse = \json_decode($response->getBody(), true);
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+As you can see, the response contains overall stats about the dataset, like its number of items, but not the actual data. To get those, we have to call the **items** endpoint.
+
+```php
+// Replace <RUN_ID> with the run ID from earlier
+$response = $client->get('actor-runs/<RUN_ID>/dataset/items');
+// The dataset items endpoint returns an array of dataset items
+// they are not under the `data` key like in other endpoints
+$data = \json_decode($response->getBody(), true);
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+Some of the Actors write to datasets other than the default. In these cases, you need to have the dataset ID and call the `datasets/<DATASET_ID>` and `datasets/<DATASET_ID>/items` endpoints instead.
+
+For larger datasets, you can paginate through the results by passing query parameters.
+
+```php
+$response = $client->get('datasets/<DATASET_ID>/items', [
+    'query' => [
+        'offset' => 20,
+        'limit' => 10,
+    ]
+]);
+$parsedResponse = \json_decode($response->getBody(), true);
+echo \json_encode($parsedResponse, JSON_PRETTY_PRINT);
+```
+
+All the available parameters are described in [our API reference](/api/v2#/reference/datasets/item-collection/get-items) and work both for all datasets.
+
+## Getting the results from key-value stores
+
+Datasets are great for structured data, but are not suited for binary files like images or PDFs. In these cases, Actors store their output in [key-value stores](/platform/storage/key-value-store). One such Actor is the **HTML String To PDF** ([mhamas/html-string-to-pdf](https://apify.com/mhamas/html-string-to-pdf)) converter. Let's run it.
+
+```php
+$response = $client->post('acts/mhamas~html-string-to-pdf/runs', [
+    'json' => [
+        'htmlString' => '<html><body><h1>Hello World</h1></body></html>'
+    ],
+]);
+$parsedResponse = \json_decode($response->getBody(), true);
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+Keep track of the returned run ID.
+
+Similar to datasets, we can get overall info about the default key-value store.
+
+```php
+// Replace <RUN_ID> with the ID returned by the code above
+$response = $client->get('actor-runs/<RUN_ID>/key-value-store');
+$parsedResponse = \json_decode($response->getBody(), true);
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+The items in key-value stores are not structured, so we cannot use the same approach as we did with dataset items. We can obtain some information about a store's content using its **keys** endpoint.
+
+```php
+// Don't forget to replace <RUN_ID> with the ID you got earlier
+$response = $client->get('actor-runs/<RUN_ID>/key-value-store/keys');
+$parsedResponse = \json_decode($response->getBody(), true);
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+```
+
+We can see that there are two record keys: `INPUT` and `OUTPUT`. The HTML String to PDF Actor's README states that the PDF is stored under the `OUTPUT` key. Let's download it:
+
+```php
+// Don't forget to replace the <RUN_ID>
+$response = $client->get('actor-runs/<RUN_ID>/key-value-store/records/OUTPUT');
+// Make sure that the destination (filename) is writable
+file_put_contents(__DIR__ . '/hello-world.pdf', $response->getBody());
+```
+
+If you open the generated `hello-world.pdf` file, you should see... well, "Hello World".
+
+If the Actor stored the data in a key-value store other than the default, we can use the standalone endpoints, `key-value-stores/<STORE_ID>`, `key-value-stores/<STORE_ID>/keys`, and `key-value-stores/<STORE_ID>/records/<KEY>`. They behave the same way as the default endpoints. [See the full docs](https://docs.apify.com/api/v2#/reference/key-value-stores/store-object).
+
+## When are the data ready
+
+It takes some time for an Actor to generate its output. Some even have Actors that run for days! In the previous examples, we chose Actors whose runs only take a few seconds. This meant the runs had enough time to finish before we ran the code to retrieve their dataset or key-value store (so the Actor had time to produce some output). If we ran the code immediately after starting a longer-running Actor, the dataset would probably still be empty.
+
+For Actors that are expected to be quick, we can use the `waitForFinish` parameter. Then, the running Actor's endpoint does not respond immediately but waits until the run finishes (up to the given limit). Let's try this with the HTML String to PDF Actor.
+
+```php
+$response = $client->post('acts/mhamas~html-string-to-pdf/runs', [
+    'json' => [
+        'htmlString' => '<html><body><h1>Hi World</h1></body></html>'
+    ],
+    // Pass in how long we want to wait, in seconds
+    'query' => [ 'waitForFinish' => 60 ]
+]);
+$parsedResponse = \json_decode($response->getBody(), true);
+$data = $parsedResponse['data'];
+
+echo \json_encode($data, JSON_PRETTY_PRINT);
+
+$runId = $data['id'];
+$response = $client->get(sprintf('actor-runs/%s/key-value-store/records/OUTPUT', $runId));
+file_put_contents(__DIR__ . '/hi-world.pdf', $response->getBody());
+```
+
+## Webhooks
+
+For Actors that take longer to run, we can use [webhooks](/platform/integrations/webhooks). A webhook is an HTML POST request that is sent to a specified URL when an Actor's status changes. We can use them as a kind of notification that is sent when your run finishes. You can set them up using query parameters. If we used webhooks in the example above, it would look like this:
+
+```php
+// Webhooks need to be passed as a base64-encoded JSON string
+$webhooks = \base64_encode(\json_encode([
+    [
+        // The webhook can be sent on multiple events
+        // this one fires when the run succeeds
+        'eventTypes' => ['ACTOR.RUN.SUCCEEDED'],
+        // Set this to some url that you can react to
+        // To see what is sent to the URL,
+        // you can set up a temporary request bin at https://requestbin.com/r
+        'requestUrl' => '<WEBHOOK_ENDPOINT_URL>',
+    ],
+]));
+$response = $client->post('acts/mhamas~html-string-to-pdf/runs', [
+    'json' => [
+        'htmlString' => '<html><body><h1>Hello World</h1></body></html>'
+    ],
+    'query' => [ 'webhooks' => $webhooks ]
+]);
+```
+
+## How to use Apify Proxy
+
+Let's use another important feature: [proxy](/platform/proxy). If you want to make sure that your server's IP address won't get blocked somewhere when making requests, you can use the automatic proxy selection mode.
+
+```php
+$client = new \GuzzleHttp\Client([
+    // Replace <YOUR_PROXY_PASSWORD> below with your password
+    // found at https://console.apify.com/proxy
+    'proxy' => 'http://auto:<YOUR_PROXY_PASSWORD>@proxy.apify.com:8000'
+]);
+
+// This request will be made through an automatically chosen proxy
+$response = $client->get("http://proxy.apify.com/?format=json");
+echo $response->getBody();
+```
+
+If you want to maintain the same IP between requests, you can use the session mode.
+
+```php
+$client = new \GuzzleHttp\Client([
+    // Replace <YOUR_PROXY_PASSWORD> below with your password
+    // found at https://console.apify.com/proxy
+    'proxy' => 'http://session-my_session:<YOUR_PROXY_PASSWORD>@proxy.apify.com:8000'
+]);
+
+// Both responses should contain the same clientIp
+$response = $client->get("https://api.apify.com/v2/browser-info");
+echo $response->getBody();
+
+$response = $client->get("https://api.apify.com/v2/browser-info");
+echo $response->getBody();
+```
+
+[See the proxy docs](/platform/proxy/usage) for more details on using specific proxies.
+
+## Feedback
+
+Are you interested in an Apify PHP client or other PHP-related content? Do you have some feedback on this tutorial? [Let us know](https://apify.typeform.com/to/KqhmiJge#source=tutorial_use_apify_from_php)!
+</file>
+
+<file path="tutorials/python/index.md">
+---
+title: Python tutorials
+description: A collection of various Python tutorials to aid you in your journey to becoming a master web scraping and automation developer.
+sidebar_position: 15
+category: tutorials
+slug: /python
+---
+
+# Python Tutorials 🐍📚
+
+**A collection of various Python tutorials to aid you in your journey to becoming a master web scraping and automation developer.**
+
+---
+
+This section contains various web-scraping or web-scraping related tutorials for Python. Whether you're trying to scrape from a website with sitemaps, struggling with a dynamic page, want to optimize your slow scraper, or need some general tips for scraping in Python, this section is right for you.
+</file>
+
+<file path="tutorials/python/process_data_using_python.md">
+---
+title: Process scraped data with Python
+description: Learn how to process the resulting data of a web scraper in Python using the Pandas library, and how to visualize the processed data using Matplotlib.
+sidebar_position: 2  # should be after scrape_data_python.md
+slug: /python/process-data-using-python
+---
+
+# How to process data in Python using Pandas
+
+**Learn how to process the resulting data of a web scraper in Python using the Pandas library, and how to visualize the processed data using Matplotlib.**
+
+---
+
+In the [previous tutorial](/academy/python/scrape-data-python), we learned how to scrape data from the web in Python using the [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) library. The Python ecosystem's strengths lie mainly in data processing, though, so in this tutorial we will learn how to process the data stored in an Apify dataset using the [Pandas](https://pandas.pydata.org/) library, and how to visualize it using [Matplotlib](https://matplotlib.org/).
+
+In this tutorial, we will use the Actor we created in the [previous tutorial](/academy/python/scrape-data-python), so if you haven't completed that tutorial yet, please do so now.
+
+> In a rush? Skip this tutorial and [get the full code example](https://github.com/apify/apify-docs/tree/master/examples/python-data-parser/).
+
+## Processing previously scraped data
+
+In the previous tutorial, we set out to select our next holiday destination based on the forecast of the upcoming weather there. We have written an Actor that scrapes the BBC Weather forecast for the upcoming two weeks for three destinations: Prague, New York, and Honolulu. It then saves the scraped data to a [dataset](/platform/storage/dataset) on the Apify platform.
+
+Now, we need to process the scraped data and make a visualization that will help us decide which location has the best weather, and will therefore become our next holiday destination.
+
+### Setting up the Actor {#setting-up-the-actor}
+
+First, we need to create another Actor. You can do it the same way as before - go to the [Apify Console](https://console.apify.com/), open the [Actors section](https://console.apify.com/actors), click on the **Create new** button in the top right, and select the **Example: Hello world in Python** Actor template.
+
+In the page that opens, you can see your newly created Actor. In the **Settings** tab, you can give it a name (e.g. `bbc-weather-parser`) and further customize its settings. We'll skip customizing the settings for now, the defaults should be fine. In the **Source** tab, you can see the files that are at the heart of the Actor. Although there are several of them, just two are important for us now, `main.py` and `requirements.txt`.
+
+First, we'll start with the `requirements.txt` file. Its purpose is to list all the third-party packages that your Actor will use. We will be using the `pandas` package for parsing the downloaded weather data, and the `matplotlib` package for visualizing it. We don't care about versions of these packages, so we list just their names:
+
+```py
+# Add your dependencies here.
+# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format
+# for how to format them
+
+matplotlib
+pandas
+```
+
+The Actor's main logic will live in the `main.py` file. Let's delete everything currently in it and start from an empty file.
+
+Next, we'll import all the packages we will use in the code:
+
+```py
+from io import BytesIO
+import os
+
+from apify_client import ApifyClient
+from apify_client.consts import ActorJobStatus
+import pandas
+```
+
+### Scraping the data
+
+Next, we need to run the weather scraping Actor and access its results. We do that through the [Apify API Client for Python](/api/client/python/), which greatly simplifies working with the Apify platform and allows you to use its functions without having to call the Apify API directly.
+
+First, we initialize an `ApifyClient` instance. All the necessary arguments are automatically provided to the Actor process as environment variables accessible in Python through the `os.environ` mapping. We need to run the Actor from the previous tutorial, which we have named `bbc-weather-scraper`, and wait for it to finish. We create a sub-client for working with that Actor and run the Actor through it. We then check whether the Actor run has succeeded. If so, we create a client for working with its default dataset.
+
+```py
+# Initialize the main ApifyClient instance
+client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
+
+# Run the weather scraper and wait for it to finish
+print('Downloading the weather data...')
+scraper_run = client.actor('~bbc-weather-scraper').call()
+
+# Check if the scraper finished successfully, otherwise raise an error
+if scraper_run['status'] != ActorJobStatus.SUCCEEDED:
+    raise RuntimeError('The weather scraper run has failed')
+
+# Get the resource sub-client for working with the dataset with the source data
+dataset_client = client.dataset(scraper_run['defaultDatasetId'])
+```
+
+### Processing the data
+
+Now, we need to load the data from the dataset to a Pandas dataframe. Pandas supports reading data from a CSV file stream, so we create a stream with the dataset items in the right format and supply it to `pandas.read_csv()`.
+
+```py
+# Load the dataset items into a pandas dataframe
+print('Parsing weather data...')
+dataset_items_stream = dataset_client.stream_items(item_format='csv')
+weather_data = pandas.read_csv(dataset_items_stream, parse_dates=['datetime'], date_parser=lambda val: pandas.to_datetime(val, utc=True))
+```
+
+Once we have the data loaded, we can process it. Each data row comes as three fields: `datetime`, `location` and `temperature`. We would like to transform the data so that we have the datetimes in one column, and the temperatures for each location at that datetime in separate columns, one for each location. To achieve this, we use the `.pivot()` method on the dataframe. Since the temperature varies considerably between day and night, and we would like to get an overview of the temperature trends over a longer period of time, we calculate a rolling average of the temperatures with a 24-hour window.
+
+```py
+# Transform data to a pivot table for easier plotting
+pivot = weather_data.pivot(index='datetime', columns='location', values='temperature')
+mean_daily_temperatures = pivot.rolling(window='24h', min_periods=24, center=True).mean()
+```
+
+### Visualizing the data
+
+With the data processed, we can then make a plot of the results. For that, we use the `.plot()` method of the dataframe, which creates a figure with the plot, using the Matplotlib library internally. We set the right titles and labels to the plot, and apply some additional formatting to achieve a nicer result.
+
+```py
+# Create a plot of the data
+print('Plotting the data...')
+axes = mean_daily_temperatures.plot(figsize=(10, 5))
+axes.set_title('Weather prediction for holiday destinations')
+axes.set_xlabel(None)
+axes.yaxis.set_major_formatter(lambda val, _: f'{int(val)} °C')
+axes.grid(which='both', linestyle='dotted')
+axes.legend(loc='best')
+axes.figure.tight_layout()
+```
+
+As the last step, we need to save the plot to a record in a [key-value store](/platform/storage/key-value-store) on the Apify platform, so that we can access it later. We save the rendered figure with the plot to an in-memory buffer, and then save the contents of that buffer to the default key-value store of the Actor run through its resource subclient.
+
+```py
+# Get the resource sub-client for working with the default key-value store of the run
+key_value_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])
+
+# Save the resulting plot to the key-value store through an in-memory buffer
+print('Saving plot to key-value store...')
+with BytesIO() as buf:
+    axes.figure.savefig(buf, format='png', dpi=200, facecolor='w')
+    buf.seek(0)
+    key_value_store_client.set_record('prediction.png', buf, 'image/png')
+
+print(f'Result is available at {os.environ["APIFY_API_PUBLIC_BASE_URL"]}'
+      + f'/v2/key-value-stores/{os.environ["APIFY_DEFAULT_KEY_VALUE_STORE_ID"]}/records/prediction.png')
+```
+
+And that's it! Now you can save the changes in the editor, and then click **Build and run** at the bottom of the page. The Actor will get built, the built Actor image will get saved for future re-use, and then it will be executed. You can follow the progress of the Actor build and the Actor run in the **Last build** and **Last run** tabs, respectively, in the developer console in the Actor source view. Once the Actor finishes running, it will output the URL where you can access the plot we created in its log.
+
+![Building and running the BBC Weather Parser Actor](./images/bbc-weather-parser-source.png)
+
+Looking at the results, Honolulu seems like the right choice now, don't you think? 🙂
+
+![Weather prediction plot created by the BBC Weather Parser Actor](./images/bbc-weather-prediction.png)
+</file>
+
+<file path="tutorials/python/scrape_data_python.md">
+---
+title: How to scrape and process data using Python
+description: Learn how to create a Python Actor and use Python libraries to scrape, process and visualize data extracted from the web.
+sidebar_position: 1
+slug: /python/scrape-data-python
+---
+
+# How to scrape data in Python using Beautiful Soup
+
+**Learn how to create a Python Actor and use Python libraries to scrape, process and visualize data extracted from the web.**
+
+---
+
+Web scraping is not limited to the JavaScript world. The Python ecosystem contains some pretty powerful scraping tools as well. One of those is [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/), a library for parsing HTML and navigating or modifying of its DOM tree.
+
+This tutorial shows you how to write a Python [Actor](../../platform/getting_started/actors.md) for scraping the weather forecast from [BBC Weather](https://www.bbc.com/weather) and process the scraped data using [Pandas](https://pandas.pydata.org/).
+
+> In a rush? Skip this tutorial and get the [full code example](https://github.com/apify/apify-docs/tree/master/examples/python-data-scraper/).
+
+## Exploring the BBC Weather page
+
+BBC Weather offers you the weather forecast for the upcoming 14 days for a large selection of places around the world. Let's say we want to decide on our next holiday destination. We're choosing between Prague, New York, and Honolulu, and we will pick the destination based on which one has the best weather. To do that, we will scrape the weather forecast for each of our options, and then compare the results.
+
+### Understanding the URL format
+
+First, we need to look around the BBC Weather page and understand how the weather data is being retrieved and presented. If we open the [BBC Weather](https://www.bbc.com/weather) page and search for Prague, we can see that it opened a page with a URL ending in a seven-digit number, which we can assume is the ID of the displayed location BBC Weather uses internally. Opening a different location changes only that number in the URL, confirming our assumptions.
+
+The page shows the weather forecast for the upcoming 14 days. If we hover over the days in the displayed carousel, we can see that the link for each day leads to a URL ending with `/day{X}`, with `{X}` representing how many days in the future the specific day is.
+
+Combining this information gives us the full format for the URL of a page for a given location and day: `https://www.bbc.com/weather/{LOCATION_ID}/day{DAY_OFFSET}`.
+
+![BBC Weather URL format](./images/bbc-weather-url-format.png)
+
+### Determining the forecast's starting date
+
+Looking more closely at the BBC Weather page, we can see that it shows the forecast for each day from 6:00 AM to 5:00 AM the next day. But what happens when we view a location where the current time is between midnight and 5 AM? Trying that, we can see that, in the day represented by **Tonight**, there are only a few slots for the hours between midnight and 5 AM displayed. This means that the first displayed day can either represent the current date at the location, or the day before the current date. To find out which of these two it is, we will first have to determine the current date and time at the location, and then possibly adjust it by one day based on whether the date matches the first displayed day.
+
+![BBC Weather displaying a location with current time between midnight and 5 AM](./images/bbc-weather-after-midnight.png)
+
+To determine the current date and time at the displayed location, we will need to know the location's timezone. Fortunately, the timezone and its offset to GMT are displayed near the bottom of the page.
+
+![The timezone offset on the BBC Weather page](./images/bbc-time-offset.png)
+
+### Understanding the element structure
+
+To extract data from the page, we need to figure out where exactly in the internal page structure it is stored.
+
+If we right-click on the day title in the top carousel (**Today** or **Tonight**) and select **Inspect** in the popup menu, we can open the Chrome DevTools Inspector with the clicked element highlighted. We can see that the element with the currently displayed day in the top carousel has the class `wr-day--active`, and that the element with the day's title has the class `wr-day__title` and the accessibility label attribute `aria-label` contains the actual date of that day, not just **Today** or **Tonight**. Additionally, the timezone information is in an element with the class `wr-c-footer-timezone__item`. You can see two elements with the same class, so we will need to pick the second one when parsing the page.
+
+Exploring the document tree further, we can see that the element containing all the displayed hours has the class `wr-time-slot-container__slots`. The elements with the forecast for a given hour have the class `wr-time-slot`. In each time slot, the element containing the slot's hour has the class `wr-time-slot-primary__hours` and the element containing the slot's predicted temperature in degrees Celsius has the class `wr-value--temperature--c`.
+
+![BBC Weather with the DevTools Inspector open](./images/bbc-weather-devtools.png)
+
+## Scraping the data from the page
+
+Now that we understand the element structure of the page and know where to find all the data we need, we can start writing the scraper.
+
+### Setting up the Actor
+
+First, we need to create a new Actor. To do this, go to [Apify Console](https://console.apify.com/), open the [Development section](https://console.apify.com/actors/development/my-actors), click on the **Develop new** button in the top right, and select the **Example: Hello world in Python** Actor template.
+
+In the page that opens, you can see your newly created Actor. In the **Settings** tab, you can give it a name (e.g. `bbc-weather-scraper`) and further customize its settings. We'll skip customizing the settings for now, the defaults should be fine. In the **Source** tab, you can see the files that are at the heart of the Actor. Although there are several of them, just two are important for us now, `main.py` and `requirements.txt`.
+
+First we'll start with the `requirements.txt` file. Its purpose is to list all the third-party packages that your Actor will use. We will be using the `requests` package for downloading the BBC Weather pages, and the `beautifulsoup4` package for parsing and processing the downloaded pages. We don't care about versions of these packages, so we list just their names:
+
+```py
+# Add your dependencies here.
+# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format
+# for how to format them
+
+beautifulsoup4
+requests
+```
+
+### Writing the code
+
+Finally, we can get to writing the main logic for the Actor, which will live in the `main.py` file. Let's delete everything currently in it and start from an empty file.
+
+First, we need to import all the packages we will use in the code:
+
+```py
+from datetime import datetime, time, timedelta, timezone
+import os
+import re
+
+from apify_client import ApifyClient
+from bs4 import BeautifulSoup
+import requests
+```
+
+Next, let's set up the locations we want to scrape in a constant for easier reference and, optionally, modification.
+
+```py
+# Locations which to scrape and their BBC Weather IDs
+LOCATIONS = [
+    ('Prague', '3067696'),
+    ('Honolulu', '5856195'),
+    ('New York', '5128581'),
+]
+```
+
+#### Extracting the data
+
+We'll be scraping each location separately. For each location, we need to know in which timezone it resides and what is the first displayed date in the weather forecast for that location. We will scrape each of the 14 forecast days one by one. For each day, we will first download its forecast page using the `requests` library, and then parse the downloaded HTML using the `BeautifulSoup` parser:
+
+```py
+# List with scraped results
+weather_data = []
+
+# Scrape each location separately
+for (location_name, location_id) in LOCATIONS:
+    print(f'Scraping weather from {location_name}...')
+    location_timezone = None
+    first_displayed_date = None
+    for day_offset in range(14):
+        # Get the BBC Weather page for the given location and day and parse it with BeautifulSoup
+        response = requests.get(f'https://www.bbc.com/weather/{location_id}/day{day_offset}')
+        soup = BeautifulSoup(response.content, 'html.parser')
+```
+
+When scraping a location, we need to know in which timezone it lies, and what date the first displayed day of the forecast represents. We can find that out at the beginning, when scraping the first day of the forecast for that location.
+
+To get the necessary data, we will need to find the elements in which it is contained. Let's use the `soup.find(...)` and `soup.findAll(...)` methods, which find elements matching some specified conditions in the parsed HTML.
+
+First, we extract the timezone from the second element with class `wr-c-footer-timezone__item`. The timezone information is described there with a full sentence, but we're only interested in the numerical representation of the timezone offset, so we parse it out using a regular expression. With the timezone offset parsed, we can construct a `timezone` object and from that get the current datetime at the location.
+
+Afterwards, we can figure out which date is represented by the first displayed day. We find the element with the class `wr-day--active` containing the header for the currently displayed day. Inside it, we find the element with the title of that day, which has the class `wr-day__title`. This element has the accessibility label containing the actual date of the day in its `aria-label` attribute, but it contains only the day and month and not the year, so we can't use it directly. Instead, to get the full date of the first displayed day, we compare the day from the accessibility label and the day from the current datetime at the location. If they match, we know the first displayed date is the current date at the location. If they don't, we know the first displayed date is the day before the current date at the location.
+
+```py
+        # When parsing the first day, find out what day it represents,
+        # to know when do the results start
+        if day_offset == 0:
+            # Get the timezone offset written in the page footer and parse it
+            tz_description = soup.find_all(class_='wr-c-footer-timezone__item')[1].text
+            tz_offset_match = re.search(r'([+-]\d\d)(\d\d)', tz_description)
+            tz_offset_hours = int(tz_offset_match.group(1))
+            tz_offset_minutes = int(tz_offset_match.group(2))
+
+            # Get the current date and time at the scraped location
+            timezone_offset = timedelta(hours=tz_offset_hours, minutes=tz_offset_minutes)
+            location_timezone = timezone(timezone_offset)
+
+            location_current_datetime = datetime.now(tz=location_timezone)
+
+            # The times displayed for each day are from 6:00 AM that day to 5:00 AM the next day,
+            # so "today" on BBC Weather might actually mean "yesterday" in actual datetime.
+            # We have to parse the accessibility label containing the actual date on the header for the first day
+            # and compare it with the current date at the location, then adjust the date accordingly
+            day_carousel_item = soup.find(class_='wr-day--active')
+            day_carousel_title = day_carousel_item.find(class_='wr-day__title')['aria-label']
+            website_first_displayed_item_day = int(re.search(r'\d{1,2}', day_carousel_title).group(0))
+
+            if location_current_datetime.day == website_first_displayed_item_day:
+                first_displayed_date = location_current_datetime.date()
+            else:
+                first_displayed_date = location_current_datetime.date() - timedelta(days=1)
+```
+
+Now that we've figured out the date of the first displayed day, we can extract the predicted weather from each hour of each forecast day. The forecast for the displayed day is in the element with class `wr-time-slot-container__slots`, and that element contains time slots for each predicted hour represented by elements with the class `wr-time-slot`. In each time slot, the element with the class `wr-time-slot-primary__hours` contains the hour of the time slot. The element with the class `wr-value--temperature--c` contains the temperature in degrees Celsius.
+
+To get the datetime of each slot, we need to combine the date of the first displayed day, the hour displayed in the slot, and the timezone of the currently processed location. Since the page shows the forecast for each day from 6 AM to 5 AM the next day, we need to add one day to the slots from midnight to 5 AM to get the correct datetime.
+
+Finally, we can put all the extracted information together and push them to the array holding the resulting data.
+
+```py
+        # Go through the elements for each displayed time slot of the displayed day
+        slot_container = soup.find(class_='wr-time-slot-container__slots')
+        for slot in slot_container.find_all(class_='wr-time-slot'):
+            # Find out the date and time of the displayed element from the day offset and the displayed hour.
+            # The times displayed for each day are from 6:00 AM that day to 5:00 AM the next day,
+            # so anything between midnight and 6 AM actually represents the next day
+            slot_hour = int(slot.find(class_='wr-time-slot-primary__hours').text)
+            slot_datetime = datetime.combine(first_displayed_date, time(hour=slot_hour), tzinfo=location_timezone)
+            slot_datetime += timedelta(days=day_offset)
+            if slot_hour < 6:
+                slot_datetime += timedelta(days=1)
+
+            # Parse the temperature from the right element
+            slot_temperature = int(slot.find(class_='wr-value--temperature--c').text[:-1])
+
+            # Add the parsed data to the result list
+            weather_data.append({
+                'datetime': slot_datetime,
+                'location': location_name,
+                'temperature': slot_temperature,
+            })
+```
+
+#### Storing the data
+
+As the last step, we need to store the scraped data in a dataset on the Apify platform, so that we can access it later. We do that through the [Apify API Client for Python](/api/client/python), which greatly simplifies working with the Apify platform and allows you to use its functions without having to call the Apify API directly.
+
+First, we initialize an `ApifyClient` instance. All the necessary arguments are automatically provided to the Actor process as environment variables accessible in Python through the `os.environ` mapping. We will save the data into the default dataset belonging to the Actor run, so we create a sub-client for working with that dataset, and push the data into it using its `.push_items(...)` method.
+
+```py
+# Initialize the main ApifyClient instance
+client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
+
+# Get the resource subclient for working with the default dataset of the Actor run
+default_dataset_client = client.dataset(os.environ['APIFY_DEFAULT_DATASET_ID'])
+
+# Finally, push all the results into the dataset
+default_dataset_client.push_items(weather_data)
+
+print(f'Results have been saved to the dataset with ID {os.environ["APIFY_DEFAULT_DATASET_ID"]}')
+```
+
+### Running the Actor
+
+And that's it! Now you can save the changes in the editor, and then click **Build and run** at the bottom of the page. The Actor will get built, the built Actor image will get saved for future reuse, and then it will be executed. You can follow the progress of the Actor build and the Actor run in the **Last build** and **Last run** tabs, respectively, in the developer console in the Actor source view. Once the Actor finishes running, you can view the scraped data in the **Dataset** tab in the Actor run view.
+
+![Building and running the BBC Weather Scraper Actor](./images/bbc-weather-scraper-source.png)
+
+## How to process data in Python using Pandas
+
+Earlier in this tutorial, we learned how to scrape data from the web in Python using the [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) library. The Python ecosystem's strengths lie mainly in data processing, though, so in this tutorial we will learn how to process the data stored in an Apify dataset using the [Pandas](https://pandas.pydata.org/) library, and how to visualize it using [Matplotlib](https://matplotlib.org/).
+
+> In a rush? Skip this tutorial and [get the full code example](https://github.com/apify/apify-docs/tree/master/examples/python-data-parser/).
+
+## Processing previously scraped data
+
+In the previous tutorial, we set out to select our next holiday destination based on the forecast of the upcoming weather there. We have written an Actor that scrapes the BBC Weather forecast for the upcoming two weeks for three destinations: Prague, New York, and Honolulu. It then saves the scraped data to a [dataset](/platform/storage/dataset) on the Apify platform.
+
+Now, we need to process the scraped data and make a visualization that will help us decide which location has the best weather, and will therefore become our next holiday destination.
+
+### Setting up the Actor {#setting-up-the-actor}
+
+First, we need to create another Actor. You can do it the same way as before - go to the [Apify Console](https://console.apify.com/), open the [Actors section](https://console.apify.com/actors), click on the **Create new** button in the top right, and select the **Example: Hello world in Python** Actor template.
+
+In the page that opens, you can see your newly created Actor. In the **Settings** tab, you can give it a name (e.g. `bbc-weather-parser`) and further customize its settings. We'll skip customizing the settings for now, the defaults should be fine. In the **Source** tab, you can see the files that are at the heart of the Actor. Although there are several of them, just two are important for us now, `main.py` and `requirements.txt`.
+
+First, we'll start with the `requirements.txt` file. Its purpose is to list all the third-party packages that your Actor will use. We will be using the `pandas` package for parsing the downloaded weather data, and the `matplotlib` package for visualizing it. We don't care about versions of these packages, so we list just their names:
+
+```py
+# Add your dependencies here.
+# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format
+# for how to format them
+
+matplotlib
+pandas
+```
+
+The Actor's main logic will live in the `main.py` file. Let's delete everything currently in it and start from an empty file.
+
+Next, we'll import all the packages we will use in the code:
+
+```py
+from io import BytesIO
+import os
+
+from apify_client import ApifyClient
+from apify_client.consts import ActorJobStatus
+import pandas
+```
+
+### Scraping the data
+
+Next, we need to run the weather scraping Actor and access its results. We do that through the [Apify API Client for Python](/api/client/python), which greatly simplifies working with the Apify platform and allows you to use its functions without having to call the Apify API directly.
+
+First, we initialize an `ApifyClient` instance. All the necessary arguments are automatically provided to the Actor process as environment variables accessible in Python through the `os.environ` mapping. We need to run the Actor from the previous tutorial, which we have named `bbc-weather-scraper`, and wait for it to finish. We create a sub-client for working with that Actor and run the Actor through it. We then check whether the Actor run has succeeded. If so, we create a client for working with its default dataset.
+
+```py
+# Initialize the main ApifyClient instance
+client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL'])
+
+# Run the weather scraper and wait for it to finish
+print('Downloading the weather data...')
+scraper_run = client.actor('~bbc-weather-scraper').call()
+
+# Check if the scraper finished successfully, otherwise raise an error
+if scraper_run['status'] != ActorJobStatus.SUCCEEDED:
+    raise RuntimeError('The weather scraper run has failed')
+
+# Get the resource sub-client for working with the dataset with the source data
+dataset_client = client.dataset(scraper_run['defaultDatasetId'])
+```
+
+### Processing the data
+
+Now, we need to load the data from the dataset to a Pandas dataframe. Pandas supports reading data from a CSV file stream, so we create a stream with the dataset items in the right format and supply it to `pandas.read_csv()`.
+
+```py
+# Load the dataset items into a pandas dataframe
+print('Parsing weather data...')
+dataset_items_stream = dataset_client.stream_items(item_format='csv')
+weather_data = pandas.read_csv(dataset_items_stream, parse_dates=['datetime'], date_parser=lambda val: pandas.to_datetime(val, utc=True))
+```
+
+Once we have the data loaded, we can process it. Each data row comes as three fields: `datetime`, `location` and `temperature`. We would like to transform the data so that we have the datetimes in one column, and the temperatures for each location at that datetime in separate columns, one for each location. To achieve this, we use the `.pivot()` method on the dataframe. Since the temperature varies considerably between day and night, and we would like to get an overview of the temperature trends over a longer period of time, we calculate a rolling average of the temperatures with a 24-hour window.
+
+```py
+# Transform data to a pivot table for easier plotting
+pivot = weather_data.pivot(index='datetime', columns='location', values='temperature')
+mean_daily_temperatures = pivot.rolling(window='24h', min_periods=24, center=True).mean()
+```
+
+### Visualizing the data
+
+With the data processed, we can then make a plot of the results. For that, we use the `.plot()` method of the dataframe, which creates a figure with the plot, using the Matplotlib library internally. We set the right titles and labels to the plot, and apply some additional formatting to achieve a nicer result.
+
+```py
+# Create a plot of the data
+print('Plotting the data...')
+axes = mean_daily_temperatures.plot(figsize=(10, 5))
+axes.set_title('Weather prediction for holiday destinations')
+axes.set_xlabel(None)
+axes.yaxis.set_major_formatter(lambda val, _: f'{int(val)} °C')
+axes.grid(which='both', linestyle='dotted')
+axes.legend(loc='best')
+axes.figure.tight_layout()
+```
+
+As the last step, we need to save the plot to a record in a [key-value store](/platform/storage/key-value-store) on the Apify platform, so that we can access it later. We save the rendered figure with the plot to an in-memory buffer, and then save the contents of that buffer to the default key-value store of the Actor run through its resource subclient.
+
+```py
+# Get the resource sub-client for working with the default key-value store of the run
+key_value_store_client = client.key_value_store(os.environ['APIFY_DEFAULT_KEY_VALUE_STORE_ID'])
+
+# Save the resulting plot to the key-value store through an in-memory buffer
+print('Saving plot to key-value store...')
+with BytesIO() as buf:
+    axes.figure.savefig(buf, format='png', dpi=200, facecolor='w')
+    buf.seek(0)
+    key_value_store_client.set_record('prediction.png', buf, 'image/png')
+
+print(f'Result is available at {os.environ["APIFY_API_PUBLIC_BASE_URL"]}'
+      + f'/v2/key-value-stores/{os.environ["APIFY_DEFAULT_KEY_VALUE_STORE_ID"]}/records/prediction.png')
+```
+
+And that's it! Now you can save the changes in the editor, and then click **Build and run** at the bottom of the page. The Actor will get built, the built Actor image will get saved for future re-use, and then it will be executed. You can follow the progress of the Actor build and the Actor run in the **Last build** and **Last run** tabs, respectively, in the developer console in the Actor source view. Once the Actor finishes running, it will output the URL where you can access the plot we created in its log.
+
+![Building and running the BBC Weather Parser Actor](./images/bbc-weather-parser-source.png)
+
+Looking at the results, Honolulu seems like the right choice now, don't you think? 🙂
+
+![Weather prediction plot created by the BBC Weather Parser Actor](./images/bbc-weather-prediction.png)
+</file>
+
+<file path="tutorials/tutorials/index.md">
+---
+title: What's this section?
+description: Learn about various different specific topics related to web-scraping and web-automation with the Apify Academy tutorial lessons!
+sidebar_position: 1
+category: tutorials
+slug: /tutorials
+---
+
+# Tutorials 📚
+
+**Learn about various different specific topics related to web-scraping and web-automation with the Apify Academy tutorial lessons!**
+
+---
+
+In web scraping, there are a whole lot of niche cases that you will run into. Because our goal with the Apify Academy is to totally prepare you for any battle you may face in your web-automation projects, we've decided to create the **Tutorials** area of the Academy.
+
+This area contains various one-off lessons about different specific topics related to web-scraping.
+</file>
+
+<file path="webscraping/advanced_web_scraping/crawling/crawling-sitemaps.md">
+---
+title: Crawling sitemaps
+description: Learn how to extract all of a website's listings even if they limit the number of results pages. See code examples for setting up your scraper.
+sidebar_position: 2
+slug: /advanced-web-scraping/crawling/crawling-sitemaps
+---
+
+In the previous lesson, we learned what is the utility (and dangers) of crawling sitemaps. In this lesson, we will go in-depth to how to crawl sitemaps.
+
+We will look at the following topics:
+
+- How to find sitemap URLs
+- How to set up HTTP requests to download sitemaps
+- How to parse URLs from sitemaps
+- Using Crawlee to get all URLs in a few lines of code
+
+## How to find sitemap URLs
+
+Sitemaps are commonly restricted to contain a maximum of 50k URLs so usually, there will be a whole list of them. There can be a master sitemap containing URLs of all other sitemaps or the sitemaps might simply be indexed in `robots.txt` and/or have auto-incremented URLs like `/sitemap1.xml`, `/sitemap2.xml`, etc.
+
+### Google
+
+You can try your luck on Google by searching for `site:example.com sitemap.xml` or `site:example.com sitemap.xml.gz` and see if you get any results. If you do, you can try to download the sitemap and see if it contains any useful URLs. The success of this approach depends on the website telling Google to index the sitemap file itself which is rather uncommon.
+
+### robots.txt {#robots-txt}
+
+If the website has a `robots.txt` file, it often contains sitemap URLs. The sitemap URLs are usually listed under `Sitemap:` directive.
+
+### Common URL paths
+
+You can check some common URL paths, such as the following:
+
+/sitemap.xml
+/product_index.xml
+/product_template.xml
+/sitemap_index.xml
+/sitemaps/sitemap_index.xml
+/sitemap/product_index.xml
+/media/sitemap.xml
+/media/sitemap/sitemap.xml
+/media/sitemap/index.xml
+
+Make also sure you test the list with `.gz`, `.tar.gz` and `.tgz` extensions and by capitalizing the words (e.g. `/Sitemap_index.xml.tar.gz`).
+
+Some websites also provide an HTML version, to help indexing bots find new content. Those include:
+
+/sitemap
+/category-sitemap
+/sitemap.html
+/sitemap_index
+
+Apify provides the [Sitemap Sniffer](https://apify.com/vaclavrut/sitemap-sniffer), an open source actor that scans the URL variations automatically for you so that you don't have to check them manually.
+
+## How to set up HTTP requests to download sitemaps
+
+For most sitemaps, you can make a single HTTP request and parse the downloaded XML text. Some sitemaps are compressed and have to be streamed and decompressed. The code can get fairly complicated, but scraping frameworks, such as [Crawlee](#using-crawlee), can do this out of the box.
+
+## How to parse URLs from sitemaps
+
+Use your favorite XML parser to extract the URLs from inside the `<loc>` tags. Just be careful that the sitemap might contain other URLs that you don't want to crawl (e.g. `/about`, `/contact`, or various special category sections). For specific code examples, see [our Node.js guide](/academy/node-js/scraping-from-sitemaps).
+
+## Using Crawlee
+
+Fortunately, you don't have to worry about any of the above steps if you use [Crawlee](https://crawlee.dev), a scraping framework, which has rich traversing and parsing support for sitemap. It can traverse nested sitemaps, download, and parse compressed sitemaps, and extract URLs from them. You can get all the URLs in a few lines of code:
+
+```js
+import { RobotsFile } from 'crawlee';
+
+const robots = await RobotsFile.find('https://www.mysite.com');
+
+const allWebsiteUrls = await robots.parseUrlsFromSitemaps();
+```
+
+## Next up
+
+That's all we need to know about sitemaps for now. Let's dive into a much more interesting topic - search, filters, and pagination.
+</file>
+
+<file path="webscraping/advanced_web_scraping/crawling/crawling-with-search.md">
+---
+title: Crawling with search 
+description: Learn how to extract all of a website's listings even if they limit the number of results pages. See code examples for setting up your scraper.
+sidebar_position: 3
+slug: /advanced-web-scraping/crawling/crawling-with-search
+---
+
+# Scraping websites with search
+
+In this lesson, we will start with a simpler example of scraping HTML based websites with limited pagination.
+
+Limiting pagination is a common practice on e-commerce sites. It makes sense: a real user will never want to look through more than 200 pages of results – only bots love unlimited pagination. Fortunately, there are ways to overcome this limit while keeping our code clean and generic.
+
+![Pagination in on Google search results page](./images/pagination.png)
+
+> In a rush? Skip the tutorial and get the [full code example](https://github.com/apify-projects/apify-extra-library/tree/master/examples/crawler-with-filters).
+
+## How to overcome the limit {#how-to-overcome-the-limit}
+
+Websites usually limit the pagination of a single (sub)category to somewhere between 1,000 to 20,000 listings. The site might have over a million listings in total. Without a proven algorithm, it will be very manual and almost impossible to scrape all listings.
+
+We will first look at a couple of ideas that don't work so well and then present the [final robust solution](#using-filter-ranges).
+
+### Going deeper into subcategories {#going-deeper-into-subcategories}
+
+This is usually the first solution that comes to mind. You traverse the smallest subcategories and hope that those are below the pagination limits. Unfortunately, there are two big problems with this approach:
+
+1. Any subcategory might be bigger than the pagination limit.
+2. Some listings from the parent category might not be present in any subcategory.
+
+While you can often manually test if the second problem is true on the site, the first problem is a hard blocker. You might be just lucky, and it may work on this site but usually, traversing subcategories is not enough. It can be used as a first step of the solution but not as the solution itself.
+
+### Using filters {#using-filters}
+
+Most websites also provide a way for the user to select search filters. These allow a more granular level of search than categories and can be combined with them. Common filters allow you to select a **color**, **size**, **location** and similar attributes.
+
+At first, it might seem like an easy solution. Enqueue all possible filter combinations and that should be so granular that it will never hit a pagination limit. Unfortunately, this solution is still far from good.
+
+1. No guarantee that some products won't slip through the chosen filter combinations.
+2. The resulting split might be too granular and end up having too many tiny paginations with many duplicate products. This leads to scraping a lot more pages than necessary and makes analytics much harder.
+
+### Using filter ranges {#using-filter-ranges}
+
+The best option is to use only a specific type of filter that can be used as a range. The most common one is **price range** but there may be others like the apartment size, etc. You can split the pagination pages to only contain listings within that range, e.g. products costing between $10 and $20.
+
+This has several benefits:
+
+1. All listings can eventually be found in a range.
+2. The ranges do not overlap, so we scrape the smallest possible number of pages and avoid duplicate listings.
+3. Ranges can be controlled by a generic algorithm that can be reused for different sites.
+
+## Splitting pages with range filters {#splitting-pages-with-range-filters}
+
+In the previous section, we analyzed different options to split the pages to overcome the pagination limit. We have chosen range filters as the most reliable way to do that. In this section, we will discuss a generic algorithm to work with ranges, look at a few special cases and then write an example crawler.
+
+![An example of range filters on a website](./images/pagination-filters.png)
+
+### The algorithm {#the-algorithm}
+
+The core algorithm can be used on any (even overlapping) range. This is a simplified presentation, we will discuss the details later.
+
+1. We choose a few pivot ranges with a similar number of products and enqueue them. For example, **$0-$10**, **$100-$1000**, **$1000-$10000**, **$10000-**.
+2. For each range, we open the page and check if the listings are below the limit. If yes, we continue to step 3. If not, we split the filter in half, e.g. **$0-$10** to **$0-$5** and **$5-$10** and enqueue those again. We recursively repeat step **2** for each range as long as needed.
+3. We now have a pagination URL that is below the limit, we enqueue it under a pagination label and start enqueuing products.
+
+Because the algorithm is recursive, we don't need to think about how big the final ranges should be, the algorithm will find them over time.
+
+### Special cases to look for {#special-cases-to-look-for}
+
+We have the base algorithm, but before we start coding, let's answer a few questions to get more insight.
+
+#### Can the ranges overlap? {#can-the-ranges-overlap}
+
+Some sites will allow you to construct non-overlapping ranges. For example, you can set the ranges with cents, e.g. **$0-$4.99**, **$5-$9.99**, etc. If that is possible, create the pivot ranges this way, too.
+
+Non-overlapping ranges should remove the possibility of duplicate products (unless a [listing has multiple values](#can-a-listing-have-more-values)) and the lowest number of pages.
+
+If the website supports only overlapping ranges (e.g. **$0-$5**, **$5–10**), it is not a big problem. Only a small portion of the listings will be duplicates, and they can be removed using a [request queue](/platform/storage/request-queue).
+
+#### Can a listing have more values? {#can-a-listing-have-more-values}
+
+In rare cases, a listing can have more than one value that you are filtering in a range. A typical example is Amazon, where each product has several offers and those offers have different prices. If any of those offers is within the range, the product is shown.
+
+No easy way exists to get around this but the price range split works even with duplicate listings, use a [JS set](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Set) or request queue to deduplicate them.
+
+#### How is the range passed to the URL? {#how-is-the-range-passed-to-the-url}
+
+In the easiest case, you can pass the range directly in the page's URL. For example, `https://example.com/products?price=0-10`. Sometimes, you will need to do some query composition because the price range might be encoded together with more information into a single parameter.
+
+Some sites don't have page URLs with filters and instead load the filtered products via [XHRs](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest). Those can be GET or POST requests with varying **URL** and **payload** syntax.
+
+The nice thing here is that if you get to understand how their internal API works, you can have it return more products per page or extract full product details just from this single request.
+
+In addition, XHRs are smaller and faster than loading an HTML page. On the other hand, you should not overly abuse them (with setting overly large limits), as this can expose you.
+
+#### Does the website show the number of products for each filtered page? {#does-the-website-show-the-number-of-products-for-each-filtered-page}
+
+If it does, it's a nice bonus. It gives us a way to check if we are over or below the pagination limit and helps with analytics.
+
+If it doesn't, we have to find a different way to check if the number of listings is within a limit. One option is to go to the last allowed page of the pagination. If that page is still full of products, we can assume the filter is over the limit.
+
+#### How to handle (open) ends of the range {#how-to-handle-open-ends-of-the-range}
+
+Logically, every full (price) range starts at 0 and ends at infinity. But the way this is encoded will differ on each site. The end of the price range can be either closed (0) or open (infinity). Open ranges require special handling when you split them (we will get to that).
+
+Most sites will let you start with 0 (there might be exceptions, where you will have to make the start open), so we can use just that. The high end is more complicated. Because you don't know the biggest price, it is best to leave it open and handle it specially. Internally you can assign `null` to the value.
+
+Here are a few examples of a query parameter with an open and closed high-end range:
+
+- Open: `p:100-` (higher than 100), Closed: `p:100-200` (between 100 and 200)
+- Open: `min_price=100`, Closed: `min_price=100&max_price=200`
+
+#### Can the range exceed the limit on a single value? {#can-the-range-exceed-the-limit-on-a-single-value}
+
+In very rare cases, a site will have so many listings that a single value (e.g. **$100** or **$4.99**) will include a number of listings over the limit. [The basic algorithm](#the-algorithm) will recurse until the **min** value equals the **max** value and then stop because it cannot split that single value anymore.
+
+In this rare case, you will need to add another range or other filters to combine it to get an even deeper split.
+
+### Implementing a range filter {#implementing-a-range-filter}
+
+This section shows a code example implementing our solution for an imaginary website. Writing a real solution will bring up more complex problems but the previous section should prepare you for some of them.
+
+First, let's define our imaginary site:
+
+- It has a single `/products` path that contains all the products that we want to scrape.
+- **Max** pagination limit is **1000**.
+- The site contains over a million products.
+- It allows for filtering over a price range with query parameters `min_price` and `max_price`.
+- If `min_price` or `max_price` are not defined, it opens that end of the range (all products up to or all products over that).
+- The site allows to specify the price in cents.
+- Pagination is done via `page` query parameter.
+
+#### Define and enqueue pivot ranges {#define-and-enqueue-pivot-ranges}
+
+This step is not necessary but it is useful. The algorithm doesn't start with splitting over too large or too small values.
+
+```js
+import { Actor } from 'apify';
+import { CheerioCrawler } from 'crawlee';
+
+await Actor.init();
+
+const MAX_PRODUCTS_PAGINATION = 1000;
+
+// Just an example, choose what makes sense for your site
+const PIVOT_PRICE_RANGES = [
+    { min: 0, max: 9.99 },
+    { min: 10, max: 99.99 },
+    { min: 100, max: 999.99 },
+    { min: 1000, max: 9999.99 },
+    { min: 10000, max: null }, // open-ended
+];
+
+// Let's create a helper function for creating the filter URLs, you can move those to a utils.js file
+const createFilterUrl = ({ min, max }) => {
+    const minString = `min_price=${min}`;
+    // We don't want to pass the parameter at all if it is null (open-ended)
+    const maxString = max ? `&max_price=${max}` : '';
+    return `https://www.mysite.com/products?${minString}${maxString}`;
+};
+
+// And another helper for getting filters back from the URL, we could also pass them in userData
+const getFiltersFromUrl = (url) => {
+    const min = Number(url.match(/min_price=([0-9.]+)/)[1]);
+    // Max price might be empty
+    const maxMatch = url.match(/max_price=([0-9.]+)/);
+    const max = maxMatch ? Number(maxMatch[1]) : null;
+    return { min, max };
+};
+
+// Actor setup things here
+const crawler = new CheerioCrawler({
+    async requestHandler(context) {
+        // ...
+    },
+});
+
+// Let's create the pivot requests
+const initialRequests = [];
+for (const { min, max } of PIVOT_PRICE_RANGES) {
+    initialRequests.push({
+        url: createFilterUrl({ min, max }),
+        label: 'FILTER',
+    });
+}
+// Let's start the crawl
+await crawler.run(initialRequests);
+
+await Actor.exit();
+```
+
+#### Define the logic for the `FILTER` page {#define-the-logic-for-the-filter-page}
+
+```js
+import { CheerioCrawler } from 'crawlee';
+
+// Doesn't matter what Crawler class we choose
+const crawler = new CheerioCrawler({
+    // Crawler options here
+    // ...
+    async requestHandler({ request, $ }) {
+        const { label } = request;
+        if (label === 'FILTER') {
+            // Of course, change the selectors and make it more robust
+            const numberOfProducts = Number($('.product-count').text());
+
+            // The filter is either good enough of we have to split it
+            if (numberOfProducts <= MAX_PRODUCTS_PAGINATION) {
+                // We pass the URL for scraping, we could optimize it so the page is not opened again
+                await crawler.addRequests([{
+                    url: `${request.url}&page=1`,
+                    userData: { label: 'PAGINATION' },
+                }]);
+            } else {
+                // Here we have to split the filter
+                // To be continued...
+            }
+        }
+        if (label === 'PAGINATION') {
+            // We know we are under the limit here
+            // Enqueue next page as long as possible
+            // Enqueue or scrape products normally
+        }
+    },
+});
+```
+
+#### Split price filters {#split-price-filters}
+
+We have the base of the crawler set up. The last part we are missing is the price filter splitting. Let's use a generic function for this. We can place it into the `utils.js` file.
+
+```js
+// utils.js
+export function splitFilter(filter) {
+    const { min, max } = filter;
+    // Don't forget that max can be null and we have to handle that situation
+    if (max && min > max) {
+        throw new Error(`WRONG FILTER - min(${min}) is greater than max(${max})`);
+    }
+
+    // We crate a middle value for the split. If max in null, we will use double min as the middle value
+    const middle = max
+        ? min + Math.floor((max - min) / 2)
+        : min * 2;
+
+    // We have to do the Math.max and Math.min to prevent having min > max
+    const filterMin = {
+        min,
+        max: Math.max(middle, min),
+    };
+    const filterMax = {
+        min: max ? Math.min(middle + 1, max) : middle + 1,
+        max,
+    };
+    // We return 2 new filters
+    return [filterMin, filterMax];
+}
+```
+
+#### Enqueue the filters {#enqueue-the-filters}
+
+Let's finish the crawler now. This code example will go inside the `else` block of the previous crawler example.
+
+```js
+const { min, max } = getFiltersFromUrl(request.url);
+// Our generic splitFilter function doesn't account for decimal values so we will have to convert to cents and back to dollars
+const newFilters = splitFilter({ min: min * 100, max: max * 100 });
+
+// And we enqueue those 2 new filters so the process will recursively repeat until all pages get to the PAGINATION phase
+const requestsToEnqueue = [];
+for (const filter of newFilters) {
+    requestsToEnqueue.push({
+        // Remember that we have to convert back from cents to dollars
+        url: createFilterUrl({ min: filter.min / 100, max: filter.max / 100 }),
+        label: 'FILTER',
+    });
+}
+
+await crawler.addRequests(requestsToEnqueue);
+```
+
+## Summary {#summary}
+
+And that's it. We have an elegant solution for a complicated problem. In a real project, you would want to make this a bit more robust and [save analytics data](../../../platform/expert_scraping_with_apify/saving_useful_stats.md). This will let you know what filters you went through and how many products each of them had.
+
+Check out the [full code example](https://github.com/apify-projects/apify-extra-library/tree/master/examples/crawler-with-filters).
+</file>
+
+<file path="webscraping/advanced_web_scraping/crawling/sitemaps-vs-search.md">
+---
+title: Sitemaps vs search
+description: Learn how to extract all of a website's listings even if they limit the number of results pages. 
+sidebar_position: 1
+slug: /advanced-web-scraping/crawling/sitemaps-vs-search
+---
+
+The core crawling problem comes to down to ensuring that we reliably find all detail pages on the target website or inside its categories. This is trivial for small sites. We just open the home page or category pages and paginate to the end as we did in the [Web Scraping for Beginners course](/academy/web-scraping-for-beginners).
+
+Unfortunately, _most modern websites restrict pagination_ only to somewhere between 1 and 10,000 products. Solving this problem might seem relatively straightforward at first but there are multiple hurdles that we will explore in this lesson.
+
+There are two main approaches to solving this problem:
+
+- Extracting all page URLs from the website's _sitemap_.
+- Using **categories, search and filters** to split the website so we get under the pagination limit.
+
+Both of these approaches have their pros and cons so the best solution is to _use both and combine the results_. Here we will learn why.
+
+## Pros and cons of sitemaps
+
+Sitemap is usually a simple XML file that contains a list of all pages on the website. They are created and maintained mainly for search engines like Google to help ensure that the website gets fully indexed there. They are commonly located at URLs like `https://example.com/sitemap.xml` or `https://example.com/sitemap.xml.gz`. We will get to work with sitemaps in the next lesson.
+
+### Pros
+
+- _Quick to set up_ - The logic to find all sitemaps and extract all URLs is usually simple and can be done in a few lines of code.
+- _Fast to run_ - You only need to run a single request for each sitemap that contains up to 50,000 URLs. This means you can get all the URLs in a matter of seconds.
+- _Usually complete_ - Websites have an incentive to keep their sitemaps up to date as they are used by search engines. This means that they usually contain all pages on the website.
+
+### Cons
+
+- _Does not directly reflect the website_ - There is no way you can ensure that all pages on the website are in the sitemap. The sitemap also can contain pages that were already removed and will return 404s. This is a major downside of sitemaps which prevents us from using them as the only source of URLs.
+- _Updated in intervals_ - Sitemaps are usually not updated in real-time. This means that you might miss some pages if you scrape them too soon after they were added to the website. Common update intervals are 1 day or 1 week.
+- _Hard to find or unavailable_ - Sitemaps are not always trivial to locate. They can be deployed on a CDN with unpredictable URLs. Sometimes they are not available at all.
+- _Streamed, compressed, and archived_ - Sitemaps are often streamed and archived with .tgz extensions and compressed with gzip. This means that you cannot use default HTTP client settings and must handle these cases with extra code or use a scraping framework.
+
+## Pros and cons of categories, search, and filters
+
+This approach means traversing the website like a normal user does by going through categories, setting up different filters, ranges, and sorting options. The goal is to ensure that we cover all categories or ranges where products can be located, and that for each of those we stay under the pagination limit.
+
+The pros and cons of this approach are pretty much the opposite of relying on sitemaps.
+
+### Pros
+
+- _Directly reflects the website_ - With most scraping use-cases, we want to analyze the website as the regular users see it. By going through the intended user flow, we ensure that we are getting the same pages as the users.
+- _Updated in real-time_ - The website is updated in real-time so we can be sure that we are getting all pages.
+- _Often contain detailed data_ - While sitemaps are usually just a list of URLs, categories, searches and filters often contain additional data like product names, prices, categories, etc, especially if available via JSON API. This means that we can sometimes get all the data we need without going to the detail pages.
+
+### Cons
+
+- _Complex to set up_ - The logic to traverse the website is usually complex and can take a lot of time to get right. We will get to this in the next lessons.
+- _Slow to run_ - The traversing can require a lot of requests. Some filters or categories will have products we already found.
+- _Not always complete_ - Sometimes the combination of filters and categories will not allow us to ensure we have all products. This is especially painful for sites where we don't know the exact number of products we are looking for. The tools we'll build in the following lessons will help us with this.
+
+## Do we know how many products there are?
+
+Most websites list a total number of detail pages somewhere. It might be displayed on the home page, search results, or be provided in the API response. We just need to make sure that this number really represents the whole site or category we are looking to scrape. By knowing the total number of products, we can tell if our approach to scrape all succeeded or if we still need to refine it.
+
+Some sites, like Amazon, do not provide exact numbers. In this case, we have to work with what they give us and put even more effort into making our scraping logic accurate. We will tackle this in the following lessons as well.
+
+## Next up
+
+Next, we will look into [sitemap crawling](./crawling-sitemaps.md). After that we will go through all the intricacies of the category, search and filter crawling, and build up tools implementing a generic approach that we can use on any website. At last, we will combine the results of both and set up monitoring and persistence to ensure we can run this regularly without any manual controls.
+</file>
+
+<file path="webscraping/advanced_web_scraping/index.md">
+---
+title: Advanced web scraping
+description: Take your scrapers to a production-ready level by learning various advanced concepts and techniques that will help you build highly scalable and reliable crawlers.
+sidebar_position: 6
+category: web scraping & automation
+slug: /advanced-web-scraping
+---
+
+In [Web scraping for beginners](/academy/web-scraping-for-beginners) course, we have learned the necessary basics required to create a scraper. In the following courses, we learned more about specific practices and techniques that will help us to solve most of the problems we will face.
+
+In this course, we will take all of that knowledge, add a few more advanced concepts, and apply them to learn how to build a production-ready web scraper.
+
+## What does production-ready mean
+
+To scrape large and complex websites, we need to scale two essential aspects of the scraper: crawling and data extraction. Big websites can have millions of pages and the data we want to extract requires more sophisticated parsing techniques than just selecting elements by CSS selectors or using APIs as they are.
+
+<!-- WIP: We want to split this into crawling and data extraction
+The following sections will cover the core concepts that will ensure that your scraper is production-ready:
+The advanced crawling section will cover how to ensure we find all pages or products on the website.
+- The advanced data extraction will cover how to efficiently extract data from a particular page or API.
+-->
+
+We will also touch on monitoring, performance, anti-scraping protections, and debugging.
+
+If you've managed to follow along with all of the courses prior to this one, then you're more than ready to take these upcoming lessons on 😎
+
+## First up
+
+First, we will explore [advanced crawling section](./crawling/sitemaps-vs-search.md) that will help us to find all pages or products on the website.
+</file>
+
+<file path="webscraping/advanced_web_scraping/tips_and_tricks_robustness.md">
+---
+title: Tips and tricks for robustness
+description: Learn how to make your automated processes more effective. Avoid common pitfalls, future-proof your programs and improve your processes.
+sidebar_position: 2
+slug: /advanced-web-scraping/tips-and-tricks-robustness
+---
+
+**Learn how to make your automated processes more effective. Avoid common web scraping and web automation pitfalls, future-proof your programs and improve your processes.**
+
+---
+
+This collection of tips and tricks aims to help you make your scrapers work smoother and produce fewer errors.
+
+## Proofs and verification {#proofs-and-verification}
+
+**Absence of evidence ≠ evidence of absence**.
+
+Make sure output remains consistent regardless of any changes at the target host/website:
+
+- Always base all important checks on the **presence** of proof.
+- Never build any important checks on the **absence** of anything.
+
+The absence of an expected element or message does **not** prove an action has been (un)successful. The website might have been updated or expected content may no longer exist in the original form. The **action relying on the absence** of something might still be failing. Instead, it must rely on **proof of presence**.
+
+**Good**: Rely on the presence of an element or other content confirming a successful action.
+
+```js
+async function isPaymentSuccessful() {
+    try {
+        await page.waitForSelector('#PaymentAccepted');
+    } catch (error) {
+        return OUTPUT.paymentFailure;
+    }
+
+    return OUTPUT.paymentSuccess;
+}
+```
+
+**Avoid**: Relying on the absence of an element that may have been updated or changed.
+
+```js
+async function isPaymentSuccessful() {
+    const $paymentAmount = await page.$('#PaymentAmount');
+
+    if (!$paymentAmount) return OUTPUT.paymentSuccess;
+}
+```
+
+## Presumption of failure {#presumption-of-failure}
+
+**Every action has failed until it has provably succeeded.**
+
+Always assume an action has failed before having a proof of success. Always verify important steps to avoid false positives or false negatives.
+
+- False positive = **false / failed** outcome reported as **true / successful** on output.
+- False negative = **true / successful** outcome reported as **false / failed** on output.
+
+Assuming any action has been successful without direct proof is dangerous. Disprove failure actively through proof of success instead. Only then consider output valid and verified.
+
+**Good**: Verify outcome through proof. Clearly disprove failure of an important action.
+
+```js
+async function submitPayment() {
+    await Promise.all([
+        page.click('submitPayment'),
+        page.waitForNavigation(),
+    ]);
+
+    try {
+        await page.waitForFunction(
+            (selector) => document.querySelector(selector).innerText.includes('Payment Success'),
+            { polling: 'mutation' },
+            '#PaymentOutcome',
+        );
+    } catch (error) {
+        return OUTPUT.paymentFailure;
+    }
+
+    return OUTPUT.paymentSuccess;
+}
+```
+
+**Avoid**: Not verifying an outcome. It can fail despite output claiming otherwise.
+
+```js
+async function submitPayment() {
+    await Promise.all([
+        page.click('submitPayment'),
+        page.waitForNavigation(),
+    ]);
+
+    return OUTPUT.paymentSuccess;
+}
+```
+
+## Targeting elements {#targeting-elements}
+
+Be both as specific and as generic as possible at the same time.
+
+### DOM element selectors {#dom-element-selectors}
+
+Make sure your [CSS selectors](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors) have the best chance to remain valid after a website is updated.
+
+- Prefer [**higher-specificity**](https://developer.mozilla.org/en-US/docs/Web/CSS/Specificity) selectors over lower specificity ones (**#id** over **.class**).
+- Use [**attribute selectors**](https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors) to search parts of attributes (prefix, suffix, etc.).
+- Use element attributes with the **lowest probability of a future change**.
+- Completely **avoid or strip** selectors of values that are clearly **random**.
+- Completely **avoid or strip** selectors of values that are clearly **flexible**.
+- **Extend low-specificity** selectors to reduce the probability of **collisions**.
+
+Below is an example of stripping away too-specific parts of a selector that are likely random or subject to change.
+
+`#P_L_v201w3_t3_ReceiptToolStripLabel` => `a[id*="ReceiptToolStripLabel"]`
+
+If you are reasonably confident a page layout will remain without any dramatic future changes **and** need to increase the selector specificity to reduce the chance of a collision with other selectors, you can extend the selector as per the principle below.
+
+`#ReceiptToolStripLabel_P_L_v201w3_t3` => `table li > a[id^="ReceiptToolStripLabel"]`
+
+### Content pattern matching {#content-pattern-matching}
+
+Matching elements by content is already natively supported by [Playwright](https://playwright.dev/). Playwright is a [Node.js](https://nodejs.org/en/) library that allows you to automate Chromium, Firefox and WebKit with a single API.
+
+In [Puppeteer](https://pptr.dev/), you can use custom utility functions to [polyfill](https://developer.mozilla.org/en-US/docs/Glossary/Polyfill) this functionality.
+
+## Event-bound flows {#event-bound-flows}
+
+Always strive to make code as fluid as possible. Listen to events and react to them as needed by triggering consecutive actions immediately.
+
+- **Avoid** any **fixed-duration** delays wherever possible.
+- Prefer fluid flow based on the **occurrence of events**.
+
+```js
+// Avoid:
+await page.waitForTimeout(timeout);
+
+// Good:
+await page.waitForFunction(myFunction, options, args);
+
+// Good:
+await page.waitForFunction(() => {
+    return window.location.href.includes('path');
+});
+
+// Good:
+await page.waitForFunction(
+    (selector) => document.querySelector(selector).innerText,
+    { polling: 'mutation' },
+    '[data-qa="btnAppleSignUp"]',
+);
+```
+</file>
+
+<file path="webscraping/anti_scraping/mitigation/cloudflare_challenge.md">
+---
+title: Bypassing Cloudflare browser check
+description: Learn how to bypass Cloudflare browser challenge with Crawlee.
+sidebar_position: 3
+slug: /anti-scraping/mitigation/cloudflare-challenge.md
+---
+
+# Bypassing Cloudflare browser check {#cloudflare-challenge}
+
+**Learn how to bypass Cloudflare browser challenge with Crawlee.**
+
+---
+
+If you find yourself stuck, there are a few strategies that you can employ. One key strategy is to ensure that your browser fingerprint is consistent. In some cases, the default browser fingerprint may actually be more effective than an inconsistently generated fingerprint. Additionally, it may be beneficial to avoid masking a Linux browser to look like a Windows or macOS browser, although this will depend on the specific configuration of the website you are targeting.
+
+For those using Crawlee, the library provides out-of-the-box support for generating consistent fingerprints that are able to pass the Cloudflare challenge. However, it's important to note that in some cases, the Cloudflare challenge screen may return a 403 status code even if it is evaluating the fingerprint and the request is not blocked. This can cause the default Crawlee browser crawlers to throw an error and not wait until the challenge is submitted and the page is redirected to the target webpage.
+
+To address this issue, it is necessary to alter the crawler configuration. For example, you might use the following code to remove default blocked status code handling from the crawler:
+
+```js
+const crawler = new PlaywrightCrawler({
+    ...otherOptions,
+    sessionPoolOptions: {
+        blockedStatusCodes: [],
+    },
+});
+```
+
+It's important to note that by removing default blocked status code handling, you should also add custom session retire logic on blocked pages to reduce retries. Additionally, you should add waiting logic to start the automation logic only after the Cloudflare challenge is solved and the page is redirected. This can be accomplished by waiting for a common selector that is available on all pages, such as a header logo.
+
+In some cases, the browser may not pass the check and you may be presented with a captcha, indicating that your IP address has been graylisted. If you are working with a large pool of proxies you can retire the session and use another IP. However, if you have a small pool of proxies you might want to whitelist the IP. To do this, you'll need to solve the captcha to improve your IP address's reputation. You can find various captcha-solving services, such as [AntiCaptcha](https://anti-captcha.com/), that you can use for this purpose. For more info check the section about [Captchas](../techniques/captchas.md).
+
+![Cloudflare captcha](https://images.ctfassets.net/slt3lc6tev37/6sN2VXiUaJpjxqVfTbZEJd/9a4e13cbf08ce29797167c133c534e1f/image1.png)
+
+In summary, while Cloudflare's browser challenge is designed to protect websites from automated scraping, it can be bypassed by ensuring a consistent browser fingerprint and customizing your scraping strategy. Crawlee offers out-of-the-box support for generating consistent fingerprints, but you may need to adjust your crawler configuration to handle Cloudflare's response. By following these tips, you can successfully navigate Cloudflare's browser challenge and continue scraping the data you need.
+</file>
+
+<file path="webscraping/anti_scraping/mitigation/generating_fingerprints.md">
+---
+title: Generating fingerprints
+description: Learn how to use two super handy npm libraries to generate fingerprints and inject them into a Playwright or Puppeteer page.
+sidebar_position: 3
+slug: /anti-scraping/mitigation/generating-fingerprints
+---
+
+# Generating fingerprints {#generating-fingerprints}
+
+**Learn how to use two super handy npm libraries to generate fingerprints and inject them into a Playwright or Puppeteer page.**
+
+---
+
+In [**Crawlee**](https://crawlee.dev), you can use [**FingerprintOptions**](https://crawlee.dev/api/browser-pool/interface/FingerprintOptions) on a crawler to automatically generate fingerprints.
+
+```js
+import { PlaywrightCrawler } from 'crawlee';
+
+const crawler = new PlaywrightCrawler({
+    browserPoolOptions: {
+        fingerprintOptions: {
+            fingerprintGeneratorOptions: {
+                browsers: [{ name: 'firefox', minVersion: 80 }],
+                devices: ['desktop'],
+                operatingSystems: ['windows'],
+            },
+        },
+    },
+});
+```
+
+> Note that Crawlee will automatically generate fingerprints for you with no configuration necessary, but the option to configure them yourself is still there within **browserPoolOptions**.
+
+## Using the fingerprint-generator package {#using-fingerprint-generator}
+
+Crawlee uses the [Fingerprint generator](https://github.com/apify/fingerprint-suite) npm package to do its fingerprint generating magic. For maximum control outside of Crawlee, you can install it on its own. With this package, you can generate browser fingerprints.
+
+> It is crucial to generate fingerprints for the specific browser and operating system being used to trick the protections successfully. For example, if you are trying to overcome protection locally with Firefox on a macOS system, you should generate fingerprints for Firefox and macOS to achieve the best results.
+
+```js
+import { FingerprintGenerator } from 'fingerprint-generator';
+
+// Instantiate the fingerprint generator with
+// configuration options
+const fingerprintGenerator = new FingerprintGenerator({
+    browsers: [
+        { name: 'firefox', minVersion: 80 },
+    ],
+    devices: [
+        'desktop',
+    ],
+    operatingSystems: [
+        'windows',
+    ],
+});
+
+// Grab a fingerprint from the fingerprint generator
+const generated = fingerprintGenerator.getFingerprint({
+    locales: ['en-US', 'en'],
+});
+```
+
+## Injecting fingerprints {#injecting-fingerprints}
+
+Once you've manually generated a fingerprint using the **Fingerprint generator** package, it can be injected into the browser using [**fingerprint-injector**](https://github.com/apify/fingerprint-injector). This tool allows you to inject fingerprints into browsers automated by Playwright or Puppeteer:
+
+```js
+import FingerprintGenerator from 'fingerprint-generator';
+import { FingerprintInjector } from 'fingerprint-injector';
+import { chromium } from 'playwright';
+
+// Instantiate a fingerprint injector
+const fingerprintInjector = new FingerprintInjector();
+
+// Launch a browser in Playwright
+const browser = await chromium.launch();
+
+// Instantiate the fingerprint generator with
+// configuration options
+const fingerprintGenerator = new FingerprintGenerator({
+    browsers: [
+        { name: 'firefox', minVersion: 80 },
+    ],
+    devices: [
+        'desktop',
+    ],
+    operatingSystems: [
+        'windows',
+    ],
+});
+
+// Grab a fingerprint
+const generated = fingerprintGenerator.getFingerprint({
+    locales: ['en-US', 'en'],
+});
+
+// Create a new browser context, plugging in
+// some values from the fingerprint
+const context = await browser.newContext({
+    userAgent: generated.fingerprint.userAgent,
+    locale: generated.fingerprint.navigator.language,
+});
+
+// Attach the fingerprint to the newly created
+// browser context
+await fingerprintInjector.attachFingerprintToPlaywright(context, generated);
+
+// Create a new page and go to Google
+const page = await context.newPage();
+await page.goto('https://google.com');
+```
+
+> Note that [Crawlee](https://crawlee.dev) automatically applies wide variety of fingerprints by default, so it is not required to do this unless you aren't using Crawlee or if you need a super specific custom fingerprint to scrape with.
+
+## Generating headers {#generating-headers}
+
+Headers are also used by websites to fingerprint users (or bots), so it might sometimes be necessary to generate some user-like headers to mitigate anti-scraping protections. Similarly with fingerprints, **Crawlee** automatically generates headers for you, but you can have full control by using the [**browser-headers-generator**](https://github.com/apify/browser-headers-generator) package.
+
+```js
+import BrowserHeadersGenerator from 'browser-headers-generator';
+
+const browserHeadersGenerator = new BrowserHeadersGenerator({
+    operatingSystems: ['windows'],
+    browsers: ['chrome'],
+});
+
+await browserHeadersGenerator.initialize();
+
+const randomBrowserHeaders = await browserHeadersGenerator.getRandomizedHeaders();
+```
+
+## Wrap up
+
+That's it for the **Mitigation** course for now, but be on the lookout for future lessons! We release lessons as we write them, and will be updating the Academy frequently, so be sure to check back every once in a while for new content!
+</file>
+
+<file path="webscraping/anti_scraping/mitigation/index.md">
+---
+title: Mitigation
+description: After learning about the various different anti-scraping techniques websites use, learn how to mitigate them with a few different techniques.
+sidebar_position: 3.2
+slug: /anti-scraping/mitigation
+---
+
+# Anti-scraping mitigation {#anti-scraping-mitigation}
+
+**After learning about the various different anti-scraping techniques websites use, learn how to mitigate them with a few different techniques.**
+
+---
+
+In the [techniques](../techniques/index.md) section of this course, you learned about multiple methods websites use to prevent bots from accessing their content. This **Mitigation** section will be all about how to circumvent these protections using various different techniques.
+
+<!-- Here there should be a bit of an outline of what mitigation techniques they'll be learning -->
+
+## Next up {#next}
+
+In the [first lesson](./proxies.md) of this section, you'll be learning about what proxies are and how to use them in your own crawler.
+</file>
+
+<file path="webscraping/anti_scraping/mitigation/using_proxies.md">
+---
+title: Using proxies
+description: Learn how to use and automagically rotate proxies in your scrapers by using Crawlee, and a bit about how to obtain pools of proxies.
+sidebar_position: 2
+slug: /anti-scraping/mitigation/using-proxies
+---
+
+# Using proxies {#using-proxies}
+
+**Learn how to use and automagically rotate proxies in your scrapers by using Crawlee, and a bit about how to obtain pools of proxies.**
+
+---
+
+In the [**Web scraping for beginners**](../../scraping_basics_javascript/crawling/pro_scraping.md) course, we learned about the power of Crawlee, and how it can streamline the development process of web crawlers. You've already seen how powerful the `crawlee` package is; however, what you've been exposed to thus far is only the tip of the iceberg.
+
+Because proxies are so widely used in the scraping world, Crawlee has built-in features for implementing them in an effective way. One of the main functionalities that comes baked into Crawlee is proxy rotation, which is when each request is sent through a different proxy from a proxy pool.
+
+## Implementing proxies in a scraper {#implementing-proxies}
+
+Let's borrow some scraper code from the end of the [pro-scraping](../../scraping_basics_javascript/crawling/pro_scraping.md) lesson in our **Web Scraping for Beginners** course and paste it into a new file called **proxies.js**. This code enqueues all of the product links on [demo-webstore.apify.org](https://demo-webstore.apify.org)'s on-sale page, then makes a request to each product page and scrapes data about each one:
+
+```js
+// crawlee.js
+import { CheerioCrawler, Dataset } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+    requestHandler: async ({ $, request, enqueueLinks }) => {
+        if (request.label === 'START') {
+            await enqueueLinks({
+                selector: 'a[href*="/product/"]',
+            });
+
+            // When on the START page, we don't want to
+            // extract any data after we extract the links.
+            return;
+        }
+
+        // We copied and pasted the extraction code
+        // from the previous lesson
+        const title = $('h3').text().trim();
+        const price = $('h3 + div').text().trim();
+        const description = $('div[class*="Text_body"]').text().trim();
+
+        // Instead of saving the data to a variable,
+        // we immediately save everything to a file.
+        await Dataset.pushData({
+            title,
+            description,
+            price,
+        });
+    },
+});
+
+await crawler.addRequests([{
+    url: 'https://demo-webstore.apify.org/search/on-sale',
+    // By labeling the Request, we can identify it
+    // later in the requestHandler.
+    label: 'START',
+}]);
+
+await crawler.run();
+```
+
+In order to implement a proxy pool, we will first need some proxies. We'll quickly use the free [proxy scraper](https://apify.com/mstephen190/proxy-scraper) on the Apify platform to get our hands on some quality proxies. Next, we'll need to set up a [`ProxyConfiguration`](https://crawlee.dev/api/core/class/ProxyConfiguration) and configure it with our custom proxies, like so:
+
+```js
+import { ProxyConfiguration } from 'crawlee';
+
+const proxyConfiguration = new ProxyConfiguration({
+    proxyUrls: ['http://45.42.177.37:3128', 'http://43.128.166.24:59394', 'http://51.79.49.178:3128'],
+});
+```
+
+Awesome, so there's our proxy pool! Usually, a proxy pool is much larger than this; however, a three proxies pool is totally fine for tutorial purposes. Finally, we can pass the `proxyConfiguration` into our crawler's options:
+
+```js
+const crawler = new CheerioCrawler({
+    proxyConfiguration,
+    requestHandler: async ({ $, request, enqueueLinks }) => {
+        if (request.label === 'START') {
+            await enqueueLinks({
+                selector: 'a[href*="/product/"]',
+            });
+            return;
+        }
+
+        const title = $('h3').text().trim();
+        const price = $('h3 + div').text().trim();
+        const description = $('div[class*="Text_body"]').text().trim();
+
+        await Dataset.pushData({
+            title,
+            description,
+            price,
+        });
+    },
+});
+```
+
+> Note that if you run this code, it may not work, as the proxies could potentially be down/non-operating at the time you are going through this course.
+
+That's it! The crawler will now automatically rotate through the proxies we provided in the `proxyUrls` option.
+
+## A bit about debugging proxies {#debugging-proxies}
+
+At the time of writing, the scraper above utilizing our custom proxy pool is working just fine. But how can we check that the scraper is for sure using the proxies we provided it, and more importantly, how can we debug proxies within our scraper? Luckily, within the same `context` object we've been destructuring `$` and `request` out of, there is a `proxyInfo` key as well. `proxyInfo` is an object which includes useful data about the proxy which was used to make the request.
+
+```js
+const crawler = new CheerioCrawler({
+    proxyConfiguration,
+    // Destructure "proxyInfo" from the "context" object
+    handlePageFunction: async ({ $, request, proxyInfo }) => {
+        // Log its value
+        console.log(proxyInfo);
+        // ...
+        // ...
+    },
+});
+```
+
+After modifying your code to log `proxyInfo` to the console and running the scraper, you're going to see some logs which look like this:
+
+![proxyInfo being logged by the scraper](./images/proxy-info-logs.png)
+
+These logs confirm that our proxies are being used and rotated successfully by Crawlee, and can also be used to debug slow or broken proxies.
+
+## Higher level proxy scraping {#higher-level-proxy-scraping}
+
+Though we will discuss it more in-depth in future courses, it is still important to mention that Crawlee has integrated support for the Apify SDK, which supports [Apify Proxy](https://apify.com/proxy) - a service that provides access to pools of both residential and datacenter IP addresses. A `proxyConfiguration` using Apify Proxy might look something like this:
+
+```js
+import { Actor } from 'apify';
+
+const proxyConfiguration = await Actor.createProxyConfiguration({
+    countryCode: 'US',
+});
+```
+
+Notice that we didn't provide it a list of proxy URLs. This is because the `SHADER` group already serves as our proxy pool (courtesy of Apify Proxy).
+
+## Next up {#next}
+
+[Next up](./generating_fingerprints.md), we'll be checking out how to use two npm packages to generate and inject [browser fingerprints](../techniques/fingerprinting.md).
+</file>
+
+<file path="webscraping/anti_scraping/techniques/browser_challenges.md">
+---
+title: Browser challenges
+description: Learn how to navigate browser challenges like Cloudflare's to effectively scrape data from protected websites.
+sidebar_position: 5
+slug: /anti-scraping/techniques/browser-challenges
+---
+
+# Browser challenges {#fingerprinting}
+
+> Learn how to navigate browser challenges like Cloudflare's to effectively scrape data from protected websites.
+
+## Browser challenges
+
+Browser challenges are a type of security measure that relies on browser fingerprints. These challenges typically involve a JavaScript program that collects both static and dynamic browser fingerprints. Static fingerprints include attributes such as User-Agent, video card, and number of CPU cores available. Dynamic fingerprints, on the other hand, might involve rendering fonts or objects in the canvas (known as a [canvas fingerprint](./fingerprinting.md#with-canvases)), or playing audio in the [AudioContext](./fingerprinting.md#from-audiocontext). We were covering the details in the previous [fingerprinting](./fingerprinting.md) lesson.
+
+While some browser challenges are relatively straightforward - for example, loading an image and checking if it renders correctly - others can be much more complex. One well-known example of a complex browser challenge is Cloudflare's browser screen check. In this challenge, Cloudflare visually inspects the browser screen and blocks the first request if any inconsistencies are found. This approach provides an extra layer of protection against automated attacks.
+
+Many online protections incorporate browser challenges into their security measures, but the specific techniques used can vary.
+
+## Cloudflare browser challenge
+
+One of the most well-known browser challenges is the one used by Cloudflare. Cloudflare has a massive dataset of legitimate canvas fingerprints and User-Agent pairs, which they use in conjunction with machine learning algorithms to detect any device property spoofing. This might include spoofed User-Agent headers, operating systems, or GPUs.
+
+![Cloudflare browser check](https://images.ctfassets.net/slt3lc6tev37/55EYMR81XJCIG5uxLjQQOx/252a98adf90fa0ff2f70437cc5c0a3af/under-attack-mode_enabled.gif)
+
+When you encounter a Cloudflare browser challenge, the platform checks your canvas fingerprint against the expected value. If there is a mismatch, the request is blocked. However, if your canvas fingerprint matches the expected value, Cloudflare issues a cookie that allows you to continue scraping - even without the browser - until the cookie is invalidated.
+
+It's worth noting that Cloudflare's protection is highly customizable, and can be adjusted to be extremely strict or relatively loose. This makes it a powerful tool for website owners who want to protect against automated traffic, while still allowing legitimate traffic to flow through.
+
+If you want to learn how to bypass Cloudflare challenge visit the [Bypassing Cloudflare challenge](../mitigation/cloudflare_challenge.md) article.
+
+## Next up {#next}
+
+In the [next lesson](./captchas.md), we'll be covering **captchas**, which were mentioned throughout this lesson. It's important to note that attempting to solve a captcha programmatically is the last resort - always try to avoid being presented with the captcha in the first place by using the techniques mentioned in this lesson.
+</file>
+
+<file path="webscraping/anti_scraping/techniques/captchas.md">
+---
+title: Captchas
+description: Learn about the reasons a bot might be presented a captcha, the best ways to avoid captchas in the first place, and how to programmatically solve them.
+sidebar_position: 5
+slug: /anti-scraping/techniques/captchas
+---
+
+# Captchas {#captchas}
+
+**Learn about the reasons a bot might be presented a captcha, the best ways to avoid captchas in the first place, and how to programmatically solve them.**
+
+---
+
+In general, a website will present a user (or scraper) a captcha for 2 main reasons:
+
+1. The website always does captcha checks to access the desired content.
+2. One of the website's anti-bot measures (or the [WAF](./firewalls.md)) has flagged the user as suspicious.
+
+## Dealing with captchas {#dealing-with-captchas}
+
+When you've hit a captcha, your first thought should not be how to programmatically solve it. Rather, you should consider the factors as to why you received the captcha in the first place: your bot didn't appear enough like a real user to avoid being presented the challenge.
+
+Have you expended all of the possible options to make your scraper appear more human-like? Are you:
+
+- Using [proxies](../mitigation/proxies.md)?
+- Making the request with the proper [headers](../../../glossary/concepts/http_headers.md) and [cookies](../../../glossary/concepts/http_cookies.md)?
+- Generating and using a custom [browser fingerprint](./fingerprinting.md)?
+- Trying different general scraping methods (HTTP scraping, browser scraping)? If you are using browser scraping, have you tried using a different browser?
+
+## Solving captchas {#solving-captchas}
+
+If you've tried everything you can to avoid being presented the captcha and are still facing this roadblock, there are methods to programmatically solve captchas.
+
+Tons of different types of captchas exist, but one of the most popular is Google's [**reCAPTCHA**](https://www.google.com/recaptcha/about/).
+
+![Google's reCAPTCHA](https://miro.medium.com/max/1400/1*4NhFKMxr-qXodjYpxtiE0w.gif)
+
+**reCAPTCHA**s can be solved using the [Anti Captcha Recaptcha](https://apify.com/petr_cermak/anti-captcha-recaptcha) Actor on the Apify platform (note that this method requires an account on [anti-captcha.com](https://anti-captcha.com)).
+
+Another popular captcha is the [Geetest slider captcha](https://www.geetest.com/en/adaptive-captcha-demo). You can find a guide for solving these types of captchas in Puppeteer [here](https://scraperbox.com/blog/solving-a-geetest-slider-captcha-with-puppeteer). Amazon's captcha can similarly also be solved programmatically.
+
+## Wrap up
+
+In this course, you've learned about some of the most common (and some of the most advanced) anti-scraping techniques. Keep in mind that as the web (and technology in general) evolves, this section of the **Anti scraping** course will evolve as well. In the [next section](../mitigation/index.md), we'll be discussing how to mitigate the anti-scraping techniques you learned about in this section.
+</file>
+
+<file path="webscraping/anti_scraping/techniques/fingerprinting.md">
+---
+title: Fingerprinting
+description: Understand browser fingerprinting, an advanced technique used by browsers to track user data and even block bots from accessing them.
+sidebar_position: 2
+slug: /anti-scraping/techniques/fingerprinting
+---
+
+# Fingerprinting {#fingerprinting}
+
+**Understand browser fingerprinting, an advanced technique used by browsers to track user data and even block bots from accessing them.**
+
+---
+
+Browser fingerprinting is a method that some websites use to collect information about a browser's type and version, as well as the operating system being used, any active plugins, the time zone and language of the machine, the screen resolution, and various other active settings. All of this information is called the **fingerprint** of the browser, and the act of collecting it is called **fingerprinting**.
+
+Yup! Surprisingly enough, browsers provide a lot of information about the user (and even their machine) that is accessible to websites! Browser fingerprinting wouldn't even be possible if it weren't for the sheer amount of information browsers provide, and the fact that each fingerprint is unique.
+
+Based on [research](https://www.eff.org/press/archives/2010/05/13) carried out by the Electronic Frontier Foundation, 84% of collected fingerprints are globally exclusive, and they found that the next 9% were in sets with a size of two. They also stated that even though fingerprints are dynamic, new ones can be matched up with old ones with 99.1% correctness. This makes fingerprinting a very viable option for websites that want to track the online behavior of their users in order to serve hyper-personalized advertisements to them. In some cases, it is also used to aid in preventing bots from accessing the websites (or certain sections of it).
+
+## What makes up a fingerprint? {#what-makes-up-a-fingerprint}
+
+To collect a good fingerprint, websites must collect them from various places.
+
+### From HTTP headers {#from-http-headers}
+
+Several [HTTP headers](../../../glossary/concepts/http_headers.md) can be used to create a fingerprint about a user. Here  are some of the main ones:
+
+1. **User-Agent** provides information about the browser and its operating system (including its versions).
+2. **Accept** tells the server what content types the browser can render and send, and **Content-Encoding** provides data about the content compression.
+3. **Content-Language** and **Accept-Language** both indicate the user's (and browser's) preferred language.
+4. **Referer** gives the server the address of the previous page from which the link was followed.
+
+A few other headers commonly used for fingerprinting can be seen below:
+
+![Fingerprinted headers](./images/fingerprinted-headers.png)
+
+### From window properties {#from-window-properties}
+
+The `window` is defined as a global variable that is accessible from JavaScript running in the browser. It is home to a vast amount of functions, variables, and constructors, and most of the global configuration is stored there.
+
+Most of the attributes that are used for fingerprinting are stored under the `window.navigator` object, which holds methods and info about the user's state and identity starting with the **User-Agent** itself and ending with the device's battery status. All of these properties can be used to fingerprint a device; however, most fingerprinting solutions (such as [Valve](https://valve.github.io/fingerprintjs/)) only use the most crucial ones.
+
+Here is a list of some of the most crucial properties on the `window` object used for fingerprinting:
+
+| Property | Example | Description |
+| - | - | - |
+| `screen.width` | `1680` | Defines the width of the device screen. |
+| `screen.height` | `1050` | Defines the height of the device screen. |
+| `screen.availWidth` | `1680` | The portion of the screen width available to the browser window. |
+| `screen.availHeight` | `1050` | The portion of the screen height available to the browser window. |
+| `navigator.userAgent` | `'Mozilla/5.0 (X11; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0'` | Same as the HTTP header. |
+| `navigator.platform` | `'MacIntel'` | The platform the browser is running on. |
+| `navigator.cookieEnabled` | `true` | Whether or not the browser accepts cookies. |
+| `navigator.doNotTrack` | `'1'` | Indicates the browser's Do Not Track settings. |
+| `navigator.buildID` | `20181001000000` | The build ID of the browser. |
+| `navigator.product` | `'Gecko'` | The layout engine used. |
+| `navigator.productSub` | `20030107` | The version of the layout engine used. |
+| `navigator.vendor` | `'Google Inc.'` | Vendor of the browser. |
+| `navigator.hardwareConcurrency` | `4` | The number of logical processors the user's computer has available to run threads on. |
+| `navigator.javaEnabled` | `false` | Whether or not the user has enabled Java. |
+| `navigator.deviceMemory` | `8` | Approximately the amount of user memory (in gigabytes). |
+| `navigator.language` | `'en-US'` | The user's primary language. |
+| `navigator.languages` | `['en-US', 'cs-CZ', 'es']` | Other user languages. |
+
+### From function calls {#from-function-calls}
+
+Fingerprinting tools can also collect pieces of information that are retrieved by calling specific functions:
+
+```js
+// Get the WebGL vendor information
+WebGLRenderingContext.getParameter(37445);
+
+// Get the WebGL renderer information
+WebGLRenderingContext.getParameter(37446);
+
+// Pass any codec into this function (ex. "audio/aac"). It will return
+// either "maybe," "probably," or "" indicating whether
+// or not the browser can play that codec. An empty
+// string means that  it can't be played.
+HTMLMediaElement.canPlayType('some/codec');
+
+// can ask for a permission if it is not already enabled.
+// allows you to know which permissions the user has
+// enabled, and which are disabled
+navigator.permissions.query('some_permission');
+```
+
+### With canvases {#with-canvases}
+
+This technique is based on rendering [WebGL](https://developer.mozilla.org/en-US/docs/Web/API/WebGL_API) scenes to a canvas element and observing the pixels rendered. WebGL rendering is tightly connected with the hardware, and therefore provides high entropy. Here's a quick breakdown of how it works:
+
+1. A JavaScript script creates a [`<canvas>` element](https://developer.mozilla.org/en-US/docs/Web/API/Canvas_API) and renders some font or a custom shape.
+2. The script then gets the pixel-map from the `<canvas>` element.
+3. The collected pixel-map is stored in a cryptographic hash specific to the device's hardware.
+
+Canvas fingerprinting takes advantage of the CSS3 feature for importing fonts into CSS (called [WebFonts](https://developer.mozilla.org/en-US/docs/Learn/CSS/Styling_text/Web_fonts)). This means it's not required to use just the machine's preinstalled fonts.
+
+Here's an example of multiple WebGL scenes visibly being rendered differently on different machines:
+
+![Differences in canvas element renderings](./images/canvas-differences.png)
+
+### From AudioContext {#from-audiocontext}
+
+The [AudioContext](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext) API represents an audio-processing graph built from audio modules linked together, each represented by an [AudioNode](https://developer.mozilla.org/en-US/docs/Web/API/AudioNode) ([OscillatorNode](https://developer.mozilla.org/en-US/docs/Web/API/OscillatorNode)).
+
+In the simplest cases, the fingerprint can be obtained by checking for the existence of AudioContext. However, this doesn't provide very much information. In advanced cases, the technique used to collect a fingerprint from AudioContext is quite similar to the `<canvas>` method:
+
+1. Audio is passed through an OscillatorNode.
+2. The signal is processed and collected.
+3. The collected signal is cryptographically hashed to provide a short ID.
+
+> A downfall of this method is that two same machines with the same browser will get the same ID.
+
+### From BatteryManager {#from-batterymanager}
+
+The `navigator.getBattery()` function returns a promise which resolves with a [BatteryManager](https://developer.mozilla.org/en-US/docs/Web/API/BatteryManager) interface. BatteryManager offers information about whether or not the battery is charging, and how much time is left until the battery has fully discharged/charged.
+
+On its own this method is quite weak, but it can be potent when combined with the `<canvas>` and AudioContext fingerprinting techniques mentioned above.
+
+## Fingerprint example {#fingerprint-example}
+
+When all is said and done, this is what a browser fingerprint might look like:
+
+```json
+{
+  "userAgent": "Mozilla/5.0 (X11; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
+  "cookiesEnabled": true,
+  "timezone": "Europe/Prague",
+  "timezoneOffset": -60,
+  "audioCodecs": {
+    "ogg": "probably",
+    "mp3": "maybe",
+    "wav": "probably",
+    "m4a": "maybe",
+    "aac": "maybe"
+  },
+  "videoCodecs": {
+    "ogg": "probably",
+    "h264": "probably",
+    "webm": "probably"
+  },
+  "videoCard": [
+    "Intel Open Source Technology Center",
+    "Mesa DRI Intel(R) HD Graphics 4600 (HSW GT2)"
+  ],
+  "productSub": "20100101",
+  "hardwareConcurrency": 8,
+  "multimediaDevices": {
+    "speakers": 0,
+    "micros": 0,
+    "webcams": 0
+  },
+  "platform": "Linux x86_64",
+  "pluginsSupport": true,
+  "screenResolution": [ 1920, 1080 ],
+  "availableScreenResolution": [ 1920, 1080 ],
+  "colorDepth": 24,
+  "touchSupport": {
+    "maxTouchPoints": 0,
+    "touchEvent": false,
+    "touchStart": false
+  },
+  "languages": [ "en-US", "en" ]
+}
+```
+
+## How it works {#how-it-works}
+
+Sites employ multiple levels and different approaches to collect browser fingerprints. However, they all have one thing in common: they are using a script written in JavaScript to evaluate the target browser's context and collect information about it (oftentimes also storing it in their database, or in a cookie). These scripts are often obfuscated and difficult to track down and understand, especially if they are anti-bot scripts.
+
+Multiple levels of script obfuscation are used to make fingerprinting scripts unreadable and hard to find:
+
+### Randomization
+
+The script is modified with some random JavaScript elements. Additionally, it also often incorporates a random number of whitespaces and other unusual formatting characters as well as cryptic variable and function names devoid of readable meaning.
+
+### Data obfuscation
+
+Two main data obfuscation techniques are widely employed:
+
+1. **String splitting** uses the concatenation of multiple substrings. It is mostly used alongside an `eval()` or `document.write()`.
+2. **Keyword replacement** allows the script to mask the accessed properties. This allows the script to have a random order of the substrings and makes it harder to detect.
+
+Oftentimes, both of these data obfuscation techniques are used together.
+
+### Encoding
+
+Built-in JavaScript encoding functions are used to transform the code into, for example, hexadecimal string. Or, a custom encoding function is used and a custom decoding function decodes the code as it is evaluated in the browser.
+
+## Detecting fingerprinting scripts
+
+As mentioned above, many sites obfuscate their fingerprinting scripts to make them harder to detect. Luckily for us, there are ways around this.
+
+### Manual de-obfuscation
+
+Almost all sites using fingerprinting and tracking scripts try to protect them as much as much as they can. However, it is impossible to make client-side JavaScript immune to reverse engineering. It is only possible to make reverse engineering difficult and unpleasant for the developer. The procedure used to make the code as unreadable as possible is called [obfuscation](https://www.techtarget.com/searchsecurity/definition/obfuscation#:~:text=Obfuscation%20means%20to%20make%20something,code%20is%20one%20obfuscation%20method.).
+
+When you want to dig inside the protection code to determine exactly which data is collected, you will probably have to deobfuscate it. Be aware that this can be a very time-consuming process. Code deobfuscation can take anywhere up to 1–2 days to be in a semi-readable state.
+
+We recommend watching some videos from [Jarrod Overson on YouTube](https://www.youtube.com/channel/UCJbZGfomrHtwpdjrARoMVaA/videos) to learn the tooling necessary to deobfuscate code.
+
+### Using browser extensions
+
+Because of how common it has become to obfuscate fingerprinting scripts, there are many extensions that help identify fingerprinting scripts due to the fact that browser fingerprinting is such a big privacy question. Browser extensions such as [**Don't Fingerprint Me**](https://github.com/freethenation/DFPM) have been created to help detect them. In the extension's window, you can see a report on which functions commonly used for fingerprinting have been called, and which navigator properties have been accessed.
+
+![Don't Fingerprint Me extension window](./images/dont-fingerprint-me.png)
+
+This extension provides monitoring of only a few critical attributes, but in order to deceive anti-scraping protections, the full list is needed. However, the extension does reveal the scripts that collect the fingerprints.
+
+## Anti-bot fingerprinting {#anti-bot-fingerprinting}
+
+On websites which implement advanced fingerprinting techniques, they will tie the fingerprint and certain headers (such as the **User-Agent** header) to the IP address of the user. These sites will block a user (or scraper) if it made a request with one fingerprint and set of headers, then tries to make another request on the same proxy but with a different fingerprint.
+
+When dealing with these cases, it's important to sync the generation of headers and fingerprints with the rotation of proxies (this is known as session rotation).
+
+## Next up {#next}
+
+[Next up](./geolocation.md), we'll be covering **geolocation** methods that websites use to grab the location from which a request has been made, and how they relate to anti-scraping.
+</file>
+
+<file path="webscraping/anti_scraping/techniques/firewalls.md">
+---
+title: Firewalls
+description: Understand what a web-application firewall is, how they work, and the various common techniques for avoiding them altogether.
+sidebar_position: 4
+slug: /anti-scraping/techniques/firewalls
+---
+
+# Firewalls {#firewalls}
+
+**Understand what a web-application firewall is, how they work, and the various common techniques for avoiding them altogether.**
+
+---
+
+A web-application firewall (or **WAF**) is a tool for website admins which allows them to set various access rules for their visitors. The rules can vary on each website and are usually hard to detect; therefore, on sites using a WAF, you need to run a set of tests to test the rules and find out their limits.
+
+One of the most common WAFs one can come across is the one from [Cloudflare](https://www.cloudflare.com). It allows setting a waiting screen that runs a few tests against the visitor to detect a genuine visitor or a bot. However, not all WAFs are that easy to detect.
+
+![Cloudflare waiting screen](./images/cloudflare.png)
+
+## How it works {#how-it-works}
+
+WAPs work on a similar premise as regular firewalls. Web admins define the rules, and the firewall executes them. As an example of how the WAF can work, we will take a look at Cloudflare's solution:
+
+1. The visitor sends a request to the webpage.
+2. The request is intercepted by the firewall.
+3. The firewall decides if presenting a challenge (captcha) is necessary. If the user already solved a captcha in the past or nothing is suspicious, it will immediately forward the request to the application's server.
+4. A captcha is presented which must be solved. Once it is solved, a [cookie](../../../glossary/concepts/http_cookies.md) is stored in the visitor's browser.
+5. The request is forwarded to the application's server.
+
+![Cloudflare WAP workflow](./images/cloudflare-graphic.jpg)
+
+Since there are multiple providers, it is essential to say that the challenges are not always graphical and can be entirely server-side (without any JavaScript evaluation in the visitor browser).
+
+## Bypassing web-application firewalls {#bypassing-firewalls}
+
+- Using [proxies](../mitigation/proxies.md).
+- Mocking [headers](../../../glossary/concepts/http_headers.md).
+- Overriding the browser's [fingerprint](./fingerprinting.md) (most effective).
+- Farming the [cookies](../../../glossary/concepts/http_cookies.md) from a website with a headless browser, then using the farmed cookies to do HTTP based scraping (most performant).
+
+As you likely already know, there is no solution that fits all. If you are struggling to get past a WAP provider, you can try using Firefox with Playwright.
+
+## Next up {#next}
+
+In the [next lesson](./browser_challenges.md), we'll be covering **browser challenges** and specifically the Cloudflare browser challenge which is part of the Cloudflare WAF mentioned in this lesson.
+</file>
+
+<file path="webscraping/anti_scraping/techniques/geolocation.md">
+---
+title: Geolocation
+description: Learn about the geolocation techniques to determine where requests are coming from, and a bit about how to avoid being blocked based on geolocation.
+sidebar_position: 3
+slug: /anti-scraping/techniques/geolocation
+---
+
+# Geolocation {#geolocation}
+
+**Learn about the geolocation techniques to determine where requests are coming from, and a bit about how to avoid being blocked based on geolocation.**
+
+---
+
+Geolocation is yet another way websites can detect and block access or show limited data. Other than by using the [Geolocation API](https://developer.mozilla.org/en-US/docs/Web/API/Geolocation_API) (which requires user permission in order to receive location data), there are two main ways that websites geolocate a user (or bot) visiting it.
+
+## Cookies & headers {#cookies-headers}
+
+Certain websites might use certain location-specific/language-specific [headers](../../../glossary/concepts/http_headers.md)/[cookies](../../../glossary/concepts/http_cookies.md) to geolocate a user. Some examples of these headers are `Accept-Language` and `CloudFront-Viewer-Country` (which is a custom HTTP header from [CloudFront](https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/adding-cloudfront-headers.html)).
+
+On targets which are utilizing just cookies and headers to identify the location from which a request is coming from, it is pretty straightforward to make requests which appear like they are coming from somewhere else.
+
+## IP address {#ip-address}
+
+The oldest (and still most common) way of geolocating is based on the IP address used to make the request. Sometimes, country-specific sites block themselves from being accessed from any other country (some Chinese, Indian, Israeli, and Japanese websites do this).
+
+[Proxies](../mitigation/proxies.md) can be used in a scraper to bypass restrictions and to make requests from a different location. Oftentimes, proxies need to be used in combination with location-specific [cookies](../../../glossary/concepts/http_cookies.md)/[headers](../../../glossary/concepts/http_headers.md).
+
+## Override/emulate geolocation when using a browser-based scraper {#override-emulate-geolocation}
+
+When using [Puppeteer](https://pptr.dev/#?product=Puppeteer&show=api-pagesetgeolocationoptions), you can emulate the geolocation with the `page.setGeolocation()` function.
+
+In [Playwright](https://playwright.dev/docs/api/class-browsercontext#browsercontextsetgeolocationgeolocation), geolocation can be emulated by using `browserContext.setGeolocation()`.
+
+Overriding browser geolocation should be used in tandem with a proper proxy corresponding to the emulated geolocation. You would still likely get blocked if you, for example, used a German proxy with the overridden location set to Japan.
+</file>
+
+<file path="webscraping/anti_scraping/techniques/index.md">
+---
+title: Anti-scraping techniques
+description: Understand the various common (and obscure) anti-scraping techniques used by websites to prevent bots from accessing their content.
+sidebar_position: 3.1
+slug: /anti-scraping/techniques
+---
+
+# Anti-scraping techniques {#anti-scraping-techniques}
+
+**Understand the various common (and obscure) anti-scraping techniques used by websites to prevent bots from accessing their content.**
+
+---
+
+In this section, we'll be discussing some of the most common (as well as some obscure) anti-scraping techniques used by websites to detect and block/limit bots from accessing their content.
+
+When a scraper is detected, a website can respond in a variety of ways:
+
+## "Access denied" page {#access-denied}
+
+This is a complete block which usually has a response status code of **403**. Usually, you'll hit an **Access denied** page if you have bad IP address or the website is restricted in the country of the IP address.
+
+> For a better understanding of what all the HTTP status codes mean, we recommend checking out [HTTP Cat](https://http.cat/) which provides a highly professional description for each status code.
+
+## Captcha page {#captcha}
+
+Probably the most common blocking method. The website gives you a chance to prove that you are not a bot by presenting you with a captcha. We'll be covering captchas within this course.
+
+## Redirect {#redirect}
+
+Another common method is redirecting to the home page of the site (or a different location).
+
+## Request timeout/Socket hangup {#request-timeout}
+
+This is the cheapest defense mechanism where the website won't even respond to the request. Dealing with timeouts in a scraper can be challenging, because you have to differentiate them from regular network problems.
+
+## Custom status code or message {#custom-status-code}
+
+Similar to getting an **Access denied** page, but some sites send along specific status codes (eg. **503**) and messages explaining what was wrong with the request.
+
+## Empty results {#empty-results}
+
+The website responds "normally," but pretends to not find any results. This requires manual testing to recognize the pattern.
+
+## Fake results {#fake-results}
+
+The website responds with data, but the data is totally fake, which is very difficult to recognize and requires extensive manual testing. Luckily, this type of response is not all too common.
+
+## Next up {#next}
+
+In the [first lesson](./rate_limiting.md) of this course, you'll be learning about **rate limiting**, which is a technique used to prevent a large amount of requests from being sent from one user.
+</file>
+
+<file path="webscraping/anti_scraping/techniques/rate_limiting.md">
+---
+title: Rate-limiting
+description: Learn about rate-limiting, a common tactic used by websites to avoid a large and non-human rate of requests coming from a single IP address.
+sidebar_position: 1
+slug: /anti-scraping/techniques/rate-limiting
+---
+
+# Rate-limiting {#rate-limiting}
+
+**Learn about rate-limiting, a common tactic used by websites to avoid a large and non-human rate of requests coming from a single IP address.**
+
+---
+
+When crawling a website, a web scraping bot will typically send many more requests from a single IP address than a human user could generate over the same period. Websites can monitor how many requests they receive from a single IP address, and block it or require a [captcha](./captchas.md) test to continue making requests.
+
+In the past, most websites had their own anti-scraping solutions, the most common of which was IP address rate-limiting. In recent years, the popularity of third-party specialized anti-scraping providers has dramatically increased, but a lot of websites still use rate-limiting to only allow a certain number of requests per second/minute/hour to be sent from a single IP; therefore, crawler requests have the potential of being blocked entirely quite quickly.
+
+In cases when a higher number of requests is expected for the crawler, using a [proxy](../mitigation/proxies.md) and rotating the IPs is essential to let the crawler run as smoothly as possible and avoid being blocked.
+
+## Dealing with rate limiting by rotating proxy or session {#dealing-with-rate-limiting}
+
+The most popular and effective way of avoiding rate-limiting issues is by rotating [proxies](../mitigation/proxies.md) after every **n** number of requests, which makes your scraper appear as if it is making requests from various different places. Since the majority of rate-limiting solutions are based on IP addresses, rotating IPs allows a scraper to make large amounts to a website without getting restricted.
+
+In Crawlee, proxies are automatically rotated for you when you use `ProxyConfiguration` and a [**SessionPool**](https://crawlee.dev/api/core/class/SessionPool) within a crawler. The SessionPool handles a lot of the nitty gritty of proxy rotating, especially with [browser based crawlers](../../puppeteer_playwright/index.md) by retiring a browser instance after a certain number of requests have been sent from it in order to use a new proxy (a browser instance must be retired in order to use a new proxy).
+
+Here is an example of these features being used in a **PuppeteerCrawler** instance:
+
+```js
+import { PuppeteerCrawler } from 'crawlee';
+import { Actor } from 'apify';
+
+const myCrawler = new PuppeteerCrawler({
+    proxyConfiguration: await Actor.createProxyConfiguration({
+        groups: ['RESIDENTIAL'],
+    }),
+    sessionPoolOptions: {
+        // Note that a proxy is tied to a session
+        sessionOptions: {
+            // Let's say the website starts blocking requests after
+            // 20 requests have been sent in the span of 1 minute from
+            // a single user.
+            // We can stay on the safe side and retire the browser
+            // and rotate proxies after 15 pages (requests) have been opened.
+            maxUsageCount: 15,
+        },
+    },
+    // ...
+});
+```
+
+> Take a look at the [**Using proxies**](../mitigation/using_proxies.md) lesson to learn more about how to use proxies and rotate them in Crawlee.
+
+### Configuring a session pool {#configuring-session-pool}
+
+To set up the SessionPool for different rate-limiting scenarios, you can use various configuration options in `sessionPoolOptions`. In the example above, we used `maxUsageCount` within `sessionOptions` to prevent more than 15 requests from being sent using a session before it was thrown away; however, a maximum age can also be set using `maxAgeSecs`.
+
+When dealing with frequent and unpredictable blockage, the `maxErrorScore` option can be set to trash a session after it's hit a certain number of errors.
+
+To learn more about all configurations available in `sessionPoolOptions`, refer to the [Crawlee documentation](https://crawlee.dev/api/core/interface/SessionPoolOptions).
+
+> Don't worry too much about these configurations. Crawlee's defaults are usually good enough for the majority of use cases.
+
+## Next up {#next}
+
+Though rate limiting is still common today, a lot of sites have improved over the years to use more complicated techniques such as **browser fingerprinting**, which is covered in the [next lesson](./fingerprinting.md).
+</file>
+
+<file path="webscraping/anti_scraping/index.md">
+---
+title: Anti-scraping protections
+description: Understand the various anti-scraping measures different sites use to prevent bots from accessing them, and how to appear more human to fix these issues.
+sidebar_position: 4
+category: web scraping & automation
+slug: /anti-scraping
+---
+
+# Anti-scraping protections {#anti-scraping-protections}
+
+**Understand the various anti-scraping measures different sites use to prevent bots from accessing them, and how to appear more human to fix these issues.**
+
+---
+
+If at any point in time you've strayed away from the Academy's demo content, and into the Wild West by writing some scrapers of your own, you may have been hit with anti-scraping measures. This is extremely common in the scraping world; however, the good thing is that there are always solutions.
+
+This section covers the essentials of mitigating anti-scraping protections, such as proxies, HTTP headers and cookies, and a few other things to consider when working on a reliable and scalable crawler. Proper usage of the methods taught in the next lessons will allow you to extract data which is specific to a certain location, enable your crawler to browse websites as a logged-in user, and more.
+
+In development, it is crucial to check and adjust the configurations related to our next lessons' topics, as doing this can fix blocking issues on the majority of websites.
+
+## Quick start {#quick-start}
+
+If you don't have time to read about the theory behind anti-scraping protections to fine-tune your scraping project and instead you need to get unblocked ASAP, here are some quick tips:
+
+- Use high-quality proxies. [Residential proxies](/platform/proxy/residential-proxy) are the least blocked. You can find many providers out there like Apify, BrightData, Oxylabs, NetNut, etc.
+- Set **real-user-like HTTP settings** and **browser fingerprints**. [Crawlee](https://crawlee.dev/) uses statistically generated realistic HTTP headers and browser fingerprints by default for all of its crawlers.
+- Use a browser to pass bot capture challenges. We recommend [Playwright with Firefox](https://crawlee.dev/docs/examples/playwright-crawler-firefox) because it is not that common for scraping. You can also play with [non-headless mode](https://crawlee.dev/api/playwright-crawler/interface/PlaywrightCrawlerOptions#headless) and adjust other [fingerprint settings](https://crawlee.dev/api/browser-pool/interface/FingerprintGeneratorOptions).
+- Consider extracting data from **[private APIs](../api_scraping/index.md)** or **mobile app APIs**. They are usually much less protected.
+- Increase the number of request retries significantly to at least 10 with [`maxRequestRetries: 10`](https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions#maxRequestRetries). Rotate sessions after every error with [`maxErrorScore: 1`](https://crawlee.dev/api/core/interface/SessionOptions#maxErrorScore)
+- If you cannot afford to use browsers for performance reasons, you can try [Playwright.request](https://playwright.dev/docs/api/class-playwright#playwright-request) or [curl-impersonate](https://www.npmjs.com/package/node-libcurl) as the HTTP library for [Cheerio](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) or [Basic](https://crawlee.dev/api/basic-crawler/class/BasicCrawler) Crawlers, instead of its default [got-scraping](https://crawlee.dev/docs/guides/got-scraping) HTTP back end. These libraries have access to native code which offers much finer control over the HTTP traffic and mimics real browsers more than what can be achieved with plain Node.js implementation like `got-scraping`. These libraries should become part of Crawlee itself in the future.
+
+In the vast majority of cases, this configuration should lead to success. Success doesn't mean that all requests will go through unblocked, that is not realistic. Some IP addresses and fingerprint combinations will still be blocked but the automatic retry system takes care of that. If you can get at least 10% of your requests through, you can still scrape the whole website with enough retries. The default [SessionPool](https://crawlee.dev/api/core/class/SessionPool) configuration will preserve the working sessions and eventually the success rate will increase.
+
+If the above tips didn't help, you can try to fiddle with the following:
+
+- Try different browsers. Crawlee & Playwright support Chromium, Firefox and WebKit out of the box. You can also try the [Brave browser](https://brave.com) which [can be configured for Playwright](https://blog.apify.com/unlocking-the-potential-of-brave-and-playwright-for-browser-automation/).
+- Don't use browsers at all. Sometimes the anti-scraping protections are extremely sensitive to browser behavior but will allow plain HTTP requests (with the right headers) just fine. Don't forget to match the specific [HTTP headers](/academy/concepts/http-headers) for each request.
+- Decrease concurrency. Slower scraping means you can blend in better with the rest of the traffic.
+- Add human-like behavior. Don't traverse the website like a bot (paginating quickly from 1 to 100). Instead, visit various types of pages, add time randomizations and you can even introduce some mouse movements and clicks.
+- Try Puppeteer with the [puppeteer-extra-plugin-stealth](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth) plugin. Generally, Crawlee's default configuration should have stronger bypassing but some features might land first in the stealth plugin.
+- Find different sources of the data. The data might be rendered to the HTML but you could also find it in JavaScript (inlined in the HTML or in files) or in the API responses. Especially the APIs are often much less protected (if you use the right headers).
+- Reverse engineer the JavaScript challenges that run on the page so you can figure out how the bypass them. This is a very advanced topic that you can read about online. We plan to introduce more content about this.
+
+Keep in mind that there is no silver bullet solution. You can find many anti-scraping systems and each of them behaves differently depending the website's configuration. That is why "trying a few things" usually leads to success. You will find more details about these tricks in the [mitigation](./mitigation/index.md) section below.
+
+## First of all, why do websites want to block bots? {#why-block-bots}
+
+What's up with that?! A website might have a variety of reasons to block bots from accessing it. Here are a few of the main ones:
+
+- To prevent the possibility of malicious bots from crawling the site to steal sensitive data like passwords or personal data about users.
+- In order to avoid server performance hits due to bots making a large amount of requests to the website at a single time.
+- To avoid their competitors to gain market insights about their business.
+- To prevent bots from scraping their content and selling it to other websites or re-publishing it.
+- To not skew their analytics data with bot traffic.
+- If it is a social media website, they might be attempting to keep away bots programmed to mass create fake profiles (which are usually sold later).
+
+> We recommend checking out [this article about legal and ethical ramifications of web scraping](https://blog.apify.com/is-web-scraping-legal/).
+
+Unfortunately for these websites, they have to make compromises and tradeoffs. While super strong anti-bot protections will surely prevent the majority of bots from accessing their content, there is also a higher chance of regular users being flagged as bots and being blocked as well. Because of this, different sites have different scraping-difficulty levels based on the anti-scraping measures they take.
+
+> Going into this topic, it's important to understand that there is no one silver bullet solution to bypassing protections against bots. Even if two websites are using Cloudflare (for example), one of them might be significantly more difficult to scrape due to harsher CloudFlare configurations. It is all about configuration, not the anti-scraping tool itself.
+
+## The principles of anti-scraping protections {#the-principles}
+
+Anti-scraping protections can work on many different layers and use a large amount of bot-identification techniques.
+
+1. **Where you are coming from** - The IP address of the incoming traffic is always available to the website. Proxies are used to emulate a different IP addresses but their quality matters a lot.
+2. **How you look** - With each request, the website can analyze its HTTP headers, TLS version, ciphers, and other information. Moreover, if you use a browser, the website can also analyze the whole browser fingerprint and run challenges to classify your hardware (like graphics hardware acceleration).
+3. **What you are scraping** - The same data can be extracted in many ways from a website. You can get the initial HTML or you can use a browser to render the full page or you can reverse engineer internal APIs. Each of those endpoints can be protected differently.
+4. **How you behave** - The website can see patterns in how you are ordering your requests, how fast you are scraping, etc. It can also analyze browser behavior like mouse movement, clicks or key presses.
+
+These are the 4 main principles that anti-scraping protections are based on.
+
+Not all websites use all of these principles but they encompass the possibilities websites have to track and block bots. All techniques that help you mitigate anti-scraping protections are based on making yourself blend in with the crowd of regular users with each of these principles.
+
+A bot can usually be detected in one of two ways, which follow two different types of web scraping:
+
+1. Crawlers using **HTTP requests**
+2. Crawlers using **browser automation** (usually with a headless browser)
+
+Once a bot is detected, there are some countermeasures a website takes to prevent it from re-accessing it. The protection techniques are divided into two main categories:
+
+1. Uses only the **information provided within the HTTP request**, such as headers, IP addresses, TLS versions, ciphers, etc.
+2. Uses **JavaScript evaluation to collect browser fingerprint**, or even track the user behavior on the website. These JavaScript evaluations can also track mouse movement or keys pressed. Based on the information gathered, they can decide if the user is a bot or a human. This method is often paired with the first one.
+
+Once one of these methods detects that the user is a bot, it will take countermeasures depending on how advanced its techniques are.
+
+A common workflow of a website after it has detected a bot goes as follows:
+
+1. The bot is added to the "greylist" (a list of suspicious IP addresses, fingerprints or any other value that can be used to uniquely identify the bot).
+2. A [Turing test](https://en.wikipedia.org/wiki/Turing_test) is provided to the bot. Typically a **captcha**. If the bot succeeds, it is added to the whitelist.
+3. If the captcha is failed, the bot is added to the blacklist.
+
+One thing to keep in mind while navigating through this course is that advanced anti-scraping methods are able to identify non-humans not only by one value (such as a single header value, or IP address), but are able to identify them through more complex things such as header combinations.
+
+Watch a conference talk by [Ondra Urban](https://github.com/mnmkng), which provides an overview of various anti-scraping measures and tactics for circumventing them.
+
+<iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/aXil0K-M-Vs" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>
+
+:::info Several years old?
+
+Although the talk, given in 2021, features some outdated code examples, it still serves well as a general overview.
+
+:::
+
+## Common anti-scraping measures {#common-measures}
+
+Because we here at Apify scrape for a living, we have discovered many popular and niche anti-scraping techniques. We've compiled them into a short and comprehensible list here to help understand the roadblocks before this course teaches you how to get around them.
+
+> Not all issues you encounter are caused by anti-scraping systems. Sometimes, it's a configuration issue. Learn [how to effectively debug your programs here](/academy/node-js/analyzing-pages-and-fixing-errors).
+
+### IP rate-limiting
+
+This is the most straightforward and standard protection, which is mainly implemented to prevent DDoS attacks, but it also works for blocking scrapers. Websites using rate limiting don't allow to more than some defined number of requests from one IP address in a certain time span. If the max-request number is low, then there is a high potential for false-positive due to IP address uniqueness, such as in large companies where hundreds of employees can share the same IP address.
+
+> Learn more about rate limiting [here](./techniques/rate_limiting.md)
+
+### Header checking
+
+This type of bot identification is based on the given fact that humans are accessing web pages through browsers, which have specific [header](../../glossary/concepts/http_headers.md) sets which they send along with every request. The most commonly known header that helps to detect bots is the `User-Agent` header, which holds a value that identifies which browser is being used, and what version it's running. Though `User-Agent` is the most commonly used header for the **Header checking** method, other headers are sometimes used as well. The evaluation is often also run based on the header consistency, and includes a known combination of browser headers.
+
+### URL analysis
+
+Solely based on the way how the bots operate. It compares data-rich page visits and the other page visits. The ratio of the data-rich and regular pages has to be high to identify the bot and reduce false positives successfully.
+
+### Regular structure changes
+
+By definition, this is not an anti-scraping method, but it can heavily affect the reliability of a scraper. If your target website drastically changes its CSS selectors, and your scraper is heavily reliant on selectors, it could break. In principle, websites using this method change their HTML structure or CSS selectors randomly and frequently, making the parsing of the data harder, and requiring more maintenance of the bot.
+
+One of the best ways of avoiding the possible breaking of your scraper due to website structure changes is to limit your reliance on data from HTML elements as much as possible (see [API Scraping](../api_scraping/index.md) and [JavaScript objects within HTML](../../tutorials/node_js/js_in_html.md))
+
+### IP session consistency
+
+This technique is commonly used to entirely block the bot from accessing the website altogether. It works on the principle that every entity that accesses the site gets a token. This token is then saved together with the IP address and HTTP request information such as User-Agent and other specific headers. If the entity makes another request, but without the session token, the IP address is added on the greylist.
+
+### Interval analysis
+
+This technique is based on analyzing the time intervals of the visit of a website. If the times are very similar, the entity is added to the greylist. This method’s premise is that the bot runs in regular intervals by, for example, a CRON job that starts every Monday. It is a long-term strategy, so it should be used as an extension. This technique needs only the information from the HTTP request to identify the frequency of the visits.
+
+### Browser fingerprinting
+
+One of the most successful and advanced methods is collecting the browser's "fingerprint", which is a fancy name for information such as fonts, audio codecs, canvas fingerprint, graphics card, and more. Browser fingerprints are highly unique, so they are a reliable means of identifying a specific user (or bot). If the fingerprint provides different/inconsistent information, the user is added to the greylist.
+
+> It's important to note that this method also blocks all users that cannot evaluate JavaScript (such as bots sending only static HTTP requests), and combines both of the fundamental methods mentioned earlier.
+
+### Honeypots
+
+The honeypot approach is based on providing links that only bots can see. A typical example is hidden pagination. Usually, the bot needs to go through all the pages in the pagination, so the website's last "fake" page has a hidden link for the user, but has the same selector as the real one. Once the bot visits the link, it is automatically blacklisted. This method needs only the HTTP information.
+
+
+## First up {#first}
+
+In our [first section](./techniques/index.md), we'll be discussing more in-depth about the various anti-scraping methods and techniques websites use, as well as how to mitigate these protections.
+</file>
+
+<file path="webscraping/api_scraping/general_api_scraping/cookies_headers_tokens.md">
+---
+title: Cookies, headers, and tokens
+description: Learn about how some APIs require certain cookies, headers, and/or tokens to be present in a request in order for data to be received.
+sidebar_position: 2
+slug: /api-scraping/general-api-scraping/cookies-headers-tokens
+---
+
+# Dealing with headers, cookies, and tokens {#challenges}
+
+**Learn about how some APIs require certain cookies, headers, and/or tokens to be present in a request in order for data to be received.**
+
+---
+
+Unfortunately, most APIs will require a valid cookie to be included in the `cookie` field within a request's headers in order to be authorized. Other APIs may require special tokens, or other data that validates the request.
+
+Luckily, there are ways to retrieve and set cookies for requests prior to sending them, which will be covered more in-depth within future Scraping Academy modules. The most important things to know at the moment are:
+
+## Cookies {#cookies}
+
+1. For sites that heavily rely on cookies for user-verification and request authorization, certain generic requests (such as to the website's main page, or to the target page) will return back a (or multiple) `set-cookie` header(s).
+2. The `set-cookie` response header(s) can be parsed and used as the `cookie` header in the headers of a request. A great package for parsing these values from a response's headers is [`set-cookie-parser`](https://www.npmjs.com/package/set-cookie-parser). With this package, cookies can be parsed from headers like so:
+
+```js
+import axios from 'axios';
+
+// import the set-cookie-parser module
+import setCookieParser from 'set-cookie-parser';
+
+const getCookie = async () => {
+    // make a request to the target site
+    const response = await axios.get('https://www.example.com/');
+
+    // parse the cookies from the response
+    const cookies = setCookieParser.parse(response);
+
+    // format the parsed data into a usable string
+    const cookieString = cookies.map(({ name, value }) => `${name}=${value};`).join(' ');
+
+    // log the final cookie string to be used in a 'cookie' header
+    console.log(cookieString);
+};
+
+getCookie();
+```
+
+## Headers {#headers}
+
+Other APIs may not require a valid cookie header, but instead will require certain headers to be attached to the request which are typically attached when a user makes a "real" request from a browser. The most commonly required headers are:
+
+- `User-Agent`
+- `Referer`
+- `Origin`
+- `Host`
+
+Headers required by the target API can be configured manually in a manner such as this, and attached to every single request the scraper sends:
+
+```js
+const HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
+        + 'Chrome/96.0.4664.110 YaBrowser/22.1.0.2500 Yowser/2.5 Safari/537.36',
+    Referer: 'https://soundcloud.com',
+    // ...
+};
+```
+
+However, a much better option is to use either a custom implementation of generating random headers for each request, or to use a package such as [`got-scraping`](https://www.npmjs.com/package/got-scraping) to automatically do this.
+
+With `got-scraping`, generating request-specific headers can be done right within a request with `headerGeneratorOptions`. Specific headers can also be set with the `headers` option:
+
+```js
+const response = await gotScraping({
+    url: 'https://example.com',
+    headerGeneratorOptions: {
+        browsers: [
+            {
+                name: 'chrome',
+                minVersion: 87,
+                maxVersion: 89,
+            },
+        ],
+        devices: ['desktop'],
+        locales: ['de-DE', 'en-US'],
+        operatingSystems: ['windows', 'linux'],
+    },
+    headers: {
+        'some-header': 'Hello, Academy!',
+    },
+});
+```
+
+## Tokens {#tokens}
+
+For our SoundCloud example, testing the endpoint from the previous section in a tool like [Postman](../../../glossary/tools/postman.md) works perfectly, and returns the data we want; however, when the `client_id` parameter is removed, we receive a **401 Unauthorized** error. Luckily, the Client ID is the same for every user, which means that it is not tied to a session or an IP address (this is based on our own observations and tests). The big downfall is that the token being used by SoundCloud changes every few weeks, so it shouldn't be hardcoded. This case is actually quite common, and is not only seen with SoundCloud.
+
+Ideally, this `client_id` should be scraped dynamically, especially since it changes frequently, but unfortunately, the token cannot be found anywhere on SoundCloud's pages. We already know that it's available within the parameters of certain requests though, and luckily, [Puppeteer](https://github.com/puppeteer/puppeteer) offers a way to analyze each response when on a page. It's a bit like using browser DevTools, which you are already familiar with by now, but programmatically instead.
+
+Here is a way you could dynamically scrape the `client_id` using Puppeteer:
+
+```js
+// import the puppeteer module
+import puppeteer from 'puppeteer';
+
+const scrapeClientId = async () => {
+    const browser = await puppeteer.launch({ headless: false });
+    const page = await browser.newPage();
+
+    // initialize a variable that will eventually hold the client_id
+    let clientId = null;
+
+    // handle each response
+    page.on('response', async (res) => {
+        // try to grab the 'client_id' parameter from each URL
+        const id = new URL(res.url()).searchParams.get('client_id') ?? null;
+
+        // if the parameter exists, set our clientId variable to the newly parsed value
+        if (id) clientId = id;
+    });
+
+    // visit the page
+    await page.goto('https://soundcloud.com/tiesto/tracks');
+
+    // wait for a selector that ensures the page has time to load and make requests to its API
+    await page.waitForSelector('.profileHeader__link');
+
+    await browser.close();
+    console.log(clientId); // log the retrieved client_id
+};
+
+scrapeClientId();
+```
+
+## Next up {#next}
+
+Keep the code above in mind, because we'll be using it in the [next lesson](./handling_pagination.md) when paginating through results from SoundCloud's API.
+</file>
+
+<file path="webscraping/api_scraping/general_api_scraping/handling_pagination.md">
+---
+title: Handling pagination
+description: Learn about the three most popular API pagination techniques and how to handle each of them when scraping an API with pagination.
+sidebar_position: 3
+slug: /api-scraping/general-api-scraping/handling-pagination
+---
+
+# Handling pagination {#handling-pagination}
+
+**Learn about the three most popular API pagination techniques and how to handle each of them when scraping an API with pagination.**
+
+---
+
+When scraping large APIs, you'll quickly realize that most APIs limit the number of results it responds back with. For some APIs, the max number of results is 5, while for others it's 2000. Either way, they all have something in common - pagination.
+
+If you've never dealt with it before, trying to scrape thousands to hundreds of thousands of items from an API with pagination can be a bit challenging. In this lesson, we'll be discussing a few of the different types of pagination, as well as how to work with them.
+
+## Page-number pagination {#page-number}
+
+The most common and rudimentary forms of pagination have page numbers. Imagine paginating through a typical e-commerce website.
+
+![Amazon pagination](./images/pagination.png)
+
+This implementation makes it fairly straightforward to programmatically paginate through an API, as it pretty much entails incrementing up or down in order to receive the next set of items. The page number is usually provided right in the parameters of the request URL; however, some APIs require it to be provided in the request body instead.
+
+## Offset pagination {#offset-pagination}
+
+The second most popular pagination technique used is based on using a **limit** parameter along with an **offset** parameter. The **limit** says how many records should be returned in a single request, while the **offset** parameter says how many records should be skipped.
+
+For example, let's say that we have this dataset and an API route to retrieve its items:
+
+```js
+const myAwesomeDataset = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+```
+
+If we were to make a request with the **limit** set to **5** and the **offset** parameter also set to **5**, the API would skip over the first five items and return `[6, 7, 8, 9, 10]`.
+
+## Cursor pagination {#cursor-pagination}
+
+Sometimes pagination uses **cursor** instead of **offset**. Cursor is a marker of an item in the dataset. It can be a date, number, or a more or less random string of letters and numbers. Request with a **cursor** parameter will result in an API response containing items which follow after the item which the cursor points to.
+
+One of the most painful things about scraping APIs with cursor pagination is that you can't skip to, for example, the 5th page. You have to paginate through each page one by one.
+
+> Note: SoundCloud [migrated](https://developers.soundcloud.com/blog/pagination-updates-on-our-api/) over to using cursor-based pagination; however, they did not change the parameter name from **offset** to **cursor**. Always be on the lookout for this type of stuff!
+
+## Using "next page" {#using-next-page}
+
+In a minute, we're going to create a mini-project which will scrape the first 100 of Tiësto's tracks by keeping a **limit** of 20 and paginating through until we've scraped 100 items.
+
+Luckily for us, SoundCloud's API (and many others) provides a **next_href** property in each response, which means we don't have to directly deal with setting the **offset** (cursor) parameter:
+
+```json
+//...
+{
+    "next_href": "https://api-v2.soundcloud.com/users/141707/tracks?offset=2020-03-13T00%3A00%3A00.000Z%2Ctracks%2C00774168919&limit=20&representation=https%3A%2F%2Fapi-v2.soundcloud.com%2Fusers%2F141707%2Ftracks%3Flimit%3D20",
+    "query_urn": null
+}
+```
+
+This URL can take various different forms, and can be given different names; however, they all generally do the same thing - bring you to the next page of results.
+
+## Mini project {#mini-project}
+
+First, create a new folder called **pagination-tutorial** and run this command inside of it:
+
+```shell
+# initialize the project and install the puppeteer
+# and got-scraping packages
+npm init -y && npm i puppeteer got-scraping
+```
+
+Now, make a new file called **scrapeClientId**, copying the **client_id** scraping code from the previous lesson and making a slight modification:
+
+```js
+// scrapeClientId.js
+import puppeteer from 'puppeteer';
+
+// export the function to be used in a different file
+export const scrapeClientId = async () => {
+    const browser = await puppeteer.launch({ headless: true });
+    const page = await browser.newPage();
+
+    let clientId = null;
+
+    page.on('response', async (res) => {
+        const id = new URL(res.url()).searchParams.get('client_id') ?? null;
+        if (id) clientId = id;
+    });
+
+    await page.goto('https://soundcloud.com/tiesto/tracks');
+    await page.waitForSelector('.profileHeader__link');
+    await browser.close();
+
+    // return the client_id
+    return clientId;
+};
+```
+
+Now, in a new file called **index.js** we'll write the skeleton for our pagination and item-scraping code:
+
+```js
+// index.js
+// we will need gotScraping to make HTTP requests
+import { gotScraping } from 'got-scraping';
+import { scrapeClientId } from './scrapeClientId';
+
+const scrape100Items = async () => {
+    // the initial request URL
+    const nextHref = 'https://api-v2.soundcloud.com/users/141707/tracks?limit=20&offset=0';
+
+    // create an array for all of our scraped items to live
+    const items = [];
+
+    // scrape the client ID with the script from the
+    // previous lesson
+    const clientId = await scrapeClientId();
+
+    // More code will go here
+};
+```
+
+Let's now take a step back and think about the condition on which we should continue paginating:
+
+1. If the API responds with a **next_href** set to **null**, that means that there are no more pages, and that we have scraped all of the possible items and we should stop paginating.
+2. If our items list has 100 records or more, we should stop paginating. Otherwise, we should continue until 100+ items has been reached.
+
+With a full understanding of this condition, we can translate it into code:
+
+```js
+const scrape100Items = async () => {
+    // ...previous code
+    // continue making requests until either we've reached 100+ items
+    while (items.flat().length < 100) {
+        // if the "next_href" wasn't present in the last call, there
+        // are no more pages. return what we have and stop paginating.
+        if (!nextHref) return items.flat();
+
+        // continue paginating
+    }
+};
+```
+
+All that's left to do now is flesh out this `while` loop with pagination logic and finally return the **items** array once the loop has finished.
+
+> Note that it's better to add requests to a requests queue rather than processing them in memory. The crawlers offered by [Crawlee](https://crawlee.dev/docs/) provide this functionality out of the box.
+
+```js
+// index.js
+import { gotScraping } from 'got-scraping';
+import { scrapeClientId } from './scrapeClientId';
+
+const scrape100Items = async () => {
+    let nextHref = 'https://api-v2.soundcloud.com/users/141707/tracks?limit=20&offset=0';
+    const items = [];
+
+    const clientId = await scrapeClientId();
+
+    while (items.flat().length < 100) {
+        if (!nextHref) return items.flat();
+
+        // set the "client_id" URL parameter of the
+        // nextHref URL
+        const nextURL = new URL(nextHref);
+        nextURL.searchParams.set('client_id', clientId);
+
+        // make the paginated request and push its results
+        // into the in-memory "items" array
+        const res = await gotScraping(nextURL);
+        const json = JSON.parse(res.body);
+        items.push(json.collection);
+
+        // queue the next link for the next loop iteration
+        nextHref = json.next_href;
+    }
+
+    // return an array of all our scraped items
+    // once the loop has finished
+    return items.flat();
+};
+
+// test run
+(async () => {
+    // run the function
+    const data = await scrape100Items();
+
+    // log the length of the items array
+    console.log(data.length);
+})();
+```
+
+> We are using the [`.flat()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/flat) method when returning the **items** array to turn our array of arrays into a single array of items.
+
+Here's what the output of this code looks like:
+
+```text
+105
+```
+
+## Final note
+
+Sometimes, APIs have limited pagination. That means that they limit the total number of results that can appear for a set of pages, or that they limit the pages to a certain number. To learn how to handle these cases, take a look at the [Crawling with search](/academy/advanced-web-scraping/crawling/crawling-with-search) article.
+
+## Next up {#next}
+
+[//]: # (In this lesson, you learned about how to use API parameters and properties returned in an API response to paginate through results. [Next up]&#40;link api_scraping/general_api_scraping/using_api_filters.md&#41;, you'll gain a solid understanding of using API filtering parameters.)
+
+This is the last lesson in the API scraping tutorial for now, but be on the lookout for more lessons soon to come! Thus far, you've learned how to:
+
+1. Locate API endpoints
+2. Understand located API endpoints and their parameters
+3. Parse and modify cookies
+4. Modify/set headers
+5. Farm API tokens using Puppeteer
+6. Use paginated APIs
+<!-- 7. Utilize API filters to narrow down results -->
+
+If you'd still like to read more about API scraping, check out the [**GraphQL scraping**](../graphql_scraping/index.md) course! GraphQL is the king of API scraping.
+</file>
+
+<file path="webscraping/api_scraping/general_api_scraping/index.md">
+---
+title: General API scraping
+description: Learn the benefits and drawbacks of API scraping, how to locate an API, how to utilize its features, and how to work around common roadblocks.
+sidebar_position: 4.1
+slug: /api-scraping/general-api-scraping
+---
+
+# General API scraping {#general-api-scraping}
+
+**Learn the benefits and drawbacks of API scraping, how to locate an API, how to utilize its features, and how to work around common roadblocks.**
+
+---
+
+This section will teach you everything you should know about API scraping before moving into the next sections in the **API Scraping** module. Learn how to find APIs, how to use them, how to paginate them, and how to get past some common roadblocks when dealing with them.
+
+Each lesson will prepare you for real-world API scraping, and will help put yet another data extraction technique into your scraping toolbelt.
+
+## Next up {#next}
+
+In our [first lesson](./locating_and_learning.md), we will take a look at how to locate a website's API endpoints with DevTools, and how to use them. This is your entrypoint into learning how to scrape APIs.
+</file>
+
+<file path="webscraping/api_scraping/general_api_scraping/locating_and_learning.md">
+---
+title: Locating API endpoints
+description: Learn how to effectively locate a website's API endpoints, and learn how to use them to get the data you want faster and more reliably.
+sidebar_position: 1
+slug: /api-scraping/general-api-scraping/locating-and-learning
+---
+
+# Locating API endpoints {#locating-endpoints}
+
+**Learn how to effectively locate a website's API endpoints, and learn how to use them to get the data you want faster and more reliably.**
+
+---
+
+In order to retrieve a website's API endpoints, as well as other data about them, the **Network** tab within Chrome's (or another browser's) DevTools can be used. This tab allows you to see all of the various network requests being made, and even allows you to filter them based on request type, response type, or by a keyword.
+
+On our target page, we'll open up the Network tab, and filter by request type of `Fetch/XHR`, as opposed to the default of `All`. Next, we'll do some action on the page which causes the request for the target data to be sent, which will enable us to view the request in DevTools. The types of actions that need to be done can vary depending on the website, the type of page, and the type of data being returned. Sometimes, reloading the page is enough, while other times, a button must be clicked, or the page must be scrolled. For our example use case, reloading the page is sufficient.
+
+_Here's what we can see in the Network tab after reloading the page:_
+
+![Network tab results after completing an action on the page which results in the API being called](./images/results-in-network-tab.png)
+
+Let's say that our target data is a full list of Tiësto's uploaded songs on SoundCloud. We can use the **Filter** option to search for the keyword `tracks`, and see if any endpoints have been hit that include that word. Multiple results may still be in the list when using this feature, so it is important to carefully examine the payloads and responses of each request in order to ensure that the correct one is found.
+
+:::note Filtering requests
+
+To find what we're looking for, we must wisely choose what piece of data (in this case a keyword) we filter by. Think of something that is most likely to be part of the endpoint (in this case a string `tracks`).
+
+:::
+
+After a little bit of digging through the different response values of each request in our filtered list within the Network tab, we can discover this endpoint, which returns a JSON list including 20 of Tiësto's latest tracks:
+
+![Endpoint found in the Network tab](./images/endpoint-found.png)
+
+## Learning the API {#learning-the-api}
+
+The majority of APIs, especially for popular sites that serve up large amounts of data, are configurable through different parameters, query options, or payload values. A lot of times, an endpoint discovered through the Network tab will reveal at least a few of these options.
+
+Here's what our target endpoint's URL looks like coming directly from the Network tab:
+
+```text
+https://api-v2.soundcloud.com/users/141707/tracks?representation=&client_id=zdUqm51WRIAByd0lVLntcaWRKzuEIB4X&limit=20&offset=0&linked_partitioning=1&app_version=1646987254&app_locale=en
+```
+
+Since our request doesn't have any body/payload, we need to analyze the URL. We can break this URL down into chunks that help us understand what each value does.
+
+![Breaking down the request url into understandable chunks](./images/analyzing-the-url.png)
+
+Understanding an API's various configurations helps with creating a game-plan on how to best scrape it, as many of the parameters can be utilized for pagination, or data-filtering. Additionally, these values can be mapped to a scraper's configuration options, which overall makes the scraper more versatile.
+
+Let's say we want to receive all of the user's tracks in one request. Based on our observations of the endpoint's different parameters, we can modify the URL and utilize the `limit` option to return more than twenty songs. The `limit` option is extremely common with most APIs, and allows the person making the request to literally limit the maximum number of results to be returned in the request:
+
+```text
+https://api-v2.soundcloud.com/users/141707/tracks?client_id=zdUqm51WRIAByd0lVLntcaWRKzuEIB4X&limit=99999
+```
+
+By using the ridiculously large number of `99999`, we ensure that all of the user's tracks will be captured in this single request. Luckily, with SoundCloud's API, there is no cap to the `limit` parameter; however, most other APIs will have a limit to ensure that hundreds of thousands of results aren't retrieved at one time. For this use-case, setting a massive results limit is not much of a risk, as most users don't have a track-count over 500 anyways, but receiving too many results at once can result in overflow errors.
+
+## Next up {#next}
+
+[Next lesson](./cookies_headers_tokens.md) will be all about cookies, headers, and tokens, and how they're relevant when scraping an API.
+</file>
+
+<file path="webscraping/api_scraping/graphql_scraping/custom_queries.md">
+---
+title: Custom queries
+description: Learn how to write custom GraphQL queries, how to pass input values into GraphQL requests as variables, and how to retrieve and output the data from a scraper.
+sidebar_position: 3
+slug: /api-scraping/graphql-scraping/custom-queries
+---
+
+# Custom queries {#custom-queries}
+
+**Learn how to write custom GraphQL queries, how to pass input values into GraphQL requests as variables, and how to retrieve and output the data from a scraper.**
+
+---
+
+Sometimes, the queries found in the **Network** tab aren't good enough for your use case. Or, perhaps they're even returning more data than what you're after (which can slow down the queries depending on how much data they're giving back). In these situations, it's a good idea to dig a bit deeper into the API and start writing your own custom use-case specific queries.
+
+In this lesson, we're building a scraper which expects a single number (in **hours**) and a **query** string as its input. As output, it should provide data about the first 1000 Cheddar posts published within the last **n** hours which match the provided query. Each **post** object should contain the **title**, the **publishDate** and the **videoUrl** of the post.
+
+```json
+[
+    {
+        "title": "FDA Authorizes 1st Breath Test for COVID-19 Infection",
+        "publishDate": "2022-04-15T11:58:44-04:00",
+        "videoUrl": "https://vod.chdrstatic.com/source%3Dbackend%2Cexpire%3D1651782479%2Cpath%3D%2Ftranscode%2Fb68f8133-3aa9-4c96-ac26-047452bbc9ce%2Ctoken%3D581fd52bb7f634834edca5c201619c014cd21eb20448cf89525bf101ca8a6f64/transcode/b68f8133-3aa9-4c96-ac26-047452bbc9ce/b68f8133-3aa9-4c96-ac26-047452bbc9ce.mp4"
+    },
+    {
+        "...": "..."
+    }
+]
+```
+
+## Project setup {#project-setup}
+
+To make sure we're all on the same page, we're going to set up the project together by first creating a folder named **graphql-scraper**. Once navigated to the folder within your terminal, run the following command:
+
+```shell
+npm init -y && npm install graphql-tag puppeteer got-scraping
+```
+
+This command will first initialize the project with npm, then will install the `puppeteer`, `graphql-tag`, and `got-scraping` packages, which we will need in this lesson.
+
+Finally, create a file called **index.js**. This is the file we will be working in for the rest of the lesson.
+
+## Preparations {#preparations}
+
+If we remember from the last lesson, we need to pass a valid "app token" within the **X-App-Token** header of every single request we make, or else we will be blocked. When testing queries, we copied this value straight from the **Network** tab; however, since this is a dynamic value, we should farm it.
+
+Since we know requests with this header are sent right when the front page is loaded, it can be farmed by visiting the page and intercepting requests in Puppeteer like so:
+
+```js
+// scrapeAppToken.js
+import puppeteer from 'puppeteer';
+
+const scrapeAppToken = async () => {
+    const browser = await puppeteer.launch();
+    const page = await browser.newPage();
+
+    let appToken = null;
+
+    page.on('response', async (res) => {
+        // grab the token from the request headers
+        const token = res.request().headers()?.['x-app-token'];
+
+        // if there is a token, grab it and close the browser
+        if (token) {
+            appToken = token;
+            await browser.close();
+        }
+    });
+
+    await page.goto('https://www.cheddar.com/');
+
+    await page.waitForNetworkIdle();
+
+    // otherwise, close the browser after networkidle
+    // has been fired
+    await browser.close();
+
+    // return the apptoken (or null)
+    return appToken;
+};
+
+export default scrapeAppToken;
+```
+
+With this code, we're doing the same exact thing as we did in the previous lesson to grab this header value, except programmatically.
+
+> To learn more about this method of scraping headers and tokens, refer to the [Cookies, headers, and tokens](../general_api_scraping/cookies_headers_tokens.md) lesson of the **General API scraping** section.
+
+Now, we can import this function into our **index.js** and use it to create a `token` variable which will be passed as our **X-App-Token** header when scraping:
+
+```js
+// index.js
+
+// import the function
+import scrapeAppToken from './scrapeAppToken.mjs';
+
+const token = await scrapeAppToken();
+```
+
+## Building the query {#building-the-query}
+
+First, we'll write a skeleton query where we define which variables we're expecting (from the user of the scraper):
+
+```graphql
+query SearchQuery($query: String!, $max_age: Int!) {
+    # query will go here
+}
+```
+
+Also in the previous lesson, we learned that the **media** type is dependent on the **organization** type. This means to get any **media**, it must be wrapped in the **organization** query:
+
+```graphql
+query SearchQuery($query: String!, $max_age: Int!) {
+  organization {
+    media(query: $query, max_age: $max_age , first: 1000) {
+
+    }
+  }
+}
+```
+
+Finally, since Cheddar is using [cursor-based relay pagination](https://relay.dev/graphql/connections.htm#relay-style-cursor-pagination) for their API, we must access the data through the **edges** property, where each **node** is a result item:
+
+```graphql
+query SearchQuery($query: String!) {
+  organization {
+    media(query: $query, max_age: $max_age , first: 1000) {
+      edges {
+        node {
+            # here we will define the fields we want
+        }
+      }
+    }
+  }
+}
+```
+
+The next step is to fill out the fields we'd like back, and we've got our final query!
+
+```graphql
+query SearchQuery($query: String!) {
+  organization {
+    media(query: $query, max_age: $max_age , first: 1000) {
+      edges {
+        node {
+          title # title
+          public_at # this will be publishDate
+          hero_video {
+            video_urls {
+              url # the first URL from these results will be videoUrl
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+## Making the request {#making-the-request}
+
+Back in our code, we can import `gql` from `graphql-tag` and use it to store our query:
+
+```js
+// index.js
+import { gql } from 'graphql-tag';
+import scrapeAppToken from './scrapeAppToken.mjs';
+
+const token = await scrapeAppToken();
+
+const GET_LATEST = gql`
+    query SearchQuery($query: String!, $max_age: Int!) {
+        organization {
+            media(query: $query, max_age: $max_age, first: 1000) {
+                edges {
+                    node {
+                        title
+                        public_at
+                        hero_video {
+                            video_urls {
+                                url
+                            }
+                        }
+                        thumbnail_url
+                    }
+                }
+            }
+        }
+    }
+`;
+```
+
+Alternatively, if you don't want to write your GraphQL queries right within your JavaScript code, you can write them in files using the **.graphql** format, then read them from the filesystem or import them.
+
+> In order to receive nice GraphQL syntax highlighting in these template literals, download the [GraphQL VSCode extension](https://marketplace.visualstudio.com/items?itemName=GraphQL.vscode-graphql)
+
+Then, we'll take our input and use it to create a **variables** object which will be used for the request:
+
+```js
+// find posts from the last 48 hours that include the keyword "stocks".
+// since we don't have any real input, we'll simulate some input
+const testInput = { hours: 48, query: 'stocks' };
+
+// the API takes max_input in the format of minutes * 60
+// to calculate this value, we do hours * 60^2
+const variables = { query: testInput.query, max_age: Math.round(testInput.hours) * 60 ** 2 };
+```
+
+The final step is to take the query and variable and marry them within a `gotScraping()` call, which will return the API response:
+
+```js
+const data = await gotScraping('https://api.cheddar.com/graphql', {
+    // we are expecting a JSON response back
+    responseType: 'json',
+    // we must use a post request
+    method: 'POST',
+    // this is where we pass in our token
+    headers: { 'X-App-Token': token, 'Content-Type': 'application/json' },
+    // here is our query with our variables
+    body: JSON.stringify({ query: GET_LATEST.loc.source.body, variables }),
+});
+```
+
+The final step after making the query is to format the data to match the expected dataset schema.
+
+## Final code {#final-code}
+
+Here's what our final project looks like:
+
+```js
+// index.js
+import { gql } from 'graphql-tag';
+import { gotScraping } from 'got-scraping';
+import scrapeAppToken from './scrapeAppToken.mjs';
+
+// Scrape the token
+const token = await scrapeAppToken();
+
+// Define our query
+const GET_LATEST = gql`
+    query SearchQuery($query: String!, $max_age: Int!) {
+        organization {
+            media(query: $query, max_age: $max_age, first: 1000) {
+                edges {
+                    node {
+                        title
+                        public_at
+                        hero_video {
+                            video_urls {
+                                url
+                            }
+                        }
+                        thumbnail_url
+                    }
+                }
+            }
+        }
+    }
+`;
+
+// Grab our input
+const testInput = { hours: 48, query: 'stocks' };
+
+// Calculate and prepare our variables
+const variables = { query: testInput.query, max_age: Math.round(testInput.hours) * 60 ** 2 };
+
+// Make the request
+const { body: { data: { organization } } } = await gotScraping('https://api.cheddar.com/graphql', {
+    responseType: 'json',
+    method: 'POST',
+    headers: { 'X-App-Token': token, 'Content-Type': 'application/json' },
+    body: JSON.stringify({ query: GET_LATEST.loc.source.body, variables }),
+});
+
+// Format the data
+const result = organization.media.edges.map(({ node }) => ({
+    title: node?.title,
+    publishDate: node?.public_at,
+    videoUrl: node?.hero_video ? node.hero_video.video_urls[0].url : null,
+}));
+
+// Log the result
+console.log(result);
+```
+
+```js
+// scrapeAppToken.js
+import puppeteer from 'puppeteer';
+
+const scrapeAppToken = async () => {
+    const browser = await puppeteer.launch();
+    const page = await browser.newPage();
+
+    let appToken = null;
+
+    page.on('response', async (res) => {
+        const token = res.request().headers()?.['x-app-token'];
+
+        if (token) {
+            appToken = token;
+            await browser.close();
+        }
+    });
+
+    await page.goto('https://www.cheddar.com/');
+
+    await page.waitForNetworkIdle();
+
+    await browser.close();
+
+    return appToken;
+};
+
+export default scrapeAppToken;
+```
+
+## Wrap up
+
+<!-- We are actively working on writing the GraphQL scraping guide, so stay tuned for more content here! -->
+
+If you've made it this far, that means that you've conquered the king of API scraping - GraphQL, and that you're ready to take on writing scrapers for the majority of websites out there. Nice work!
+
+Take a moment to review the skills you learned in this section:
+
+1. Modifying the variables of copied GraphQL queries
+2. Introspecting a GraphQL API
+3. Visualizing and understanding a GraphQL API introspection
+4. Writing custom queries
+5. Dealing with cursor-based relay pagination
+6. Writing a GraphQL scraper with custom queries
+</file>
+
+<file path="webscraping/api_scraping/graphql_scraping/index.md">
+---
+title: GraphQL scraping
+description: Dig into the topic of scraping APIs which use the latest and greatest API technology - GraphQL. GraphQL APIs are very different from regular REST APIs.
+sidebar_position: 4.2
+slug: /api-scraping/graphql-scraping
+---
+
+# GraphQL scraping {#graphql-scraping}
+
+**Dig into the topic of scraping APIs which use the latest and greatest API technology - GraphQL. GraphQL APIs are very different from regular REST APIs.**
+
+---
+
+[GraphQL](https://graphql.org/) APIs different from the regular [REST](https://www.redhat.com/en/topics/api/what-is-a-rest-api)ful APIs you're likely familiar with, which means that different methods and tooling are used to scrape them. This course will teach you everything you need to know about GraphQL to scrape an API built with it.
+
+## How do I know if it's a GraphQL API? {#graphql-endpoints}
+
+In this section, we'll be scraping [cheddar.com](https://www.cheddar.com/)'s GraphQL API. When you visit the website and make a search for anything while your **Network Tab** is open, you'll see a request that has been sent to the endpoint **api.cheddar.com/graphql**.
+
+![GraphQL endpoint](../images/graphql-endpoint.png)
+
+As a rule of thumb, when the endpoint ends with **/graphql** and it's a **POST** request, it's a 99.99% bulletproof indicator that the target site is using GraphQL. If you want to be 100% certain though, taking a look at the request payload will most definitely give it away.
+
+![GraphQL payload](../images/graphql-payload.png)
+
+Every GraphQL payload will be a JSON object with a **query** property, and a **variables** property if any variables were provided. If you take a closer look at the full **query** property of this request, you'll notice that it's stringified GraphQL language content.
+
+![Taking a closer look at the payload](../images/stringified-syntax.png)
+
+## Advantages & disadvantages {#advantages-disadvantages}
+
+We already discussed the advantages and disadvantages of API scraping in general in this course's introduction, but because GraphQL is such a different technology, scraping an API built with it comes with its own pros and cons.
+
+### Advantages
+
+1. GraphQL allows you as the developer to choose which fields you'd like to be returned back to you. Not only does this leave you with only the data you want and no extra unwanted fields, but it is also easier on the target.
+
+2. Allows access to data that is not readily available natively through the website.
+
+3. Queries are heavily customizable due to features like **fragments**.
+
+### Disadvantages
+
+1. Though it's a fantastic technology with lots of awesome features, it is also more complex to understand.
+
+2. GraphQL [introspection](./introspection.md) is disabled on many sites, which makes it more difficult to reap the full benefits of GraphQL.
+
+## Next up {#next}
+
+This course section's [first lesson](./modifying_variables.md) will discuss how to customize GraphQL queries without ever having to write any GraphQL language.
+</file>
+
+<file path="webscraping/api_scraping/graphql_scraping/introspection.md">
+---
+title: Introspection
+description: Understand what introspection is, and how it can help you understand a GraphQL API to take advantage of the features it has to offer before writing any code.
+sidebar_position: 2
+slug: /api-scraping/graphql-scraping/introspection
+---
+
+# Introspection {#introspection}
+
+**Understand what introspection is, and how it can help you understand a GraphQL API to take advantage of the features it has to offer before writing any code.**
+
+---
+
+[Introspection](https://graphql.org/learn/introspection/) is when you make a query to the target GraphQL API requesting information about its schema. When done properly, this can provide a whole lot of information about the API and the different **queries** and **mutations** it supports.
+
+Just like when working with regular RESTful APIs in the [**General API scraping**](../general_api_scraping/locating_and_learning.md) section, it's important to learn a bit about the different available features of the GraphQL API (or at least of the query/mutation) you are scraping before actually writing any code.
+
+Not only does becoming comfortable with and understanding the ins and outs of using the API make the development process easier, but it can also sometimes expose features which will return data you'd otherwise be scraping from a different location.
+
+## Making the query {#making-the-query}
+
+! Cheddar website was changed and the below example no longer works there. Nonetheless, the general approach is still viable on some websites even though introspection is disabled on most.
+
+In order to perform introspection on our [target website](https://www.cheddar.com), we need to make a request to their GraphQL API with this introspection query using [Insomnia](../../../glossary/tools/insomnia.md) or another HTTP client that supports GraphQL:
+
+> To make a GraphQL query in Insomnia, make sure you've set the HTTP method to **POST** and the request body type to **GraphQL Query**.
+
+```graphql
+query {
+  __schema {
+    queryType {
+      name
+    }
+    mutationType {
+      name
+    }
+    subscriptionType {
+      name
+    }
+    types {
+      ...FullType
+    }
+    directives {
+      name
+      description
+      locations
+      args {
+        ...InputValue
+      }
+    }
+  }
+}
+fragment FullType on __Type {
+  kind
+  name
+  description
+  fields(includeDeprecated: true) {
+    name
+    description
+    args {
+      ...InputValue
+    }
+    type {
+      ...TypeRef
+    }
+    isDeprecated
+    deprecationReason
+  }
+  inputFields {
+    ...InputValue
+  }
+  interfaces {
+    ...TypeRef
+  }
+  enumValues(includeDeprecated: true) {
+    name
+    description
+    isDeprecated
+    deprecationReason
+  }
+  possibleTypes {
+    ...TypeRef
+  }
+}
+fragment InputValue on __InputValue {
+  name
+  description
+  type {
+    ...TypeRef
+  }
+  defaultValue
+}
+fragment TypeRef on __Type {
+  kind
+  name
+  ofType {
+    kind
+    name
+    ofType {
+      kind
+      name
+      ofType {
+        kind
+        name
+        ofType {
+          kind
+          name
+          ofType {
+            kind
+            name
+            ofType {
+              kind
+              name
+              ofType {
+                kind
+                name
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+Here's what we got back from the request:
+
+![GraphQL introspection request response](./images/introspection.jpg)
+
+The response body of our introspection query contains a whole lot of useful information about the API, such as the data types defined within it, as well the queries and mutations available for retrieving/changing the data.
+
+## Understanding the response {#understanding-the-response}
+
+An introspection query's response body size will vary depending on how big the target API is. In our case, what we got back is a 27 thousand line JSON response 🤯 If you thought to yourself, "Wow, that's a whole lot to sift through! I don't want to look through that!", you are absolutely right. Luckily for us, there is a fantastic online tool called [GraphQL Voyager](https://graphql-kit.com/graphql-voyager/) (no install required) which can take this massive JSON response and turn it into a digestable visualization of the API.
+
+Let's copy the response to our clipboard by clicking inside of the response body and pressing **CMD** + **A**, then subsequently **CMD** + **C**. Now, we'll head over to [GraphQL Voyager](https://graphql-kit.com/graphql-voyager/) and click on **Change Schema**. In the modal, we'll click on the **Introspection** tab and paste our data into the text area.
+
+![Pasting the introspection](./images/pasting-introspection.png)
+
+Finally, we can click on **Display** and immediately be shown a visualization of the API:
+
+![GraphQL Voyager API visualization](./images/voyager-interface.jpg)
+
+Now that we have this visualization to work off of, it will be much easier to build a query of our own.
+
+## Building a query {#building-a-query}
+
+In future lessons, we'll be building more complex queries using **dynamic variables** and advanced features such as **fragments**; however, for now let's get our feet wet by using the data we have from GraphQL Voyager to build a query.
+
+Right now, our goal is to fetch the 1000 most recent articles on [Cheddar](https://www.cheddar.com). From each article, we'd like to fetch the **title** and the **publish date**. After a bit of digging through the schema, we've come across the **media** field within the **organization** type, which has both **title** and **public_at** fields - seems to check out!
+
+![The media field pointing to datatype slugable](./images/media-field.jpg)
+
+Cool. Now we know we need to access **media** through the **organization** query. The **media** field also takes in some arguments, of which we will be using the **first** parameter set to **1000**. Let's start writing our query in Insomnia!
+
+![Receiving a suggestion for a field titled edges](./images/edges-suggested.png)
+
+While writing our query, we've hit a slight roadblock - the **media** type doesn't seem to be accepting a **title** field; however, we are being suggested an **edges** field. This signifies that Cheddar is using [cursor-based relay pagination](https://relay.dev/graphql/connections.htm#relay-style-cursor-pagination), and that what is returned from media is actually a **Connection** type with multiple properties. The **edges** property contains the list of results we're after, and each result lies within a **Node** type accessible within **edges** as **node**. With this knowledge, we can finish writing our query:
+
+```graphql
+query {
+    organization {
+        media(first: 1000) {
+            edges {
+                node {
+                    title
+                    public_at
+                }
+            }
+        }
+    }
+}
+```
+
+## Sending the query {#sending-the-query}
+
+Let's send it!
+
+![Unauthorized](./images/unauthorized.png)
+
+Oh, okay. That didn't work. But **why**?
+
+Rest assured, nothing is wrong with our query. We are most likely missing an authorization token/parameter. Let's check back on the Cheddar website within our browser to see what types of headers are being sent with the requests there:
+
+![Request headers back on the Cheddar website](./images/cheddar-headers.jpg)
+
+The **Authorization** and **X-App-Token** headers seem to be our culprits. Of course these values are dynamic, but for testing purposes we can copy them right from the **Network** tab and use them for our request in Insomnia.
+
+![Successful request](./images/successful-request.png)
+
+Cool, it worked! Now we know that if we want to scrape this API, we'll likely have to scrape these authorization headers as well in order to not get blocked.
+
+> For more information about cookies, headers, and tokens, refer back to [this lesson](../general_api_scraping/cookies_headers_tokens.md) from the previous section of the **API scraping** course.
+
+## Introspection disabled? {#introspection-disabled}
+
+If the target website is smart, they will have introspection disabled. One of the most widely used GraphQL development tools is [ApolloServer](https://www.apollographql.com/docs/apollo-server/), which automatically disables introspection, so these cases are actually quite common.
+
+![Introspection disabled](./images/introspection-disabled.png)
+
+In these cases, it is still possible to get some information about the API when using [Insomnia](../../../glossary/tools/insomnia.md) or [Postman](../../../glossary/tools/postman.md), due to the autocomplete that they provide. If we remember from the [Building a query](#building-a-query) section of this lesson, we were able to receive autocomplete suggestions when we entered a non-existent field into the query. Though this is not as great as seeing an entire visualization of the API in GraphQL Voyager, it can still be quite helpful.
+
+## Next up {#next}
+
+[Next lesson](./custom_queries.md)'s code-along project will walk you through how to construct a custom GraphQL query for scraping purposes, how to accept input into it, and how to retrieve and output the data.
+</file>
+
+<file path="webscraping/api_scraping/graphql_scraping/modifying_variables.md">
+---
+title: Modifying variables
+description: Learn how to modify the variables of a JSON format GraphQL query to use the API without needing to write any GraphQL language or create custom queries.
+sidebar_position: 1
+slug: /api-scraping/graphql-scraping/modifying-variables
+---
+
+# Modifying variables {#modifying-variables}
+
+**Learn how to modify the variables of a JSON format GraphQL query to use the API without needing to write any GraphQL language or create custom queries.**
+
+---
+
+In the introduction of this course, we searched for the term **test** on the [Cheddar](https://www.cheddar.com/) website and discovered a request to their GraphQL API. The payload looked like this:
+
+```json
+{
+    "query": "query SearchQuery($query: String!, $count: Int!, $cursor: String) {\n    organization {\n        ...SearchList_organization\n        id\n    }\n    }\n    fragment SearchList_organization on Organization {\n    media(\n        first: $count\n        after: $cursor\n        query: $query\n        recency_weight: 0.6\n        recency_days: 30\n        include_private: false\n        include_unpublished: false\n    ) {\n        hitCount\n        edges {\n        node {\n            _score\n            id\n            ...StandardListCard_video\n            __typename\n        }\n        cursor\n        }\n        pageInfo {\n        endCursor\n        hasNextPage\n        }\n    }\n    }\n    fragment StandardListCard_video on Slugable {\n    ...Thumbnail_video\n    ...StandardTextCard_media\n    slug\n    id\n    __typename\n    }\n    fragment Thumbnail_video on Slugable {\n    original_thumbnails: thumbnails(aspect_ratio: ORIGINAL) {\n        small\n        medium\n        large\n    }\n    sd_thumbnails: thumbnails(aspect_ratio: SD) {\n        small\n        medium\n        large\n    }\n    hd_thumbnails: thumbnails(aspect_ratio: HD) {\n        small\n        medium\n        large\n    }\n    film_thumbnails: thumbnails(aspect_ratio: FILM) {\n        small\n        medium\n        large\n    }\n    square_thumbnails: thumbnails(aspect_ratio: SQUARE) {\n        small\n        medium\n        large\n    }\n    }\n    fragment StandardTextCard_media on Slugable {\n    public_at\n    updated_at\n    title\n    hero_video {\n        duration\n    }\n    description\n    }",
+    "variables": { "query": "test","count": 10,"cursor": null },
+    "operationName": "SearchQuery"
+}
+```
+
+We also learned that every GraphQL request payload will have a **query** property, which contains a stringified version of the query, and a **variables** property, which contains any parameters for the query.
+
+If we convert the query field to a `.graphql` format, we can get it nicely formatted with syntax highlighting (install GraphQL extension for editor)
+
+```graphql
+query SearchQuery($query: String!, $count: Int!, $cursor: String) {
+    organization {
+        ...SearchList_organization
+        id
+    }
+    }
+    fragment SearchList_organization on Organization {
+    media(
+        first: $count
+        after: $cursor
+        query: $query
+        recency_weight: 0.6
+        recency_days: 30
+        include_private: false
+        include_unpublished: false
+    ) {
+        hitCount
+        edges {
+        node {
+            _score
+            id
+            ...StandardListCard_video
+            __typename
+        }
+        cursor
+        }
+        pageInfo {
+        endCursor
+        hasNextPage
+        }
+    }
+    }
+    fragment StandardListCard_video on Slugable {
+    ...Thumbnail_video
+    ...StandardTextCard_media
+    slug
+    id
+    __typename
+    }
+    fragment Thumbnail_video on Slugable {
+    original_thumbnails: thumbnails(aspect_ratio: ORIGINAL) {
+        small
+        medium
+        large
+    }
+    sd_thumbnails: thumbnails(aspect_ratio: SD) {
+        small
+        medium
+        large
+    }
+    hd_thumbnails: thumbnails(aspect_ratio: HD) {
+        small
+        medium
+        large
+    }
+    film_thumbnails: thumbnails(aspect_ratio: FILM) {
+        small
+        medium
+        large
+    }
+    square_thumbnails: thumbnails(aspect_ratio: SQUARE) {
+        small
+        medium
+        large
+    }
+    }
+    fragment StandardTextCard_media on Slugable {
+    public_at
+    updated_at
+    title
+    hero_video {
+        duration
+    }
+    description
+}
+```
+
+If the query provided in the payload you find in the **Network** tab is good enough for your scraper's needs, you don't actually have to go down the GraphQL rabbit hole. Rather, you can change the variables to receive the data you want. For example, right now, our example payload is set up to search for articles matching the keyword **test**. However, if we wanted to search for articles matching **cats** instead, we could do that by changing the **query** variable like so:
+
+```json
+{
+    "...": "...",
+    "variables": { "query": "cats","count": 10,"cursor": null }
+}
+```
+
+Depending on the API, doing just this can be sufficient. However, sometimes we want to utilize complex GraphQL features in order to optimize our scrapers or to receive more data than is being provided in the response of the request found in the **Network** tab. This is what we will be discussing in the next lessons.
+
+## Next up {#next}
+
+In the [next lesson](./introspection.md) we will be walking you through how to learn about a GraphQL API before scraping it by using **introspection**.
+</file>
+
+<file path="webscraping/api_scraping/index.md">
+---
+title: API scraping
+description: Learn all about how the professionals scrape various types of APIs with various configurations, parameters, and requirements.
+sidebar_position: 3
+category: web scraping & automation
+slug: /api-scraping
+---
+
+# API scraping
+
+**Learn all about how the professionals scrape various types of APIs with various configurations, parameters, and requirements.**
+
+---
+
+API scraping is locating a website's API endpoints, and fetching the desired data directly from their API, as opposed to parsing the data from their rendered HTML pages.
+
+> **Note:** In the next few lessons, we'll be using [SoundCloud's website](https://soundcloud.com) as an example target, but the techniques described here can be applied to any site.
+
+In this module, we will discuss the benefits and drawbacks of API scraping, how to locate an API, how to utilize its potential features, and how to work around some common roadblocks.
+
+## What's an API? {#what-is-api}
+
+An API is a custom service that lives on the server of any given website. They provide an intuitive way for the website's client-side pages to send and receive data to and from the server, where it can be stored in a database, manipulated, or used to perform an operation. Though not **all** sites have APIs, many do, especially those built as complex web applications. Learn more about APIs [in this article](https://blog.apify.com/what-is-an-api/).
+
+## Different types of APIs
+
+Websites use APIs which can be either REST or GraphQL. While REST is a vague architectural style based only on conventions, GraphQL is a specification.
+
+The REST APIs usually consists of many so-called endpoints, to which you can send your requests. In the responses you are provided with information about various resources, such as users, products, etc. Examples of typical REST API requests:
+
+```text
+GET https://api.example.com/users/123
+GET https://api.example.com/comments/abc123?limit=100
+POST https://api.example.com/orders
+```
+
+In a GraphQL API, all requests are `POST` and point to a single URL, typically something like `https://api.example.com/graphql`. To get data, you send along a query in the GraphQL query language, optionally with variables. Example of such query:
+
+```graphql
+query($number_of_repos: Int!) {
+  viewer {
+    name
+     repositories(last: $number_of_repos) {
+       nodes {
+         name
+       }
+     }
+   }
+}
+```
+
+## Advantages of API scraping {#advantages}
+
+<br/>
+
+### 1. More reliable
+
+Since the data is coming directly from the site's API, as opposed to the parsing of HTML content based on CSS selectors, it can be relied on more, as it is less likely to change. Typically, websites change their APIs much less frequently than they change the structure/selectors of their pages.
+
+### 2. Configurable
+
+Most APIs accept query parameters such as `maxPosts` or `fromCountry`. These parameters can be mapped to the configuration options of the scraper, which makes creating a scraper that supports various requirements and use-cases much easier. They can also be utilized to filter and/or limit data results.
+
+### 3. Fast and efficient
+
+Especially for [dynamic sites](https://blog.apify.com/what-is-a-dynamic-page/), in which a headless browser would otherwise be required (it can sometimes be slow and cumbersome), scraping their API can prove to be much quicker and more efficient.
+
+### 4. Easy on the target website
+
+Depending on the website, sending large amounts of requests to their pages could result in a slight performance decrease on their end. By using their API instead, not only does your scraper run better, but it is less demanding of the target website.
+
+## Disadvantages of API Scraping {#disadvantages}
+
+<br/>
+
+### 1. Sometimes requires special tokens
+
+Many APIs will require the session cookie, an API key, or some other special value to be included within the header of the request in order to receive any data back. For certain projects, this can be a challenge.
+
+### 2. Potential overhead
+
+For complex APIs that require certain headers and/or payloads in order to make a successful request, return encoded data, have rate limits, or that use GraphQL, there can be a slight overhead in figuring out how to utilize them in a scraper.
+
+<!-- These will be articles in the future -->
+
+## Extra challenges {#extra-challenges}
+
+<br/>
+
+### 1. Different data formats
+
+APIs come in all different shapes and sizes. That means every API will vary in not only the quality of the data that it returns, but also the format that it is in. The two most common formats are JSON and HTML.
+
+JSON responses are ideal, as they can be manipulated in JavaScript code. In general, no serious parsing is necessary, and the data can be filtered and formatted to fit a scraper's dataset schema.
+
+APIs which output HTML generally return the raw HTML of a small component of the page which is already hydrated with data. In these cases, it is still worth using the API, as it is still more efficient than making a request to the entire page; even though the data does still need to be parsed from the HTML response.
+
+### 2. Encoded data
+
+Sometimes, a response will look something like this:
+
+```json
+{
+    "title": "Scraping Academy Message",
+    "message": "SGVsbG8hIFlvdSBoYXZlIHN1Y2Nlc3NmdWxseSBkZWNvZGVkIHRoaXMgYmFzZTY0IGVuY29kZWQgbWVzc2FnZSEgV2UgaG9wZSB5b3UncmUgbGVhcm5pbmcgYSBsb3QgZnJvbSB0aGUgQXBpZnkgU2NyYXBpbmcgQWNhZGVteSE="
+}
+```
+
+Or some other encoding format. This example's `message` has some data encoded in [Base64](https://en.wikipedia.org/wiki/Base64), which is one of the most common encoding types. For testing out Base64 encoding and decoding, you can use [base64encode.org](https://www.base64encode.org/) and [base64decode.org](https://www.base64decode.org/). Within a project where base64 decoding/encoding is necessary, the [Node.js Buffer Class](https://nodejs.org/api/buffer.html) can be used like so:
+
+<!-- eslint-disable -->
+```js
+const value = 'SGVsbG8hIFlvdSBoYXZlIHN1Y2Nlc3NmdWxseSBkZWNvZGVkIHRoaXMgYmFzZTY0IGVuY29kZWQgbWVzc2FnZSEgV2UgaG9wZSB5b3UncmUgbGVhcm5pbmcgYSBsb3QgZnJvbSB0aGUgQXBpZnkgU2NyYXBpbmcgQWNhZGVteSE=';
+
+const decoded = Buffer.from(value, 'base64').toString('utf-8');
+
+console.log(decoded);
+```
+
+## First up {#first}
+
+Get started with this course by learning some general knowledge about API scraping in the [General API Scraping](./general_api_scraping/index.md) section! This section will teach you everything you need to know about scraping APIs before moving into more complex sections.
+</file>
+
+<file path="webscraping/puppeteer_playwright/common_use_cases/downloading_files.md">
+---
+title: Downloading files
+description: Learn how to automatically download and save files to the disk using two of the most popular web automation libraries, Puppeteer and Playwright.
+sidebar_position: 3
+slug: /puppeteer-playwright/common-use-cases/downloading-files
+---
+
+# Downloading files
+
+**Learn how to automatically download and save files to the disk using two of the most popular web automation libraries, Puppeteer and Playwright.**
+
+---
+
+Downloading a file using Puppeteer can be tricky. On some systems, there can be issues with the usual file saving process that prevent you from doing it in a straightforward way. However, there are different techniques that work (most of the time).
+
+These techniques are only necessary when we don't have a direct file link, which is usually the case when the file being downloaded is based on more complicated data export.
+
+## Setting up a download path {#setting-up-a-download-path}
+
+Let's start with the easiest technique. This method tells the browser in what folder we want to download a file from Puppeteer after clicking on it.
+
+```js
+const client = await page.target().createCDPSession();
+await client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath: './my-downloads' });
+```
+
+We use the mysterious `client` API which gives us access to all the functions of the underlying [Chrome DevTools Protocol](https://pptr.dev/api/puppeteer.cdpsession) (Puppeteer & Playwright are built on top of it). Basically, it extends Puppeteer's functionality. Then we can download the file by clicking on the button.
+
+```js
+await page.click('.export-button');
+```
+
+Let's wait for one minute. In a real use case, you want to check the state of the file in the file system.
+
+```js
+await page.waitFor(60000);
+```
+
+To extract the file from the file system into memory, we have to first find its name, and then we can read it.
+
+```js
+import fs from 'fs';
+
+const fileNames = fs.readdirSync('./my-downloads');
+
+// Let's pick the first one
+const fileData = fs.readFileSync(`./my-downloads/${fileNames[0]}`);
+
+// ...Now we can do whatever we want with the data
+```
+
+## Intercepting and replicating a file download request {#intercepting-a-file-download-request}
+
+For this second option, we can trigger the file download, intercept the request going out, and then replicate it to get the actual data. First, we need to enable request interception. This is done using the following line of code:
+
+```js
+await page.setRequestInterception(true);
+```
+
+Next, we need to trigger the actual file export. We might need to fill in some form, select an exported file type, etc. In the end, it will look something like this:
+
+```js
+await page.click('.export-button');
+```
+
+We don't need to await this promise since we'll be waiting for the result of this action anyway (the triggered request).
+
+The crucial part is intercepting the request that would result in downloading the file. Since the interception is already enabled, we just need to wait for the request to be sent.
+
+```js
+const xRequest = await new Promise((resolve) => {
+    page.on('request', (interceptedRequest) => {
+        interceptedRequest.abort(); // stop intercepting requests
+        resolve(interceptedRequest);
+    });
+});
+```
+
+The last thing is to convert the intercepted Puppeteer request into a request-promise options object. We need to have the `request-promise` package installed.
+
+```js
+import request from 'request-promise';
+```
+
+Since the request interception does not include cookies, we need to add them subsequently.
+
+```js
+const options = {
+    encoding: null,
+    method: xRequest._method,
+    uri: xRequest._url,
+    body: xRequest._postData,
+    headers: xRequest._headers,
+};
+
+// Add the cookies
+const cookies = await page.cookies();
+options.headers.Cookie = cookies.map((ck) => `${ck.name}=${ck.value}`).join(';');
+
+// Resend the request
+const response = await request(options);
+```
+
+Now, the response contains the binary data of the downloaded file. It can be saved to the disk, uploaded somewhere, or [submitted with another form](./submitting_a_form_with_a_file_attachment.md).
+</file>
+
+<file path="webscraping/puppeteer_playwright/common_use_cases/index.md">
+---
+title: Common use cases
+description: Learn about some of the most common use cases of Playwright and Puppeteer, and how to handle these use cases when you run into them.
+sidebar_position: 7.7
+slug: /puppeteer-playwright/common-use-cases
+---
+
+# Common use cases {#common-use-cases}
+
+**Learn about some of the most common use cases of Playwright and Puppeteer, and how to handle these use cases when you run into them.**
+
+---
+
+You can do about anything with a headless browser, but, there are some extremely common use cases that are important to understand and be prepared for when you might run into them. This short section will be all about solving these common situations. Here's what we'll be covering:
+
+1. Login flow (logging into an account)
+2. Paginating through results on a website
+3. Solving browser challenges (ex. captchas)
+4. More!
+
+# Next up {#next}
+
+The [first lesson](./logging_into_a_website.md) of this section is all about logging into a website and running multiple concurrent operations within a user's account.
+</file>
+
+<file path="webscraping/puppeteer_playwright/common_use_cases/logging_into_a_website.md">
+---
+title: Logging into a website
+description: Understand the "login flow" - logging into a website, then maintaining a logged in status within different browser contexts for an efficient automation process.
+sidebar_position: 1
+slug: /puppeteer-playwright/common-use-cases/logging-into-a-website
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Logging into a website {#logging-into-a-website}
+
+**Understand the "login flow" - logging into a website, then maintaining a logged in status within different browser contexts for an efficient automation process.**
+
+---
+
+Whether it's auto-renewing a service, automatically sending a message on an interval, or automatically cancelling a Netflix subscription, one of the most popular things headless browsers are used for is automating things within a user's account on a certain website. Of course, automating anything on a user's account requires the automation of the login process as well. In this lesson, we'll be covering how to build a login flow from start to finish with Playwright or Puppeteer.
+
+> In this lesson, we'll be using [yahoo.com](https://www.yahoo.com/) as an example. Feel free to follow along using the academy Yahoo account credentials, or even deviate from the lesson a bit and try building a login flow for a different website of your choosing!
+
+## Inputting credentials {#inputting-credentials}
+
+The full logging in process on Yahoo goes like this:
+
+1. Accept their cookies policy, then load the main page.
+2. Click on the **Sign in** button and load the sign-in page.
+3. Enter the username and click the button.
+4. Enter the password and click the button, then load the main page again (but now logged in).
+
+When we lay out the steps like this in [pseudocode](https://en.wikipedia.org/wiki/Pseudocode), it makes it significantly easier to translate over into code. Here's the four steps above loop in JavaScript:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+// Launch a browser and open a page
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.yahoo.com/');
+
+// Agree to the cookies terms, then click on the "Sign in" button
+await page.click('button[name="agree"]');
+await page.waitForSelector('a:has-text("Sign in")');
+
+await page.click('a:has-text("Sign in")');
+await page.waitForLoadState('load');
+
+// Type in the username and continue forward
+await page.type('input[name="username"]', 'YOUR-LOGIN-HERE');
+await page.click('input[name="signin"]');
+
+// Type in the password and continue forward
+await page.type('input[name="password"]', 'YOUR-PASSWORD-HERE');
+await page.click('button[name="verifyPassword"]');
+await page.waitForLoadState('load');
+
+// Wait for 10 seconds so we can see that we have in fact
+// successfully logged in
+await page.waitForTimeout(10000);
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+// Launch a browser and open a page
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.yahoo.com/');
+
+// Agree to the cookies terms, then click on the "Sign in" button
+await Promise.all([page.waitForSelector('a[data-ylk*="sign-in"]'), page.click('button[name="agree"]')]);
+await Promise.all([page.waitForNavigation(), page.click('a[data-ylk*="sign-in"]')]);
+
+// Type in the username and continue forward
+await page.type('input[name="username"]', 'YOUR-LOGIN-HERE');
+await Promise.all([page.waitForNavigation(), page.click('input[name="signin"]')]);
+
+// Type in the password and continue forward
+await page.type('input[name="password"]', 'YOUR-PASSWORD-HERE');
+await Promise.all([page.waitForNavigation(), page.click('button[name="verifyPassword"]')]);
+
+// Wait for 10 seconds so we can see that we have in fact
+// successfully logged in
+await page.waitForTimeout(10000);
+```
+
+</TabItem>
+</Tabs>
+
+Great! If you're following along and you've replaced the placeholder credentials with your own, you should see that on the final navigated page, you're logged into your Yahoo account.
+
+![Successfully logged into Yahoo](./images/logged-in.jpg)
+
+## Passing around cookies {#passing-around-cookies}
+
+Now that we all know how to log into a website let's try and solve a more complex problem. Let's say that we want to send 3 different emails at the same exact time, all from the **Academy** Yahoo account.
+
+Here is an object we'll create which represents the three different emails we want to send:
+
+```js
+const emailsToSend = [
+    {
+        to: 'alice@example.com',
+        subject: 'Hello',
+        body: 'This is a message.',
+    },
+    {
+        to: 'bob@example.com',
+        subject: 'Testing',
+        body: 'I love the academy!',
+    },
+    {
+        to: 'carol@example.com',
+        subject: 'Apify is awesome!',
+        body: 'Some content.',
+    },
+];
+```
+
+What we could do is log in 3 different times, then automate the sending of each email; however, this is extremely inefficient. When you log into a website, one of the main things that allows you to stay logged in and perform actions on your account is the [cookies](../../../glossary/concepts/http_cookies.md) stored in your browser. These cookies tell the website that you have been authenticated, and that you have the permissions required to modify your account.
+
+With this knowledge of cookies, it can be concluded that we can pass the cookies generated by the code above right into each new browser context that we use to send each email. That way, we won't have to run the login flow each time.
+
+### Retrieving cookies {#retrieving-cookies}
+
+First, we'll grab the cookies we generated:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Grab the cookies from the default browser context,
+// which was used to log in
+const cookies = await browser.contexts()[0].cookies();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// Grab the cookies from the page used to log in
+const cookies = await page.cookies();
+```
+
+</TabItem>
+</Tabs>
+
+Notice that in Playwright, cookies are tied to a **BrowserContext**, while in Puppeteer they are tied to a **Page**.
+
+### Passing cookies to a new browser context {#passing-cookies-to-new-contexts}
+
+Remembering from the section above, we stored our cookies in a variable named **cookies**. These can now be directly passed into a new browser context like so:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Create a fresh non-persistent browser context
+const sendEmailContext = await browser.newContext();
+// Add the cookies from the previous one to this one so that
+// we'll be logged into Yahoo without having to re-do the
+// logging in automation
+await sendEmailContext.addCookies(cookies);
+const page2 = await sendEmailContext.newPage();
+
+// Notice that we are logged in, even though we didn't
+// go through the logging in process again!
+await page2.goto('https://mail.yahoo.com/');
+await page2.waitForTimeout(10000);
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// Create a fresh non-persistent browser context
+const sendEmailContext = await browser.createIncognitoBrowserContext();
+// Create a new page on the new browser context and set its cookies
+// to be the same ones from the page we used to log into the website.
+const page2 = await sendEmailContext.newPage();
+await page2.setCookie(...cookies);
+
+// Notice that we are logged in, even though we didn't
+// go through the logging in process again!
+await page2.goto('https://mail.yahoo.com/');
+await page2.waitForTimeout(10000);
+```
+
+</TabItem>
+</Tabs>
+
+### Completing the flow {#completing-the-flow}
+
+Now that passing cookies around is out of the way, we can finally complete the goal at hand and send all three of these emails at once. This can be done by mapping through **emailsToSend**, creating an array of promises where each function creates a new browser context, adds the initial cookies, and sends the email.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Grab the cookies from the default browser context,
+// which was used to log in
+const cookies = await browser.contexts()[0].cookies();
+
+await page.close();
+
+// Create an array of promises, running the cookie passing
+// and email sending logic each time
+const promises = emailsToSend.map(({ to, subject, body }) => (async () => {
+    // Create a fresh non-persistent browser context
+    const sendEmailContext = await browser.newContext();
+    // Add the cookies from the previous one to this one so that
+    // we'll be logged into Yahoo without having to re-do the
+    // logging in automation
+    await sendEmailContext.addCookies(cookies);
+    const page2 = await sendEmailContext.newPage();
+
+    await page2.goto('https://mail.yahoo.com/');
+
+    // Compose an email
+    await page2.click('a[aria-label="Compose"]');
+
+    // Populate the fields with the details from the object
+    await page2.type('input#message-to-field', to);
+    await page2.type('input[data-test-id="compose-subject"]', subject);
+    await page2.type('div[data-test-id="compose-editor-container"] div[contenteditable="true"]', body);
+
+    // Send the email
+    await page2.click('button[title="Send this email"]');
+
+    await sendEmailContext.close();
+})(),
+);
+
+// Wait for all emails to be sent
+await Promise.all(promises);
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// Create an array of promises, running the cookie passing
+// and email sending logic each time
+const promises = emailsToSend.map(({ to, subject, body }) => (async () => {
+    // Create a fresh non-persistent browser context
+    const sendEmailContext = await browser.createIncognitoBrowserContext();
+    // Create a new page on the new browser context and set its cookies
+    // to be the same ones from the page we used to log into the website.
+    const page2 = await sendEmailContext.newPage();
+    await page2.setCookie(...cookies);
+
+    await page2.goto('https://mail.yahoo.com/');
+
+    // Compose an email
+    await page2.click('a[aria-label="Compose"]');
+
+    // Populate the fields with the details from the object
+    await page2.type('input#message-to-field', to);
+    await page2.type('input[data-test-id="compose-subject"]', subject);
+    await page2.type('div[data-test-id="compose-editor-container"] div[contenteditable="true"]', body);
+
+    // Send the email
+    await page2.click('button[title="Send this email"]');
+
+    await sendEmailContext.close();
+})(),
+);
+
+// Wait for all emails to be sent
+await Promise.all(promises);
+```
+
+</TabItem>
+</Tabs>
+
+## Final code overview {#final-code}
+
+To sum up what we've built during this lesson:
+
+1. Log into Yahoo.
+2. Store the login cookies in a variable.
+3. Concurrently create 3 new browser contexts and inject the cookies into each one.
+4. Concurrently send 3 emails from the same account logged into in the first step.
+
+Here's what the final code looks like:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const emailsToSend = [
+    {
+        to: 'alice@example.com',
+        subject: 'Hello',
+        body: 'This is a message.',
+    },
+    {
+        to: 'bob@example.com',
+        subject: 'Testing',
+        body: 'I love the academy!',
+    },
+    {
+        to: 'carol@example.com',
+        subject: 'Apify is awesome!',
+        body: 'Some content.',
+    },
+];
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+// Login logic
+await page.goto('https://www.yahoo.com/');
+
+await page.click('button[name="agree"]');
+await page.waitForSelector('a:has-text("Sign in")');
+
+await page.click('a:has-text("Sign in")');
+await page.waitForLoadState('load');
+
+await page.type('input[name="username"]', 'YOUR-LOGIN-HERE');
+await page.click('input[name="signin"]');
+
+await page.type('input[name="password"]', 'YOUR-PASSWORD-HERE');
+await page.click('button[name="verifyPassword"]');
+await page.waitForLoadState('load');
+
+const cookies = await browser.contexts()[0].cookies();
+
+await page.close();
+
+// Email sending logic
+const promises = emailsToSend.map(({ to, subject, body }) => (async () => {
+    const sendEmailContext = await browser.newContext();
+    await sendEmailContext.addCookies(cookies);
+    const page2 = await sendEmailContext.newPage();
+
+    await page2.goto('https://mail.yahoo.com/');
+
+    await page2.click('a[aria-label="Compose"]');
+
+    await page2.type('input#message-to-field', to);
+    await page2.type('input[data-test-id="compose-subject"]', subject);
+    await page2.type('div[data-test-id="compose-editor-container"] div[contenteditable="true"]', body);
+
+    await page2.click('button[title="Send this email"]');
+
+    await sendEmailContext.close();
+})(),
+);
+
+await Promise.all(promises);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const emailsToSend = [
+    {
+        to: 'alice@example.com',
+        subject: 'Hello',
+        body: 'This is a message.',
+    },
+    {
+        to: 'bob@example.com',
+        subject: 'Testing',
+        body: 'I love the academy!',
+    },
+    {
+        to: 'carol@example.com',
+        subject: 'Apify is awesome!',
+        body: 'Some content.',
+    },
+];
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+// Login logic
+await page.goto('https://www.yahoo.com/');
+
+await Promise.all([page.waitForSelector('a[data-ylk*="sign-in"]'), page.click('button[name="agree"]')]);
+await Promise.all([page.waitForNavigation(), page.click('a[data-ylk*="sign-in"]')]);
+
+await page.type('input[name="username"]', 'YOUR-LOGIN-HERE');
+await Promise.all([page.waitForNavigation(), page.click('input[name="signin"]')]);
+
+await page.type('input[name="password"]', 'YOUR-PASSWORD-HERE');
+await Promise.all([page.waitForNavigation(), page.click('button[name="verifyPassword"]')]);
+
+const cookies = await page.cookies();
+await page.close();
+
+// Email sending logic
+const promises = emailsToSend.map(({ to, subject, body }) => (async () => {
+    const sendEmailContext = await browser.createIncognitoBrowserContext();
+    const page2 = await sendEmailContext.newPage();
+    await page2.setCookie(...cookies);
+
+    await page2.goto('https://mail.yahoo.com/');
+
+    await page2.click('a[aria-label="Compose"]');
+
+    await page2.type('input#message-to-field', to);
+    await page2.type('input[data-test-id="compose-subject"]', subject);
+    await page2.type('div[data-test-id="compose-editor-container"] div[contenteditable="true"]', body);
+
+    await page2.click('button[title="Send this email"]');
+
+    await sendEmailContext.close();
+})(),
+);
+
+await Promise.all(promises);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Next up {#next}
+
+In the [next lesson](./paginating_through_results.md), you'll learn how to paginate through results on a website.
+</file>
+
+<file path="webscraping/puppeteer_playwright/common_use_cases/paginating_through_results.md">
+---
+title: Paginating through results
+description: Learn how to paginate through results on websites that use either page number-based pagination or dynamic lazy-loading pagination.
+sidebar_position: 2
+slug: /puppeteer-playwright/common-use-cases/paginating-through-results
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Paginating through results {#paginating-through-results}
+
+**Learn how to paginate through results on websites that use either pagination based on page numbers or dynamic lazy loading.**
+
+---
+
+If you're trying to [collect data](../executing_scripts/extracting_data.md) on a website that has millions, thousands, or even hundreds of results, it is very likely that they are paginating their results to reduce strain on their back-end as well as on the users loading and rendering the content.
+
+![Amazon pagination](../../advanced_web_scraping/crawling/images/pagination.png)
+
+## Page number-based pagination {#page-number-based-pagination}
+
+At the time of writing this lesson, Facebook has [over a hundred repositories on GitHub](https://github.com/orgs/facebook/repositories). By default, GitHub lists repositories in descending order based on when they were last updated (the most recently updated ones are at the top of the list).
+
+We want to scrape the titles, links, and descriptions of all of Facebook's repositories; however, GitHub only displays 30 repositories per page. This means we need to paginate through the results. Let's start by defining some variables:
+
+```js
+// This is where we'll store scraped data
+const repositories = [];
+
+// This will come handy when resolving relative links
+const BASE_URL = 'https://github.com';
+
+// We'll use this URL a couple of times within our code
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+```
+
+### Finding the last page {#finding-the-last-page}
+
+Going through each page is easier if we know in advance when to stop. The good news is that GitHub's pagination is upfront about the number of the last page, so the total number of pages is available to us:
+
+![Last page number](./images/github-last-page.jpg)
+
+As Facebook adds repositories over time, the number you see in your browser might be different. Let's read the number now with the following code:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const repositories = [];
+const BASE_URL = 'https://github.com';
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+
+const browser = await chromium.launch({ headless: false });
+const firstPage = await browser.newPage();
+await firstPage.goto(REPOSITORIES_URL);
+
+const lastPageElement = firstPage.locator('a[aria-label*="Page "]:nth-last-child(2)');
+const lastPageLabel = await lastPageElement.getAttribute('aria-label');
+const lastPageNumber = Number(lastPageLabel.replace(/\D/g, ''));
+console.log(lastPageNumber);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const repositories = [];
+const BASE_URL = 'https://github.com';
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+
+const browser = await puppeteer.launch({ headless: false });
+const firstPage = await browser.newPage();
+await firstPage.goto(REPOSITORIES_URL);
+
+const lastPageLabel = await firstPage.$eval(
+    'a[aria-label*="Page "]:nth-last-child(2)',
+    (element) => element.getAttribute('aria-label'),
+);
+const lastPageNumber = Number(lastPageLabel.replace(/\D/g, ''));
+console.log(lastPageNumber);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+:::tip :nth-last-child
+
+[Learn more](https://developer.mozilla.org/en-US/docs/Web/CSS/:nth-last-child) about the `:nth-last-child` pseudo-class. It works like `:nth-child`, but starts from the bottom of the parent element's children instead of from the top.
+
+:::
+
+When we run the code, it prints the total number of pages, which is `4` at the time of writing this lesson. Now let's scrape repositories from all the pages.
+
+First, we'll add a function that can handle the data extraction for a single page and return an array of results. Then, to start, we'll run this function just for the first page:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+import * as cheerio from 'cheerio';
+
+const repositories = [];
+const BASE_URL = 'https://github.com';
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+
+// Scrapes all repositories from a single page
+const scrapeRepos = async (page) => {
+    const $ = cheerio.load(await page.content());
+
+    return [...$('.list-view-item')].map((item) => {
+        const repoElement = $(item);
+        return {
+            title: repoElement.find('h4').text().trim(),
+            description: repoElement.find('.repos-list-description').text().trim(),
+            link: new URL(repoElement.find('h4 a').attr('href'), BASE_URL).href,
+        };
+    });
+};
+
+const browser = await chromium.launch({ headless: false });
+const firstPage = await browser.newPage();
+await firstPage.goto(REPOSITORIES_URL);
+
+const lastPageElement = firstPage.locator('a[aria-label*="Page "]:nth-last-child(2)');
+const lastPageLabel = await lastPageElement.getAttribute('aria-label');
+const lastPageNumber = Number(lastPageLabel.replace(/\D/g, ''));
+
+// Push all results from the first page to the repositories array
+repositories.push(...(await scrapeRepos(firstPage)));
+
+// Log the 30 repositories scraped from the first page
+console.log(repositories);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+import * as cheerio from 'cheerio';
+
+const repositories = [];
+const BASE_URL = 'https://github.com';
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+
+// Scrapes all repositories from a single page
+const scrapeRepos = async (page) => {
+    const $ = cheerio.load(await page.content());
+
+    return [...$('.list-view-item')].map((item) => {
+        const repoElement = $(item);
+        return {
+            title: repoElement.find('h4').text().trim(),
+            description: repoElement.find('.repos-list-description').text().trim(),
+            link: new URL(repoElement.find('h4 a').attr('href'), BASE_URL).href,
+        };
+    });
+};
+
+const browser = await puppeteer.launch({ headless: false });
+const firstPage = await browser.newPage();
+await firstPage.goto(REPOSITORIES_URL);
+
+const lastPageLabel = await firstPage.$eval(
+    'a[aria-label*="Page "]:nth-last-child(2)',
+    (element) => element.getAttribute('aria-label'),
+);
+const lastPageNumber = Number(lastPageLabel.replace(/\D/g, ''));
+
+// Push all results from the first page to the repositories array
+repositories.push(...(await scrapeRepos(firstPage)));
+
+// Log the 30 repositories scraped from the first page
+console.log(repositories);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+If we run the code above, it outputs data about the first 30 repositories listed:
+
+```text
+$ node index.js
+[
+  {
+    title: 'react-native',
+    description: 'A framework for building native applications using React',
+    link: 'https://github.com/facebook/react-native'
+  },
+  {
+    title: 'fboss',
+    description: 'Facebook Open Switching System Software for controlling network switches.',
+    link: 'https://github.com/facebook/fboss'
+  },
+  ...
+]
+```
+
+### Making a request for each results page {#making-a-request-for-each-results-page}
+
+If we click around the pagination links, we can observe that all the URLs follow certain format. For example, we can find page number 2 at `https://github.com/orgs/facebook/repositories?page=2`.
+
+That means we could construct URL for each page if we had an array of numbers with the same range as the pages. If `lastPageNumber` is `4`, the following code creates `[0, 1, 2, 3, 4]`:
+
+```js
+const array = Array(lastPageNumber + 1); // getting an array of certain size
+const numbers = [...array.keys()]; // getting the keys (the actual numbers) as another array
+```
+
+Page `0` doesn't exist though and we've already scraped page `1`, so we need one more step to remove those:
+
+```js
+const pageNumbers = numbers.slice(2); // removes the first two numbers
+```
+
+To have our code examples shorter, we'll squash the above to a single line of code:
+
+```js
+const pageNumbers = [...Array(lastPageNumber + 1).keys()].slice(2);
+```
+
+Now let's scrape repositories for each of these numbers. We'll create promises for each request and collect results to a single `repositories` array:
+
+```js
+const pageNumbers = [...Array(lastPageNumber + 1).keys()].slice(2);
+const promises = pageNumbers.map((pageNumber) => (async () => {
+    const paginatedPage = await browser.newPage();
+
+    // Construct the URL by setting the ?page=... parameter to value of pageNumber
+    const url = new URL(REPOSITORIES_URL);
+    url.searchParams.set('page', pageNumber);
+
+    // Scrape the page
+    await paginatedPage.goto(url.href);
+    const results = await scrapeRepos(paginatedPage);
+
+    // Push results to the repositories array
+    repositories.push(...results);
+
+    await paginatedPage.close();
+})(),
+);
+await Promise.all(promises);
+
+// For brievity logging just the count of repositories scraped
+console.log(repositories.length);
+```
+
+:::caution Scaling to hundreds of requests
+
+Using `Promise.all()` is okay for up to ten or maybe tens of requests, but won't work well for large numbers. When scraping hundreds or even thousands of pages, it's necessary to have more robust infrastructure in place, such as a request queue.
+
+:::
+
+### Final code {#final-pagination-code}
+
+The code below puts all the bits together:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+import * as cheerio from 'cheerio';
+
+const repositories = [];
+const BASE_URL = 'https://github.com';
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+
+// Scrapes all repositories from a single page
+const scrapeRepos = async (page) => {
+    const $ = cheerio.load(await page.content());
+
+    return [...$('.list-view-item')].map((item) => {
+        const repoElement = $(item);
+        return {
+            title: repoElement.find('h4').text().trim(),
+            description: repoElement.find('.repos-list-description').text().trim(),
+            link: new URL(repoElement.find('h4 a').attr('href'), BASE_URL).href,
+        };
+    });
+};
+
+const browser = await chromium.launch({ headless: false });
+const firstPage = await browser.newPage();
+
+await firstPage.goto(REPOSITORIES_URL);
+
+const lastPageElement = firstPage.locator('a[aria-label*="Page "]:nth-last-child(2)');
+const lastPageLabel = await lastPageElement.getAttribute('aria-label');
+const lastPageNumber = Number(lastPageLabel.replace(/\D/g, ''));
+
+// Push all results from the first page to the repositories array
+repositories.push(...(await scrapeRepos(firstPage)));
+
+await firstPage.close();
+
+const pageNumbers = [...Array(lastPageNumber + 1).keys()].slice(2);
+const promises = pageNumbers.map((pageNumber) => (async () => {
+    const paginatedPage = await browser.newPage();
+
+    // Construct the URL by setting the ?page=... parameter to value of pageNumber
+    const url = new URL(REPOSITORIES_URL);
+    url.searchParams.set('page', pageNumber);
+
+    // Scrape the page
+    await paginatedPage.goto(url.href);
+    const results = await scrapeRepos(paginatedPage);
+
+    // Push results to the repositories array
+    repositories.push(...results);
+
+    await paginatedPage.close();
+})(),
+);
+await Promise.all(promises);
+
+// For brievity logging just the count of repositories scraped
+console.log(repositories.length);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+import * as cheerio from 'cheerio';
+
+const repositories = [];
+const BASE_URL = 'https://github.com';
+const REPOSITORIES_URL = `${BASE_URL}/orgs/facebook/repositories`;
+
+// Scrapes all repositories from a single page
+const scrapeRepos = async (page) => {
+    const $ = cheerio.load(await page.content());
+
+    return [...$('.list-view-item')].map((item) => {
+        const repoElement = $(item);
+        return {
+            title: repoElement.find('h4').text().trim(),
+            description: repoElement.find('.repos-list-description').text().trim(),
+            link: new URL(repoElement.find('h4 a').attr('href'), BASE_URL).href,
+        };
+    });
+};
+
+const browser = await puppeteer.launch({ headless: false });
+const firstPage = await browser.newPage();
+
+await firstPage.goto(REPOSITORIES_URL);
+
+const lastPageLabel = await firstPage.$eval(
+    'a[aria-label*="Page "]:nth-last-child(2)',
+    (element) => element.getAttribute('aria-label'),
+);
+const lastPageNumber = Number(lastPageLabel.replace(/\D/g, ''));
+
+// Push all results from the first page to the repositories array
+repositories.push(...(await scrapeRepos(page)));
+
+await firstPage.close();
+
+const pageNumbers = [...Array(lastPageNumber + 1).keys()].slice(2);
+const promises = pageNumbers.map((pageNumber) => (async () => {
+    const paginatedPage = await browser.newPage();
+
+    // Construct the URL by setting the ?page=... parameter to value of pageNumber
+    const url = new URL(REPOSITORIES_URL);
+    url.searchParams.set('page', pageNumber);
+
+    // Scrape the page
+    await paginatedPage.goto(url.href);
+    const results = await scrapeRepos(paginatedPage);
+
+    // Push results to the repositories array
+    repositories.push(...results);
+
+    await paginatedPage.close();
+})(),
+);
+await Promise.all(promises);
+
+// For brievity logging just the count of repositories scraped
+console.log(repositories.length);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+At the time of writing this lesson, a summary at the top of the [listing page](https://github.com/orgs/facebook/repositories) claims that Facebook has 115 repositories. Whatever is the number you are seeing, it should be equal to the number you get if you run the program:
+
+```text
+$ node index.js
+115
+```
+
+## Lazy-loading pagination {#lazy-loading-pagination}
+
+Pagination based on page numbers is straightforward to automate, but many websites use [lazy-loading](https://en.wikipedia.org/wiki/Lazy_loading) instead.
+
+> On websites with lazy-loading pagination, if [API scraping](../../api_scraping/index.md) is a viable option, it is a much better approach due to reliability and performance.
+
+Take a moment to look at and scroll through the women's clothing section [on About You's website](https://www.aboutyou.com/c/women/clothing-20204). Notice that the items are loaded as you scroll, and that there are no page numbers. Because of how drastically different this pagination implementation is from the previous one, it also requires a different workflow to scrape.
+
+We're going to scrape the brand and price from the first 75 results on the **About You** page linked above. Here's our basic setup:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+// Create an array where all scraped products will
+// be pushed to
+const products = [];
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.aboutyou.com/c/women/clothing-20204');
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+// Create an array where all scraped products will
+// be pushed to
+const products = [];
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.aboutyou.com/c/women/clothing-20204');
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+### Auto scrolling {#auto-scrolling}
+<!-- vale off -->
+Now, what we'll do is grab the height in pixels of a result item to have somewhat of a reference to how much we should scroll each time, as well as create a variable for keeping track of how many pixels have been scrolled.
+<!-- vale on -->
+```js
+// Grab the height of result item in pixels, which will be used to scroll down
+const itemHeight = await page.$eval('a[data-testid*="productTile"]', (elem) => elem.clientHeight);
+
+// Keep track of how many pixels have been scrolled down
+const totalScrolled = 0;
+```
+
+Then, within a `while` loop that ends once the length of the **products** array has reached 75, we'll run some logic that scrolls down the page and waits 1 second before running again.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+while (products.length < 75) {
+    await page.mouse.wheel(0, itemHeight * 3);
+    totalScrolled += itemHeight * 3;
+    // Allow the products 1 second to load
+    await page.waitForTimeout(1000);
+}
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+while (products.length < 75) {
+    await page.mouse.wheel({ deltaY: itemHeight * 3 });
+    totalScrolled += itemHeight * 3;
+    // Allow the products 1 second to load
+    await page.waitForTimeout(1000);
+}
+```
+
+</TabItem>
+</Tabs>
+
+This will work; however, what if we reach the bottom of the page and there are say, only 55 total products on the page? That would result in an infinite loop. Because of this edge case, we have to keep track of the constantly changing available scroll height on the page.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+while (products.length < 75) {
+    const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
+
+    await page.mouse.wheel(0, itemHeight * 3);
+    totalScrolled += itemHeight * 3;
+    // Allow the products 1 second to load
+    await page.waitForTimeout(1000);
+
+    // Data extraction login will go here
+
+    const innerHeight = await page.evaluate(() => window.innerHeight);
+
+    // if the total pixels scrolled is equal to the true available scroll
+    // height of the page, we've reached the end and should stop scraping.
+    // even if we haven't reach our goal of 75 products.
+    if (totalScrolled >= scrollHeight - innerHeight) {
+        break;
+    }
+}
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+while (products.length < 75) {
+    const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
+
+    await page.mouse.wheel({ deltaY: itemHeight * 3 });
+    totalScrolled += itemHeight * 3;
+    // Allow the products 1 second to load
+    await page.waitForTimeout(1000);
+
+    // Data extraction login will go here
+
+    const innerHeight = await page.evaluate(() => window.innerHeight);
+
+    // if the total pixels scrolled is equal to the true available scroll
+    // height of the page, we've reached the end and should stop scraping.
+    // even if we haven't reach our goal of 75 products.
+    if (totalScrolled >= scrollHeight - innerHeight) {
+        break;
+    }
+}
+```
+
+</TabItem>
+</Tabs>
+
+Now, the `while` loop will exit out if we've reached the bottom of the page.
+
+> Generally, you'd want to create a utility function that handles this scrolling logic instead of putting all of the code directly into the while loop.
+
+### Extracting data {#extracting-data}
+
+Within the loop, we can grab hold of the total number of items on the page. To avoid extracting and pushing duplicate items to the **products** array, we can use the `.slice()` method to cut out the items we've already scraped.
+
+```js
+import * as cheerio from 'cheerio';
+
+const $ = cheerio.load(await page.content());
+
+// Grab the newly loaded items
+const items = [...$('a[data-testid*="productTile"]')].slice(products.length);
+
+const newItems = items.map((item) => {
+    const elem = $(item);
+
+    return {
+        brand: elem.find('p[data-testid="brandName"]').text().trim(),
+        price: elem.find('span[data-testid="finalPrice"]').text().trim(),
+    };
+});
+
+products.push(...newItems);
+```
+
+### Final code {#final-lazy-loading-code}
+
+With everything completed, this is what we're left with:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+import * as cheerio from 'cheerio';
+
+const products = [];
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.aboutyou.com/c/women/clothing-20204');
+
+// Grab the height of result item in pixels, which will be used to scroll down
+const itemHeight = await page.$eval('a[data-testid*="productTile"]', (elem) => elem.clientHeight);
+
+// Keep track of how many pixels have been scrolled down
+let totalScrolled = 0;
+
+while (products.length < 75) {
+    const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
+
+    await page.mouse.wheel(0, itemHeight * 3);
+    totalScrolled += itemHeight * 3;
+    // Allow the products 1 second to load
+    await page.waitForTimeout(1000);
+
+    const $ = cheerio.load(await page.content());
+
+    // Grab the newly loaded items
+    const items = [...$('a[data-testid*="productTile"]')].slice(products.length);
+
+    const newItems = items.map((item) => {
+        const elem = $(item);
+
+        return {
+            brand: elem.find('p[data-testid="brandName"]').text().trim(),
+            price: elem.find('span[data-testid="finalPrice"]').text().trim(),
+        };
+    });
+
+    products.push(...newItems);
+
+    const innerHeight = await page.evaluate(() => window.innerHeight);
+
+    // if the total pixels scrolled is equal to the true available scroll
+    // height of the page, we've reached the end and should stop scraping.
+    // even if we haven't reach our goal of 75 products.
+    if (totalScrolled >= scrollHeight - innerHeight) {
+        break;
+    }
+}
+
+console.log(products.slice(0, 75));
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+import * as cheerio from 'cheerio';
+
+const products = [];
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.aboutyou.com/c/women/clothing-20204');
+
+// Grab the height of result item in pixels, which will be used to scroll down
+const itemHeight = await page.$eval('a[data-testid*="productTile"]', (elem) => elem.clientHeight);
+
+// Keep track of how many pixels have been scrolled down
+let totalScrolled = 0;
+
+while (products.length < 75) {
+    const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
+
+    await page.mouse.wheel({ deltaY: itemHeight * 3 });
+    totalScrolled += itemHeight * 3;
+    // Allow the products 1 second to load
+    await page.waitForTimeout(1000);
+
+    const $ = cheerio.load(await page.content());
+
+    // Grab the newly loaded items
+    const items = [...$('a[data-testid*="productTile"]')].slice(products.length);
+
+    const newItems = items.map((item) => {
+        const elem = $(item);
+
+        return {
+            brand: elem.find('p[data-testid="brandName"]').text().trim(),
+            price: elem.find('span[data-testid="finalPrice"]').text().trim(),
+        };
+    });
+
+    products.push(...newItems);
+
+    const innerHeight = await page.evaluate(() => window.innerHeight);
+
+    // if the total pixels scrolled is equal to the true available scroll
+    // height of the page, we've reached the end and should stop scraping.
+    // even if we haven't reach our goal of 75 products.
+    if (totalScrolled >= scrollHeight - innerHeight) {
+        break;
+    }
+}
+
+console.log(products.slice(0, 75));
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Quick note {#quick-note}
+
+The examples shown in this lesson are not the only ways to paginate through websites. They are here to serve as solid examples, but don't view them as the end-all be-all of scraping paginated websites. The methods you use and the algorithms you write might differ to various degrees based on what pages you're scraping and how your specific target website implemented pagination.
+
+## Next up {#next}
+
+We're actively working in expanding this section of the course, so stay tuned!
+</file>
+
+<file path="webscraping/puppeteer_playwright/common_use_cases/scraping_iframes.md">
+---
+title: Scraping iFrames
+description: Extracting data from iFrames can be frustrating. In this tutorial, we will learn how to scrape information from iFrames using Puppeteer or Playwright.
+sidebar_position: 5
+slug: /puppeteer-playwright/common-use-cases/scraping-iframes
+---
+
+# Scraping iFrames
+
+**Extracting data from iFrames can be frustrating. In this tutorial, we will learn how to scrape information from iFrames using Puppeteer or Playwright.**
+
+---
+
+Getting information from inside iFrames is a known pain, especially for new developers. After spending some time on Stack Overflow, you usually find answers like jQuery's `contents()` method or native contentDocument property, which can guide you to the insides of an iframe. But still, getting the right identifiers and holding that new context is a little annoying. Fortunately, you can make everything simpler and more straightforward by scraping iFrames with Puppeteer.
+
+## Finding the right `<iframe>`
+
+If you are using basic methods of page objects like `page.evaluate()`, you are actually already working with frames. Behind the scenes, Puppeteer will call `page.mainFrame().evaluate()`, so most of the methods you are using with page object can be used the same way with frame object. To access frames, you need to loop over the main frame's child frames and identify the one you want to use.
+
+As a demonstration, we'll scrape the Twitter widget iFrame from [IMDB](https://www.imdb.com/).
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch();
+
+const page = await browser.newPage();
+
+await page.goto('https://www.imdb.com');
+await page.waitForTimeout(5000); // we need to wait for Twitter widget to load
+
+let twitterFrame; // this will be populated later by our identified frame
+
+for (const frame of page.mainFrame().childFrames()) {
+    // Here you can use few identifying methods like url(),name(),title()
+    if (frame.url().includes('twitter')) {
+        console.log('we found the Twitter iframe');
+        twitterFrame = frame;
+        // we assign this frame to myFrame to use it later
+    }
+}
+
+await browser.close();
+```
+
+If it is hard to identify the iframe you want to access, don't worry. You can already use any Puppeteer method on the frame object to help you identify it, scrape it or manipulate it. You can also go through any nested frames.
+
+```js
+let twitterFrame;
+
+for (const frame of page.mainFrame().childFrames()) {
+    if (frame.url().includes('twitter')) {
+        for (const nestedFrame of frame.childFrames()) {
+            const tweetList = await nestedFrame.$('.timeline-TweetList');
+            if (tweetList) {
+                console.log('We found the frame with tweet list');
+                twitterFrame = nestedFrame;
+            }
+        }
+    }
+}
+```
+
+Here we used some more advanced techniques to find a nested `<iframe>`. Now when we have it assigned to our twitterFrame object, the hard work is over and we can start working with it (almost) like with a regular page object.
+
+```js
+const textFeed = await twitterFrame.$$eval('.timeline-Tweet-text', (pElements) => pElements.map((elem) => elem.textContent));
+
+for (const text of textFeed) {
+    console.log(text);
+    console.log('**********');
+}
+```
+
+With a little more effort, we could also follow different links from the feed or even play a video, but that is not within the scope of this article. For all references about page and frame objects (and Puppeteer generally), you should study [the documentation](https://pub.dev/documentation/puppeteer/latest/puppeteer/Frame-class.html). New versions are released quite often, so checking the docs regularly can help you to stay on top of web scraping and automation.
+</file>
+
+<file path="webscraping/puppeteer_playwright/common_use_cases/submitting_a_form_with_a_file_attachment.md">
+---
+title: Submitting a form with a file attachment
+description: Understand how to download a file, attach it to a form using a headless browser in Playwright or Puppeteer, then submit the form.
+sidebar_position: 4
+slug: /puppeteer-playwright/common-use-cases/submitting-a-form-with-a-file-attachment
+---
+
+# Submitting a form with a file attachment
+
+**Understand how to download a file, attach it to a form using a headless browser in Playwright or Puppeteer, then submit the form.**
+
+---
+
+We can use Puppeteer or Playwright to simulate submitting the same way a human-operated browser would.
+
+## Downloading the file {#downloading-the-file}
+
+The first thing necessary is to download the file, which can be done using the `request-promise` module. We will also be using the `fs/promises` module to save it to the disk, so make sure they are included.
+
+```js
+import * as fs from 'fs/promises';
+import request from 'request-promise';
+```
+
+The actual downloading is slightly different for text and binary files. For a text file, it can be done like this:
+
+```js
+const fileData = await request('https://some-site.com/file.txt');
+```
+
+For a binary data file, we need to provide an additional parameter so as not to interpret it as text:
+
+```js
+const fileData = await request({
+    uri: 'https://some-site.com/file.pdf',
+    encoding: null,
+});
+```
+
+In this case, `fileData` will be a `Buffer` instead of a string.
+
+To use the file in Puppeteer/Playwright, we need to save it to the disk. This can be done using the `fs/promises` module.
+
+```js
+await fs.writeFile('./file.pdf', fileData);
+```
+
+## Submitting the form {#submitting-the-form}
+
+The first step necessary is to open the form page in Puppeteer. This can be done as follows:
+
+```js
+const browser = await puppeteer.launch();
+const page = await browser.newPage();
+await page.goto('https://some-site.com/file-upload.php');
+```
+
+To fill in any necessary form inputs, we can use the `page.type()` function. This works even in cases when `elem.value = 'value'` is not usable.
+
+```js
+await page.type('input[name=firstName]', 'John');
+await page.type('input[name=surname]', 'Doe');
+await page.type('input[name=email]', 'john.doe@example.com');
+```
+
+To add the file to the appropriate input, we first need to find it and then use the [`uploadFile()`](https://pptr.dev/api/puppeteer.elementhandle.uploadfile) function.
+
+```js
+const fileInput = await page.$('input[type=file]');
+await fileInput.uploadFile('./file.pdf');
+```
+
+Now we can finally submit the form.
+
+```js
+await page.click('input[type=submit]');
+```
+</file>
+
+<file path="webscraping/puppeteer_playwright/executing_scripts/extracting_data.md">
+---
+title: Extracting data
+description: Learn how to extract data from a page with evaluate functions, then how to parse it by using a second library called Cheerio.
+sidebar_position: 2
+slug: /puppeteer-playwright/executing-scripts/collecting-data
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Extracting data {#extracting-data}
+
+**Learn how to extract data from a page with evaluate functions, then how to parse it by using a second library called Cheerio.**
+
+---
+
+Now that we know how to execute scripts on a page, we're ready to learn a bit about [data extraction](../../scraping_basics_javascript/data_extraction/index.md). In this lesson, we'll be scraping all the on-sale products from our [Fakestore](https://demo-webstore.apify.org/search/on-sale) website. Playwright & Puppeteer offer two main methods for data extraction:
+
+1. Directly in `page.evaluate()` and other evaluate functions such as `page.$$eval()`.
+2. In the Node.js context using a parsing library such as [Cheerio](https://www.npmjs.com/package/cheerio)
+
+:::tip Crawlee and parsing with Cheerio
+
+If you are using Crawlee, we highly recommend the [parseWithCheerio](https://crawlee.dev/api/playwright-crawler/interface/PlaywrightCrawlingContext#parseWithCheerio) function for unified data extraction syntax. This way, switching between browser and plain HTTP scraping is a breeze.
+
+:::
+
+## Setup
+
+Here is the base setup for our code, upon which we'll be building off of in this lesson:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://demo-webstore.apify.org/search/on-sale');
+
+// code will go here
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://demo-webstore.apify.org/search/on-sale');
+
+// code will go here
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+<!-- > Notice the slightly different syntax between Playwright and Puppeteer with `waitForLoadState('DOMContentLoaded')` and `waitForNavigation({ waitUntil: 'DOMContentLoaded' })`. Sometimes, the differences are fairly subtle like this, but later on we'll run into some more significant differences. -->
+
+## Extracting from the browser context {#extracting-in-page-evaluate}
+
+Whatever is returned from the callback function in `page.evaluate()` will be returned by the evaluate function, which means that we can set it to a variable like so:
+
+```js
+const products = await page.evaluate(() => ({ foo: 'bar' }));
+
+console.log(products); // -> { foo: 'bar' }
+```
+
+We'll be returning a bunch of product objects from this function, which will be accessible back in our Node.js context after the promise has resolved. Let's now go ahead and write some data extraction code to collect each product:
+
+```js
+const products = await page.evaluate(() => {
+    const productCards = Array.from(document.querySelectorAll('a[class*="ProductCard_root"]'));
+
+    return productCards.map((element) => {
+        const name = element.querySelector('h3[class*="ProductCard_name"]').textContent;
+        const price = element.querySelector('div[class*="ProductCard_price"]').textContent;
+
+        return {
+            name,
+            price,
+        };
+    });
+});
+
+console.log(products);
+```
+
+When we run this code, we see this logged to our console:
+
+![Products logged to the console](./images/log-products.png)
+
+## Using jQuery {#using-jquery}
+
+Working with `document.querySelector` is cumbersome and quite verbose, but with the `page.addScriptTag()` function and the latest [jQuery CDN link](https://releases.jquery.com/), we can inject jQuery into the current page to gain access to its syntactical sweetness:
+
+```js
+await page.addScriptTag({ url: 'https://code.jquery.com/jquery-3.6.0.min.js' });
+```
+
+This function will literally append a `<script>` tag to the `<head>` element of the current page, allowing access to jQuery's API when using `page.evaluate()` to run code in the browser context.
+
+Now, since we're able to use jQuery, let's translate our vanilla JavaScript code within the `page.evaluate()` function to jQuery:
+
+```js
+await page.addScriptTag({ url: 'https://code.jquery.com/jquery-3.6.0.min.js' });
+
+const products = await page.evaluate(() => {
+    const productCards = Array.from($('a[class*="ProductCard_root"]'));
+
+    return productCards.map((element) => {
+        const card = $(element);
+
+        const name = card.find('h3[class*="ProductCard_name"]').text();
+        const price = card.find('div[class*="ProductCard_price"]').text();
+
+        return {
+            name,
+            price,
+        };
+    });
+});
+
+console.log(products);
+```
+
+This will output the same exact result as the code in the previous section.
+
+## Parsing in the Node.js context {#parsing-in-node-context}
+
+One of the most popular parsing libraries for Node.js is [Cheerio](https://www.npmjs.com/package/cheerio), which can be used in tandem with Playwright and Puppeteer. It is extremely beneficial to parse the page's HTML in the Node.js context for a number of reasons:
+
+- You can port the code between headless browser data extraction and plain HTTP data extraction
+- You don't have to worry in which context you're working (which can sometimes be confusing)
+- Errors are easier to handle when running in the base Node.js context
+
+To install it, we can run the following command within your project's directory:
+
+```shell
+npm install cheerio
+```
+
+Then, we'll import the `load` function like so:
+
+```js
+import { load } from 'cheerio';
+```
+
+Finally, we can create a `Cheerio` object based on our page's current content like so:
+
+```js
+const $ = load(await page.content());
+```
+
+> It's important to note that this `$` object is static. If any content on the page changes, the `$` variable will not automatically be updated. It will need to be re-declared or re-defined.
+
+Here's our full code so far:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+import { load } from 'cheerio';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://demo-webstore.apify.org/search/on-sale');
+
+const $ = load(await page.content());
+
+// code will go here
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+import { load } from 'cheerio';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://demo-webstore.apify.org/search/on-sale');
+
+const $ = load(await page.content());
+
+// code will go here
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+Now, to loop through all of the products, we'll make use of the `$` object and loop through them while safely in the server-side context rather than running the code in the browser. Notice that this code is nearly exactly the same as the jQuery code above - it is just not running inside of a `page.evaluate()` in the browser context.
+
+```js
+const $ = load(await page.content());
+
+const productCards = Array.from($('a[class*="ProductCard_root"]'));
+
+const products = productCards.map((element) => {
+    const card = $(element);
+
+    const name = card.find('h3[class*="ProductCard_name"]').text();
+    const price = card.find('div[class*="ProductCard_price"]').text();
+
+    return {
+        name,
+        price,
+    };
+});
+
+console.log(products);
+```
+
+## Final code
+
+Here's what our final optimized code looks like:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+import { load } from 'cheerio';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://demo-webstore.apify.org/search/on-sale');
+
+const $ = load(await page.content());
+
+const productCards = Array.from($('a[class*="ProductCard_root"]'));
+
+const products = productCards.map((element) => {
+    const card = $(element);
+
+    const name = card.find('h3[class*="ProductCard_name"]').text();
+    const price = card.find('div[class*="ProductCard_price"]').text();
+
+    return {
+        name,
+        price,
+    };
+});
+
+console.log(products);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+import { load } from 'cheerio';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://demo-webstore.apify.org/search/on-sale');
+
+const $ = load(await page.content());
+
+const productCards = Array.from($('a[class*="ProductCard_root"]'));
+
+const products = productCards.map((element) => {
+    const card = $(element);
+
+    const name = card.find('h3[class*="ProductCard_name"]').text();
+    const price = card.find('div[class*="ProductCard_price"]').text();
+
+    return {
+        name,
+        price,
+    };
+});
+
+console.log(products);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Next up {#next}
+
+Our [next lesson](../reading_intercepting_requests.md) will be discussing something super cool - request interception and reading data from requests and responses. It's like using DevTools, except programmatically!
+</file>
+
+<file path="webscraping/puppeteer_playwright/executing_scripts/index.md">
+---
+title: III - Executing scripts
+description: Understand the two different contexts which your code can be run in, and how to run custom scripts in the context of the browser.
+sidebar_position: 2.3
+slug: /puppeteer-playwright/executing-scripts
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Executing scripts {#executing-scripts}
+
+**Understand the two different contexts which your code can be run in, and how to run custom scripts in the context of the browser.**
+
+---
+
+An important concept to understand when dealing with headless browsers is the **context** in which your code is being run. For example, if you try to use the native `fs` Node.js module (used in the previous lesson) while running code in the context of the browser, errors will be thrown saying that it is undefined. Similarly, if you are trying to use `document.querySelector()` or other browser-specific functions in the server-side Node.js context, errors will also be thrown.
+
+![Diagram explaining the two different contexts your code can be run in](../images/context-diagram.jpg)
+
+Here is an example of a common mistake made by beginners to Puppeteer/Playwright:
+
+```js
+// This code is incorrect!
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+// visit google
+await page.goto('https://www.google.com/');
+
+// change background to green
+document.body.style.background = 'green';
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+When we try and run this, we get this error:
+
+```text
+ReferenceError: document is not defined
+```
+
+The reason this is happening is because we're trying to run browser-side code on the server-side where it is not supported. [`document`](https://developer.mozilla.org/en-US/docs/Web/API/Document) is a property of the browser [**Window**](https://developer.mozilla.org/en-US/docs/Web/API/Window) instance that holds the rendered website; therefore, this API is not available in Node.js. How are we supposed to run code within the context of the browser?
+
+## Running code in the context of the browser {#running-in-browser-context}
+
+We will use `page.evaluate()` to run our code in the browser. This method takes a callback as its first parameter, which will be executed within the browser.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+await page.evaluate(() => {
+    document.body.style.background = 'green';
+});
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+await page.evaluate(() => {
+    document.body.style.background = 'green';
+});
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+Here's what we see in the automated browser when we run this code:
+
+![Google with the background color changed to green](../images/green-google.png)
+
+## Using variables in `page.evaluate()` {#using-variables-in-page-evaluate}
+
+Within our code, we generate a `randomString` in the Node.js context:
+
+```js
+const randomString = Math.random().toString(36).slice(2);
+```
+
+Now, let's say we want to change the title of the document to be this random string. To have the random string available in the callback of our `page.evaluate()`, we'll pass it in a second parameter. It's best practice to have this second parameter as an object, because in real world situations you often need to pass more than one value.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+const params = { randomString: Math.random().toString(36).slice(2) };
+
+await page.evaluate(({ randomString }) => {
+    document.querySelector('title').textContent = randomString;
+}, params);
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+const params = { randomString: Math.random().toString(36).slice(2) };
+
+await page.evaluate(({ randomString }) => {
+    document.querySelector('title').textContent = randomString;
+}, params);
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+Now, when we run this code, we can see the title change on the page's tab:
+
+![Google with the background color changed to green](../images/title-changed.png)
+
+## Next up {#next}
+
+The [next lesson](./injecting_code.md) will be a short one discussing two different ways of executing scripts on a page.
+</file>
+
+<file path="webscraping/puppeteer_playwright/executing_scripts/injecting_code.md">
+---
+title: Injecting code
+description: Learn how to inject scripts prior to a page's load (pre-injecting), as well as how to expose functions to be run at a later time on the page.
+sidebar_position: 1
+slug: /puppeteer-playwright/executing-scripts/injecting-code
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Injecting code {#injecting-code}
+
+**Learn how to inject scripts prior to a page's load (pre-injecting), as well as how to expose functions to be run at a later time on the page.**
+
+---
+
+In the previous lesson, we learned how to execute code on the page using `page.evaluate()`, and though this fits the majority of use cases, there are still some more unusual cases. For example, what if we want to execute our custom script prior to the page's load? Or, what if we want to define a function in the page's context to be run at a later time?
+
+We'll be covering both of these cases in this brief lesson.
+
+## Pre-injecting scripts {#pre-injecting}
+
+Sometimes, you need your custom code to run before any other code is run on the page. Perhaps you need to modify an object's prototype, or even re-define certain global variables before they are used by the page's native scripts.
+
+Luckily, Puppeteer and Playwright both have functions for this. In Puppeteer, we use the [`page.evaluateOnNewDocument()`](https://pptr.dev/api/puppeteer.page.evaluateonnewdocument) function, while in Playwright we use [`page.addInitScript()`](https://playwright.dev/docs/api/class-page#page-add-init-script). We'll use these functions to override the native `addEventListener` function, setting it to a function that does nothing. This will prevent event listeners from being added to elements.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.addInitScript(() => {
+    // Override the prototype
+    Node.prototype.addEventListener = () => { /* do nothing */ };
+});
+
+await page.goto('https://google.com');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.evaluateOnNewDocument(() => {
+    // Override the prototype
+    Node.prototype.addEventListener = null;
+});
+
+await page.goto('https://google.com');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+> Go ahead and run this code. Can you click the **I accept** button to accept Google's cookies policy?
+
+## Exposing functions {#exposing-functions}
+
+Here's a super awesome function we've created called `returnMessage()`, which returns the string **Apify Academy!**:
+
+```js
+const returnMessage = () => 'Apify academy!';
+```
+
+We want to **expose** this function to our loaded page so that it can be later executed there, which can be done with [`page.exposeFunction()`](https://playwright.dev/docs/api/class-page#page-expose-function). This will make `returnMessage()` available when running scripts not only inside of `page.evaluate()`, but also directly from DevTools.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+const returnMessage = () => 'Apify academy!';
+
+await page.exposeFunction(returnMessage.name, returnMessage);
+
+const msg = await page.evaluate(() => returnMessage());
+
+console.log(msg);
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+const returnMessage = () => 'Apify academy!';
+
+await page.exposeFunction(returnMessage.name, returnMessage);
+
+const msg = await page.evaluate(() => returnMessage());
+
+console.log(msg);
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Next up {#next}
+
+Next, we'll be learning a bit about how to extract data using Playwright/Puppeteer. You can use one of the two main ways to do this, so [the next exciting lesson](./extracting_data.md) will be about both of them!
+</file>
+
+<file path="webscraping/puppeteer_playwright/page/index.md">
+---
+title: II - Opening & controlling a page
+description: Learn how to create and open a Page with a Browser, and how to use it to visit and programmatically interact with a website.
+sidebar_position: 2.2
+slug: /puppeteer-playwright/page
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Opening a page {#opening-a-page}
+
+**Learn how to create and open a Page with a Browser, and how to use it to visit and programmatically interact with a website.**
+
+---
+
+When you open up your regular browser and visit a website, you open up a new page (or tab) before entering the URL in the search bar and hitting the **Enter** key. In Playwright and Puppeteer, you also have to open up a new page before visiting a URL. This can be done with the `browser.newPage()` function, which will return a **Page** object ([Puppeteer](https://pptr.dev/#?product=Puppeteer&version=v13.7.0&show=api-class-page), [Playwright](https://playwright.dev/docs/api/class-page)).
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+
+// Open a new page
+const page = await browser.newPage();
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+// Open a new page
+const page = await browser.newPage();
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+Then, we can visit a website with the `page.goto()` method. Let's go to [Google](https://google.com) for now. We'll also use the `page.waitForTimeout()` function, which will force the program to wait for a number of seconds before quitting (otherwise, everything will flash before our eyes and we won't really be able to tell what's going on):
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+
+// Open a new page
+const page = await browser.newPage();
+
+// Visit Google
+await page.goto('https://google.com');
+
+// wait for 10 seconds before shutting down
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+// Open a new page
+const page = await browser.newPage();
+
+// Visit Google
+await page.goto('https://google.com');
+
+// wait for 10 seconds before shutting down
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+> If you haven't already, go ahead and run this code to see what happens.
+
+## Next up {#next}
+
+Now that we know how to open up a page, [let's learn](./interacting_with_a_page.md) how to automate page interaction, such as clicking, typing, and pressing keys.
+</file>
+
+<file path="webscraping/puppeteer_playwright/page/interacting_with_a_page.md">
+---
+title: Interacting with a page
+description: Learn how to programmatically do actions on a page such as clicking, typing, and pressing keys. Also, discover a common roadblock that comes up when automating.
+sidebar_position: 1
+slug: /puppeteer-playwright/page/interacting-with-a-page
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Interacting with a page {#interacting-with-a-page}
+
+**Learn how to programmatically do actions on a page such as clicking, typing, and pressing keys. Also, discover a common roadblock that comes up when automating.**
+
+---
+
+The **Page** object has a whole boat-load of functions which can be used to interact with the loaded page. We're not going to go over every single one of them right now, but we _will_ use a few of the most common ones to add some functionality to our current project.
+
+Let's say that we want to automate searching for **hello world** on Google, then click on the first result and log the title of the page to the console, then take a screenshot and write it it to the filesystem. In order to understand how we're going to automate this, let's break down how we would do it manually:
+
+1. Click on the button which accepts Google's cookies policy (To see how it looks, open Google in an anonymous window.)
+2. Type **hello world** into the search bar
+3. Press **Enter**
+4. Wait for the results page to load
+5. Click on the first result
+6. Read the title of the clicked result's loaded page
+7. Screenshot the page
+
+Though it seems complex, the wonderful **Page** API can help us with all the steps.
+
+## Clicking & pressing keys {#clicking-and-pressing-keys}
+
+Let's first focus on the first 3 steps listed above. By using `page.click()` and the CSS selector of the element to click, we can click an element:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Click the "Accept all" button
+await page.click('button:has-text("Accept all")');
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// Click the "Accept all" button
+await page.click('button + button');
+```
+
+</TabItem>
+</Tabs>
+
+With `page.click()`, Puppeteer and Playwright actually drag the mouse and click, allowing the bot to act more human-like. This is different from programmatically clicking with `Element.click()` in vanilla client-side JavaScript.
+
+Notice that in the Playwright example, we are using a different selector than in the Puppeteer example. This is because Playwright supports [many custom CSS selectors](https://playwright.dev/docs/other-locators#css-elements-matching-one-of-the-conditions), such as the **has-text** pseudo class. As a rule of thumb, using text selectors is much more preferable to using regular selectors, as they are much less likely to break. If Google makes the sibling above the **Accept all** button a `<div>` element instead of a `<button>` element, our `button + button` selector will break. However, the button will always have the text **Accept all**; therefore, `button:has-text("Accept all")` is more reliable.
+
+> If you're not already familiar with CSS selectors and how to find them, we recommend referring to [this lesson](../../scraping_basics_javascript/data_extraction/using_devtools.md) in the **Web scraping for beginners** course.
+
+Then, we can type some text into an input field `<textarea>` with `page.type()`; passing a CSS selector as the first, and the string to input as the second parameter:
+
+```js
+// Type the query into the search box
+await page.type('textarea[title]', 'hello world');
+```
+
+Finally, we can press a single key by accessing the `keyboard` property of `page` and calling the `press()` function on it:
+
+```js
+// Press enter
+await page.keyboard.press('Enter');
+```
+
+This is what we've got so far:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+// Click the "Accept all" button
+await page.click('button:has-text("Accept all")');
+
+// Type the query into the search box
+await page.type('textarea[title]', 'hello world');
+
+// Press enter
+await page.keyboard.press('Enter');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+// Click the "Accept all" button
+await page.click('button + button');
+
+// Type the query into the search box
+await page.type('textarea[title]', 'hello world');
+
+// Press enter
+await page.keyboard.press('Enter');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+When we run it, we leave off on the results page:
+
+![Google results page reached by headless browser](./images/google-results.png)
+
+Great! Now all we have to do is click the first result which matches the CSS selector `.g a`:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+await page.click('button:has-text("Accept all")');
+
+await page.type('textarea[title]', 'hello world');
+
+await page.keyboard.press('Enter');
+
+// Click the first result
+await page.click('.g a');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// This code will throw an error!
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+const page = await browser.newPage();
+
+await page.goto('https://www.google.com/');
+
+await page.click('button + button');
+
+await page.type('textarea[title]', 'hello world');
+
+await page.keyboard.press('Enter');
+
+// Click the first result
+await page.click('.g a');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+But wait, when we try to run the Puppeteer code, we run into this nasty error:
+
+> The following error won't be present if you're following the Playwright examples. You'll learn why in the next lesson.
+
+```text
+/Users/me/Desktop/playwright-puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/assert.js:26
+        throw new Error(message);
+              ^
+
+Error: No node found for selector: .g a
+    at assert (/Users/me/Desktop/playwright-puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/assert.js:26:15)
+...
+```
+
+We hit this error because we attempted to click an element that wasn't yet present on the page. The results page hadn't even loaded yet!
+
+## Next up {#next}
+
+In the [next lesson](./waiting.md), we'll be taking a look at how to **wait for** navigation, events, and content before resuming interactions.
+</file>
+
+<file path="webscraping/puppeteer_playwright/page/page_methods.md">
+---
+title: Page methods
+description: Understand that the Page object has many different methods to offer, and learn how to use two of them to capture a page's title and take a screenshot.
+sidebar_position: 3
+slug: /puppeteer-playwright/page/page-methods
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Page methods {#page-methods}
+
+**Understand that the Page object has many different methods to offer, and learn how to use two of them to capture a page's title and take a screenshot.**
+
+---
+
+Other than having methods for interacting with a page and waiting for events and elements, the **Page** object also supports various methods for doing other things, such as [reloading](https://pptr.dev/api/puppeteer.page.reload), [screenshotting](https://playwright.dev/docs/api/class-page#page-screenshot), [changing headers](https://playwright.dev/docs/api/class-page#page-set-extra-http-headers), and extracting the [page's content](https://pptr.dev/api/puppeteer.page.content/).
+
+Last lesson, we left off at a point where we were waiting for the page to navigate so that we can extract the page's title and take a screenshot of it. In this lesson, we'll be learning about the two methods we can use to achieve both of those things.
+
+## Grabbing the title {#grabbing-the-title}
+
+Two main page functions exist that will return general data:
+
+1. `page.content()` will return the entire HTML content of the page.
+2. `page.title()` will return the title of the current page found in the `<title>` tag.
+
+For our case, we'll utilize the `page.title()` function to grab the title and log it to the console:
+
+```js
+// Grab the title and set it to a variable
+const title = await page.title();
+
+// Log the title to the console
+console.log(title);
+```
+
+## Screenshotting {#screenshotting}
+
+The `page.screenshot()` function will return a buffer which can be written to the filesystem as an image:
+
+```js
+// Take the screenshot and write it to the filesystem
+await page.screenshot({ path: 'screenshot.png' });
+```
+
+> The image will by default be **.png**. To change the image to **.jpeg** type, set the (optional) `type` option to **jpeg**.
+
+## Final code {#final-code}
+
+Here's our final code which extracts the page's title, takes a screenshot and saves it to our project's folder as `screenshot.png`:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+
+// Create a page and visit Google
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+// Agree to the cookies policy
+await page.click('button:has-text("Accept all")');
+
+// Type the query and visit the results page
+await page.type('textarea[title]', 'hello world');
+await page.keyboard.press('Enter');
+
+// Click on the first result
+await page.click('.g a');
+await page.waitForLoadState('load');
+
+// Grab the page's title and log it to the console
+const title = await page.title();
+console.log(title);
+
+// Take a screenshot and write it to the filesystem
+await page.screenshot({ path: 'screenshot.png' });
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+// Create a page and visit Google
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+// Agree to the cookies policy
+await page.click('button + button');
+
+// Type the query and visit the results page
+await page.type('textarea[title]', 'hello world');
+await page.keyboard.press('Enter');
+
+// Wait for the first result to appear on the page,
+// then click on it
+await page.waitForSelector('.g a');
+await Promise.all([page.waitForNavigation(), page.click('.g a')]);
+
+// Grab the page's title and log it to the console
+const title = await page.title();
+console.log(title);
+
+// Take a screenshot and write it to the filesystem
+await page.screenshot({ path: 'screenshot.png' });
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+When you run this code, you should see this logged to the console:
+
+```text
+"Hello, World!" program - Wikipedia
+```
+
+Additionally, you should see a new image named **screenshot.png** in your project's folder that looks something like this:
+
+![Screenshot from Playwright/Puppeteer](./images/wikipedia-screenshot.png)
+
+## Next up {#next}
+
+In the [next exciting lesson](../executing_scripts/index.md), we'll gain a solid understanding of the two different contexts we can run our code in when using Puppeteer and Playwright, as well as how to run code in the context of the browser.
+</file>
+
+<file path="webscraping/puppeteer_playwright/page/waiting.md">
+---
+title: Waiting for content & events
+description: Learn the importance of waiting for content and events before running interaction or extraction code, as well as the best practices for doing so.
+sidebar_position: 2
+slug: /puppeteer-playwright/page/waiting
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Waiting for elements and events {#waiting-for-elements-and-events}
+
+**Learn the importance of waiting for content and events before running interaction or extraction code, as well as the best practices for doing so.**
+
+---
+
+In a perfect world, every piece of content served on a website would be loaded instantaneously. We don't live in a perfect world though, and often times it can take anywhere between 1/10th of a second to a few seconds to load some content onto a page. Certain elements are also [generated dynamically](../../../glossary/concepts/dynamic_pages.md), which means that they are not present in the initial HTML and that they are created by scripts or data from API calls.
+
+Puppeteer and Playwright don't sit around waiting for a page (or specific elements) to load though - if we tell it to do something with an element that hasn't been rendered yet, it'll start trying to do it (which will result in nasty errors). We've got to tell it to wait.
+
+> For a thorough explanation on how dynamic rendering works, give [**Dynamic pages**](../../../glossary/concepts/dynamic_pages.md) a quick readover, and check out the examples.
+
+Different events and elements can be waited for using the various `waitFor...` methods offered.
+
+## Elements {#waiting-for-elements}
+
+In the previous lesson, we ran into an error with Puppeteer due to the fact that we weren't waiting for the `.g a` selector to be present on the page before clicking it. The same error didn't occur in Playwright, because `page.click()` [automatically waits](https://playwright.dev/docs/actionability) for the element to be visible on the page before clicking it.
+
+Elements with specific selectors can be waited for by using the `page.waitForSelector()` function. Let's use this knowledge to wait for the first result to be present on the page prior to clicking on it:
+
+```js
+// This example is relevant for Puppeteer only!
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+const page = await browser.newPage();
+await page.goto('https://www.google.com/');
+
+await page.click('button + button');
+
+await page.type('textarea[title]', 'hello world');
+await page.keyboard.press('Enter');
+
+// Wait for the element to be present on the page prior to clicking it
+await page.waitForSelector('.g a');
+await page.click('.g a');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+Now, we won't see the error message anymore, and the first result will be successfully clicked by Puppeteer.
+
+> Playwright also has a `page.waitForSelector()` function and it's useful in other scenarios than clicking, or for more granular control over the waiting process.
+
+## Navigation {#waiting-for-navigation}
+
+If we remember properly, after clicking the first result, we want to console log the title of the result's page and save a screenshot into the filesystem. In order to grab a solid screenshot of the loaded page though, we should **wait for navigation** before snapping the image. This can be done with [`page.waitForNavigation()`](https://pptr.dev/#?product=Puppeteer&version=v14.1.0&show=api-pagewaitfornavigationoptions).
+
+> A navigation is when a new [page load](../../../glossary/concepts/dynamic_pages.md) happens. First, the `domcontentloaded` event is fired, then the `load` event. `page.waitForNavigation()` will wait for the `load` event to fire.
+
+Naively, you might immediately think that this is the way we should wait for navigation after clicking the first result:
+
+```js
+await page.click('.g a');
+await page.waitForNavigation();
+```
+
+Though in theory this is correct, it can result in a race condition in which the page navigates quickly before the `page.waitForNavigation()` function is ever run, which means that once it is finally called, it will hang and wait forever for the [`load` event](https://developer.mozilla.org/en-US/docs/Web/API/Window/load_event) event to fire even though it already fired. To solve this, we can stick the waiting logic and the clicking logic into a `Promise.all()` call (placing `page.waitForNavigation()` first).
+
+```js
+await Promise.all([page.waitForNavigation(), page.click('.g a')]);
+```
+
+Though the line of code above is also valid in Playwright, it is recommended to use [`page.waitForLoadState('load')`](https://playwright.dev/docs/api/class-page#page-wait-for-load-state) instead of `page.waitForNavigation()`, as it automatically handles the issues being solved by using `Promise.all()`.
+
+```js
+await page.click('.g a');
+await page.waitForLoadState('load');
+```
+
+This implementation will do the following:
+
+1. Begin waiting for the page to navigate without blocking the `page.click()` function
+2. Click the element, firing off a navigating event
+3. Resolve once the page has navigated, allowing further code to run
+
+## Our code so far {#current-code}
+
+Here's what our project's code looks like so far:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import * as fs from 'fs/promises';
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+
+// Create a page and visit Google
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+// Agree to the cookies policy
+await page.click('button:has-text("Accept all")');
+
+// Type the query and visit the results page
+await page.type('textarea[title]', 'hello world');
+await page.keyboard.press('Enter');
+
+// Click on the first result
+await page.click('.g a');
+await page.waitForLoadState('load');
+
+// Our title extraction and screenshotting logic
+// will go here
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import * as fs from 'fs/promises';
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+// Create a page and visit Google
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+// Agree to the cookies policy
+await page.click('button + button');
+
+// Type the query and visit the results page
+await page.type('textarea[title]', 'hello world');
+await page.keyboard.press('Enter');
+
+// Wait for the first result to appear on the page,
+// then click on it
+await page.waitForSelector('.g a');
+await Promise.all([page.waitForNavigation(), page.click('.g a')]);
+
+// Our title extraction and screenshotting logic
+// will go here
+
+await page.waitForTimeout(10000);
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Next up {#next}
+
+In the [final lesson](./page_methods.md) of the **Opening & controlling a page** section of this course, we'll be learning about various methods on **Page** which aren't related to directly interacting with a page or waiting for stuff, as well as finally adding the final touches to our mini-project (page title grabbing and screenshotting).
+</file>
+
+<file path="webscraping/puppeteer_playwright/browser_contexts.md">
+---
+title: VI - Creating multiple browser contexts
+description: Learn what a browser context is, how to create one, how to emulate devices, and how to use browser contexts to automate multiple sessions at one time.
+sidebar_position: 2.6
+slug: /puppeteer-playwright/browser-contexts
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Creating multiple browser contexts {#creating-browser-contexts}
+
+**Learn what a browser context is, how to create one, how to emulate devices, and how to use browser contexts to automate multiple sessions at one time.**
+
+---
+
+A [**BrowserContext**](https://playwright.dev/docs/api/class-browsercontext) is an isolated incognito session within a **Browser** instance. This means that contexts can have different device/screen size configurations, different language and color scheme settings, etc. It is useful to use multiple browser instances when dealing with automating logging into multiple accounts simultaneously (therefore requiring multiple sessions), or in any cases where multiple sessions are required.
+
+When we create a **Browser** object by using the `launch()` function, a single [browser context](https://playwright.dev/docs/browser-contexts) is automatically created. In order to create more, we use the [`browser.newContext()`](https://playwright.dev/docs/api/class-browser#browser-new-context) function in Playwright, and [`browser.createIncognitoBrowserContext`](https://pptr.dev/#?product=Puppeteer&version=v14.1.0&show=api-browsercreateincognitobrowsercontextoptions) in Puppeteer.
+
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+const myNewContext = await browser.newContext();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+const myNewContext = await browser.createIncognitoBrowserContext();
+```
+
+</TabItem>
+</Tabs>
+
+## Persistent vs non-persistent browser contexts {#persistent-vs-non-persistent}
+
+In both examples above, we are creating a new **non-persistent** browser context, which means that once it closes, all of its cookies, cache, etc. will be lost. For some cases, that's okay, but in most situations, the performance hit from this is too large. This is why we have **persistent** browser contexts. Persistent browser contexts open up a bit slower and they store all their cache, cookies, session storage, and local storage in a file on disk.
+
+In Puppeteer, the **default** browser context is the persistent one, while in Playwright we have to use [`BrowserType.launchPersistentContext()`](https://playwright.dev/docs/api/class-browsertype#browser-type-launch-persistent-context) instead of `BrowserType.launch()` in order for the default context to be persistent.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+// Here, we launch a persistent browser context. The first
+// argument is the location to store the data.
+const browser = await chromium.launchPersistentContext('./persistent-context', { headless: false });
+
+const page = await browser.newPage();
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+
+// This page will be under the default context, which is persistent.
+// Cache, cookies, etc. will be stored on disk and persisted
+const page = await browser.newPage();
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Using browser contexts {#using-browser-contexts}
+
+In both Playwright and Puppeteer, various devices (iPhones, iPads, Androids, etc.) can be emulated by using [`playwright.devices`](https://playwright.dev/docs/api/class-playwright#playwright-devices) or [`puppeteer.devices`](https://pptr.dev/#?product=Puppeteer&version=v14.1.0&show=api-puppeteerdevices). We'll be using this to create two different browser contexts, one emulating an iPhone, and one emulating an Android device:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium, devices } from 'playwright';
+
+// Launch the browser
+const browser = await chromium.launch({ headless: false });
+
+const iPhone = devices['iPhone 11 Pro'];
+// Create a new context for our iPhone emulation
+const iPhoneContext = await browser.newContext({ ...iPhone });
+// Open a page on the newly created iPhone context
+const iPhonePage = await iPhoneContext.newPage();
+
+const android = devices['Galaxy Note 3'];
+// Create a new context for our Android emulation
+const androidContext = await browser.newContext({ ...android });
+// Open a page on the newly created Android context
+const androidPage = await androidContext.newPage();
+
+// The code in the next step will go here
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+// Launch the browser
+const browser = await puppeteer.launch({ headless: false });
+
+const iPhone = puppeteer.devices['iPhone 11 Pro'];
+// Create a new context for our iPhone emulation
+const iPhoneContext = await browser.createIncognitoBrowserContext();
+// Open a page on the newly created iPhone context
+const iPhonePage = await iPhoneContext.newPage();
+// Emulate the device
+await iPhonePage.emulate(iPhone);
+
+const android = puppeteer.devices['Galaxy Note 3'];
+// Create a new context for our Android emulation
+const androidContext = await browser.createIncognitoBrowserContext();
+// Open a page on the newly created Android context
+const androidPage = await androidContext.newPage();
+// Emulate the device
+await androidPage.emulate(android);
+
+// The code in the next step will go here
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+Then, we'll make both `iPhonePage` and `androidPage` visit [deviceinfo.me](https://www.deviceinfo.me/), which is a website that displays the type of device you have, the operating system you're using, and more device and location-specific information.
+
+```js
+// Go to deviceinfo.me on both at the same time
+await Promise.all([iPhonePage.goto('https://www.deviceinfo.me/'), androidPage.goto('https://www.deviceinfo.me/')]);
+
+// Wait for 10 seconds on both before shutting down
+await Promise.all([iPhonePage.waitForTimeout(10000), androidPage.waitForTimeout(10000)]);
+```
+
+Let's go ahead and run our code and analyze the data on each **deviceinfo.me** page. Here's what we see:
+
+![deviceinfo.me results for both browser contexts](./images/dual-contexts.jpg)
+
+We see that **deviceinfo.me** detects both contexts as using different devices, despite the fact they're visiting the same page at the same time. This shows firsthand that different browser contexts can have totally different configurations, as they all have separate sessions.
+
+## Accessing browser contexts {#accessing-browser-contexts}
+
+When working with multiple browser contexts, it can be difficult to keep track of all of them and making changes becomes a repetitive job. This is why the **Browser** instance returned from the `launch()` function also has a `contexts()` function (`browserContexts()` in Puppeteer). This function returns an array of all the contexts that are currently attached to the browser.
+
+Let's go ahead and use this function to loop through all of our browser contexts and make them log **Site visited** to the console whenever the website is visited:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+for (const context of browser.contexts()) {
+    // In Playwright, lots of events are supported in the "on" function of
+    // a BrowserContext instance
+    context.on('request', (req) => req.url() === 'https://www.deviceinfo.me/' && console.log('Site visited'));
+}
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+for (const context of browser.browserContexts()) {
+    // In Puppeteer, only three events are supported in the "on" function
+    // of a BrowserContext instance
+    context.on('targetchanged', () => console.log('Site visited'));
+}
+```
+
+</TabItem>
+</Tabs>
+
+After adding this above our `page.goto`s and running the code once again, we see this logged to the console:
+
+```text
+Site visited
+Site visited
+```
+
+Cool! We've modified both our `iPhoneContext` and `androidContext`, as well as our default context, to log the message.
+
+> Note that the Puppeteer code and Playwright code are slightly different in the examples above. The Playwright code will log **Site visited** any time the specific URL is visited, while the Puppeteer code will log any time the target URL is changed to anything.
+
+Finally, in Puppeteer, you can use the `browser.defaultBrowserContext()` function to grab hold of the default context at any point.
+
+## Wrap up
+
+Thus far in this course, you've learned how to launch a browser, open a page, run scripts on a page, extract data from a page, intercept requests made on the page, use proxies, and use multiple browser contexts. Stay tuned for new lessons!
+</file>
+
+<file path="webscraping/puppeteer_playwright/browser.md">
+---
+title: I - Launching a browser
+description: Understand what the Browser object is in Puppeteer/Playwright, how to create one, and a bit about how to interact with one.
+sidebar_position: 2.1
+slug: /puppeteer-playwright/browser
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Browser {#browser}
+
+**Understand what the Browser object is in Puppeteer/Playwright, how to create one, and a bit about how to interact with one.**
+
+---
+
+In order to automate a browser in Playwright or Puppeteer, we need to open one up programmatically. Playwright supports Chromium, Firefox, and Webkit (Safari), while Puppeteer only supports Chromium based browsers. For ease of understanding, we've chosen to use Chromium in the Playwright examples to keep things working on the same plane.
+
+Let's start by using the `launch()` function in the **index.js** file we created in the intro to this course:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+await chromium.launch();
+
+console.log('launched!');
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+await puppeteer.launch();
+
+console.log('launched!');
+```
+
+</TabItem>
+</Tabs>
+
+When we run this code with the command `node index.js`, a browser will open up; however, we won't actually see anything. This is because the default mode of a browser after `launch()`ing it is **headless**, meaning that it has no visible UI.
+
+> If you run this code right now, it will hang. Use **control^** + **C** to force quit the program.
+
+## Launch options {#launch-options}
+
+In order to see what's actually happening, we can pass an **options** object ([Puppeteer](https://pptr.dev/#?product=Puppeteer&version=v13.7.0&show=api-puppeteerlaunchoptions), [Playwright](https://playwright.dev/docs/api/class-browsertype#browser-type-launch)) with **headless** set to **false**.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+await browser.newPage();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+await browser.newPage();
+```
+
+</TabItem>
+</Tabs>
+
+Now we'll actually see a browser open up.
+
+![Chromium browser opened by Puppeteer/Playwright](./images/chromium.jpg)
+
+You can pass a whole lot more options to the `launch()` function. We'll be getting into those a little bit later on.
+
+## Browser methods {#browser-methods}
+
+The `launch()` function also returns a **Browser** object ([Puppeteer](https://pptr.dev/#?product=Puppeteer&version=v13.7.0&show=api-class-browser), [Playwright](https://playwright.dev/docs/api/class-browser)), which is a representation of the browser. This object has many methods, which allow us to interact with the browser from our code. One of them is `close()`. Until now, we've been using **control^** + **C** to force quit the process, but with this function, we'll no longer have to do that.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+await browser.newPage();
+
+// code will be here in the future
+
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+await browser.newPage();
+
+// code will be here in the future
+
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+<!-- In a few lessons from now, we'll be discussing the `browser.contexts()` (Playwright)/`browser.browserContexts()` (Puppeteer) functions and how to create **browser contexts** with another function on the **Browser** object. -->
+
+## Next up {#next}
+
+Now that we can open a browser, let's move onto the [next lesson](./page/index.md) where we will learn how to create pages and visit websites programmatically.
+
+<!-- Talk about browser context later, it doesn't make sense to show it until we're actually creating pages -->
+
+<!-- ## Browser context {#browser-context}
+
+**BrowserContext** objects ([Playwright](https://pptr.dev/#?product=Puppeteer&version=v13.7.0&show=api-class-browsercontext), [Playwright](https://playwright.dev/docs/api/class-browsercontext)) allow us to create and manage multiple browser sessions. -->
+</file>
+
+<file path="webscraping/puppeteer_playwright/index.md">
+---
+title: Puppeteer & Playwright
+description: Learn in-depth how to use two of the most popular Node.js libraries for controlling a headless browser - Puppeteer and Playwright.
+sidebar_position: 2
+category: web scraping & automation
+slug: /puppeteer-playwright
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Puppeteer & Playwright course {#puppeteer-playwright-course}
+
+**Learn in-depth how to use two of the most popular Node.js libraries for controlling a headless browser - Puppeteer and Playwright.**
+
+---
+
+[Puppeteer](https://pptr.dev/) and [Playwright](https://playwright.dev/) are libraries that allow you to automate browsing. Based on your instructions, they can open a browser window, load a website, click on links, etc. They can also do this _headlessly_, i.e., in a way that the browser window isn't visible, which is faster.
+
+Both packages were developed by the same team and are very similar, which is why we have combined the Puppeteer course and the Playwright course into one super-course that shows code examples for both technologies. The two differ in only small ways, and those will always be highlighted in the examples.
+
+> Each lesson's activity will contain examples for both libraries, but we recommend using Playwright, as it is newer and has more features and better [documentation](https://playwright.dev/docs/intro)
+
+## Advantages of using a headless browser {#advantages-of-headless-browsers}
+
+When automating a headless browser, you can do a whole lot more in comparison to making HTTP requests for static content. In fact, you can programmatically do pretty much anything a human could do with a browser, such as clicking elements, taking screenshots, typing into text areas, etc.
+
+Additionally, since the requests aren't static, [dynamic content](../../glossary/concepts/dynamic_pages.md) can be rendered and interacted with (or, data from the dynamic content can be scraped). Turn on the [headful mode](https://playwright.dev/docs/api/class-testoptions#test-options-headless) (`headless: false`) to see exactly what the browser is doing.
+
+Browsers can also be effective for [overcoming anti-scraping measures](../anti_scraping/index.md), especially if the website is running [JavaScript browser challenges](../anti_scraping/techniques/browser_challenges.md).
+
+## Disadvantages of headless browsers
+
+Browsers are slow and expensive to run. In the follow-up courses, the Apify Academy will show you how to scrape websites without a browser. Every website can potentially be reverse-engineered into a series of quick and cheap HTTP calls, but it might require significant effort and specialized knowledge.
+
+## Setup {#setup}
+
+For this course, we'll be jumping right into the features of these awesome libraries and expecting you to already have an environment set up. Here's how we set up our environment:
+
+1. Make sure you've installed [Node.js](https://nodejs.org/en/)
+2. Create a new folder called **puppeteer-playwright** (or whatever you want to call it)
+3. Run the command `npm init -y` within your new folder to automatically initialize the project
+4. Add `"type": "module"` to the **package.json** file
+5. Create a new file named **index.js**
+6. Install the library you're going to be using during this course:
+
+<Tabs groupId="main">
+<TabItem value="Install Playwright" label="Install Playwright">
+
+```shell
+npm install playwright
+
+```
+
+</TabItem>
+<TabItem value="Install Puppeteer" label="Install Puppeteer">
+
+```shell
+npm install puppeteer
+
+```
+
+</TabItem>
+</Tabs>
+
+> For a more in-depth guide on how to set up the basic environment we'll be using in this tutorial, check out the [**Computer preparation**](../scraping_basics_javascript/data_extraction/computer_preparation.md) lesson in the **Web scraping for beginners** course
+
+## Course overview {#course-overview}
+
+1. [Launching a browser](./browser.md)
+2. [Opening a page](./page/index.md)
+    - [Interacting with a page](./page/interacting_with_a_page.md)
+    - [Waiting for content & events](./page/waiting.md)
+    - [Page methods](./page/page_methods.md)
+3. [Executing scripts](./executing_scripts/index.md)
+    - [Injecting code](./executing_scripts/injecting_code.md)
+    - [Extracting data](./executing_scripts/extracting_data.md)
+4. [Reading & intercepting requests](./reading_intercepting_requests.md)
+5. [Using proxies](./proxies.md)
+6. [Creating multiple browser contexts](./browser_contexts.md)
+7. [Common use cases](./common_use_cases/index.md)
+
+## First up {#next}
+
+In the [first lesson](./browser.md) of this course, we'll be learning a bit about how to create and use the **Browser** object.
+</file>
+
+<file path="webscraping/puppeteer_playwright/proxies.md">
+---
+title: V - Using proxies
+description: Understand how to use proxies in your Puppeteer and Playwright requests, as well as a couple of the most common use cases for proxies.
+sidebar_position: 2.5
+slug: /puppeteer-playwright/proxies
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Using proxies {#using-proxies}
+
+**Understand how to use proxies in your Puppeteer and Playwright requests, as well as a couple of the most common use cases for proxies.**
+
+---
+
+[Proxies](../anti_scraping/mitigation/proxies.md) are a great way of appearing as if you are making requests from a different location. A common use case for proxies is to avoid [geolocation](../anti_scraping/techniques/geolocation.md) restrictions. For example your favorite TV show might not be available on Netflix in your country, but it might be available for Vietnamese Netflix watchers.
+
+In this lesson, we'll be learning how to use proxies with Playwright and Puppeteer. This will be demonstrated with a Vietnamese proxy that we got by running [this](https://apify.com/mstephen190/proxy-scraper) proxy-scraping Actor on the Apify platform.
+
+## Adding a proxy {#adding-a-proxy}
+
+First, let's add our familiar boilerplate code for visiting Google and also create a variable called `proxy` which will point to our proxy server:
+
+> Note that this proxy may no longer be working at the time of reading. If you don't have a proxy to use during this lesson, we recommend using Proxy Scraper for a list of free ones, or checking out [Apify proxy](https://apify.com/proxy)
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+// our proxy server
+const proxy = '103.214.9.13:3128';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+// our proxy server
+const proxy = '103.214.9.13:3128';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+For both Puppeteer and Playwright, the proxy server's URL should be passed into the options of the `launch()` function; however, it's done a bit differently depending on which library you're using.
+
+In Puppeteer, the server must be passed within the **--proxy-server** [Chromium command line argument](https://peter.sh/experiments/chromium-command-line-switches/), while in Playwright, it can be passed into the **proxy** option.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const proxy = '103.214.9.13:3128';
+
+const browser = await chromium.launch({
+    headless: false,
+    // Using the "proxy" option
+    proxy: {
+        // Pass in the server URL
+        server: proxy,
+
+    },
+});
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const proxy = '103.214.9.13:3128';
+
+// Using the "args" option, which is an array of Chromium command
+// line switches, we pass the server URL in with "--proxy-server"
+const browser = await puppeteer.launch({
+    headless: false,
+    args: [`--proxy-server=${proxy}`],
+});
+const page = await browser.newPage();
+await page.goto('https://google.com');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+And that's it! Now, when we visit Google, it's in Vietnamese. Depending on the country of your proxy, the language will vary.
+
+![Vietnamese Google](./images/vietnamese-google.png)
+
+> Note that in order to rotate through multiple proxies, you must retire a browser instance then create a new one to continue automating with a new proxy.
+
+## Authenticating a proxy {#authenticating-a-proxy}
+
+The proxy in the last activity didn't require a username and password, but let's say that this one does:
+
+```text
+proxy.example.com:3001
+```
+
+One might automatically assume that this would be the solution:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// This code is wrong!
+import { chromium } from 'playwright';
+
+const proxy = 'proxy.example.com:3001';
+const username = 'someUsername';
+const password = 'password123';
+
+const browser = await chromium.launch({
+    headless: false,
+    proxy: {
+        server: `http://${username}:${password}@${proxy}`,
+
+    },
+});
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// This code is wrong!
+import puppeteer from 'puppeteer';
+
+const proxy = 'proxy.example.com:3001';
+const username = 'someUsername';
+const password = 'password123';
+
+const browser = await puppeteer.launch({
+    headless: false,
+    args: [`--proxy-server=http://${username}:${password}@${proxy}`],
+});
+```
+
+</TabItem>
+</Tabs>
+
+However, authentication parameters need to be passed in separately in order to work. In Puppeteer, the username and password need to be passed to the `page.authenticate()` prior to any navigations being made, while in Playwright they can be passed to the **proxy** option object.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const proxy = 'proxy.example.com:3001';
+const username = 'someUsername';
+const password = 'password123';
+
+const browser = await chromium.launch({
+    headless: false,
+    proxy: {
+        server: proxy,
+        username,
+        password,
+    },
+});
+// Proxy will now be authenticated
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const proxy = 'proxy.example.com:3001';
+const username = 'someUsername';
+const password = 'password123';
+
+const browser = await puppeteer.launch({
+    headless: false,
+    args: [`--proxy-server=${proxy}`],
+});
+
+const page = await browser.newPage();
+
+await page.authenticate({ username, password });
+// Proxy will now be authenticated
+```
+
+</TabItem>
+</Tabs>
+
+## Next up {#next}
+
+You already know how to launch a browser with various configurations, which means you're ready to [learn about browser contexts](./browser_contexts.md). Browser contexts can be used to automate multiple sessions at once with completely different configurations. You'll also learn how to emulate different devices, such as iPhones, iPads, and Androids.
+</file>
+
+<file path="webscraping/puppeteer_playwright/reading_intercepting_requests.md">
+---
+title: IV - Reading & intercepting requests
+description: You can use DevTools, but did you know that you can do all the same stuff (plus more) programmatically? Read and intercept requests in Puppeteer/Playwright.
+sidebar_position: 2.4
+slug: /puppeteer-playwright/reading-intercepting-requests
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Reading & intercepting requests {#reading-intercepting-requests}
+
+**You can use DevTools, but did you know that you can do all the same stuff (plus more) programmatically? Read and intercept requests in Puppeteer/Playwright.**
+
+---
+
+On any website that serves up images, makes [XMLHttpRequests](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest), or fetches content in some other way, you can see those requests (and their responses) in the [**Network** tab](../api_scraping/general_api_scraping/locating_and_learning.md) of your browser's DevTools. Lots of data about the request can be found there, such as the headers, payload, and response body.
+
+In Playwright and Puppeteer, it is also possible to read (and even intercept) requests being made on the page - programmatically. This is very useful for things like reading dynamic headers, saving API responses, blocking certain resources, and much more.
+
+During this lesson, we'll be using [Tiësto's following list](https://soundcloud.com/tiesto/following) on SoundCloud to demonstrate request/response reading and interception. Here's our basic setup for opening the page:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+// Our code will go here
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+// Our code will go here
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+## Reading requests {#reading-requests}
+
+We can use the [`page.on()`](https://pptr.dev/#?product=Puppeteer&version=v14.0.0&show=api-event-close) function to listen for the **request** event, passing in a callback function. The first parameter of the passed in callback function is an object representing the request.
+
+Upon visiting Tiësto's following page, we can see in the **Network** tab that a request is made to fetch all of the users which he is following.
+
+![Request to grab Tiësto's following list](./images/tiesto-request.png)
+
+Let's go ahead and listen for this request in our code:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Listen for all requests
+page.on('request', (req) => {
+    // If the URL doesn't include our keyword, ignore it
+    if (!req.url().includes('followings')) return;
+
+    console.log('Request for followers was made!');
+});
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// Listen for all requests
+page.on('request', (req) => {
+    // If the URL doesn't include our keyword, ignore it
+    if (!req.url().includes('followings')) return;
+
+    console.log('Request for followers was made!');
+});
+```
+
+</TabItem>
+</Tabs>
+
+> Note that you should always define any request reading/interception code prior to calling the `page.goto()` function.
+
+Cool! Now when we run our code, we'll see this logged to the console:
+
+```text
+Request for followers was made!
+```
+
+This request includes some useful query parameters, namely the `client_id`. Let's go ahead and grab these values from the request URL and print them to the console:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+// Listen for all requests
+page.on('request', (req) => {
+    // If the URL doesn't include our keyword, ignore it
+    if (!req.url().includes('followings')) return;
+
+    // Convert the request URL into a URL object
+    const url = new URL(req.url());
+
+    // Print the search parameters in object form
+    console.log(Object.fromEntries(url.searchParams));
+});
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+// Listen for all requests
+page.on('request', (req) => {
+    // If the URL doesn't include our keyword, ignore it
+    if (!req.url().includes('followings')) return;
+
+    // Convert the request URL into a URL object
+    const url = new URL(req.url());
+
+    // Print the search parameters in object form
+    console.log(Object.fromEntries(url.searchParams));
+});
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+After running this code, we can see this logged to the console:
+
+```text
+{
+  client_id: 'llCGDUjKpxUslgO1yEce7Zh95PXE78Bo',
+  limit: '12',
+  offset: '0',
+  linked_partitioning: '1',
+  app_version: '1652347025',
+  app_locale: 'en'
+}
+```
+
+## Reading responses {#reading-responses}
+
+Listening for and reading responses is very similar to reading requests. The only difference is that we need to listen for the **response** event instead of **request**. Additionally, the object passed into the callback function represents the response instead of the request.
+
+This time, instead of grabbing the query parameters of the request URL, let's grab hold of the response body and print it to the console in JSON format:
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Notice that the callback function is now async
+page.on('response', async (res) => {
+    if (!res.request().url().includes('followings')) return;
+
+    // Grab the response body in JSON format
+    try {
+        const json = await res.json();
+        console.log(json);
+    } catch (err) {
+        console.error('Response wasn\'t JSON or failed to parse response.');
+    }
+});
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+// Notice that the callback function is now async
+page.on('response', async (res) => {
+    if (!res.request().url().includes('followings')) return;
+
+    // Grab the response body in JSON format
+    try {
+        const json = await res.json();
+        console.log(json);
+    } catch (err) {
+        console.error('Response wasn\'t JSON or failed to parse response.');
+    }
+});
+```
+
+</TabItem>
+</Tabs>
+
+> Take notice of our usage of a `try...catch` block. This is because if the response is not JSON, the `res.json()` function will fail and throw an error, which we must handle to prevent any unexpected crashes.
+
+Upon running this code, we'll see the API response logged into the console:
+
+![API response in console](./images/api-response-tiesto.png)
+
+## Intercepting requests {#intercepting-requests}
+
+One of the most popular ways of speeding up website loading in Puppeteer and Playwright is by blocking certain resources from loading. These resources are usually CSS files, images, and other miscellaneous resources that aren't super necessary (mainly because the computer doesn't have eyes - it doesn't care how the website looks!).
+
+In Puppeteer, we must first enable request interception with the `page.setRequestInterception()` function. Then, we can check whether or not the request's resource ends with one of our blocked file extensions. If so, we'll abort the request. Otherwise, we'll let it continue. All of this logic will still be within the `page.on()` method.
+
+With Playwright, request interception is a bit different. We use the [`page.route()`](https://playwright.dev/docs/api/class-page#page-route) function instead of `page.on()`, passing in a string, regular expression, or a function that will match the URL of the request we'd like to read from. The second parameter is also a callback function, but with the [**Route**](https://playwright.dev/docs/api/class-route) object passed into it instead.
+
+### Blocking resources {#blocking-resources}
+
+We'll first create an array of some file extensions that we'd like to block:
+
+```js
+const blockedExtensions = ['.png', '.css', '.jpg', '.jpeg', '.pdf', '.svg'];
+```
+
+Then, we'll `abort()` all requests that end with any of these extensions.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+const blockedExtensions = ['.png', '.css', '.jpg', '.jpeg', '.pdf', '.svg'];
+
+// Only listen for requests with one of our blocked extensions
+// Abort all matching requests
+page.route(`**/*{${blockedExtensions.join(',')}}`, async (route) => route.abort());
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+const blockedExtensions = ['.png', '.css', '.jpg', '.jpeg', '.pdf', '.svg'];
+
+// Enable request interception (skipping this step will result in an error)
+await page.setRequestInterception(true);
+
+// Listen for all requests
+page.on('request', async (req) => {
+    // If the request ends in a blocked extension, abort the request
+    if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort();
+    // Otherwise, continue
+    await req.continue();
+});
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+> You can also use `request.resourceType()` to grab the resource type.
+
+Here's what we see when we run this logic:
+
+![SoundCloud with no CSS or image resources loaded](./images/ugly-soundcloud.png)
+
+This confirms that we've successfully blocked the CSS and image resources from loading.
+
+#### Quick note about resource blocking {#quick-note}
+
+Something **very** important to note is that by using request interception, the browser's cache is turned **off**. This means that resources on websites that would normally be cached (and pulled from the cache instead on the next request for those resources) will not be cached, which can have varying negative effects on performance, especially when making many requests to the same domain, which is very common in web scraping. You can learn how to solve this problem in [this short tutorial](../../tutorials/node_js/caching_responses_in_puppeteer.md).
+
+To block resources, it is better to use a CDP (Chrome DevTools Protocol) Session ([Playwright](https://playwright.dev/docs/api/class-cdpsession)/[Puppeteer](https://pptr.dev/#?product=Puppeteer&version=v14.1.0&show=api-class-cdpsession)) to set the blocked URLs. Here is an implementation that achieves the same goal as our above example above; however, the browser's cache remains enabled.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+// Note, you can't use CDP session in other browsers!
+// Only in Chromium.
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+// Define our blocked extensions
+const blockedExtensions = ['.png', '.css', '.jpg', '.jpeg', '.pdf', '.svg'];
+
+// Use CDP session to block resources
+const client = await page.context().newCDPSession(page);
+
+await client.send('Network.setBlockedURLs', { urls: blockedExtensions });
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+// Define our blocked extensions
+const blockedExtensions = ['.png', '.css', '.jpg', '.jpeg', '.pdf', '.svg'];
+
+// Use CDP session to block resources
+await page.client().send('Network.setBlockedURLs', { urls: blockedExtensions });
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+### Modifying the request {#modifyng-the-request}
+
+There's much more to intercepting requests than just aborting them though. We can change the payload, headers, query parameters, and even the base URL.
+
+Let's go ahead and intercept and modify the initial request we fire off with the `page.goto()` by making it go to [Mesto's following page](https://soundcloud.com/mestomusic) instead.
+
+<Tabs groupId="main">
+<TabItem value="Playwright" label="Playwright">
+
+```js
+import { chromium } from 'playwright';
+
+const browser = await chromium.launch({ headless: false });
+const page = await browser.newPage();
+
+// Only listen for requests matching this regular expression
+page.route(/soundcloud.com\/tiesto/, async (route) => {
+    // Continue  the route, but replace "tiesto" in the URL with "mestomusic"
+    return route.continue({ url: route.request().url().replace('tiesto', 'mestomusic') });
+});
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+<TabItem value="Puppeteer" label="Puppeteer">
+
+```js
+import puppeteer from 'puppeteer';
+
+const browser = await puppeteer.launch({ headless: false });
+const page = await browser.newPage();
+
+await page.setRequestInterception(true);
+
+// Listen for all requests
+page.on('request', async (req) => {
+    // If it doesn't match, continue the route normally
+    if (!/soundcloud.com\/tiesto/.test(req.url())) return req.continue();
+    // Otherwise, continue  the route, but replace "tiesto"
+    // in the URL with "mestomusic"
+    await req.continue({ url: req.url().replace('tiesto', 'mestomusic') });
+});
+
+await page.goto('https://soundcloud.com/tiesto/following');
+
+await page.waitForTimeout(10000);
+await browser.close();
+```
+
+</TabItem>
+</Tabs>
+
+> Note that this **is not** a redirect, because Tiësto's page was never even visited. The request was changed before it was even fulfilled.
+
+Here's what we see when we run `node index.js`:
+
+![Request intercepted and sent to Mesto's page instead](./images/mesto-following.jpg)
+
+## Next up {#next}
+
+The [next lesson](./proxies.md) will teach you how to use proxies in Playwright and Puppeteer in order to avoid blocking or to appear as if you are requesting from a different location.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/challenge/index.md">
+---
+title: Challenge
+description: Test your knowledge acquired in the previous sections of this course by building an Amazon scraper using Crawlee's CheerioCrawler!
+sidebar_position: 1.4
+slug: /web-scraping-for-beginners/challenge
+---
+
+# Challenge
+
+**Test your knowledge acquired in the previous sections of this course by building an Amazon scraper using Crawlee's CheerioCrawler!**
+
+---
+
+Before moving onto the other courses in the academy, we recommend following along with this section, as it combines everything you've learned in the previous lessons into one cohesive project that helps you prove to yourself that you've thoroughly understood the material.
+
+We recommend that you make sure you've gone through both the [data extraction](../data_extraction/index.md) and [crawling](../crawling/index.md) sections of this course to ensure the smoothest development process.
+
+## Learning 🧠 {#learning}
+
+Before continuing, it is highly recommended to do the following:
+
+- Look over [how to build a crawler in Crawlee](https://crawlee.dev/docs/introduction/first-crawler) and ideally **code along**.
+- Read [this short article](https://docs.apify.com/academy/node-js/request-labels-in-apify-actors) about [**request labels**](https://crawlee.dev/api/core/class/Request#label) (this will be extremely useful later on).
+- Check out [this tutorial](../../../tutorials/node_js/dealing_with_dynamic_pages.md) about dynamic pages.
+- Read about the [RequestQueue](https://crawlee.dev/api/core/class/RequestQueue).
+
+## Our task {#our-task}
+
+On Amazon, we can use this link to get to the results page of any product we want:
+
+```text
+https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=KEYWORD
+```
+
+Our crawler's input will look like this:
+
+```json
+{
+    "keyword": "iphone"
+}
+```
+
+The goal at hand is to scrape all of the products from the first page of results for whatever keyword was provided (for our test case, it will be **iPhone**), then to scrape all available offers of each product and push the results to the dataset. For context, the offers for a product look like this:
+
+![Amazon product offers](../../../platform/expert_scraping_with_apify/images/product-offers.jpg)
+
+In the end, we'd like our final output to look something like this:
+
+```json
+[
+    {
+        "title": "Apple iPhone 6 a1549 16GB Space Gray Unlocked (Certified Refurbished)",
+        "asin": "B07P6Y7954",
+        "itemUrl": "https://www.amazon.com/Apple-iPhone-Unlocked-Certified-Refurbished/dp/B00YD547Q6/ref=sr_1_2?s=wireless&ie=UTF8&qid=1539772626&sr=1-2&keywords=iphone",
+        "description": "What's in the box: Certified Refurbished iPhone 6 Space Gray 16GB Unlocked , USB Cable/Adapter. Comes in a Generic Box with a 1 Year Limited Warranty.",
+        "keyword": "iphone",
+        "seller name": "Blutek Intl",
+        "offer": "$162.97"
+    },
+    {
+        "title": "Apple iPhone 6 a1549 16GB Space Gray Unlocked (Certified Refurbished)",
+        "asin": "B07P6Y7954",
+        "itemUrl": "https://www.amazon.com/Apple-iPhone-Unlocked-Certified-Refurbished/dp/B00YD547Q6/ref=sr_1_2?s=wireless&ie=UTF8&qid=1539772626&sr=1-2&keywords=iphone",
+        "description": "What's in the box: Certified Refurbished iPhone 6 Space Gray 16GB Unlocked , USB Cable/Adapter. Comes in a Generic Box with a 1 Year Limited Warranty.",
+        "keyword": "iphone",
+        "sellerName": "PLATINUM DEALS",
+        "offer": "$169.98"
+    },
+    {
+        "...": "..."
+    }
+]
+
+```
+
+> The `asin` is the ID of the product, which is data present on the Amazon website.
+
+Each of the items in the dataset will represent a scraped offer and will have the same `title`, `asin`, `itemUrl`, and `description`. The offer-specific fields will be `sellerName` and `offer`.
+
+<!-- After the scrape has completed, we'll programmatically call a [public Actor which sends emails](https://apify.com/apify/send-mail) to send ourselves an email with a publicly viewable link to the Actor's final dataset. -->
+
+## First up {#next}
+
+From this course, you should have all the knowledge to build this scraper by yourself. Give it a try, then come back to compare your scraper with our solution.
+
+The challenge can be completed using either [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) or [PlaywrightCrawler](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler). Playwright is significantly slower but doesn't get blocked as much. You will learn the most by implementing both.
+
+Let's start off this section by [initializing and setting up](./initializing_and_setting_up.md) our project with the Crawlee CLI (don't worry, no additional installation is required).
+</file>
+
+<file path="webscraping/scraping_basics_javascript/challenge/initializing_and_setting_up.md">
+---
+title: Initializing & setting up
+description: When you extract links from a web page, you often end up with a lot of irrelevant URLs. Learn how to filter the links to only keep the ones you need.
+sidebar_position: 1
+slug: /web-scraping-for-beginners/challenge/initializing-and-setting-up
+---
+
+# Initialization & setting up
+
+**When you extract links from a web page, you often end up with a lot of irrelevant URLs. Learn how to filter the links to only keep the ones you need.**
+
+---
+
+The Crawlee CLI speeds up the process of setting up a Crawlee project. Navigate to the directory you'd like your project's folder to live, then open up a terminal instance and run the following command:
+
+```shell
+npx crawlee create amazon-crawler
+```
+
+Once you run this command, you'll get prompted into a menu which you can navigate using your arrow keys. Each of these options will generate a different boilerplate code when selected. We're going to work with CheerioCrawler today, so we'll select the **CheerioCrawler template project** template, and then press **Enter**.
+
+![Crawlee CLI "create" command](./images/crawlee-create.png)
+
+Once it's completed, open up the **amazon-crawler** folder that was generated by the `npx crawlee create` command. We're going to modify the **main.js** boilerplate to fit our needs:
+
+```js
+// main.js
+import { CheerioCrawler, KeyValueStore, log } from 'crawlee';
+import { router } from './routes.js';
+
+// Grab our keyword from the input
+const { keyword } = await KeyValueStore.getInput();
+
+const crawler = new CheerioCrawler({
+    requestHandler: router,
+
+    // If you have access to Apify Proxy, you can use residential proxies and
+    // high retry count which helps with blocking
+    // If you don't, your local IP address will likely be fine for a few requests if you scrape slowly.
+    // proxyConfiguration: await Actor.createProxyConfiguration({ groups: ['RESIDENTIAL'] }),
+    // maxRequestRetries: 10,
+});
+
+log.info('Starting the crawl.');
+await crawler.run([{
+    // Turn the keyword into a link we can make a request with
+    url: `https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=${keyword}`,
+    label: 'START',
+    userData: {
+        keyword,
+    },
+}]);
+log.info('Crawl finished.');
+```
+
+```js
+// routes.js
+import { createCheerioRouter } from 'crawlee';
+
+export const router = createCheerioRouter();
+
+router.addDefaultHandler(({ log }) => {
+    log.info('Route reached.');
+});
+```
+
+Finally, we'll add the following input file to **INPUT.json** in the project's root directory (next to `package.json`, `node_modules` and others)
+
+```json
+{
+  "keyword": "iphone"
+}
+```
+
+> This is how we'll be inputting data into our scraper from now on. Don't worry though, from now on, we'll only need  to work in the **main.js** and **routes.js** files!
+
+## Next up {#next}
+
+Cool! We're ready. But [let's discuss a bit about modularity](./modularity.md) before moving forward!
+</file>
+
+<file path="webscraping/scraping_basics_javascript/challenge/modularity.md">
+---
+title: Modularity
+description: Before you build your first web scraper with Crawlee, it is important to understand the concept of modularity in programming.
+sidebar_position: 2
+slug: /web-scraping-for-beginners/challenge/modularity
+---
+
+# Modularity
+
+**Before you build your first web scraper with Crawlee, it is important to understand the concept of modularity in programming.**
+
+---
+
+Now that we've gotten our first request going, the first challenge is going to be selecting all of the resulting products on the page. Back in the browser, we'll use the DevTools hover tool to inspect a product.
+
+![Result products](../../../platform/expert_scraping_with_apify/solutions/images/result-items.jpg)
+
+**Bingo!** Each product seems to have a `data-asin` attribute, which includes the ASIN (product ID) data we want. Now, we can select each of these elements with this selector: `div > div[data-asin]:not([data-asin=""])`. Then, we'll scrape some data about each product, and push a request to the main product page so we can grab hold of the description.
+
+But, before we start scraping, let's pause to talk a bit about the important concept of **modularity**. You may have noticed the **src** folder inside of your project, which by default has a **routes.js** file in it. We're going to use this to create modularized functions which can then be conditionally executed by our crawler.
+
+```js
+// routes.js
+import { createCheerioRouter } from 'crawlee';
+import { BASE_URL } from './constants.js';
+
+export const router = createCheerioRouter();
+
+router.addDefaultHandler(({ log }) => {
+    log.info('Route reached.');
+});
+
+// Add a handler to our router to handle requests with the 'START' label
+router.addHandler('START', async ({ $, crawler, request }) => {
+    const { keyword } = request.userData;
+
+    const products = $('div > div[data-asin]:not([data-asin=""])');
+
+    // loop through the resulting products
+    for (const product of products) {
+        const element = $(product);
+        const titleElement = $(element.find('.a-text-normal[href]'));
+
+        const url = `${BASE_URL}${titleElement.attr('href')}`;
+
+        // scrape some data from each and to a request
+        // to the crawler for its page
+        await crawler.addRequests([{
+            url,
+            label: 'PRODUCT',
+            userData: {
+                // Pass the scraped data about the product to the next
+                // request so that it can be used there
+                data: {
+                    title: titleElement.first().text().trim(),
+                    asin: element.attr('data-asin'),
+                    itemUrl: url,
+                    keyword,
+                },
+            },
+        }]);
+    }
+});
+
+router.addHandler('PRODUCT', ({ log }) => log.info('on a product page!'));
+```
+
+Also notice that we are importing `BASE_URL` from **constants.js**. Here is what that file looks like:
+
+```js
+// constants.js
+export const BASE_URL = 'https://www.amazon.com';
+```
+
+And here is what our **main.js** file currently looks like:
+
+```js
+// main.js
+import { CheerioCrawler, log, KeyValueStore } from 'crawlee';
+import { router } from './routes.js';
+import { BASE_URL } from './constants.js';
+
+const { keyword = 'iphone' } = (await KeyValueStore.getInput()) ?? {};
+
+const crawler = new CheerioCrawler({
+    requestHandler: router,
+});
+
+await crawler.addRequests([
+    {
+        // Use BASE_URL here instead
+        url: `${BASE_URL}/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=${keyword}`,
+        label: 'START',
+        userData: {
+            keyword,
+        },
+    },
+]);
+
+log.info('Starting the crawl.');
+await crawler.run();
+log.info('Crawl finished.');
+```
+
+One of the main reasons we **modularize** our code is to prevent massive and difficult to read files by separating concerns into separate files. In our **main.js** file, we're handling the initialization, configuration, and running of our crawler. In **routes.js**, we determine how the crawler should handle different routes, and in **constants.js** we define non-changing values that will be used throughout the project.
+
+Organized code makes everyone happy, including you - the one developing the scraper! Spaghetti is super awesome, [but not when it comes to programming](https://www.urbandictionary.com/define.php?term=spaghetti+code) 🍝
+
+This can even be optimized further by putting our `label` items into **constants.js**, like so:
+
+```js
+// constants.js
+export const BASE_URL = 'https://www.amazon.com';
+
+export const labels = {
+    START: 'START',
+    PRODUCT: 'PRODUCT',
+    OFFERS: 'OFFERS',
+};
+```
+
+Then, the labels can be used by importing `labels` and accessing `labels.START`, `labels.PRODUCT`, or `labels.OFFERS`.
+
+This is not necessary, but it is best practice, as it can prevent dumb typos that can cause nasty bugs 🐞 For the rest of this lesson, all of the examples using labels will be using the imported versions.
+
+> If you haven't already read the **Best practices** lesson in the **Web scraping for beginners** course, please [give it a read](../best_practices.md).
+
+## Next up {#next}
+
+Now that we've gotten that out of the way, we can finally continue with our Amazon scraper. [Let's do it](./scraping_amazon.md)!
+</file>
+
+<file path="webscraping/scraping_basics_javascript/challenge/scraping_amazon.md">
+---
+title: Scraping Amazon
+description: Before you build your first web scraper with Crawlee, it is important to understand the concept of modularity in programming.
+sidebar_position: 4
+slug: /web-scraping-for-beginners/challenge/scraping-amazon
+---
+
+# Scraping Amazon
+
+**Build your first web scraper with Crawlee. Let's extract product information from Amazon to give you an idea of what real-world scraping looks like.**
+
+---
+
+In our quick chat about modularity, we finished the code for the results page and added a request for each product to the crawler's **RequestQueue**. Here, we need to scrape the description, so it shouldn't be too hard:
+
+```js
+// routes.js
+
+// ...
+
+router.addHandler(labels.PRODUCT, async ({ $ }) => {
+    const element = $('div#productDescription');
+
+    const description = element.text().trim();
+
+    console.log(description); // works!
+});
+```
+
+
+Great! But wait, where do we go from here? We need to go to the offers page next and scrape each offer, but how can we do that? Let's take a small break from writing the scraper and open up [Proxyman](../../../glossary/tools/proxyman.md) to analyze requests which we might be difficult to find in the network tab, then we'll click the button on the product page that loads up all of the product offers:
+
+![View offers button](./images/view-offers-button.jpg)
+
+After clicking this button and checking back in Proxyman, we discovered this link:
+
+> You can find the request below in the network tab just fine, but with Proxyman, it is much easier and faster due to the extended filtering options.
+
+```text
+https://www.amazon.com/gp/aod/ajax/ref=auto_load_aod?asin=B07ZPKBL9V&pc=dp
+```
+
+The `asin` [query parameter](https://www.branch.io/glossary/query-parameters/) matches up with our product's ASIN, which means we can use this for any product of which we have the ASIN.
+
+Here's what this page looks like:
+
+![View offers page](./images/offers-page.jpg)
+
+Wow, that's ugly. But for our scenario, this is really great. When we click the **View offers** button, we usually have to wait for the offers to load and render, which would mean we could have to switch our entire crawler to a **PuppeteerCrawler** or **PlaywrightCrawler**. The data on this page we've just found appears to be loaded statically, which means we can still use CheerioCrawler and keep the scraper as efficient as possible 😎
+
+> It's totally possible to scrape the same data as this crawler using [Puppeteer or Playwright](../../puppeteer_playwright/index.md); however, with this offers link found in Postman, we can follow the same workflow much more quickly with static HTTP requests using CheerioCrawler.
+
+First, we'll create a request for each product's offers page:
+
+```js
+// routes.js
+
+// ...
+
+router.addHandler(labels.PRODUCT, async ({ $, crawler, request }) => {
+    const { data } = request.userData;
+
+    const element = $('div#productDescription');
+
+    // Add to the request queue
+    await crawler.addRequests([{
+        url: `${BASE_URL}/gp/aod/ajax/ref=auto_load_aod?asin=${data.asin}&pc=dp`,
+        label: labels.OFFERS,
+        userData: {
+            data: {
+                ...data,
+                description: element.text().trim(),
+            },
+        },
+    }]);
+});
+```
+
+Finally, we can handle the offers in a separate handler:
+
+```js
+// routes.js
+
+router.addHandler(labels.OFFERS, async ({ $, request }) => {
+    const { data } = request.userData;
+
+    for (const offer of $('#aod-offer')) {
+        const element = $(offer);
+
+        await Dataset.pushData({
+            ...data,
+            sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(),
+            offer: element.find('.a-price .a-offscreen').text().trim(),
+        });
+
+    }
+});
+```
+
+## Final code {#final-code}
+
+That should be it! Let's make sure we've all got the same code:
+
+```js
+// constants.js
+export const BASE_URL = 'https://www.amazon.com';
+
+export const labels = {
+    START: 'START',
+    PRODUCT: 'PRODUCT',
+    OFFERS: 'OFFERS',
+};
+```
+
+```js
+// routes.js
+import { createCheerioRouter, Dataset } from 'crawlee';
+import { BASE_URL, labels } from './constants';
+
+export const router = createCheerioRouter();
+
+router.addHandler(labels.START, async ({ $, crawler, request }) => {
+    const { keyword } = request.userData;
+
+    const products = $('div > div[data-asin]:not([data-asin=""])');
+
+    for (const product of products) {
+        const element = $(product);
+        const titleElement = $(element.find('.a-text-normal[href]'));
+
+        const url = `${BASE_URL}${titleElement.attr('href')}`;
+
+        await crawler.addRequests([
+            {
+                url,
+                label: labels.PRODUCT,
+                userData: {
+                    data: {
+                        title: titleElement.first().text().trim(),
+                        asin: element.attr('data-asin'),
+                        itemUrl: url,
+                        keyword,
+                    },
+                },
+            },
+        ]);
+    }
+});
+
+router.addHandler(labels.PRODUCT, async ({ $, crawler, request }) => {
+    const { data } = request.userData;
+
+    const element = $('div#productDescription');
+
+    await crawler.addRequests([
+        {
+            url: `${BASE_URL}/gp/aod/ajax/ref=auto_load_aod?asin=${data.asin}&pc=dp`,
+            label: labels.OFFERS,
+            userData: {
+                data: {
+                    ...data,
+                    description: element.text().trim(),
+                },
+            },
+        },
+    ]);
+});
+
+router.addHandler(labels.OFFERS, async ({ $, request }) => {
+    const { data } = request.userData;
+
+    for (const offer of $('#aod-offer')) {
+        const element = $(offer);
+
+        await Dataset.pushData({
+            ...data,
+            sellerName: element.find('div[id*="soldBy"] a[aria-label]').text().trim(),
+            offer: element.find('.a-price .a-offscreen').text().trim(),
+        });
+    }
+});
+```
+
+```js
+// main.js
+import { CheerioCrawler, KeyValueStore, log } from 'crawlee';
+import { router } from './routes.js';
+
+// Grab our keyword from the input
+const { keyword = 'iphone' } = (await KeyValueStore.getInput()) ?? {};
+
+const crawler = new CheerioCrawler({
+    requestHandler: router,
+});
+
+// Add our initial requests
+await crawler.addRequests([
+    {
+        // Turn the inputted keyword into a link we can make a request with
+        url: `https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=${keyword}`,
+        label: 'START',
+        userData: {
+            keyword,
+        },
+    },
+]);
+
+log.info('Starting the crawl.');
+await crawler.run();
+log.info('Crawl finished.');
+```
+
+## Wrap up 💥 {#wrap-up}
+
+Nice work! You've officially built your first scraper with Crawlee! You're now ready to take on the rest of the Apify Academy with confidence.
+
+For now, this is the last section of the **Web scraping for beginners** course. If you want to learn more about web scraping, we recommend checking venturing out and following the other lessons in the Academy. We will keep updating the Academy with more content regularly until we cover all the advanced and expert topics we promised at the beginning.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/exporting_data.md">
+---
+title: Exporting data
+description: Learn how to export the data you scraped using Crawlee to CSV or JSON.
+sidebar_position: 9
+slug: /web-scraping-for-beginners/crawling/exporting-data
+---
+
+# Exporting data {#exporting-data}
+
+**Learn how to export the data you scraped using Crawlee to CSV or JSON.**
+
+---
+
+In the previous lessons, you learned that:
+
+1. You can use `Dataset.pushData()` to save data to the default dataset.
+2. The default dataset files are saved in the `./storage/datasets/default` folder.
+
+But when we look inside the folder, we see that there are a lot of files, and we don't want to work with those manually. We can use the dataset itself to export the data.
+
+## Exporting data to CSV {#export-csv}
+
+Crawlee's `Dataset` provides a way to export all your scraped data into one big CSV file. You can then open it in Excel or any other data processor. To do that, you need to call [`Dataset.exportToCSV()`](https://crawlee.dev/api/core/class/Dataset#exportToCSV) after collecting all the data. That means, after your crawler run finishes.
+
+```js title=browser.js
+// ...
+await crawler.run();
+// Add this line to export to CSV.
+await Dataset.exportToCSV('results');
+```
+
+After you add this one line and run the code, you'll find your CSV with all the scraped products in here:
+
+```text
+./storage/key-value-stores/default/results.csv
+```
+
+:::info
+
+[Key-value store](https://crawlee.dev/docs/guides/result-storage#key-value-store) is another of Crawlee's storages. It's best for saving files like CSVs, PDFs or images, but also large JSONs or crawler statistics.
+
+:::
+
+## Exporting data to JSON {#export-json}
+
+Exporting to JSON is very similar to exporting to CSV, but we'll use a different function: [`Dataset.exportToJSON`](https://crawlee.dev/api/core/class/Dataset#exportToJSON). Exporting to JSON is useful when you don't want to work with each item separately, but would rather have one big JSON file with all the results.
+
+```js title=browser.js
+// ...
+await crawler.run();
+// Add this line to export to JSON.
+await Dataset.exportToJSON('results');
+```
+
+You will find the resulting JSON here:
+
+```text
+./storage/key-value-stores/default/results.json
+```
+
+## Final scraper code {#filtering-data}
+
+```js title=browser.js
+import { PlaywrightCrawler, Dataset } from 'crawlee';
+
+const crawler = new PlaywrightCrawler({
+    // We removed the headless: false option to hide the browser windows.
+    requestHandler: async ({ parseWithCheerio, request, enqueueLinks }) => {
+        console.log(`Fetching URL: ${request.url}`);
+
+        if (request.label === 'start-url') {
+            await enqueueLinks({
+                selector: 'a.product-item__title',
+            });
+            return;
+        }
+
+        // Fourth, parse the browser's page with Cheerio.
+        const $ = await parseWithCheerio();
+
+        const title = $('h1').text().trim();
+        const vendor = $('a.product-meta__vendor').text().trim();
+        const price = $('span.price').contents()[2].nodeValue;
+        const reviewCount = parseInt($('span.rating__caption').text(), 10);
+        const description = $('div[class*="description"] div.rte').text().trim();
+        const recommendedProducts = $('.product-recommendations a.product-item__title')
+            .map((i, el) => $(el).text().trim())
+            .toArray();
+
+        await Dataset.pushData({
+            title,
+            vendor,
+            price,
+            reviewCount,
+            description,
+            recommendedProducts,
+        });
+    },
+});
+
+await crawler.addRequests([{
+    url: 'https://warehouse-theme-metal.myshopify.com/collections/sales',
+    label: 'start-url',
+}]);
+
+await crawler.run();
+await Dataset.exportToCSV('results');
+```
+
+## Next up {#next}
+
+And this is it for the [**Basics of crawling**](./index.md) section of the [**Web scraping for beginners**](../index.md) course. If you want to learn more, test your knowledge of the methods and concepts you learned in this course by moving forward with the [**challenge**](../challenge/index.md).
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/filtering_links.md">
+---
+title: Filtering links
+description: When you extract links from a web page, you often end up with a lot of irrelevant URLs. Learn how to filter the links to only keep the ones you need.
+sidebar_position: 3
+slug: /web-scraping-for-beginners/crawling/filtering-links
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Filtering links {#filtering-links}
+
+**When you extract links from a web page, you often end up with a lot of irrelevant URLs. Learn how to filter the links to only keep the ones you need.**
+
+---
+
+Web pages are full of links, but frankly, most of them are useless to us when scraping. Filtering links can be approached in two ways: Targeting the links we're interested in by using unique CSS selectors, or extracting all links and then using pattern matching to find the sought after URLs. In real scraping scenarios, both of these two approaches are often combined for the most effective URL filtering.
+
+## Filtering with unique CSS selectors {#css-filtering}
+
+In the previous lesson, we grabbed all the links from the HTML document.
+
+<Tabs groupId="main">
+<TabItem value="DevTools" label="DevTools">
+
+```js
+document.querySelectorAll('a');
+```
+
+</TabItem>
+<TabItem value="Node.js with Cheerio" label="Node.js with Cheerio">
+
+```js
+$('a');
+```
+
+</TabItem>
+</Tabs>
+
+### Attribute selector {#attribute-selector}
+
+That's not the only way to do it, however. Since we're interested in the `href` attributes, a first very reasonable filter is to exclusively target the `<a>` tags that have the `href` attribute (yes, anchor tags without the attribute can and do exist). You can do that by using the [CSS attribute selector](https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors).
+
+<Tabs groupId="main">
+<TabItem value="DevTools" label="DevTools">
+
+```js
+document.querySelectorAll('a[href]');
+```
+
+</TabItem>
+<TabItem value="Node.js" label="Node.js">
+
+```js
+$('a[href]');
+```
+
+</TabItem>
+</Tabs>
+
+Adding the `[href]` part of the selector will save you from nasty bug hunts on certain pages.
+
+### Link specific selectors {#specific-selectors}
+
+Let's go back to the [Sales category of Warehouse](https://warehouse-theme-metal.myshopify.com/collections/sales) and see how we could capture only the links to product detail pages. After inspecting the product cards in DevTools, you'll find that the links are available together with the product's title. Getting them will therefore be very similar to getting the product titles in the previous section.
+
+![product detail page link](./images/filtering-product-detail-link.png)
+
+<Tabs groupId="main">
+<TabItem value="DevTools" label="DevTools">
+
+```js
+document.querySelectorAll('a.product-item__title');
+```
+
+</TabItem>
+<TabItem value="Node.js" label="Node.js">
+
+```js
+$('a.product-item__title');
+```
+
+</TabItem>
+</Tabs>
+
+
+When we print all the URLs in the DevTools console, we can see that we've correctly filtered only the product detail page URLs.
+
+```js title=DevTools
+for (const a of document.querySelectorAll('a.product-item__title')) {
+    console.log(a.href);
+}
+```
+
+:::info
+
+If you try this in Node.js instead of DevTools, you will not get the full URLs, but only so-called **relative links**. We will explain what those are and how to work with them in the next lesson.
+
+:::
+
+![Product URLs printed to console](./images/filtering-product-urls.png)
+
+
+## Filtering with pattern-matching {#pattern-matching-filter}
+
+Another common way to filter links (or any text, really) is by matching patterns with regular expressions.
+
+> [Learn more about regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions)
+
+When we inspect the product URLs, we'll find that they all look like the following:
+
+```text
+https://warehouse-theme-metal.myshopify.com/products/denon-ah-c720-in-ear-headphones
+https://warehouse-theme-metal.myshopify.com/products/sony-sacs9-10-inch-active-subwoofer
+https://warehouse-theme-metal.myshopify.com/products/sony-ps-hx500-hi-res-usb-turntable
+```
+
+That is, they all begin with exactly the same pattern and only differ in the last portion of the path. We could write the pattern like this:
+
+```text
+https://warehouse-theme-metal.myshopify.com/products/{PRODUCT_NAME}
+```
+
+This means that we can create a regular expression that matches those URLs. You can do it in many ways . For simplicity, let's go with this one:
+
+```RegExp
+https?:\/\/warehouse-theme-metal\.myshopify\.com\/products\/[\w\-]+
+```
+
+This regular expression matches all URLs that use either `http` or `https` protocol and point to `warehouse-theme-metal.myshopify.com/products/` immediately followed with any number of letters or dashes `-`.
+
+> A great way to learn more about regular expression syntax and to test your expressions are tools like [regex101.com](https://regex101.com/) or [regexr.com](https://regexr.com/). It's okay if you don't get the hang of it right away!
+
+To test our regular expression in the DevTools console, we'll first create a [`RegExp`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp) object and then test the URLs with the [`regExp.test(string)`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/test) function.
+
+```js
+// To demonstrate pattern matching, we use only the 'a'
+// selector to select all links on the page.
+for (const a of document.querySelectorAll('a')) {
+    const regExp = /https?:\/\/warehouse-theme-metal\.myshopify\.com\/products\/[\w-]+/;
+    const url = a.href;
+    if (regExp.test(url)) console.log(url);
+}
+```
+
+When you run this code in DevTools Console on the [Sales category of Warehouse](https://warehouse-theme-metal.myshopify.com/collections/sales), you'll see that it produces a slightly different set of URLs than the CSS filter did.
+
+![filtering-regex-urls.png](./images/filtering-regex-urls.png)
+
+That's because we selected all the links on the page and apparently there are more ways to get to the product detail pages. After careful inspection we can find that we can get there not only by clicking the title, but also by clicking the product's image, which leads to duplicates. Some products also have review links that lead to a specific subsection of the product detail page.
+
+With that said, yes, filtering with CSS selectors is often the better and more reliable option. But sometimes, it's not enough, and knowing about pattern matching with regular expressions expands your scraping toolbox and helps you tackle more complex scenarios.
+
+## Next Up {#next}
+
+In the [next lesson](./relative_urls.md) we'll see how rewriting this code to Node.js is not so simple and learn about absolute and relative URLs in the process.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/finding_links.js">
+import * as cheerio from 'cheerio';
+import { gotScraping } from 'got-scraping';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+// ------- new code below
+
+const links = $('a');
+
+for (const link of links) {
+    const url = $(link).attr('href');
+    console.log(url);
+}
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/finding_links.md">
+---
+title: Finding links
+description: Learn what a link looks like in HTML and how to find and extract their URLs when web scraping. Using both DevTools and Node.js.
+sidebar_position: 2
+slug: /web-scraping-for-beginners/crawling/finding-links
+---
+
+import Example from '!!raw-loader!roa-loader!./finding_links.js';
+
+# Finding links {#finding-links}
+
+**Learn what a link looks like in HTML and how to find and extract their URLs when web scraping using both DevTools and Node.js.**
+
+---
+
+Many kinds of links exist on the internet, and we'll cover all the types in the advanced Academy courses. For now, let's think of links as [HTML anchor elements](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a) with `<a>` tags. A typical link looks like this:
+
+```HTML
+<a href="https://example.com">This is a link to example.com</a>
+```
+
+On a webpage, the link above will look like this: [This is a link to example.com](https://example.com) When you click it, your browser will navigate to the URL in the `<a>` tag's `href` attribute (`https://example.com`).
+
+> `href` means **H**ypertext **REF**erence. You don't need to remember this - just know that `href` typically means some sort of link.
+
+## Extracting links 🔗 {#extracting-links}
+
+If a link is an HTML element, and the URL is an attribute, this means that we can extract links the same way as we extracted data. To test this theory in the browser, we can try running the following code in our DevTools console on any website.
+
+```js
+// Select all the <a> elements.
+const links = document.querySelectorAll('a');
+// For each of the links...
+for (const link of links) {
+    // get the value of its 'href' attribute...
+    const url = link.href;
+    // and print it to console.
+    console.log(url);
+}
+```
+
+Go to the [Warehouse store Sales category](https://warehouse-theme-metal.myshopify.com/collections/sales), open the DevTools Console, paste the above code and run it.
+
+![links extracted from Warehouse store](./images/warehouse-links.png)
+
+**_Boom_** 💥, all the links from the page have now been printed to the console. Most of the links point to other parts of the website, but some links lead to other domains like facebook.com or instagram.com.
+
+## Extracting link URLs in Node.js {#Extracting-links-in-node}
+
+DevTools Console is a fun playground, but Node.js is way more useful. Let's create a new file in our project called **crawler.js** and add some basic crawling code that prints all the links from the [Sales category of Warehouse](https://warehouse-theme-metal.myshopify.com/collections/sales).
+
+We'll start from a boilerplate that's very similar to the scraper we built in [Basics of data extraction](../data_extraction/node_js_scraper.md).
+
+<RunnableCodeBlock className="language-js" type="cheerio">
+    {Example}
+</RunnableCodeBlock>
+
+Aside from importing libraries and downloading HTML, we load the HTML into Cheerio and then use it to retrieve all the `<a>` elements. After that, we iterate over the collected links and print their `href` attributes, which we access using the [`.attr()`](https://cheerio.js.org/docs/api/classes/Cheerio#attr) method.
+
+When you run the above code, you'll see quite a lot of links in the terminal. Some of them may look wrong, because they don't start with the regular `https://` protocol. We'll learn what to do with them in the following lessons.
+
+## Next Up {#next}
+
+The [next lesson](./filtering_links.md) will teach you how to select and filter links, so that your crawler will always work only with valid and useful URLs.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/first_crawl.md">
+---
+title: Your first crawl
+description: Learn how to crawl the web using Node.js, Cheerio and an HTTP client. Extract URLs from pages and use them to visit more websites.
+sidebar_position: 5
+slug: /web-scraping-for-beginners/crawling/first-crawl
+---
+
+# Your first crawl {#your-first-crawl}
+
+**Learn how to crawl the web using Node.js, Cheerio and an HTTP client. Extract URLs from pages and use them to visit more websites.**
+
+---
+
+In the previous lessons, we learned what crawling is and how to extract URLs from a page's HTML. The only thing that remains is to write the code—let's get right to it!
+
+> If the code starts to look too complex to you, don't worry. We're showing it for educational purposes, so that you can learn how crawling works. Near the end of this course, we'll show you a much easier and faster way to crawl, using a specialized scraping library. If you want, you can skip the details and [go there now](./pro_scraping.md).
+
+## Processing URLs {#processing-urls}
+
+In the previous lessons, we collected and filtered all the URLs pointing to individual products in the [Sales category of Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales). To crawl the URLs, we must take the whole list we collected and download the HTML of each of the pages. See the comments for changes and additions to the code.
+
+```js title=crawler.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com';
+const storeUrl = `${WEBSITE_URL}/collections/sales`;
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+const productLinks = $('a.product-item__title');
+
+// Prepare an empty array for our product URLs.
+const productUrls = [];
+
+for (const link of productLinks) {
+    const relativeUrl = $(link).attr('href');
+    const absoluteUrl = new URL(relativeUrl, WEBSITE_URL);
+
+    // Collect absolute product URLs.
+    productUrls.push(absoluteUrl);
+}
+
+// Loop over the stored URLs to process
+// each product page individually.
+for (const url of productUrls) {
+    // Download HTML.
+    const productResponse = await gotScraping(url);
+    const productHtml = productResponse.body;
+
+    // Load into Cheerio to parse the HTML.
+    const $productPage = cheerio.load(productHtml);
+
+    // Extract the product's title from the <h1> tag.
+    const productPageTitle = $productPage('h1').text().trim();
+
+    // Print the title to the terminal to see
+    // confirm we downloaded the correct pages.
+    console.log(productPageTitle);
+}
+```
+
+If you run the crawler from your terminal, it will print the titles of all the products on sale in the Warehouse store.
+
+## Handling errors {#handling-errors}
+
+The code above is correct, but it's not robust. If something goes wrong, it will crash. That something could be a network error, an internet connection error, or the websites you're trying to reach could be experiencing problems at that moment. Hitting any error like that would cause the current crawler to stop entirely, which means we would lose all the data it had collected so far.
+
+In programming, you handle errors by catching and handling them. Typically by printing information that the error occurred and/or retrying.
+
+> The scraping library we'll [show you in the following lessons](./pro_scraping.md) handles errors and retries automatically for you.
+
+```js title=crawler.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com';
+const storeUrl = `${WEBSITE_URL}/collections/sales`;
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+const productLinks = $('a.product-item__title');
+
+const productUrls = [];
+for (const link of productLinks) {
+    const relativeUrl = $(link).attr('href');
+    const absoluteUrl = new URL(relativeUrl, WEBSITE_URL);
+    productUrls.push(absoluteUrl);
+}
+
+for (const url of productUrls) {
+    // Everything else is exactly the same.
+    // We only wrapped the code in try/catch blocks.
+    // The try block passes all errors into the catch block.
+    // So, instead of crashing the crawler, they can be handled.
+    try {
+        // The try block attempts to execute our code
+        const productResponse = await gotScraping(url);
+        const productHtml = productResponse.body;
+        const $productPage = cheerio.load(productHtml);
+        const productPageTitle = $productPage('h1').text().trim();
+        console.log(productPageTitle);
+    } catch (error) {
+        // In the catch block, we handle errors.
+        // This time, we will print
+        // the error message and the url.
+        console.error(error.message, url);
+    }
+}
+```
+
+At the time of writing, none of the links have failed; however, as you crawl more pages, you will surely hit a few errors 😉. The important thing is that the crawler will no longer crash if an error does in fact occur, and that it will be able to download the HTML from the working product links.
+
+> If you thought that the crawl was taking too long to complete, the [scraping library](./pro_scraping.md) we keep referring to will help once again. It automatically parallelizes the downloads and processing of HTML, which leads to significant speed improvements.
+
+## Next up {#next}
+
+In the [next lesson](./scraping_the_data.md), we will complete the scraper by extracting data about all the products from their individual pages.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/headless_browser.md">
+---
+title: Headless browsers
+description: Learn how to scrape the web with a headless browser using only a few lines of code. Chrome, Firefox, Safari, Edge - all are supported.
+sidebar_position: 8
+slug: /web-scraping-for-beginners/crawling/headless-browser
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Headless browsers {#headless-browser}
+
+**Learn how to scrape the web with a headless browser using only a few lines of code. Chrome, Firefox, Safari, Edge - all are supported.**
+
+---
+
+A headless browser is a browser that runs without a user interface (UI). This means that it's normally controlled by automated scripts. Headless browsers are very popular in scraping because they can help you render JavaScript or programmatically behave like a human user to prevent blocking. The two most popular libraries for controlling headless browsers are [Puppeteer](https://pptr.dev/) and [Playwright](https://playwright.dev/). **Crawlee** supports both.
+
+## Building a Playwright scraper {#playwright-scraper}
+
+> Our focus will be on Playwright, which boasts additional features and better documentation. Notably, it originates from the same team responsible for Puppeteer.
+
+Crawlee has a built-in support for building Playwright scrapers. Let's reuse code of the Cheerio scraper from the previous lesson. It'll take us just a few changes to turn it into a full headless scraper.
+
+First, we must install Playwright into our project. It's not included in Crawlee, because it's quite large as it bundles all the browsers.
+
+```shell
+npm install playwright
+```
+
+After Playwright installs, we can proceed with updating the scraper code. Let's create a new file called `browser.js` and put the new code there. As always, the comments in the example describe changes in the code. Everything else is the same as before.
+
+```js title=browser.js
+// First, import PlaywrightCrawler instead of CheerioCrawler
+import { PlaywrightCrawler, Dataset } from 'crawlee';
+
+const crawler = new PlaywrightCrawler({
+    // Second, tell the browser to run with visible UI,
+    // so that we can see what's going on.
+    headless: false,
+    // Third, replace $ with parseWithCheerio function.
+    requestHandler: async ({ parseWithCheerio, request, enqueueLinks }) => {
+        console.log(`Fetching URL: ${request.url}`);
+
+        if (request.label === 'start-url') {
+            await enqueueLinks({
+                selector: 'a.product-item__title',
+            });
+            return;
+        }
+
+        // Fourth, parse the browser's page with Cheerio.
+        const $ = await parseWithCheerio();
+
+        const title = $('h1').text().trim();
+        const vendor = $('a.product-meta__vendor').text().trim();
+        const price = $('span.price').contents()[2].nodeValue;
+        const reviewCount = parseInt($('span.rating__caption').text(), 10);
+        const description = $('div[class*="description"] div.rte').text().trim();
+
+        await Dataset.pushData({
+            title,
+            vendor,
+            price,
+            reviewCount,
+            description,
+        });
+    },
+});
+
+await crawler.addRequests([{
+    url: 'https://warehouse-theme-metal.myshopify.com/collections/sales',
+    label: 'start-url',
+}]);
+
+await crawler.run();
+```
+
+:::tip
+
+The `parseWithCheerio` function is available even in `CheerioCrawler` and all the other Crawlee crawlers. If you think you'll often switch up the crawlers, you can use it to further reduce the number of needed line changes.
+
+:::
+
+
+When you run the code with `node browser.js`, you'll see a browser window open and then the individual pages getting scraped, each in a new browser tab.
+
+That's it. In 4 lines of code, we transformed our crawler from a static HTTP crawler to a headless browser crawler. The crawler now runs the same as before, but uses a Chromium browser instead of plain HTTP requests. This isn't possible without Crawlee.
+
+Using Playwright in combination with Cheerio like this is only one of many ways how you can utilize Playwright (and Puppeteer) with Crawlee. In the advanced courses of the Academy, we will go deeper into using headless browsers for scraping and web automation (RPA) use cases.
+
+## Running in headless mode {#running-headless}
+
+We said that headless browsers didn't have a UI, but while scraping with the above scraper code, you could definitely see the browser. That's because we added the `headless: false` option. This is useful for debugging and seeing what's going on in the browser. Once your scraper is complete, you can remove the line and the crawler will run without a UI.
+
+You can also switch between headless and headful (with UI) using the [`CRAWLEE_HEADLESS`](https://crawlee.dev/docs/guides/configuration#crawlee_headless) environment variable. This allows you to change the mode without touching your code.
+
+<Tabs groupId="main">
+<TabItem value="MacOS/Linux" label="MacOS/Linux">
+
+```shell
+CRAWLEE_HEADLESS=1 node browser.js
+```
+
+</TabItem>
+<TabItem value="Windows CMD" label="Windows CMD">
+
+```shell
+set CRAWLEE_HEADLESS=1 && node browser.js
+```
+
+</TabItem>
+<TabItem value="Windows Powershell" label="Windows Powershell">
+
+```shell
+$env:CRAWLEE_HEADLESS=1; & node browser.js
+```
+
+</TabItem>
+</Tabs>
+
+## Dynamically loaded data {#dynamic-data}
+
+One of the important benefits of using a browser is that it allows you to extract data that's dynamically loaded, such as data that's only fetched after a user scrolls or interacts with the page. In our case, it's the "**You may also like**" section of the product detail pages. Those products aren't available in the initial HTML, but the browser loads them later using an API.
+
+![headless-dynamic-data.png](./images/headless-dynamic-data.png)
+
+:::tip
+
+We discuss dynamic data at length in the [How to scrape dynamic pages](../../../tutorials/node_js/dealing_with_dynamic_pages.md) tutorial, and we also have a special lesson dedicated to it in our [Puppeteer & Playwright course](../../puppeteer_playwright/page/waiting.md).
+
+:::
+
+If we added an appropriate selector to our original `CheerioCrawler` code, it would not extract the information, but a browser automatically fetches and renders this extra data.
+
+Let's add this new extractor to our code. It collects the names of the recommended products.
+
+```js title=browser.js
+// ...
+const recommendedProducts = $('.product-recommendations a.product-item__title')
+    .map((i, el) => $(el).text().trim())
+    .toArray();
+// ...
+await Dataset.pushData({
+    // ...
+    recommendedProducts,
+});
+```
+
+And here's the complete, runnable code:
+
+```js title=browser.js
+import { PlaywrightCrawler, Dataset } from 'crawlee';
+
+const crawler = new PlaywrightCrawler({
+    // We removed the headless: false option to hide the browser windows.
+    requestHandler: async ({ parseWithCheerio, request, enqueueLinks }) => {
+        console.log(`Fetching URL: ${request.url}`);
+
+        if (request.label === 'start-url') {
+            await enqueueLinks({
+                selector: 'a.product-item__title',
+            });
+            return;
+        }
+
+        // Fourth, parse the browser's page with Cheerio.
+        const $ = await parseWithCheerio();
+
+        const title = $('h1').text().trim();
+        const vendor = $('a.product-meta__vendor').text().trim();
+        const price = $('span.price').contents()[2].nodeValue;
+        const reviewCount = parseInt($('span.rating__caption').text(), 10);
+        const description = $('div[class*="description"] div.rte').text().trim();
+        // We added one more extractor to get all the recommended products.
+        const recommendedProducts = $('.product-recommendations a.product-item__title')
+            .map((i, el) => $(el).text().trim())
+            .toArray();
+
+        await Dataset.pushData({
+            title,
+            vendor,
+            price,
+            reviewCount,
+            description,
+            // And we saved the extracted product names.
+            recommendedProducts,
+        });
+    },
+});
+
+await crawler.addRequests([{
+    url: 'https://warehouse-theme-metal.myshopify.com/collections/sales',
+    label: 'start-url',
+}]);
+
+await crawler.run();
+```
+
+When you run the code, you'll find the recommended product names correctly extracted in the dataset files. If you tried the same with our earlier `CheerioCrawler` code, you would find the `recommendedProducts` array empty in your results. That's because Cheerio can't make the API call to retrieve the additional data, like a browser can.
+
+## Next up {#next}
+
+We learned how to scrape with Cheerio and Playwright, but how do we export the data for further processing? Let's learn that in the [next and final lesson](./exporting_data.md) of the Basics of crawling section.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/index.md">
+---
+title: Basics of crawling
+description: Learn how to crawl the web with your scraper. How to extract links and URLs from web pages and how to manage the collected links to visit new pages.
+sidebar_position: 1.3
+category: courses
+slug: /web-scraping-for-beginners/crawling
+---
+
+# Basics of crawling {#basics}
+
+**Learn how to crawl the web with your scraper. How to extract links and URLs from web pages and how to manage the collected links to visit new pages.**
+
+---
+
+Welcome to the second section of our **Web scraping for beginners** course. In the [Basics of data extraction](../data_extraction/index.md) section, we learned how to extract data from a web page. Specifically, a template Shopify site called [Warehouse store](https://warehouse-theme-metal.myshopify.com/).
+
+![on-sale category of Warehouse store](./images/warehouse-store.png)
+
+In this section, we will take a look at moving between web pages, which we call **crawling**. We will extract data about all the on-sale products on [Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales). To do that, we will need to crawl the individual product pages.
+
+## How do you crawl? {#how-to-crawl}
+
+Crawling websites is a fairly straightforward process. We'll start by opening the first web page and extracting all the links (URLs) that lead to the other pages we want to visit. To do that, we'll use the skills learned in the [Basics of data extraction](../data_extraction/index.md) course. We'll add some extra filtering to make sure we only get the correct URLs. Then, we'll save those URLs, so in case our scraper crashes with an error, we won't have to extract them again. And, finally, we will visit those URLs one by one.
+
+At any point, we can extract URLs, data, or both. Crawling can be separate from data extraction, but it's not a requirement and, in most projects, it's actually easier and faster to do both at the same time. To summarize, it goes like this:
+
+1. Visit the start URL.
+2. Extract new URLs (and data) and save them.
+3. Visit one of the new-found URLs and save data and/or more URLs from them.
+4. Repeat 2 and 3 until you have everything you need.
+
+## Next up {#next}
+
+First, let's make sure we all understand the foundations. In the [next lesson](./recap_extraction_basics.md) we will review the scraper code we already have from the [Basics of data extraction](../data_extraction/index.md) section of the course.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/pro_scraping.md">
+---
+title: Professional scraping
+description: Learn how to build scrapers quicker and get better and more robust results by using Crawlee, an open-source library for scraping in Node.js.
+sidebar_position: 7
+slug: /web-scraping-for-beginners/crawling/pro-scraping
+---
+
+# Professional scraping 👷 {#pro-scraping}
+
+**Learn how to build scrapers quicker and get better and more robust results by using Crawlee, an open-source library for scraping in Node.js.**
+
+---
+
+While it's definitely an interesting exercise to do all the programming manually, and we hope you enjoyed it, it's neither the most effective, nor the most efficient way of scraping websites. Because we scrape for a living at Apify, we've built a library that we use to scrape tens of millions of pages every day.
+
+It's called [**Crawlee**](https://crawlee.dev/), and it is, and **always will be**, completely **open-source** and **free** to use. You don't need to sign up for an Apify account or use the Apify platform. You can use it on your personal computer, on any server, or in any cloud environment you want.
+
+We mentioned the benefits of developing using a dedicated scraping library in the previous lessons, but to recap:
+
+1. **Faster development time** because you don't have to write boilerplate code.
+2. **Fewer bugs**. Crawlee is fully unit-tested and battle-tested on millions of scraper runs.
+3. **Faster and cheaper scrapers** because Crawlee automatically scales based on system resources, and we optimize its performance regularly.
+4. **More robust scrapers**. Annoying details like retries, proxy management, error handling, and result storage are all handled out-of-the-box by Crawlee.
+5. **Helpful community**. You can [join our Discord](https://discord.gg/qkMS6pU4cF) or talk to us [on GitHub](https://github.com/apify/crawlee/discussions). We're almost always there to talk about scraping and programming in general.
+
+:::tip
+
+If you're still not convinced, [read this story](https://apify.com/success-stories/daltix-analytics-scrapy-python-to-apify) about how a data analytics company saved 90% of scraping costs by switching from Scrapy (a scraping library for Python) to Crawlee. We were pretty surprised ourselves, to be honest.
+
+:::
+
+Crawlee factors away and manages the dull and repetitive parts of web scraper development under the hood, such as:
+
+- Auto-scaling
+- Request concurrency
+- Queueing requests
+- Data storage
+- Using and rotating [proxies](../../anti_scraping/mitigation/proxies.md)
+- Puppeteer/Playwright setup overhead
+- [See all the features](https://crawlee.dev/docs/introduction)
+
+Crawlee and its resources can be found in various different places:
+
+1. [Official Crawlee documentation](https://crawlee.dev/)
+2. [Crawlee GitHub repository (source code, issues)](https://github.com/apify/crawlee)
+3. [Crawlee on npm](https://www.npmjs.com/package/crawlee)
+
+## Install Crawlee {#crawlee-installation}
+
+To use Crawlee, we have to install it from npm. Let's add it to our project from the previous lessons by executing this command in your project's folder.
+
+```shell
+npm install crawlee
+```
+
+After the installation completes, create a new file called **crawlee.js** and add the following code to it:
+
+```js title=crawlee.js
+import { CheerioCrawler } from 'crawlee';
+
+console.log('Crawlee works!');
+```
+
+We are using the new ESM `import` syntax here (see [Node.js docs](https://nodejs.org/dist/latest-v16.x/docs/api/esm.html#enabling)). To be able to use it, we need to turn our project to `module` in the `package.json` file:
+
+```json title=package.json
+{
+    "name": "my-scraping-project",
+    // highlight-next-line
+    "type": "module",
+    "dependencies": {
+        "crawlee": "^3.0.0"
+    }
+}
+```
+
+Then, run the code using `node` as usual:
+
+```shell
+node crawlee.js
+```
+
+You'll see "**Crawlee works!**" printed to the console. If it doesn't work, it means Crawlee didn't install correctly. If that's the case, try deleting the `node_modules` directory and `package-lock.json` file in your project and install Crawlee again.
+
+> You don't need to `import` any other libraries like Cheerio or Got-Scraping. That's because they're both included in Crawlee's [`CheerioCrawler`](https://crawlee.dev/docs/guides/cheerio-crawler-guide).
+
+## Prepare the scraper {#coding-the-scraper}
+
+ `CheerioCrawler` automatically visits URLs, downloads HTML using **Got-Scraping**, and parses it with **Cheerio**. The benefit of this over writing the code yourself is that it automatically handles the URL queue, errors, retries, proxies, parallelizes the downloads, and much more. Overall, it removes the need to write a lot of boilerplate code.
+
+To create a crawler with Crawlee, you only need to provide it with a request handler - a function that gets executed for each page it visits.
+
+```js title=crawlee.js
+import { CheerioCrawler } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+    // This function will run on every page.
+    // Among other things, it gives you access
+    // to parsed HTML with the Cheerio $ function.
+    requestHandler: async ({ $, request }) => {
+        console.log('URL:', request.url);
+        // Print the heading of each visited page.
+        console.log('Title:', $('h1').text().trim());
+    },
+});
+```
+
+But the above code still doesn't crawl anything. We need to provide it with URLs to crawl. To do that, we call the crawler's `addRequests` function.
+
+```js title=crawlee.js
+import { CheerioCrawler } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+    requestHandler: async ({ $, request }) => {
+        console.log('URL:', request.url);
+        console.log('Title:', $('h1').text().trim());
+    },
+});
+
+// Add the Sales category of Warehouse store to the queue of URLs.
+await crawler.addRequests([
+    'https://warehouse-theme-metal.myshopify.com/collections/sales',
+]);
+
+await crawler.run();
+```
+
+When you run the above code, you'll see some internal Crawlee logs and then the two messages your code printed:
+
+```text
+URL: https://warehouse-theme-metal.myshopify.com/collections/sales
+Title: Sales
+
+```
+
+> `crawler.addRequests` uses the [`RequestQueue`](https://crawlee.dev/docs/guides/request-storage#request-queue) under the hood. It's a persistent storage, which means that if your crawler crashes, it doesn't have to start over, but it can continue from where it left off.
+
+### Summary
+
+1. We added the first URL to the crawler using the `addRequests` function.
+2. `CheerioCrawler` will automatically take the URL from the queue, download its HTML using Got Scraping, and parse it using Cheerio.
+3. The crawler executes the [`requestHandler`](https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler), where we extract the page's data using the [`$`](https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlingContext) variable. You can also access the request itself using the [`request`](https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlingContext#request) variable.
+
+## Crawling links {#crawling-links}
+
+The current scraper only visits the Sales category page, but we want detailed data for all the products. We can use the [`enqueueLinks()`](https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlingContext#enqueueLinks) function to add more URLs to the queue. The function automatically extracts URLs from the current page based on a provided CSS selector and adds them to the queue. Once added, the crawler will automatically crawl them.
+
+```js title=crawlee.js
+import { CheerioCrawler } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+    requestHandler: async ({ $, request, enqueueLinks }) => {
+        console.log('URL:', request.url);
+        console.log('Title:', $('h1').text().trim());
+
+        // We only want to enqueue the URLs from the start URL.
+        if (request.label === 'start-url') {
+            // enqueueLinks will add all the links
+            // that match the provided selector.
+            await enqueueLinks({
+                // The selector comes from our earlier code.
+                selector: 'a.product-item__title',
+            });
+        }
+    },
+});
+
+// Instead of using a string with URL, we're now
+// using a request object to add more options.
+await crawler.addRequests([{
+    url: 'https://warehouse-theme-metal.myshopify.com/collections/sales',
+    // We label the Request to identify
+    // it later in the requestHandler.
+    label: 'start-url',
+}]);
+
+await crawler.run();
+```
+
+When you run the code, you'll see the names and URLs of all the products printed to the console. You'll also see that it crawls faster than the manually written code. This is thanks to the parallelization of the requests.
+
+> If the crawler gets stuck for you at the end, it's ok. It's not actually stuck, but waiting to retry any pages that may have failed.
+
+## Extracting data {#extracting-data}
+
+We have the crawler in place, and it's time to extract data. We already have the extraction code from the previous lesson, so we can copy and paste it into the `requestHandler` with tiny changes. Instead of printing results to the terminal, we will save it to disk.
+
+```js title=crawlee.js
+// To save data to disk, we need to import Dataset.
+import { CheerioCrawler, Dataset } from 'crawlee';
+
+const crawler = new CheerioCrawler({
+    requestHandler: async ({ $, request, enqueueLinks }) => {
+        console.log(`Fetching URL: ${request.url}`);
+
+        if (request.label === 'start-url') {
+            await enqueueLinks({
+                selector: 'a.product-item__title',
+            });
+            // When on the start URL, we don't want to
+            // extract any data after we extract the links.
+            return;
+        }
+
+        // We copied and pasted the extraction code
+        // from the previous lesson with small
+        // refactoring: e.g. `$productPage` to `$`.
+        const title = $('h1').text().trim();
+        const vendor = $('a.product-meta__vendor').text().trim();
+        const price = $('span.price').contents()[2].nodeValue;
+        const reviewCount = parseInt($('span.rating__caption').text(), 10);
+        const description = $('div[class*="description"] div.rte').text().trim();
+
+        // Instead of printing the results to
+        // console, we save everything to a file.
+        await Dataset.pushData({
+            title,
+            vendor,
+            price,
+            reviewCount,
+            description,
+        });
+    },
+});
+
+await crawler.addRequests([{
+    url: 'https://warehouse-theme-metal.myshopify.com/collections/sales',
+    label: 'start-url',
+}]);
+
+await crawler.run();
+```
+
+When you run the code as usual, you'll see the product URLs printed to the terminal and you'll find the scraped data saved to your disk. Thanks to using the [`Dataset.pushData()`](https://crawlee.dev/docs/introduction/saving-data#whats-datasetpushdata) function, Crawlee automatically created a `storage` directory in your project's location and saved the results there. Each product has its data stored as a separate JSON file.
+
+```text
+./storage/datasets/default/*.json
+```
+
+Thanks to **Crawlee**, we were able to create a **faster and more robust scraper**, but **with less code** than what was needed for the scraper in the earlier lessons.
+
+## Next up {#next}
+
+In the [next lesson](./headless_browser.md) we'll show you how to turn this plain HTTP crawler into a **headless browser** scraper in only a few lines of code.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/recap_extraction_basics.md">
+---
+title: Recap - Data extraction
+description: Review our e-commerce website scraper and refresh our memory about its code and the programming techniques we used to extract and save the data.
+sidebar_position: 1
+slug: /web-scraping-for-beginners/crawling/recap-extraction-basics
+---
+
+# Recap of data extraction basics {#quick-recap}
+
+**Review our e-commerce website scraper and refresh our memory about its code and the programming techniques we used to extract and save the data.**
+
+---
+
+We finished off the [first section](../data_extraction/index.md) of the _Web Scraping for Beginners_ course by creating a web scraper in Node.js. The scraper collected all the on-sale products from [Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales). Let's see the code with some comments added.
+
+```js
+// First, we imported all the libraries we needed to
+// download, extract, and convert the data we wanted
+import { writeFileSync } from 'fs';
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+import { parse } from 'json2csv';
+
+// Here, we fetched the website's HTML and saved it to a new variable.
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+// We used Cheerio, a popular library, to parse (process)
+// the downloaded HTML so that we could manipulate it.
+const $ = cheerio.load(html);
+
+// Using the .product-item CSS selector, we collected all the HTML
+// elements which contained data about individual products.
+const products = $('.product-item');
+
+// Then, we prepared a new array to store the results.
+const results = [];
+
+// And looped over all the elements to extract
+// information about the individual products.
+for (const product of products) {
+    // The product's title was in an <a> element
+    // with the CSS class: product-item__title
+    const titleElement = $(product).find('a.product-item__title');
+    const title = titleElement.text().trim();
+    // The product's price was in a <span> element
+    // with the CSS class: price
+    const priceElement = $(product).find('span.price');
+    // Because the <span> also included some useless data,
+    // we had to extract the price from a specific HTML node.
+    const price = priceElement.contents()[2].nodeValue.trim();
+
+    // We added the data to the results array
+    // in the form of an object with keys and values.
+    results.push({ title, price });
+}
+
+// Finally, we formatted the results
+// as a CSV file instead of a JS object
+const csv = parse(results);
+
+// Then, we saved the CSV to the disk
+writeFileSync('products.csv', csv);
+```
+
+:::tip
+
+If some of the code is hard for you to understand, please review the [Basics of data extraction](../data_extraction/index.md) section. We will not go through the details again in this section about crawling.
+
+:::
+
+:::caution
+
+We are using JavaScript features like `import` statements and top-level `await`. If you see errors like _Cannot use import outside of a module_, please review the [Project setup lesson](../data_extraction/project_setup.md#modern-javascript), where we explain how to enable those features.
+
+:::
+
+## Next up {#next}
+
+The [next lesson](./finding_links.md) is all about finding links to crawl on the [Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales).
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/relative_urls.md">
+---
+title: Relative URLs
+description: Learn about absolute and relative URLs used on web pages and how to work with them when parsing HTML with Cheerio in your scraper.
+sidebar_position: 4
+slug: /web-scraping-for-beginners/crawling/relative-urls
+---
+
+# Relative URLs {#filtering-links}
+
+**Learn about absolute and relative URLs used on web pages and how to work with them when parsing HTML with Cheerio in your scraper.**
+
+---
+
+You might have noticed in the previous lesson that while printing URLs to the DevTools console, they would always show in full length, like this:
+
+```text
+https://warehouse-theme-metal.myshopify.com/products/denon-ah-c720-in-ear-headphones
+```
+
+But in the Elements tab, when checking the `<a href="...">` attributes, the URLs would look like this:
+
+```text
+/products/denon-ah-c720-in-ear-headphones
+```
+
+What's up with that? This short version of the URL is called a **relative URL**, and the full length one is called an **absolute URL**.
+
+> [Learn more about absolute and relative URLs](https://developer.mozilla.org/en-US/docs/Learn/Common_questions/Web_mechanics/What_is_a_URL#absolute_urls_vs._relative_urls).
+
+We'll see why the difference between relative URLs and absolute URLs is important a bit later in this lesson.
+
+## Browser vs Node.js: The Differences {#browser-vs-node}
+
+Let's update the Node.js code from the [Finding links lesson](./finding_links.md) to see why links with relative URLs can be a problem.
+
+```js title=crawler.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+const productLinks = $('a.product-item__title');
+
+for (const link of productLinks) {
+    const url = $(link).attr('href');
+    console.log(url);
+}
+```
+
+When you run this file in your terminal, you'll immediately see the difference. Unlike in the browser, where looping over elements produced absolute URLs, here in Node.js it only produces the relative ones. This is bad, because we can't use the relative URLs to crawl. They don't include all the necessary information.
+
+## Resolving URLs {#resolving-urls}
+
+Luckily, there's a process called resolving URLs that creates absolute URLs from relative ones. We need two things. The relative URL, such as `/products/denon-ah-c720-in-ear-headphones`, and the URL of the website where we found the relative URL (which is `https://warehouse-theme-metal.myshopify.com` in our case).
+
+```js
+const websiteUrl = 'https://warehouse-theme-metal.myshopify.com';
+const relativeUrl = '/products/denon-ah-c720-in-ear-headphones';
+
+const absoluteUrl = new URL(relativeUrl, websiteUrl);
+console.log(absoluteUrl.href);
+```
+
+In Node.js, when you create a `new URL()`, you can optionally pass a second argument, the base URL. When you do, the URL in the first argument will be resolved using the URL in the second argument. Note that the URL created from `new URL()` is an object, not a string. To get the URL in a string format, we use the `url.href` property, or alternatively the `url.toString()` function.
+
+When we plug this into our crawler code, we will get the correct - absolute - URLs.
+
+```js title=crawler.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+// Split the base URL from the category to use it later.
+const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com';
+const storeUrl = `${WEBSITE_URL}/collections/sales`;
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+const productLinks = $('a.product-item__title');
+
+for (const link of productLinks) {
+    const relativeUrl = $(link).attr('href');
+    // Resolve relative URLs using the website's URL
+    const absoluteUrl = new URL(relativeUrl, WEBSITE_URL);
+    console.log(absoluteUrl.href);
+}
+```
+
+Cheerio can't resolve the URL itself, because until you provide the necessary information - it doesn't know where you originally downloaded the HTML from. The browser always knows which page you're on, so it will resolve the URLs automatically.
+
+## Next up {#next}
+
+The [next lesson](./first_crawl.md) will teach you how to use the collected URLs to crawl all the individual product pages.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/crawling/scraping_the_data.md">
+---
+title: Scraping data
+description: Learn how to add data extraction logic to your crawler, which will allow you to extract data from all the websites you crawled.
+sidebar_position: 6
+slug: /web-scraping-for-beginners/crawling/scraping-the-data
+---
+
+# Scraping data {#scraping-data}
+
+**Learn how to add data extraction logic to your crawler, which will allow you to extract data from all the websites you crawled.**
+
+---
+
+At the [very beginning of this course](../index.md), we learned that the term web scraping usually means a combined process of data extraction and crawling. And this is exactly what we'll do in this lesson. We will take the crawling code from the previous lesson, and we will combine it with data extraction code and turn everything into a web scraper.
+
+## Extracting data from a product detail page {#extracting-data}
+
+The term product detail page (or PDP) is commonly used on e-commerce websites to describe the page where you can find detailed information about a product. In the Warehouse store, there's, for example, [this page describing Denon headphones](https://warehouse-theme-metal.myshopify.com/products/denon-ah-c720-in-ear-headphones).
+
+Let's start writing a script that extracts data from this single PDP. We can use this familiar code as a boilerplate.
+
+```js title=product.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const productUrl = 'https://warehouse-theme-metal.myshopify.com/products/denon-ah-c720-in-ear-headphones';
+const response = await gotScraping(productUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+// Attribute extraction code will go here.
+```
+
+We will use the techniques learned in the [Basics of data extraction](../data_extraction/index.md) section to find and extract the following product attributes:
+
+- title
+- vendor
+- price
+- number of reviews
+- description
+
+![scraping-title.png](./images/scraping-title.png)
+
+> For brevity, we won't explain how to extract every attribute step-by-step. Review the [Basics of data extraction](../data_extraction/index.md) section to learn about DevTools and extracting data.
+
+### Title
+
+Getting the title is quite straightforward. We recommend using `h1` for titles where available, because it's the semantically correct way and therefore unlikely to change.
+
+```js
+const title = $('h1').text().trim();
+```
+
+### Vendor
+
+Vendor name is available as a link with the `product-meta__vendor` class. We're only interested in the text though.
+
+```js
+const vendor = $('a.product-meta__vendor').text().trim();
+```
+
+### Price
+
+We will take a shortcut here and only extract the price as a string that includes currency. In production scrapers, you might want to split it into two fields.
+
+```js
+const price = $('span.price').contents()[2].nodeValue;
+```
+
+### Number of reviews
+
+For the review count, we use the `parseInt()` function to get only the number. Otherwise, we would receive a string like **2 reviews** from this element.
+
+```js
+const reviewCount = parseInt($('span.rating__caption').text(), 10);
+```
+
+### Description
+
+Getting the description is fairly straightforward as well, but notice the two selectors separated by a space: `div[class*="description"] div.rte`. This is called a [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator), and it allows you to search for child elements within parent elements. Using any of the selectors separately would lead to unwanted strings in our result.
+
+```js
+const description = $('div[class*="description"] div.rte').text().trim();
+```
+
+### Complete extraction code
+
+This is the final code after putting all the extractors together with the initial boilerplate. It will scrape all the requested attributes from the single URL and print them to the terminal.
+
+Save it into a new file called `product.js` and run it with `node product.js` to see for yourself.
+
+```js title=product.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const productUrl = 'https://warehouse-theme-metal.myshopify.com/products/denon-ah-c720-in-ear-headphones';
+const response = await gotScraping(productUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+const title = $('h1').text().trim();
+const vendor = $('a.product-meta__vendor').text().trim();
+const price = $('span.price').contents()[2].nodeValue;
+const reviewCount = parseInt($('span.rating__caption').text(), 10);
+const description = $('div[class*="description"] div.rte').text().trim();
+
+const product = {
+    title,
+    vendor,
+    price,
+    reviewCount,
+    description,
+};
+
+console.log(product);
+```
+
+## Crawling product detail pages {#crawling}
+
+Let's compare the above data extraction example with the crawling code we wrote in the last lesson:
+
+```js title=crawler.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com';
+const storeUrl = `${WEBSITE_URL}/collections/sales`;
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+const productLinks = $('a.product-item__title');
+
+const productUrls = [];
+for (const link of productLinks) {
+    const relativeUrl = $(link).attr('href');
+    const absoluteUrl = new URL(relativeUrl, WEBSITE_URL);
+    productUrls.push(absoluteUrl);
+}
+
+for (const url of productUrls) {
+    try {
+        const productResponse = await gotScraping(url);
+        const productHtml = productResponse.body;
+        const $productPage = cheerio.load(productHtml);
+        const productPageTitle = $productPage('h1').text().trim();
+        console.log(productPageTitle);
+    } catch (error) {
+        console.error(error.message, url);
+    }
+}
+```
+
+We can see that the code is quite similar. Both scripts download HTML and then process the HTML. To understand how to put them together, we'll go back to the [original process of crawling](./index.md).
+
+1. Visit the start URL.
+2. Extract the next URLs (and data) and save them.
+3. Visit one of the collected URLs and save data and/or more URLs.
+4. Repeat step 3 until you have everything you need.
+
+Using this flow as guidance, we should be able to connect the pieces of code together to build a scraper which crawls through the products found in the [Sales category of Warehouse](https://warehouse-theme-metal.myshopify.com/collections/sales), and then scrapes the **title**, **vendor**, **price**, **review count**, and **description** of each of them.
+
+## Building the final scraper {#building-scraper}
+
+Let's create a brand-new file called **final.js** and write our scraper code there. We'll show the code step by step for easier orientation. At the end, we'll combine the pieces into a runnable example.
+
+We'll start by adding our imports and constants at the top of the file, no changes there.
+
+```js title=final.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com';
+```
+
+Then we need to **visit the start URL**. To scrape all the on-sale product links, we need the Sales page as the start URL.
+
+```js
+// ...
+const storeUrl = `${WEBSITE_URL}/collections/sales`;
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+```
+
+After that, we need to **extract the next URLs** we want to visit (the product detail page URLs). Thus far, the code is exactly the same as the **crawler.js** code.
+
+```js
+// ...
+const $ = cheerio.load(html);
+const productLinks = $('a.product-item__title');
+
+const productUrls = [];
+for (const link of productLinks) {
+    const relativeUrl = $(link).attr('href');
+    const absoluteUrl = new URL(relativeUrl, WEBSITE_URL);
+    productUrls.push(absoluteUrl);
+}
+```
+
+Now the code will start to differ. We will use the crawling logic from earlier to visit all the URLs, but we will replace the placeholder extraction logic we had there. The placeholder logic only extracted the product's title, but we want the vendor, price, number of reviews and description as well.
+
+```js
+// ...
+
+// A new array to save each product in.
+const results = [];
+
+// An optional array we can save errors to.
+const errors = [];
+
+for (const url of productUrls) {
+    try {
+        // Download HTML of each product detail.
+        const productResponse = await gotScraping(url);
+        const $productPage = cheerio.load(productResponse.body);
+
+        // Use the data extraction logic from above.
+        // If copy pasting, be careful about $ -> $productPage.
+        const title = $productPage('h1').text().trim();
+        const vendor = $productPage('a.product-meta__vendor').text().trim();
+        const price = $productPage('span.price').contents()[2].nodeValue;
+        const reviewCount = parseInt($productPage('span.rating__caption').text(), 10);
+        const description = $productPage('div[class*="description"] div.rte').text().trim();
+
+        results.push({
+            title,
+            vendor,
+            price,
+            reviewCount,
+            description,
+        });
+    } catch (error) {
+        // Save information about the error to the
+        // "errors" array to see what's happened.
+        errors.push({ url, msg: error.message });
+    }
+}
+```
+
+Finally, let's combine the above code blocks into a full runnable example. When you run the below code, it will scrape detailed information about all the products on the first page of the [Warehouse Sales category](https://warehouse-theme-metal.myshopify.com/collections/sales). We added a few console logs throughout the code to see what's going on.
+
+```js title=final.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const WEBSITE_URL = 'https://warehouse-theme-metal.myshopify.com';
+const storeUrl = `${WEBSITE_URL}/collections/sales`;
+
+console.log('Fetching products on sale.');
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+const productLinks = $('a.product-item__title');
+
+const productUrls = [];
+for (const link of productLinks) {
+    const relativeUrl = $(link).attr('href');
+    const absoluteUrl = new URL(relativeUrl, WEBSITE_URL);
+    productUrls.push(absoluteUrl);
+}
+
+console.log(`Found ${productUrls.length} products.`);
+
+const results = [];
+const errors = [];
+
+for (const url of productUrls) {
+    try {
+        console.log(`Fetching URL: ${url}`);
+        const productResponse = await gotScraping(url);
+        const $productPage = cheerio.load(productResponse.body);
+
+        const title = $productPage('h1').text().trim();
+        const vendor = $productPage('a.product-meta__vendor').text().trim();
+        const price = $productPage('span.price').contents()[2].nodeValue;
+        const reviewCount = parseInt($productPage('span.rating__caption').text(), 10);
+        const description = $productPage('div[class*="description"] div.rte').text().trim();
+
+        results.push({
+            title,
+            vendor,
+            price,
+            reviewCount,
+            description,
+        });
+    } catch (error) {
+        errors.push({ url, msg: error.message });
+    }
+}
+
+console.log('RESULTS:', results);
+console.log('ERRORS:', errors);
+```
+
+And here's an example of the results you will see after running the above code. We truncated the descriptions for readability. There should be 24 products in your list.
+
+```js
+[
+    {
+        title: 'JBL Flip 4 Waterproof Portable Bluetooth Speaker',
+        vendor: 'JBL',
+        price: '$74.95',
+        reviewCount: 2,
+        description: 'JBL Flip 4 is the next generation in the ...',
+    },
+    {
+        title: 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV',
+        vendor: 'Sony',
+        price: '$1,398.00',
+        reviewCount: 3,
+        description: 'Unlock the world of ultimate colors and ...',
+    },
+    {
+        title: 'Sony SACS9 10" Active Subwoofer',
+        vendor: 'Sony',
+        price: '$158.00',
+        reviewCount: 3,
+        description: 'Put more punch in your movie ...',
+    },
+];
+```
+
+That's it for the absolute basics of crawling, but we're not done yet. We scraped 24 products from the first page of the Sales category, but the category actually has 50 products on 3 pages. You will learn how to visit all the pages and scrape all the products in the following lessons.
+
+## Next up {#next}
+
+In the [next lesson](./pro_scraping.md) we will rewrite the scraper using an open-source web scraping library called [Crawlee](https://crawlee.dev). It will make the scraper more robust while speeding up development at the same time.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/browser_devtools.md">
+---
+title: Starting with browser DevTools
+description: Learn about browser DevTools, a valuable tool in the world of web scraping, and how you can use them to extract data from a website.
+sidebar_position: 1
+slug: /web-scraping-for-beginners/data-extraction/browser-devtools
+---
+
+**Learn about browser DevTools, a valuable tool in the world of web scraping, and how you can use them to extract data from a website.**
+
+---
+
+Even though DevTools stands for developer tools, everyone can use them to inspect a website. Each major browser has its own DevTools. We will use Chrome DevTools as an example, but the advice is applicable to any browser, as the tools are extremely similar. To open Chrome DevTools, you can press **F12** or right-click anywhere in the page and choose **Inspect**. Now go to [Wikipedia](https://www.wikipedia.org/) and open your DevTools there.
+
+![Wikipedia with Chrome DevTools open](./images/browser-devtools-wikipedia.png)
+
+## Elements tab {#elements-tab}
+
+When you first open Chrome DevTools on Wikipedia, you will start on the Elements tab (In Firefox it's called the **Inspector**). You can use this tab to inspect the page's HTML on the left hand side, and its CSS on the right. The items in the HTML view are called [**elements**](../../../glossary/concepts/html_elements.md).
+
+![Elements tab in Chrome DevTools](./images/browser-devtools-elements-tab.png)
+
+> On a screen that is narrow or has a small resolution, the CSS information can appear under the HTML tab, not on the right.
+
+Each element is enclosed in an HTML tag. For example `<div>`, `<p>`, and `<span>` are all tags. When you add something inside of those tags, like `<p>Hello!</p>` you create an element. You can also see elements inside other elements in the **Elements** tab. This is called nesting, and it gives the page its structure.
+
+At the bottom, there's the **JavaScript console**, which is a powerful tool which can be used to manipulate the website. If the console is not there, you can press **ESC** to toggle it. All of this might look super complicated at first, but don't worry, there's no need to understand everything yet - we'll walk you through all the important things you need to know.
+
+![Console in Chrome DevTools](./images/browser-devtools-console.png)
+
+## Selecting an element {#selecting-an-element}
+
+In the top left corner of DevTools, there's a little arrow icon with a square.
+
+![Chrome DevTools element selection tool](./images/browser-devtools-element-selection.png)
+
+Click it and then hover your mouse over **The Free Encyclopedia**, Wikipedia's subtitle. DevTools will show you information about the HTML element being hovered over. Now click the element. It will be selected in the **Elements** tab, which allows for further inspection of the element and its content.
+
+![Chrome DevTools element hover effect](./images/browser-devtools-hover.png)
+
+## Interacting with an element {#interacting-with-elements}
+
+After you select the subtitle element, right-click the highlighted element in the Elements tab to show a menu with available actions. For now, select **Store as global variable** (**Use in Console** in Firefox). You'll see that a new variable called `temp1` (`temp0` in Firefox) appeared in your DevTools Console. You can now use the Console to access the element's properties using JavaScript.
+
+For example, if you wanted to scrape the text inside the element, you could use the `textContent` property to get it. Copy and paste (or type) the following command into your Console and press Enter. The text of your `temp1` element - The Free Encyclopedia - will display in the Console.
+
+```js
+temp1.textContent;
+```
+
+Now run this command to get the HTML of the element:
+
+```js
+temp1.outerHTML;
+```
+
+And finally, run the next command to change the text of the element.
+
+```js
+temp1.textContent = 'Hello World!';
+```
+
+By changing HTML elements from the Console, you can change what's displayed on the page. This change only happens on your own computer so don't worry, you haven't hacked Wikipedia.
+
+![Chrome DevTools JavaScript command execution](./images/browser-devtools-console-commands.png)
+
+> In JavaScript, the web page is called `document`. From the Console you can interact with it in many ways. Go through [document basics](https://developer.mozilla.org/en-US/docs/Web/API/Document_object_model/Using_the_Document_Object_Model) to learn more.
+
+## Next up {#next}
+
+In this lesson, we learned the absolute basics of interaction with a page using the DevTools. In the [next lesson](./using_devtools.md), you will learn how to extract data from it. We will extract data about the on-sale products on the [Warehouse store](https://warehouse-theme-metal.myshopify.com).
+
+It isn't a real store, but a full-featured demo of a Shopify online store. And that is perfect for our purposes. Shopify is one of the largest e-commerce platforms in the world, and it uses all the latest technologies that a real e-commerce web application would use. Learning to scrape a Shopify store is useful, because you can immediately apply the learnings to millions of websites.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/computer_preparation.md">
+---
+title: Computer preparation
+description: Set up your computer to be able to code scrapers with Node.js and JavaScript. Download Node.js and npm and run a Hello World script.
+sidebar_position: 4
+slug: /web-scraping-for-beginners/data-extraction/computer-preparation
+---
+
+# Prepare your computer for programming {#prepare-computer}
+
+**Set up your computer to be able to code scrapers with Node.js and JavaScript. Download Node.js and npm and run a Hello World script.**
+
+---
+
+Before you can start writing scraper code, you need to have your computer set up for it. In this lesson, we will show you all the tools you need to install to successfully write your first scraper.
+
+## Install Node.js {#install-node}
+
+Let's start with the installation of Node.js. Node.js is an engine for running JavaScript, quite similar to the browser console we used in the previous lessons. You feed it JavaScript code, and it executes it for you. Why not just use the browser console? Because it's limited in its capabilities. Node.js is way more powerful and is much better suited for coding scrapers.
+
+If you're on macOS, use [this tutorial to install Node.js](https://blog.apify.com/how-to-install-nodejs/). If you're using Windows [visit the official Node.js website](https://nodejs.org/en/download/). And if you're on Linux, use your package manager to install `nodejs`.
+
+## Install a text editor {#install-an-editor}
+
+Many text editors are available for you to choose from when programming. You might already have a preferred one so feel free to use that. Make sure it has syntax highlighting and support for Node.js. If you don't have a text editor, we suggest starting with VSCode. It's free, very popular, and well maintained. [Download it here](https://code.visualstudio.com/download).
+
+Once you downloaded and installed it, you can open a folder where we will build your scraper. We recommend starting with a new, empty folder.
+
+![How to open a folder in VSCode](./images/vscode-open-folder.png)
+
+## Hello world! 👋 {#hello-world}
+
+Before we start, let's confirm that Node.js was successfully installed on your computer. To do that, run those two commands in your terminal and see if they correctly print your Node.js and npm versions. The next lessons **require Node.js version 16 or higher**. If you skipped Node.js installation and want to use your existing version of Node.js, **make sure that it's 16 or higher**.
+
+```shell
+node -v
+npm -v
+```
+
+If you installed VSCode in the previous paragraph, you can use the integrated terminal.
+
+![How to open a terminal in VSCode](./images/vscode-open-terminal.png)
+
+> If you're still wondering what a "terminal" is, we suggest googling for a terminal tutorial for your operating system because individual terminals are different. Sometimes a little, sometimes a lot.
+
+After confirming that `node` is correctly installed on your computer, use your text editor to create a file called **hello.js** in your folder.
+
+![How to create a file in VSCode](./images/vscode-create-file.png)
+
+Now add this piece of code to **hello.js** and save the file.
+
+```js
+console.log('Hello World');
+```
+
+Finally, run the below command in your terminal:
+
+```shell
+node hello.js
+```
+
+You should see **Hello World** printed in your terminal. If you do, congratulations, you are now officially a programmer! 🚀
+
+![Hello world in VSCode](./images/vscode-hello-world.png)
+
+## Next up {#next}
+
+You have your computer set up correctly for development, and you've run your first script. Great! In the [next lesson](./project_setup.md) we'll set up your project to download a website's HTML using Node.js instead of a browser.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/devtools_continued.md">
+---
+title: Extracting data with DevTools
+description: Continue learning how to extract data from a website using browser DevTools, CSS selectors, and JavaScript via the DevTools console.
+sidebar_position: 3
+slug: /web-scraping-for-beginners/data-extraction/devtools-continued
+---
+
+**Continue learning how to extract data from a website using browser DevTools, CSS selectors, and JavaScript via the DevTools console.**
+
+---
+
+In the previous parts of the DevTools tutorial, we were able to extract information about a single product from the Sales collection of the [Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales). If you missed the previous lessons, please go through them to understand the basic concepts. You don't need any of the code from there, though. We will start from scratch.
+
+## Find all product elements {#find-all-products}
+
+First, we will use the `querySelectorAll()` function from the previous lessons to get a list of all the product elements.
+
+Run this command in your Console:
+
+```js
+const products = document.querySelectorAll('.product-item');
+products.length;
+```
+
+The `length` property of `products` tells us how many products we have in the list. It says **24** and if you count the number of products on the page, you'll find that it's correct. Good, that means our CSS selector is working perfectly to get all the products.
+
+![Print all products](./images/devtools-count-products.png)
+
+## Looping over elements {#looping-over-elements}
+
+> [Visit this tutorial](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Loops_and_iteration) if you need to refresh the concept of loops in programming.
+
+Now, we will loop over each product and print their titles. We will use a so-called `for..of` loop to do it. It is a loop that iterates through all items of an array.
+
+Run the following command in the Console. Some notes:
+
+- The `a.product-item__title` selector and the extraction code come from the previous lesson.
+- The `console.log()` function prints the results to the Console.
+- The `trim()` function makes sure there are no useless whitespace characters around our data.
+
+```js
+for (const product of products) {
+    const titleElement = product.querySelector('a.product-item__title');
+    const title = titleElement.textContent.trim();
+    console.log(title);
+}
+```
+
+> [Learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...of) about the `for..of` loop.
+
+![Print all products' text](./images/devtools-product-titles.png)
+
+## Extracting more data {#extracting-data-in-loop}
+
+We will add the price extraction from the previous lesson to the loop. We will also save all the data to an array so that we can work with it. Run this in the Console:
+
+> The `results.push()` function takes its argument and pushes (adds) it to the `results` array. [Learn more about it here](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/push).
+
+```js
+const results = [];
+
+for (const product of products) {
+    const titleElement = product.querySelector('a.product-item__title');
+    const title = titleElement.textContent.trim();
+
+    const priceElement = product.querySelector('span.price');
+    const price = priceElement.childNodes[2].nodeValue.trim();
+
+    results.push({ title, price });
+}
+```
+
+After running the code, you'll see **24** printed to the Console. That's because the `results` array includes 24 products.
+
+Now, run this command in the Console to print all the products:
+
+```js
+console.log(results);
+```
+
+![Print all products' data](./images/devtools-print-all-products.png)
+
+> You may notice that some prices include the word **From**, indicating that the price is not final. If you wanted to process this data further, you would want to remove this from the price and instead save this information to another field.
+
+## Summary
+
+Let's recap the web scraping process. First, we used DevTools to **find the element** that holds data about a single product. Then, inside this **parent** element we **found child elements** that contained the data (title, price) we were looking for.
+
+Second, we used the `document.querySelector()` function and its `All` variant to **find the data programmatically**, using their **CSS selectors**.
+
+And third, we wrapped this data extraction logic in a **loop** to automatically find the data not only for a single product, but for **all the products** on the page. 🎉
+
+## Next up {#next}
+
+And that's it! With a bit of trial and error, you will be able to extract data from any webpage that's loaded in your browser. This is a useful skill on its own. It will save you time copy-pasting stuff when you need data for a project.
+
+More importantly though, it taught you the basics to start programming your own scrapers. In the [next lessons](./computer_preparation.md), we will teach you how to create your own web data extraction script using JavaScript and Node.js.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/index.md">
+---
+title: Basics of data extraction
+description: Learn about HTML, CSS, and JavaScript, the basic building blocks of a website, and how to use them in web scraping and data extraction.
+sidebar_position: 1.2
+category: courses
+slug: /web-scraping-for-beginners/data-extraction
+---
+
+# Basics of data extraction {#basics}
+
+**Learn about HTML, CSS, and JavaScript, the basic building blocks of a website, and how to use them in web scraping and data extraction.**
+
+---
+
+Every web scraping project starts with some detective work. To a human, it's completely obvious where the data is on the web page, but a computer needs very precise instructions to find the data we want. We can leverage three elementary components of each website to give those instructions: HTML, CSS, and JavaScript
+
+## HTML {#html}
+
+For the browser to be able to show you the web page with all its text and images, the data needs to be present somewhere. This data source is called HTML (HyperText Markup Language) and it gets downloaded to your computer whenever you open a website. If you want to extract data from a website, you need to show your computer where to find it in the HTML.
+
+> To learn more about markup, we recommend the [resources about HTML](https://developer.mozilla.org/en-US/docs/Learn/HTML) provided by MDN, the official documentation of the web.
+
+## CSS {#css}
+
+CSS (Cascading Style Sheets) is a language that is used to give websites their style. It controls shapes, colors, positioning and even animations. The style is then added to the page's HTML and together, they define the page's content and structure. In web scraping, we can leverage CSS to find the data we want using CSS selectors.
+
+> To learn more about styles and selectors, we recommend the [resources about CSS](https://developer.mozilla.org/en-US/docs/Learn/CSS) provided by MDN, the official documentation of the web.
+
+## JavaScript {#javascript}
+
+HTML and CSS give websites their structure and style, but they are static. To be able to meaningfully interact with a website, you need to throw JavaScript into the mix. It is the language of the web, and you don't need to be a programmer to learn the basics. You don't even need any special software, because you can try it right now, in your browser.
+
+> To learn more about programming in browser, we recommend the [resources about JavaScript](https://developer.mozilla.org/en-US/docs/Learn/JavaScript) provided by MDN, the official documentation of the web.
+
+## Next up {#next}
+
+We will show you [how to use the browser DevTools](./browser_devtools.md) to inspect and interact with a web page.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/node_continued.md">
+---
+title: Extracting data with Node.js
+description: Continue learning how to create a web scraper with Node.js and Cheerio. Learn how to parse HTML and print the results of the data your scraper has collected.
+sidebar_position: 7
+slug: /web-scraping-for-beginners/data-extraction/node-continued
+---
+
+**Continue learning how to create a web scraper with Node.js and Cheerio. Learn how to parse HTML and print the results of the data your scraper has collected.**
+
+---
+
+In the first part of the Node.js tutorial we downloaded the HTML of our [Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales) and parsed it with Cheerio. Now, we will replicate the extraction logic from the [Extracting Data with DevTools](./using_devtools.md) lessons and finish our scraper.
+
+## Querying data with Cheerio {#querying-with-cheerio}
+
+As a reminder, the data we need for each product on the page is available in the elements that have the `product-item` class attribute.
+
+![Selecting an element from the Elements tab](./images/devtools-collection-class.png)
+
+To get all the elements with that class using Cheerio, we call the `$` function with the appropriate CSS selector. Same as we would with the `document.querySelectorAll()` function.
+
+```js
+// In browser DevTools Console
+const products = document.querySelectorAll('.product-item');
+```
+
+```js
+// In Node.js with Cheerio
+const products = $('.product-item');
+```
+
+We will use the same approach as in the previous DevTools lessons. Using a `for..of` loop we will iterate over the list of products we saved in the `products` variable. The code is a little different from DevTools, because we're using Node.js and Cheerio instead of a browser's native DOM manipulation functions, but the principle is exactly the same.
+
+Replace the code in your **main.js** with the following, and run it with `node main.js` in your terminal.
+
+```js
+// main.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+// Download HTML with Got Scraping
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+// Parse HTML with Cheerio
+const $ = cheerio.load(html);
+
+// Find all products on the page
+const products = $('.product-item');
+
+// Loop through all the products
+// and print their text to terminal
+for (const product of products) {
+    const productElement = $(product);
+    const productText = productElement.text();
+
+    console.log(productText);
+}
+```
+
+After you run this script, you will see data of all the 24 products printed in your terminal. The output will be messy, but that's ok. Next, we will clean it.
+
+## Extracting product data {#extracting-data}
+
+To clean the output, we need to repeat the process from the DevTools lessons and add individual data point extraction to the loop. From those lessons, we know that each of our product cards includes an `<a class="product-item__title ...">` element which holds the product's title, and a `<span class="price ...">` element which includes the product's price.
+
+![Finding child elements in Elements tab](./images/devtools-find-child-elements.png)
+
+We will loop over all the `products` and extract the data points from each of them using the `for..of` loop. For reference, this a part of the code from the DevTools lesson, where we collected the data using the browser **DevTools Console**:
+
+```js
+// This code will only work in the browser, and NOT in Node.js
+const results = [];
+
+for (const product of products) {
+    const titleElement = product.querySelector('a.product-item__title');
+    const title = titleElement.textContent.trim();
+
+    const priceElement = subwoofer.querySelector('span.price');
+    const price = priceElement.childNodes[2].nodeValue.trim();
+
+    results.push({ title, price });
+}
+```
+
+And this snippet shows the same piece of code when using **Node.js and Cheerio**:
+
+```js
+const results = [];
+
+for (const product of products) {
+    const titleElement = $(product).find('a.product-item__title');
+    const title = titleElement.text().trim();
+
+    const priceElement = $(product).find('span.price');
+    const price = priceElement.contents()[2].nodeValue.trim();
+
+    results.push({ title, price });
+}
+```
+
+The main difference is that we used the [`.find()`](https://cheerio.js.org/classes/Cheerio.html#find) function to select the title and price elements and also the `.contents()` function instead of the `childNodes` attribute. If you find the differences confusing, don't worry about it. It will begin to feel very natural after a bit of practice.
+
+The final scraper code looks like this. Replace the code in your **main.js** file with this code and run it using `node main.js` in your terminal.
+
+```js
+// main.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+// Download HTML with Got Scraping
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+// Parse HTML with Cheerio
+const $ = cheerio.load(html);
+
+// Find all products on the page
+const products = $('.product-item');
+
+const results = [];
+for (const product of products) {
+    const titleElement = $(product).find('a.product-item__title');
+    const title = titleElement.text().trim();
+
+    const priceElement = $(product).find('span.price');
+    const price = priceElement.contents()[2].nodeValue.trim();
+
+    results.push({ title, price });
+}
+
+console.log(results);
+```
+
+After running the code, you will see this output in your terminal:
+
+```js
+[
+    {
+        title: 'JBL Flip 4 Waterproof Portable Bluetooth Speaker',
+        price: '$74.95',
+    },
+    {
+        title: 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV',
+        price: 'From $1,398.00',
+    },
+    {
+        title: 'Sony SACS9 10" Active Subwoofer',
+        price: '$158.00',
+    },
+    {
+        title: 'Sony PS-HX500 Hi-Res USB Turntable',
+        price: '$398.00',
+    },
+    {
+        title: 'Klipsch R-120SW Powerful Detailed Home Speaker - Unit',
+        price: '$324.00',
+    },
+    // ...and more
+];
+```
+
+Congratulations! You completed the **Basics of data extraction** section of the Web scraping for beginners course. A quick recap of what you learned:
+
+1. The basic terminology around web scraping, crawling, HTML, CSS and JavaScript.
+2. How to use browser DevTools and Console to inspect web pages and manipulate them using CSS and JavaScript.
+3. How to install Node.js and set up your computer for building scrapers.
+4. How to download a website's HTML using Got Scraping and then parse it using Cheerio to extract valuable data.
+
+Great job! 👏🎉
+
+# Next up {#next}
+
+What's next? While we were able to extract the data, it's not super useful to have it printed to the terminal. In the [next, bonus lesson](./save_to_csv.md), we will learn how to convert the data to a CSV and save it to a file.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/node_js_scraper.md">
+---
+title: Scraping with Node.js
+description: Learn how to use JavaScript and Node.js to create a web scraper, plus take advantage of the Cheerio and Got-scraping libraries to make your job easier.
+sidebar_position: 6
+slug: /web-scraping-for-beginners/data-extraction/node-js-scraper
+---
+
+**Learn how to use JavaScript and Node.js to create a web scraper, plus take advantage of the Cheerio and Got-scraping libraries to make your job easier.**
+
+---
+
+Finally, we have everything ready to start scraping! Yes, the setup was a bit daunting, but luckily, you only have to do it once. We have our project, we have our **main.js** file, so let's add some code to it.
+
+## Downloading HTML {#downloading-html}
+
+We will use the `got-scraping` library to download the HTML of products that are [on sale in the Warehouse store](https://warehouse-theme-metal.myshopify.com/collections/sales). We already worked with this page earlier in the [Extracting Data with DevTools](./using_devtools.md) lessons.
+
+Replace the contents of your **main.js** file with this code:
+
+```js
+// main.js
+import { gotScraping } from 'got-scraping';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+console.log(html);
+```
+
+Now run the script using the `node main.js` command from the previous lesson. After a brief moment, you should see the page's HTML printed to your terminal.
+
+> `gotScraping` is an `async` function and the `await` keyword is used to pause execution of the script until it returns the `response`. If you're new to this, go through an [introduction to asynchronous JavaScript](https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Asynchronous).
+
+## Parsing HTML {#parsing-html}
+
+Having the HTML printed to the terminal is not very helpful. To extract the data, we first have to parse it. Parsing the HTML allows us to query the individual HTML elements, similarly to the way we did it in the browser in the [Extracting Data with DevTools](./using_devtools.md) lessons.
+
+To parse the HTML with the `cheerio` library. Replace the code in your **main.js** with the following code:
+
+```js
+// main.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+// Download HTML with Got Scraping
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+// Parse HTML with Cheerio
+const $ = cheerio.load(html);
+const headingElement = $('h1');
+const headingText = headingElement.text();
+
+// Print page title to terminal
+console.log(headingText);
+```
+
+When you run the above script, **Sales** will be printed to the terminal. That's because it's the heading of the Sales page of the Warehouse Store which is located in a `h1` element.
+
+![Scraping page heading](./images/node-scraper-title.png)
+
+Great, we successfully parsed the HTML and extracted the text of the `<h1>` element from it using Node.js and Cheerio. Let's break the code down.
+
+The script first downloaded the page's HTML using the Got Scraping library. Then, it parsed the downloaded `html` with `cheerio` using the `load()` function, and allowed us to work with it using the `$` variable (the `$` name is an old convention). The next `$('h1')` function call looked inside the parsed HTML and found the `<h1>` element. Finally, the script extracted the text from the element using the `.text()` function and printed it to the terminal with `console.log()`.
+
+> `$('h1')` is very similar to calling `document.querySelector('h1')` in the browser and `element.text()` is similar to `element.textContent` from the earlier DevTools lessons. [Visit the cheerio documentation](https://github.com/cheeriojs/cheerio#readme) to learn more about its syntax.
+
+## Next up {#next}
+
+In the [next lesson](./node_continued.md) we will learn more about Cheerio and use it to extract all the products' data from Fakestore.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/project_setup.md">
+---
+title: Project setup
+description: Create a new project with npm and Node.js. Install necessary libraries, and test that everything works before starting the next lesson.
+sidebar_position: 5
+slug: /web-scraping-for-beginners/data-extraction/project-setup
+---
+
+# Setting up your project {#setting-up}
+
+**Create a new project with npm and Node.js. Install necessary libraries, and test that everything works before starting the next lesson.**
+
+---
+
+When you open a website in a browser, the browser first downloads the page's HTML. To do the same thing with Node.js, we will install a program - an npm module - to help us with it. npm modules are installed using `npm`, which is another program, automatically installed with Node.js.
+
+> The [npmjs.com](https://www.npmjs.com/) registry offers a huge collection of open-source libraries for Node.js. You can (and you should) utilize it to save time and tap into the amazing open-source community around JavaScript and Node.js.
+
+## Creating a new project with npm {#creating-a-project}
+
+Before we can install npm modules, we need to create an npm project. To do that, you can create a new directory or use the one that you already have open in VSCode (you can delete the **hello.js** file now) and from that directory run this command in your terminal:
+
+```shell
+npm init -y
+```
+
+It will set up an empty npm project for you and create a file called **package.json**. This is a very important file in Node.js programming as it contains information about the project.
+
+![npm init with VSCode](./images/vscode-npm-init.png)
+
+### Use modern JavaScript {#modern-javascript}
+
+Node.js and npm support two types of projects, let's call them legacy and modern. For backwards compatibility, the legacy version is used by default. To switch to the modern version, open your **package.json** and add this line to the end of the JSON object. Don't forget to add a comma to the end of the previous line 😉
+
+```text
+"type": "module"
+```
+
+![Update package.json with VSCode](./images/vscode-type-module.png)
+
+> More recent versions of npm might already have `"type": "commonjs",` pre-defined; if so, simply replace `commonjs` with `module`.
+
+If you want to learn more about JSON and its syntax, we recommend [this tutorial on MDN](https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Objects/JSON).
+
+## Installing necessary libraries {#install-libraries}
+
+Now that we have a project set up, we can install npm modules into the project. Let's install libraries that will help us with downloading and processing websites' HTML. In the project directory, run the following command, which will install two libraries into your project. **got-scraping** and Cheerio.
+
+```shell
+npm install got-scraping cheerio
+```
+
+[**got-scraping**](https://github.com/apify/got-scraping) is a library that's made especially for scraping and downloading page's HTML. It's based on the popular [**got** library](https://github.com/sindresorhus/got), which means any features of **got** are also available in **got-scraping**. Both **got** and **got-scraping** are HTTP clients. To learn more about HTTP, [visit this MDN tutorial](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP).
+
+[Cheerio](https://github.com/cheeriojs/cheerio) is a popular Node.js library for parsing and processing HTML. If you know how to work with [jQuery](https://jquery.com/), you'll find Cheerio familiar.
+
+## Test everything {#testing}
+
+With the libraries installed, create a new file in the project's folder called **main.js**. This is where we will put all our code. Before we start scraping, though, let's do a check that everything was installed correctly. Add this piece of code inside **main.js**.
+
+```js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+
+console.log('it works!');
+```
+
+Those `import` statements tell Node.js that it should give you access to the **got-scraping** library under the `gotScraping` variable and the Cheerio library under the `cheerio` variable.
+
+Now run this command in your terminal:
+
+```shell
+node main.js
+```
+
+If you see **it works!** printed in your terminal, great job! You set up everything correctly. If you see an error that says _Cannot use import statement outside a module_, go back to the [Use modern JavaScript](#modern-javascript) paragraph and add the `type` property to your **package.json**. If you see a different error, try copying and pasting it into Google, and you'll find a solution soon.
+
+![Test your setup with VSCode](./images/vscode-test-setup.png)
+
+## Next up {#next}
+
+With the project set up, the [next lesson](./node_js_scraper.md) will show you how to use **got-scraping** to download the website's HTML and extract data from it with Cheerio.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/save_to_csv.md">
+---
+title: Saving results to CSV
+description: Learn how to save the results of your scraper's collected data to a CSV file that can be opened in Excel, Google Sheets, or any other spreadsheets program.
+sidebar_position: 8
+slug: /web-scraping-for-beginners/data-extraction/save-to-csv
+---
+
+# Saving results to CSV {#saving-to-csv}
+
+**Learn how to save the results of your scraper's collected data to a CSV file that can be opened in Excel, Google Sheets, or any other spreadsheets program.**
+
+---
+
+In the last lesson, we were able to extract data about all the on-sale products from [Warehouse Store](https://warehouse-theme-metal.myshopify.com/collections/sales). That's great. But we ended up with results printed to the terminal, which is not very useful for further processing. In this lesson, we'll learn how to save that data into a CSV file that you can then open in Excel or Google Sheets.
+
+## Converting to CSV {#converting-to-csv}
+
+It might look like a big programming challenge to transform a JavaScript object into a CSV, but thanks to npm, this is going to be a walk in the park. Google search **json to csv npm**. You will find that there's a library called [`json2csv`](https://www.npmjs.com/package/json2csv) that can convert a JavaScript object to CSV format with a single function call. _Perfect!_
+
+To install `json2csv`, run this command in your terminal. You need to be in the project's folder - the folder which has the `package.json` file.
+
+```shell
+npm i json2csv
+```
+
+First, we need to import the `parse()` function from the library.
+
+```js
+import { parse } from 'json2csv';
+```
+
+Next, we need to parse the `results` array from the previous lesson with the imported function.
+
+```js
+const csv = parse(results);
+```
+
+The full code including the earlier scraping part now looks like this. Replace the contents of your **main.js** file with this code:
+
+```js
+// main.js
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+import { parse } from 'json2csv'; // <---- added a new import
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+const products = $('.product-item');
+
+const results = [];
+for (const product of products) {
+    const titleElement = $(product).find('a.product-item__title');
+    const title = titleElement.text().trim();
+
+    const priceElement = $(product).find('span.price');
+    const price = priceElement.contents()[2].nodeValue.trim();
+
+    results.push({ title, price });
+}
+
+const csv = parse(results); // <---- added parsing of results to CSV
+console.log(csv);
+```
+
+Now run the script with `node main.js`. The newly created CSV will be printed to the terminal.
+
+```text
+"title","price"
+"JBL Flip 4 Waterproof Portable Bluetooth Speaker","$74.95"
+"Sony XBR-950G BRAVIA 4K HDR Ultra HD TV","From $1,398.00"
+"Sony SACS9 10"" Active Subwoofer","$158.00"
+"Sony PS-HX500 Hi-Res USB Turntable","$398.00"
+"Klipsch R-120SW Powerful Detailed Home Speaker - Unit","$324.00"
+"Denon AH-C720 In-Ear Headphones","$119.00"
+"Sony XBR-85X850F 85-Inch 4K Ultra HD Smart LED TV","$3,498.00"
+"Sony XBR-75X850F 75-Inch 4K Ultra HD Smart LED TV","$1,998.00"
+"Sony XBR-55A8F 55-Inch 4K Ultra HD Smart BRAVIA OLED TV","$2,298.00"
+...
+```
+
+## Writing the CSV to a file {#writing-to-file}
+
+The final task that remains is to save our CSV formatted data to a file on our disk, so we can open it or send it to someone. For this, we don't need any extra npm packages because functions for saving files are included in Node.js.
+
+First, we import the `writeFileSync` function from the `fs` (file system) package.
+
+```js
+import { writeFileSync } from 'fs';
+```
+
+and then call it with a file name and the CSV data.
+
+```js
+writeFileSync('products.csv', csv);
+```
+
+When we complete the code, it looks like this. Replace the code in your **main.js** file with this new code.
+
+```js
+// main.js
+import { writeFileSync } from 'fs'; // <---- added a new import
+import { gotScraping } from 'got-scraping';
+import * as cheerio from 'cheerio';
+import { parse } from 'json2csv';
+
+const storeUrl = 'https://warehouse-theme-metal.myshopify.com/collections/sales';
+
+const response = await gotScraping(storeUrl);
+const html = response.body;
+
+const $ = cheerio.load(html);
+
+const products = $('.product-item');
+
+const results = [];
+for (const product of products) {
+    const titleElement = $(product).find('a.product-item__title');
+    const title = titleElement.text().trim();
+
+    const priceElement = $(product).find('span.price');
+    const price = priceElement.contents()[2].nodeValue.trim();
+
+    results.push({ title, price });
+}
+
+const csv = parse(results);
+writeFileSync('products.csv', csv); // <---- added writing of CSV to file
+```
+
+Finally, run it with `node main.js` in your terminal. After running it, you will find the **products.csv** file in your project folder. And when you open it with Excel/Google Sheets—voila!
+
+![Displaying CSV data in Google Sheets](./images/csv-data-in-sheets.png)
+
+This marks the end of the **Basics of data extraction** section of Web scraping for beginners. If you enjoyed the course, give us a thumbs up down below and if you're eager to learn more...
+
+## Next up {#next}
+
+Next up are the [**Basics of crawling**](../crawling/index.md). You already know how to build a scraper that finds all the products on sale in the [Warehouse Store](https://warehouse-theme-metal.myshopify.com/collections/sales). In the [**Basics of crawling**](../crawling/index.md) section you will learn how to open individual product pages of those products and scrape information that's not available on the listing page, like SKUs, descriptions or reviews.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/data_extraction/using_devtools.md">
+---
+title: Finding elements with DevTools
+description: Learn how to use browser DevTools, CSS selectors, and JavaScript via the DevTools console to extract data from a website.
+sidebar_position: 2
+slug: /web-scraping-for-beginners/data-extraction/using-devtools
+---
+
+**Learn how to use browser DevTools, CSS selectors, and JavaScript via the DevTools console to extract data from a website.**
+
+---
+
+With the knowledge of the basics of DevTools we can finally try doing something more practical - extracting data from a website. Let's try collecting the on-sale products from the [Warehouse store](https://warehouse-theme-metal.myshopify.com/). We will use [CSS selectors](../../../glossary/concepts/css_selectors.md), JavaScript, and DevTools to achieve this task.
+
+> **Why use a Shopify demo and not a real e-commerce store like Amazon?** Because real websites are usually bulkier, littered with promotions, and they change very often. Many have multiple versions of pages, and you never know in advance which one you will get. It will be important to learn how to deal with these challenges in the future, but for this beginner course, we want to have a light and stable environment.
+>
+> Some other courses use so-called scraping playgrounds or sandboxes. Those are websites made solely for the purpose of learning scraping. We find those too dumbed down and not representative of real websites. The Shopify demo is a full-featured, real-world website.
+
+## Getting structured data from HTML {#getting-structured-data}
+
+When you open up the [Sales section of Warehouse](https://warehouse-theme-metal.myshopify.com/collections/sales), you'll see that there's a grid of products on the page with names and pictures of products. We will learn how to extract all this information.
+
+![Warehouse store with DevTools open](./images/devtools-collection-warehouse.png)
+
+Open DevTools and select the name of the **Sony SACS9 Active Subwoofer**. When you click on it, it will get highlighted in the Elements tab.
+
+![Selecting an element with DevTools](./images/devtools-collection-product-name.png)
+
+Great, you have selected the element which contains the name of the subwoofer. Now we want to find all the elements that contain all the information about this subwoofer. Price, number of reviews, image and everything else you might need. We will use the **Elements** tab to do that. You can hover over the elements in the Elements tab, and they will get highlighted on the page as you move the mouse.
+
+Start from the previously selected element with the subwoofer's name and move your mouse up, hovering over each element, until you find the one that highlights the entire product card. Alternatively, you can press the up arrow a few times to get the same result.
+
+The element that contains all the information about the subwoofer is called a **parent element**, and all the nested elements, including the subwoofer's name, price and everything else, are **child elements**.
+
+![Selecting an element with hover](./images/devtools-collection-product-hover.png)
+
+Now that we know how the parent element looks, we can extract its data, including the data of its children. Notice that the element has a `class` attribute with multiple values like `product-item` or `product-item--vertical`. Let's use those classes in the Console to extract data.
+
+![Class attribute in DevTools](./images/devtools-collection-class.png)
+
+## Selecting elements in Console {#selecting-elements}
+
+We know how to find an element manually using the DevTools, but that's not very useful for automated scraping. We need to tell the computer how to find it as well. We can do that using JavaScript and CSS selectors. The function to do that is called [`document.querySelector()`](../../../glossary/concepts/querying_css_selectors.md) and it will find the first element in the page's HTML matching the provided [CSS selector](../../../glossary/concepts/css_selectors.md).
+
+For example `document.querySelector('div')` will find the first `<div>` element. And `document.querySelector('.my-class')` (notice the period `.`) will find the first element with the class `my-class`, such as `<div class="my-class">` or `<p class="my-class">`.
+
+You can also combine selectors. `document.querySelector('p.my-class')` will find all `<p class="my-class">` elements, but no `<div class="my-class">`.
+
+Let's try to use `document.querySelector()` to find the **Sony subwoofer**. Earlier we mentioned that the parent element of the subwoofer had, among others, the `product-item` class. We can use the class to look up the element. Copy or type (don't miss the period `.` in `.product-item`) the following function into the Console and press Enter.
+
+```js
+document.querySelector('.product-item');
+```
+
+It will produce a result like this, but it **won't be** the Sony subwoofer.
+
+![Query a selector with JavaScript](./images/devtools-collection-query.png)
+
+:::note About the missing semicolon
+
+In the screenshot, there is a missing semicolon `;` at the end of the line. In JavaScript, semicolons are optional, so it makes no difference.
+
+:::
+
+When we look more closely by hovering over the result in the Console, we find that instead of the Sony subwoofer, we found a JBL Flip speaker. Why? Because earlier we explained that `document.querySelector('.product-item')` finds the **first element** with the `product-item` class, and the JBL speaker is the first product in the list.
+
+![Hover over a query result](./images/devtools-collection-query-hover.png)
+
+We need a different function: [`document.querySelectorAll()`](../../../glossary/concepts/querying_css_selectors.md) (notice the `All` at the end). This function does not find only the first element, but all the elements that match the provided selector.
+
+Run the following function in the Console:
+
+```js
+document.querySelectorAll('.product-item');
+```
+
+It will return a `NodeList` (a type of array) with many results. Expand the results by clicking the small arrow button and then hover over the third (number 2, indexing starts at 0) element in the list. You'll find that it's the Sony subwoofer we're looking for.
+
+![Hover over a query result](./images/devtools-collection-query-all.png)
+
+Naturally, this is the method we use mostly in web scraping, because we're usually interested in scraping all the products from a page, not just a single product.
+
+:::note Elements or nodes?
+
+The list is called a `NodeList`, because browsers understand a HTML document as a tree of nodes. Most of the nodes are HTML elements, but there can be also text nodes for plain text, and others.
+
+:::
+
+## How to choose good selectors {#choose-good-selectors}
+
+Often you can select the same element with different CSS selectors. Try to choose selectors that are **simple**, **human-readable**, **unique** and **semantically connected** to the data. Selectors that meet these criteria are sometimes called **resilient selectors**, because they're the most reliable and least likely to change with website updates. If you can, avoid randomly generated attributes like `class="F4jsL8"`. They change often and without warning.
+
+The `product-item` class is simple, human-readable, and semantically connected with the data. The subwoofer is one of the products. A product item. Those are strong signals that this is a good selector. It's also sufficiently unique in the website's context. If the selector was only an `item`, for example, there would be a higher chance that the website's developers would add this class to something unrelated. Like an advertisement. And it could break your extraction code.
+
+## Extracting data from elements {#extraction-from-elements}
+
+Now that we found the element, we can start poking into it to extract data. First, let's save the element to a variable so that we can work with it repeatedly. Run these commands in the Console:
+
+```js
+const products = document.querySelectorAll('.product-item');
+const subwoofer = products[2];
+```
+
+> If you're wondering what an array is or what `products[2]` means, read the [JavaScript arrays basics](https://developer.mozilla.org/en-US/docs/Learn/JavaScript/First_steps/Arrays).
+
+Now that we have the subwoofer saved in a variable, run another command in the Console to print its text:
+
+```js
+subwoofer.textContent;
+```
+
+![Print text content of parent element](./images/devtools-print-parent-text.png)
+
+As you can see, we were able to extract information about the subwoofer, but the format is still not very useful - there's a lot of content that we don't need. For further processing (ex. in a spreadsheet), we would like to have each piece of data as a separate field (column). To do that, we will look at the HTML structure in more detail.
+
+### Finding child elements {#finding-child-elements}
+
+In the [Getting structured data from HTML](#getting-structured-data) section, we were browsing the elements in the **Elements** tab to find the element that contains all the data. We can use the same approach to find the individual data points as well.
+
+Start from the element that contains all data: `<div class="product-item...">` Then inspect all the elements nested within this element. You'll discover that:
+
+- the product's name is an `<a>` element with the class `product-item__title`, and
+- the price is held inside a `<span>` with the class `price`. Note that there are two prices. The sale price and the regular price. We want the sale price.
+
+We will use this knowledge soon to extract the data.
+
+![Finding child elements in Elements tab](./images/devtools-find-child-elements.png)
+
+### Selecting child elements {#selecting-child-elements}
+
+The `document.querySelector()` function looks for a specific element in the whole HTML `document`, so if we called it with `h3`, it would find the first `<h3>` node in the `document`. But we can replace the `document` with any other parent element and the function will limit its search to child elements of the chosen parent.
+
+Earlier we selected the parent element of the Sony subwoofer and saved it to a variable called `subwoofer`. Let's use this variable to search inside the subwoofer element and find the product's name and price.
+
+Run two commands in the Console. The first will find the element with the subwoofer's name and save it to a variable called `title`. The second will extract the name and print it.
+
+```js
+const title = subwoofer.querySelector('a.product-item__title');
+title.textContent;
+```
+
+![Extract product title](./images/devtools-extract-product-title.png)
+
+Great! We found a way how to programmatically extract the name of the product. We're getting somewhere.
+
+Next, run the following two commands in the Console.
+
+```js
+const price = subwoofer.querySelector('span.price');
+price.textContent;
+```
+
+![Extract product price](./images/devtools-extract-product-price.png)
+
+It worked, but the price was not alone in the result. We extracted it together with some extra text. This is very common in web scraping. Sometimes it's impossible to separate the data we need by element selection alone, and we have to clean the data using other methods.
+
+### Cleaning extracted data {#cleaning-extracted-data}
+
+When it comes to data cleaning, there are two main approaches you can take. It's beneficial to understand both, as one approach may be feasible in a given situation while the other is not.
+
+1. Remove the elements that add noise to your data from the selection. Then extract the pre-cleaned data.
+2. Extract the data with noise. Use [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions) or other text manipulation techniques to parse the data and keep only the parts we're interested in.
+
+First, let's look at **removing the noise before extraction**. When you look closely at the element that contains the price, you'll see that it includes another `<span>` element with the text **Sale price**. This `<span>` is what adds noise to our data, and we have to get rid of it.
+
+![Noise in element selection](./images/devtools-cleaning-noise.png)
+
+When we call `subwoofer.querySelector('span.price')` it selects the whole `<span class="price ...:>` element. Unfortunately, it also includes the `<span class="visually-hidden">` element that we're not interested in.
+
+We can, however, use JavaScript to get only the actual text of the selected element, without any child elements. Run this command in the Console:
+
+```js
+price.childNodes[2].nodeValue;
+```
+
+Why the third child node? Because the first one represents the empty space before `<span class="visually-hidden"`, the second is the noise `<span>` itself and the third one is the price. In any case, we were able to extract the clean price.
+
+![Clean price selection](./images/devtools-clean-price.png)
+
+The second option we have is to **take the noisy price data and clean it with string manipulation**. The data looks like this:
+
+```text
+\n                Sale price$158.00
+```
+
+This can be approached in a variety of ways. To start let's look at a naive solution:
+
+```js
+price.textContent.split('$')[1];
+```
+
+![Split price from noise](./images/devtools-split-price.png)
+
+And there you go. Notice that this time we extracted the price without the `$` dollar sign. This could be desirable, because we wanted to convert the price from a string to a number, or not, depending on individual circumstances of the scraping project.
+
+Which method to choose? Neither is the perfect solution. The first method could break if the website's developers change the structure of the `<span>` elements and the price will no longer be in the third position - a very small change that can happen at any moment.
+
+The second method seems more reliable, but only until the website adds prices in other currency or decides to replace `$` with `USD`. It's up to you, the scraping developer to decide which of the methods will be more resilient on the website you scrape.
+
+In production, we would probably use a regular expression like the following, or a specialized library for parsing prices from strings, but for this tutorial, we'll keep it simple.
+
+```js
+price.textContent.match(/((\d+,?)+.?(\d+)?)/)[0];
+```
+
+## Next up {#next}
+
+This concludes our lesson on extracting and cleaning data using DevTools. Using CSS selectors, we were able to find the HTML element that contains data about our favorite Sony subwoofer and then extract the data. In the [next lesson](./devtools_continued.md), we will learn how to extract information not only about the subwoofer, but about all the products on the page.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/best_practices.md">
+---
+title: Best practices
+description: Understand the standards and best practices that we here at Apify abide by to write readable, scalable, and maintainable code.
+sidebar_position: 1.5
+slug: /web-scraping-for-beginners/best-practices
+---
+
+# Best practices when writing scrapers {#best-practices}
+
+**Understand the standards and best practices that we here at Apify abide by to write readable, scalable, and maintainable code.**
+
+---
+
+Every developer has their own style, which evolves as they grow and learn. While one dev might prefer a more  [functional](https://en.wikipedia.org/wiki/Functional_programming) style, another might find an [imperative](https://en.wikipedia.org/wiki/Imperative_programming) approach to be more intuitive. We at Apify understand this, and have written this best practices lesson with that in mind.
+
+The goal of this lesson is not to force you into a specific paradigm or to make you think that you're doing things wrong, but instead to provide you some insight into the standards and best practices that we at Apify follow to ensure readable, maintainable, scalable code.
+
+## Code style {#code-style}
+
+When it comes to your code style when writing scrapers, there are some general things we recommend.
+
+### Clean code {#clean-code}
+
+Praise [clean code](https://blog.risingstack.com/javascript-clean-coding-best-practices-node-js-at-scale/)! Use proper variable and function names that are descriptive of what they are, and split your code into smaller [pure](https://en.wikipedia.org/wiki/Pure_function) functions.
+
+### Constant variables {#constants}
+
+Define any [constant variables](https://softwareengineering.stackexchange.com/questions/250619/best-practices-reasons-for-string-constants-in-javascript) that globally apply to the scraper in a single file named **constants.js**, from where they will all be imported. Constant variable names should be in `UPPERCASE_WITH_UNDERSCORES` style.
+
+> If you have a whole lot of constant variables, they can be in a folder named **constants** organized into different files.
+
+### Use ES6 JavaScript {#use-es6}
+
+If you're writing your scraper in JavaScript, use [ES6](https://www.w3schools.com/js/js_es6.asp) features and ditch the old ones which they replace. This means using `const` and `let` instead of `var`, `includes` instead of `indexOf`, etc.
+
+> To learn more about some of the most popular (and awesome) ES6+ features, check out [this](https://medium.com/@matthiasvstephens/why-is-es6-so-awesome-88bff6857849) article.
+
+### No magic numbers {#no-magic-numbers}
+
+Avoid using [magic numbers](https://en.wikipedia.org/wiki/Magic_number_(programming)) as much as possible. Either declare them as a **constant** variable in your **constants.js** file, or if they are only used once, add a comment explaining what the number is.
+
+Don't write code like this:
+
+```js
+const x = (y) => (y - 32) * (5 / 9);
+```
+
+That is quite confusing due to the nondescriptive naming and the magic numbers. Do this instead:
+
+```js
+// Converts a fahrenheit value to celsius
+const fahrenheitToCelsius = (celsius) => (celsius - 32) * (5 / 9);
+```
+
+### Use comments! {#use-comments}
+
+Don't be shy to add comments to your code! Even when using descriptive function and variable naming, it might still be a good idea to add a comment in places where you had to make a tough decision or chose an unusual choice.
+
+> If you're a true pro, use [JSDoc](https://jsdoc.app/) to comment and document your code.
+
+## Logging {#logging}
+
+Logging helps you understand exactly what your scraper is doing. Generally, having more logs is better than having fewer. Especially make sure to log your `catch` blocks - no error should pass unseen unless there is a good reason.
+
+For scrapers that will run longer than usual, keep track of some useful stats (such as **itemsScraped** or **errorsHit**) and log them to the console on an interval.
+
+The meaning of your log messages should make sense to an outsider who is not familiar with the inner workings of your scraper. Avoid log lines with just numbers or just URLs - always identify what the number/string means.
+
+Here is an example of an "incorrect" log message:
+
+```text
+300  https://example.com/1234  1234
+```
+
+And here is  that log message translated into something that makes much more sense to the end user:
+
+```text
+Index 1234 --- https://example.com/1234 --- took 300 ms
+```
+
+## Input {#input}
+
+When it comes to accepting input into a scraper, two main best practices should be followed.
+
+### Set limits {#set-limits}
+
+When allowing your users to pass input properties which could break the scraper (such as **timeout** set to **0**), be sure to disallow ridiculous values. Set a maximum/minimum number allowed, maximum array input length, etc.
+
+### Validate {#validate}
+
+Validate the input provided by the user! This should be the very first thing your scraper does. If the fields in the input are missing or in an incorrect type/format, either parse the value and correct it programmatically or throw an informative error telling the user how to fix the error.
+
+> On the Apify platform, you can use the [input schema](../../platform/deploying_your_code/input_schema.md) to both validate inputs and generate a clean UI for those using your scraper.
+
+## Error handling {#error-handling}
+
+Errors are bound to occur in scrapers. Perhaps it got blocked, or perhaps the data scraped was corrupted in some way.
+
+Whatever the reason, a scraper shouldn't completely crash when an error occurs. Use `try...catch` blocks to catch errors and log useful messages. The log messages should indicate where the error happened, and what type of error happened.
+
+Bad error log message:
+
+```text
+Cannot read property “0” from undefined
+```
+
+Good error log message:
+
+```text
+Could not parse an address, skipping the page. Url: https://www.example-website.com/people/1234
+```
+
+This doesn't mean that you should absolutely litter your code with `try...catch` blocks, but it does mean that they should be placed in error-prone areas (such as API calls or testing a string with a specific regular expression).
+
+> If the error that has occurred renders that run of the scraper completely useless, exit the process immediately.
+
+Logging is the minimum you should be doing though. For example, if you have an entire object of scraped data and just the **price** field fails to be parsed, you might not want to throw away the rest of that data. Rather, it could still be pushed to the output and a log message like this could appear:
+
+```text
+We could not parse the price of product: Men's Trainers Orange, pushing anyways.
+```
+
+This really depends on your use case though. If you want 100% clean data, you might not want to push incomplete objects and just retry (ideally) or log an error message instead.
+
+## Recap {#recap}
+
+Wow, that's a whole lot of things to abide by! How will you remember all of them? Try to follow these three points:
+
+1. Describe your code as you write it with good naming, constants, and comments. It **should read like a book**.
+2. Add log messages at points throughout your code so that when it's running, you (and everyone else) know what's going on.
+3. Handle errors appropriately. Log the error and either retry, or continue on. Only throw if the error will be caught or if the error is absolutely detrimental to the scraper's run.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/index.md">
+---
+title: Web scraping for beginners
+description: Learn how to develop web scrapers with this comprehensive and practical course. Go from beginner to expert, all in one place.
+sidebar_position: 1
+category: web scraping & automation
+slug: /web-scraping-for-beginners
+---
+
+# Web scraping for beginners {#welcome}
+
+**Learn how to develop web scrapers with this comprehensive and practical course. Go from beginner to expert, all in one place.**
+
+---
+
+Welcome to **Web scraping for beginners**, a comprehensive, practical and long form web scraping course that will take you from an absolute beginner to a successful web scraper developer. If you're looking for a quick start, we recommend trying [this tutorial](https://blog.apify.com/web-scraping-javascript-nodejs/) instead.
+
+This course is made by [Apify](https://apify.com), the web scraping and automation platform, but we will use only open-source technologies throughout all academy lessons. This means that the skills you learn will be applicable to any scraping project, and you'll be able to run your scrapers on any computer. No Apify account needed.
+
+If you would like to learn about the Apify platform and how it can help you build, run and scale your web scraping and automation projects, see the [Apify platform course](../../platform/apify_platform.md), where we'll teach you all about Apify serverless infrastructure, proxies, API, scheduling, webhooks and much more.
+
+## Why learn scraper development? {#why-learn}
+
+With so many point-and-click tools and no-code software that can help you extract data from websites, what is the point of learning web scraper development? Contrary to what their marketing departments say, a point-and-click or no-code tool will never be as flexible, as powerful, or as optimized as a custom-built scraper.
+
+Any software can do only what it was programmed to do. If you build your own scraper, it can do anything you want. And you can always quickly change it to do more, less, or the same, but faster or cheaper. The possibilities are endless once you know how scraping really works.
+
+Scraper development is a fun and challenging way to learn web development, web technologies, and understand the internet. You will reverse-engineer websites and understand how they work internally, what technologies they use and how they communicate with their servers. You will also master your chosen programming language and core programming concepts. When you truly understand web scraping, learning other technologies like React or Next.js will be a piece of cake.
+
+## Course Summary {#summary}
+
+When we set out to create the Academy, we wanted to build a complete guide to web scraping - a course that a beginner could use to create their first scraper, as well as a resource that professionals will continuously use to learn about advanced and niche web scraping techniques and technologies. All lessons include code examples and code-along exercises that you can use to immediately put your scraping skills into action.
+
+This is what you'll learn in the **Web scraping for beginners** course:
+
+* [Web scraping for beginners](./index.md)
+  * [Basics of data extraction](./data_extraction/index.md)
+  * [Basics of crawling](./crawling/index.md)
+  * [Best practices](./best_practices.md)
+
+## Requirements {#requirements}
+
+You don't need to be a developer or a software engineer to complete this course, but basic programming knowledge is recommended. Don't be afraid, though. We explain everything in great detail in the course and provide external references that can help you level up your web scraping and web development skills. If you're new to programming, pay very close attention to the instructions and examples. A seemingly insignificant thing like using `[]` instead of `()` can make a lot of difference.
+
+> If you don't already have basic programming knowledge and would like to be well-prepared for this course, we recommend learning about [JavaScript basics](https://developer.mozilla.org/en-US/curriculum/core/javascript-fundamentals/) and [CSS Selectors](https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors).
+
+As you progress to the more advanced courses, the coding will get more challenging, but will still be manageable to a person with an intermediate level of programming skills.
+
+Ideally, you should have at least a moderate understanding of the following concepts:
+
+### JavaScript + Node.js {#javascript-and-node}
+
+It is recommended to understand at least the fundamentals of JavaScript and be proficient with Node.js prior to starting this course. If you are not yet comfortable with asynchronous programming (with promises and `async...await`), loops (and the different types of loops in JavaScript), modularity, or working with external packages, we would recommend studying the following resources before coming back and continuing this section:
+
+* [`async...await` (YouTube)](https://www.youtube.com/watch?v=vn3tm0quoqE&ab_channel=Fireship)
+* [JavaScript loops (MDN)](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Loops_and_iteration)
+* [Modularity in Node.js](https://javascript.plainenglish.io/how-to-use-modular-patterns-in-nodejs-982f0e5c8f6e)
+
+### General web development {#general-web-development}
+
+Throughout the next lessons, we will sometimes use certain technologies and terms related to the web without explaining them. This is because their knowledge will be **assumed** (unless we're showing something out of the ordinary).
+
+* [HTML](https://developer.mozilla.org/en-US/docs/Web/HTML)
+* [HTTP protocol](https://developer.mozilla.org/en-US/docs/Web/HTTP)
+* [DevTools](./data_extraction/browser_devtools.md)
+
+### jQuery or Cheerio {#jquery-or-cheerio}
+
+We'll be using the [**Cheerio**](https://www.npmjs.com/package/cheerio) package a lot to parse data from HTML. This package provides an API using jQuery syntax to help traverse downloaded HTML within Node.js.
+
+## Next up {#next}
+
+The course begins with a small bit of theory and moves into some realistic and practical examples of extracting data from the most popular websites on the internet using your browser console. [Let's get to it!](./introduction.md)
+
+> If you already have experience with HTML, CSS, and browser DevTools, feel free to skip to the [Basics of crawling](./crawling/index.md) section.
+</file>
+
+<file path="webscraping/scraping_basics_javascript/introduction.md">
+---
+title: Introduction
+description: Start learning about web scraping, web crawling, data extraction, and popular tools to start developing your own scraper.
+sidebar_position: 1.1
+category: courses
+slug: /web-scraping-for-beginners/introduction
+---
+
+# Introduction {#introduction}
+
+**Start learning about web scraping, web crawling, data extraction, and popular tools to start developing your own scraper.**
+
+---
+
+Web scraping or crawling? Web data extraction, mining, or collection? You can find various definitions on the web. Let's agree on explanations that we will use throughout this beginner course on web scraping.
+
+## What is web data extraction? {#what-is-data-extraction}
+
+Web data extraction (or collection) is a process that takes a web page, like an Amazon product page, and collects useful information from the page, such as the product's name and price. Web pages are an unstructured data source and the goal of web data extraction is to make information from websites structured, so that it can be processed by data analysis tools or integrated with computer systems. The main sources of data on a web page are HTML documents and API calls, but also images, PDFs, etc.
+
+![product data extraction from Amazon](./images/beginners-data-extraction.png)
+
+## What is crawling? {#what-is-crawling}
+
+Where web data extraction focuses on a single page, web crawling (sometimes called spidering 🕷) is all about movement between pages or websites. The purpose of crawling is to travel across the website to find pages with the information we want. Crawling and collection can happen either simultaneously, while moving from page to page, or separately, where one scraper focuses solely on finding pages with data, and another scraper collects the data. The main purpose of crawling is to collect URLs or other links that can be used to move around.
+
+## What is web scraping? {#what-is-web-scraping}
+
+We use web scraping as an umbrella term for crawling, web data extraction and all other activities that have the purpose of converting unstructured data from the web to a structured format ready for integration or data analysis. In the advanced courses, you'll learn that web scraping is about much more than just HTML and URLs.
+
+## Next up {#next}
+
+In the [next lesson](./data_extraction/index.md), you will learn about the basic building blocks of each web page. HTML, CSS and JavaScript.
+</file>
+
+<file path="webscraping/scraping_basics_python/_exercises.mdx">
+<!-- markdownlint-disable-next-line MD041 -->
+## Exercises
+
+These challenges are here to help you test what you’ve learned in this lesson. Try to resist the urge to peek at the solutions right away. Remember, the best learning happens when you dive in and do it yourself!
+
+:::caution Real world
+
+You're about to touch the real web, which is practical and exciting! But websites change, so some exercises might break. If you run into any issues, please leave a comment below or [file a GitHub Issue](https://github.com/apify/apify-docs/issues).
+
+:::
+</file>
+
+<file path="webscraping/scraping_basics_python/01_devtools_inspecting.md">
+---
+title: Inspecting web pages with browser DevTools
+sidebar_label: "DevTools: Inspecting"
+description: Lesson about using the browser tools for developers to inspect and manipulate the structure of an e-commerce website.
+sidebar_position: 1
+slug: /scraping-basics-python/devtools-inspecting
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll use the browser tools for developers to inspect and manipulate the structure of an e-commerce website.**
+
+---
+
+A browser is the most complete tool for navigating websites. Scrapers are like automated browsers—and sometimes, they actually are automated browsers. The key difference? There's no user to decide where to go or eyes to see what's displayed. Everything has to be pre-programmed.
+
+All modern browsers provide developer tools, or DevTools, for website developers to debug their work. We'll use them to understand how websites are structured and identify the behavior our scraper needs to mimic. Here's the typical workflow for creating a scraper:
+
+1. Inspect the target website in DevTools to understand its structure and determine how to extract the required data.
+1. Translate those findings into code.
+1. If the scraper fails due to overlooked edge cases or, over time, due to website changes, go back to step 1.
+
+Now let's spend some time figuring out what the detective work from step 1 is about.
+
+## Opening DevTools
+
+Google Chrome is currently the most popular browser, and many others use the same core. That's why we'll focus on [Chrome DevTools](https://developer.chrome.com/docs/devtools) here. However, the steps are similar in other browsers like Safari ([Web Inspector](https://developer.apple.com/documentation/safari-developer-tools/web-inspector)) or Firefox ([DevTools](https://firefox-source-docs.mozilla.org/devtools-user/)).
+
+Let's peek behind the scenes of a real-world website—say, Wikipedia. Open Google Chrome and visit [wikipedia.org](https://www.wikipedia.org/). Press **F12**, or right-click anywhere on the page and select **Inspect**.
+
+![Wikipedia with Chrome DevTools open](./images/devtools-wikipedia.png)
+
+Websites are built with three main technologies: HTML, CSS, and JavaScript. In the **Elements** tab, DevTools shows the HTML and CSS of the current page:
+
+![Elements tab in Chrome DevTools](./images/devtools-elements-tab.png)
+
+:::warning Screen adaptations
+
+On smaller or low-resolution screens, DevTools might look different. For example, the CSS styles section might appear below the HTML elements instead of in the right pane.
+
+:::
+
+Think of [HTML](https://developer.mozilla.org/en-US/docs/Learn/HTML) as the frame that defines a page's structure. A basic HTML element includes an opening tag, a closing tag, and attributes. Here's an `article` element with an `id` attribute. It wraps `h1` and `p` elements, both containing text. Some text is emphasized using `em`.
+
+```html
+<article id="article-123">
+  <h1 class="heading">First Level Heading</h1>
+  <p>Paragraph with <em>emphasized text</em>.</p>
+</article>
+```
+
+HTML, a markup language, describes how everything on a page is organized, how elements relate to each other, and what they mean. It doesn't define how elements should look—that's where [CSS](https://developer.mozilla.org/en-US/docs/Learn/CSS) comes in. CSS is like the velvet covering the frame. Using styles, we can select elements and assign rules that tell the browser how they should appear. For instance, we can style all elements with `heading` in their `class` attribute to make the text blue and uppercase.
+
+```css
+.heading {
+  color: blue;
+  text-transform: uppercase;
+}
+```
+
+While HTML and CSS describe what the browser should display, [JavaScript](https://developer.mozilla.org/en-US/docs/Learn/JavaScript) is a general-purpose programming language that adds interaction to the page.
+
+In DevTools, the **Console** tab allows ad-hoc experimenting with JavaScript. If you don't see it, press **ESC** to toggle the Console. Running commands in the Console lets you manipulate the loaded page—we’ll try this shortly.
+
+![Console in Chrome DevTools](./images/devtools-console.png)
+
+## Selecting an element
+
+In the top-left corner of DevTools, find the icon with an arrow pointing to a square.
+
+![Chrome DevTools element selection tool](./images/devtools-element-selection.png)
+
+Click the icon and hover your cursor over Wikipedia's subtitle, **The Free Encyclopedia**. As you move your cursor, DevTools will display information about the HTML element under it. Click on the subtitle. In the **Elements** tab, DevTools will highlight the HTML element that represents the subtitle.
+
+![Chrome DevTools element hover](./images/devtools-hover.png)
+
+The highlighted section should look something like this:
+
+```html
+<strong class="jsl10n localized-slogan" data-jsl10n="portal.slogan">
+  The Free Encyclopedia
+</strong>
+```
+
+If we were experienced creators of scrapers, our eyes would immediately spot what's needed to make a program that fetches Wikipedia's subtitle. The program would need to download the page's source code, find a `strong` element with `localized-slogan` in its `class` attribute, and extract its text.
+
+:::tip HTML and whitespace
+
+In HTML, whitespace isn't significant, i.e., it only makes the code readable. The following code snippets are equivalent:
+
+```html
+<strong>
+  The Free Encyclopedia
+</strong>
+```
+
+```html
+  <strong>The Free
+Encyclopedia
+</strong>
+```
+
+:::
+
+## Interacting with an element
+
+We won't be creating Python scrapers just yet. Let's first get familiar with what we can do in the JavaScript console and how we can further interact with HTML elements on the page.
+
+In the **Elements** tab, with the subtitle element highlighted, right-click the element to open the context menu. There, choose **Store as global variable**. The **Console** should appear, with a `temp1` variable ready.
+
+![Global variable in Chrome DevTools Console](./images/devtools-console-variable.png)
+
+The Console allows us to run JavaScript in the context of the loaded page, similar to Python's [interactive REPL](https://realpython.com/interacting-with-python/). We can use it to play around with elements.
+
+For a start, let's access some of the subtitle's properties. One such property is `textContent`, which contains the text inside the HTML element. The last line in the Console is where your cursor is. Type the following and hit **Enter**:
+
+```js
+temp1.textContent;
+```
+
+The result should be `'The Free Encyclopedia'`. Now try this:
+
+```js
+temp1.outerHTML;
+```
+
+This should return the element's HTML tag as a string. Finally, run the next line to change the text of the element:
+
+```js
+temp1.textContent = 'Hello World!';
+```
+
+When you change elements in the Console, those changes reflect immediately on the page!
+
+![Changing textContent in Chrome DevTools Console](./images/devtools-console-textcontent.png)
+
+But don't worry—you haven't hacked Wikipedia. The change only happens in your browser. If you reload the page, your change will disappear. This, however, is an easy way to craft a screenshot with fake content—so screenshots shouldn't be trusted as evidence.
+
+We're not here for playing around with elements, though—we want to create a scraper for an e-commerce website to watch prices. In the next lesson, we'll examine the website and use CSS selectors to locate HTML elements containing the data we need.
+
+---
+
+<Exercises />
+
+### Find FIFA logo
+
+Open the [FIFA website](https://www.fifa.com/) and use the DevTools to figure out the URL of FIFA's logo image file. Hint: You're looking for an [`img`](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img) element with a `src` attribute.
+
+<details>
+  <summary>Solution</summary>
+
+  1. Go to [fifa.com](https://www.fifa.com/).
+  1. Activate the element selection tool.
+  1. Click on the logo.
+  1. Send the highlighted element to the **Console** using the **Store as global variable** option from the context menu.
+  1. In the console, type `temp1.src` and hit **Enter**.
+
+  ![DevTools exercise result](./images/devtools-exercise-fifa.png)
+
+</details>
+
+### Make your own news
+
+Open a news website, such as [CNN](https://cnn.com). Use the Console to change the headings of some articles.
+
+<details>
+  <summary>Solution</summary>
+
+  1. Go to [cnn.com](https://cnn.com).
+  1. Activate the element selection tool.
+  1. Click on a heading.
+  1. Send the highlighted element to the **Console** using the **Store as global variable** option from the context menu.
+  1. In the console, type `temp1.textContent = 'Something something'` and hit **Enter**.
+
+  ![DevTools exercise result](./images/devtools-exercise-cnn.png)
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/02_devtools_locating_elements.md">
+---
+title: Locating HTML elements on a web page with browser DevTools
+sidebar_label: "DevTools: Locating HTML elements"
+description: Lesson about using the browser tools for developers to manually find products on an e-commerce website.
+sidebar_position: 2
+slug: /scraping-basics-python/devtools-locating-elements
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll use the browser tools for developers to manually find products on an e-commerce website.**
+
+---
+
+Inspecting Wikipedia and tweaking its subtitle is fun, but let's shift gears and focus on building an app to track prices on an e-commerce site. As part of the groundwork, let's check out the site we'll be working with.
+
+## Meeting the Warehouse store
+
+Instead of artificial scraping playgrounds or sandboxes, we'll scrape a real e-commerce site. Shopify, a major e-commerce platform, has a demo store at [warehouse-theme-metal.myshopify.com](https://warehouse-theme-metal.myshopify.com/). It strikes a good balance between being realistic and stable enough for a tutorial. Our scraper will track prices for all products listed on the [Sales page](https://warehouse-theme-metal.myshopify.com/collections/sales).
+
+:::info Balancing authenticity and stability
+
+Live sites like Amazon are complex, loaded with promotions, frequently changing, and equipped with anti-scraping measures. While those challenges are manageable, they're advanced topics. For this beginner course, we're sticking to a lightweight, stable environment.
+
+That said, we designed all the exercises to work with live websites. This means occasional updates might be needed, but we think it's worth it for a more authentic learning experience.
+
+:::
+
+## Finding a product card
+
+As mentioned in the previous lesson, before building a scraper, we need to understand structure of the target page and identify the specific elements our program should extract. Let's figure out how to select details for each product on the [Sales page](https://warehouse-theme-metal.myshopify.com/collections/sales).
+
+![Warehouse store with DevTools open](./images/devtools-warehouse.png)
+
+The page displays a grid of product cards, each showing a product's name and picture. Open DevTools and locate the name of the **Sony SACS9 Active Subwoofer**. Highlight it in the **Elements** tab by clicking on it.
+
+![Selecting an element with DevTools](./images/devtools-product-name.png)
+
+Next, let's find all the elements containing details about this subwoofer—its price, number of reviews, image, and more.
+
+In the **Elements** tab, move your cursor up from the `a` element containing the subwoofer's name. On the way, hover over each element until you highlight the entire product card. Alternatively, use the arrow-up key. The `div` element you land on is the **parent element**, and all nested elements are its **child elements**.
+
+![Selecting an element with hover](./images/devtools-hover-product.png)
+
+At this stage, we could use the **Store as global variable** option to send the element to the **Console**. While helpful for manual inspection, this isn't something a program can do.
+
+Scrapers typically rely on [CSS selectors](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors) to locate elements on a page, and these selectors often target elements based on their `class` attributes. The product card we highlighted has markup like this:
+
+```html
+<div class="product-item product-item--vertical 1/3--tablet-and-up 1/4--desk">
+  ...
+</div>
+```
+
+The `class` attribute can hold multiple values separated by whitespace. This particular element has four classes. Let's move to the **Console** and experiment with CSS selectors to locate this element.
+
+## Programmatically locating a product card
+
+Let's jump into the **Console** and write some JavaScript. Don't worry—you don't need to know the language, and yes, this is a helpful step on our journey to creating a scraper in Python.
+
+In browsers, JavaScript represents the current page as the [`Document`](https://developer.mozilla.org/en-US/docs/Web/API/Document) object, accessible via `document`. This object offers many useful methods, including [`querySelector()`](https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector). This method takes a CSS selector as a string and returns the first HTML element that matches. Try typing this into the **Console**:
+
+```js
+document.querySelector('.product-item');
+```
+
+It will return the HTML element for the first product card in the listing:
+
+![Using querySelector() in DevTools Console](./images/devtools-queryselector.png)
+
+:::note About the missing semicolon
+
+In the screenshot, there is a missing semicolon `;` at the end of the line. In JavaScript, semicolons are optional, so it doesn't make a difference here.
+
+:::
+
+CSS selectors can get quite complex, but the basics are enough to scrape most of the Warehouse store. Let's cover two simple types and how they can combine.
+
+The [type selector](https://developer.mozilla.org/en-US/docs/Web/CSS/Type_selectors) matches elements by tag name. For example, `h1` would match the highlighted element:
+
+```html
+<article>
+  <!-- highlight-next-line -->
+  <h1>Title</h1>
+  <p>Paragraph.</p>
+</article>
+```
+
+The [class selector](https://developer.mozilla.org/en-US/docs/Web/CSS/Class_selectors) matches elements based on their class attribute. For instance, `.heading` (note the dot) would match the following:
+
+```html
+<article>
+  <h1>Title</h1>
+  <!-- highlight-next-line -->
+  <h2 class="heading">Subtitle</h2>
+  <p>Paragraph</p>
+  <p>
+    <!-- highlight-next-line -->
+    <strong class="heading">Heading</strong>
+  </p>
+</article>
+```
+
+You can combine selectors to narrow results. For example, `p.lead` matches `p` elements with the `lead` class, but not `p` elements without the class or elements with the class but a different tag name:
+
+```html
+<article>
+  <!-- highlight-next-line -->
+  <p class="lead">Lead paragraph.</p>
+  <p>Paragraph</p>
+  <section class="lead"><p>Paragraph</p></section>
+</article>
+```
+
+How did we know `.product-item` selects a product card? By inspecting the markup of the product card element. After checking its classes, we chose the one that best fit our purpose. Testing in the **Console** confirmed it—selecting by the most descriptive class worked.
+
+## Choosing good selectors
+
+Multiple approaches often exist for creating a CSS selector that targets the element you want. Pick selectors that are simple, readable, unique, and semantically tied to the data. These are **resilient selectors**. They're the most reliable and likely to survive website updates. Avoid randomly generated attributes like `class="F4jsL8"`, as they tend to change without warning.
+
+The product card has four classes: `product-item`, `product-item--vertical`, `1/3--tablet-and-up`, and `1/4--desk`. Only the first one checks all the boxes. A product card *is* a product item, after all. The others seem more about styling—defining how the element looks on the screen—and are probably tied to CSS rules.
+
+This class is also unique enough in the page's context. If it were something generic like `item`, there would be a higher risk that developers of the website might use it for unrelated elements. In the **Elements** tab, you can see a parent element `product-list` that contains all the product cards marked as `product-item`. This structure aligns with the data we're after.
+
+![Overview of all the product cards in DevTools](./images/devtools-product-list.png)
+
+## Locating all product cards
+
+In the **Console**, hovering your cursor over objects representing HTML elements highlights the corresponding elements on the page. This way we can verify that when we query `.product-item`, the result represents the JBL Flip speaker—the first product card in the list.
+
+![Highlighting a querySelector() result](./images/devtools-hover-queryselector.png)
+
+But what if we want to scrape details about the Sony subwoofer we inspected earlier? For that, we need a method that selects more than just the first match: [`querySelectorAll()`](https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelectorAll). As the name suggests, it takes a CSS selector string and returns all matching HTML elements. Type this into the **Console**:
+
+```js
+document.querySelectorAll('.product-item');
+```
+
+The returned value is a [`NodeList`](https://developer.mozilla.org/en-US/docs/Web/API/NodeList), a collection of nodes. Browsers understand an HTML document as a tree of nodes. Most nodes are HTML elements, but there are also text nodes for plain text, and others.
+
+Expand the result by clicking the small arrow, then hover your cursor over the third element in the list. Indexing starts at 0, so the third element is at index 2. There it is—the product card for the subwoofer!
+
+![Highlighting a querySelectorAll() result](./images/devtools-hover-queryselectorall.png)
+
+To save the subwoofer in a variable for further inspection, use index access with brackets, just like in Python lists (or JavaScript arrays):
+
+```js
+products = document.querySelectorAll('.product-item');
+subwoofer = products[2];
+```
+
+Even though we're just playing with JavaScript in the browser's **Console**, we're inching closer to figuring out what our Python program will need to do. In the next lesson, we'll dive into accessing child elements and extracting product details.
+
+---
+
+<Exercises />
+
+### Locate headings on Wikipedia's Main Page
+
+On English Wikipedia's [Main Page](https://en.wikipedia.org/wiki/Main_Page), use CSS selectors in the **Console** to list the HTML elements representing headings of the colored boxes (including the grey ones).
+
+![Wikipedia's Main Page headings](./images/devtools-exercise-wikipedia.png)
+
+<details>
+  <summary>Solution</summary>
+
+  1. Open the [Main Page](https://en.wikipedia.org/wiki/Main_Page).
+  1. Activate the element selection tool in your DevTools.
+  1. Click on several headings to examine the markup.
+  1. Notice that all headings are `h2` tags with the `mp-h2` class.
+  1. In the **Console**, execute `document.querySelectorAll('h2')`.
+  1. At the time of writing, this selector returns 8 headings. Each corresponds to a box, and there are no other `h2` tags on the page. Thus, the selector is sufficient as is.
+
+</details>
+
+### Locate products on Shein
+
+Go to Shein's [Jewelry & Accessories](https://shein.com/RecommendSelection/Jewelry-Accessories-sc-017291431.html) category. In the **Console**, use CSS selectors to list all HTML elements representing the products.
+
+![Products in Shein's Jewelry & Accessories category](./images/devtools-exercise-shein.png)
+
+<details>
+  <summary>Solution</summary>
+
+  1. Visit the [Jewelry & Accessories](https://shein.com/RecommendSelection/Jewelry-Accessories-sc-017291431.html) page. Close any pop-ups or promotions.
+  1. Activate the element selection tool in your DevTools.
+  1. Click on the first product to inspect its markup. Repeat with a few others.
+  1. Observe that all products are `section` tags with multiple classes, including `product-card`.
+  1. Since `section` is a generic wrapper, focus on the `product-card` class.
+  1. In the **Console**, execute `document.querySelectorAll('.product-card')`.
+  1. At the time of writing, this selector returns 120 results, all representing products. No further narrowing is necessary.
+
+</details>
+
+### Locate articles on Guardian
+
+Go to Guardian's [page about F1](https://www.theguardian.com/sport/formulaone). Use the **Console** to find all HTML elements representing the articles.
+
+Hint: Learn about the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator).
+
+![Articles on Guardian's page about F1](./images/devtools-exercise-guardian1.png)
+
+<details>
+  <summary>Solution</summary>
+
+  1. Open the [page about F1](https://www.theguardian.com/sport/formulaone).
+  1. Activate the element selection tool in your DevTools.
+  1. Click on an article to inspect its structure. Check several articles, including the ones with smaller cards.
+  1. Note that all articles are `li` tags, but their classes (e.g., `dcr-1qmyfxi`) are dynamically generated and unreliable.
+  1. Using `document.querySelectorAll('li')` returns too many results, including unrelated items like navigation links.
+  1. Inspect the page structure. The `main` element contains the primary content, including articles. Use the descendant combinator to target `li` elements within `main`.
+  1. In the **Console**, execute `document.querySelectorAll('main li')`.
+  1. At the time of writing, this selector returns 21 results. All appear to represent articles, so the solution works!
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/03_devtools_extracting_data.md">
+---
+title: Extracting data from a web page with browser DevTools
+sidebar_label: "DevTools: Extracting data"
+description: Lesson about using the browser tools for developers to manually extract product data from an e-commerce website.
+sidebar_position: 3
+slug: /scraping-basics-python/devtools-extracting-data
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll use the browser tools for developers to manually extract product data from an e-commerce website.**
+
+---
+
+In our pursuit to scrape products from the [Sales page](https://warehouse-theme-metal.myshopify.com/collections/sales), we've been able to locate parent elements containing relevant data. Now how do we extract the data?
+
+## Finding product details
+
+Previously, we've figured out how to save the subwoofer product card to a variable in the **Console**:
+
+```js
+products = document.querySelectorAll('.product-item');
+subwoofer = products[2];
+```
+
+The product details are within the element as text, so maybe if we extract the text, we could work out the individual values?
+
+```js
+subwoofer.textContent;
+```
+
+That indeed outputs all the text, but in a form which would be hard to break down to relevant pieces.
+
+![Printing text content of the parent element](./images/devtools-extracting-text.png)
+
+We'll need to first locate relevant child elements and extract the data from each of them individually.
+
+## Extracting title
+
+We'll use the **Elements** tab of DevTools to inspect all child elements of the product card for the Sony subwoofer. We can see that the title of the product is inside an `a` element with several classes. From those the `product-item__title` seems like a great choice to locate the element.
+
+![Finding child elements](./images/devtools-product-details.png)
+
+JavaScript represents HTML elements as [Element](https://developer.mozilla.org/en-US/docs/Web/API/Element) objects. Among properties we've already played with, such as `textContent` or `outerHTML`, it also has the [`querySelector()`](https://developer.mozilla.org/en-US/docs/Web/API/Element/querySelector) method. Here the method looks for matches only within children of the element:
+
+```js
+title = subwoofer.querySelector('.product-item__title');
+title.textContent;
+```
+
+Notice we're calling `querySelector()` on the `subwoofer` variable, not `document`. And just like this, we've scraped our first piece of data! We've extracted the product title:
+
+![Extracting product title](./images/devtools-extracting-title.png)
+
+## Extracting price
+
+To figure out how to get the price, we'll use the **Elements** tab of DevTools again. We notice there are two prices, a regular price and a sale price. For the purposes of watching prices we'll need the sale price. Both are `span` elements with the `price` class.
+
+![Finding child elements](./images/devtools-product-details.png)
+
+We could either rely on the fact that the sale price is likely to be always the one which is highlighted, or that it's always the first price. For now we'll rely on the later and we'll let `querySelector()` to simply return the first result:
+
+```js
+price = subwoofer.querySelector('.price');
+price.textContent;
+```
+
+It works, but the price isn't alone in the result. Before we'd use such data, we'd need to do some **data cleaning**:
+
+![Extracting product price](./images/devtools-extracting-price.png)
+
+But for now that's okay. We're just testing the waters now, so that we have an idea about what our scraper will need to do. Once we'll get to extracting prices in Python, we'll figure out how to get the values as numbers.
+
+In the next lesson, we'll start with our Python project. First we'll be figuring out how to download the Sales page without browser and make it accessible in a Python program.
+
+---
+
+<Exercises />
+
+### Extract the price of IKEA's most expensive artificial plant
+
+At IKEA's [Artificial plants & flowers listing](https://www.ikea.com/se/en/cat/artificial-plants-flowers-20492/), use CSS selectors and HTML elements manipulation in the **Console** to extract the price of the most expensive artificial plant (sold in Sweden, as you'll be browsing their Swedish offer). Before opening DevTools, use your judgment to adjust the page to make the task as straightforward as possible. Finally, use JavaScript's [`parseInt()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/parseInt) function to convert the price text into a number.
+
+<details>
+  <summary>Solution</summary>
+
+  1. Open the [Artificial plants & flowers listing](https://www.ikea.com/se/en/cat/artificial-plants-flowers-20492/).
+  1. Sort the products by price, from high to low, so the most expensive plant appears first in the listing.
+  1. Activate the element selection tool in your DevTools.
+  1. Click on the price of the first and most expensive plant.
+  1. Notice that the price is structured into two elements, with the integer separated from the currency, under a class named `plp-price__integer`. This structure is convenient for extracting the value.
+  1. In the **Console**, execute `document.querySelector('.plp-price__integer')`. This returns the element representing the first price in the listing. Since `document.querySelector()` returns the first matching element, it directly selects the most expensive plant's price.
+  1. Save the element in a variable by executing `price = document.querySelector('.plp-price__integer')`.
+  1. Convert the price text into a number by executing `parseInt(price.textContent)`.
+  1. At the time of writing, this returns `699`, meaning [699 SEK](https://www.google.com/search?q=699%20sek).
+
+</details>
+
+### Extract the name of the top wiki on Fandom Movies
+
+On Fandom's [Movies page](https://www.fandom.com/topics/movies), use CSS selectors and HTML element manipulation in the **Console** to extract the name of the top wiki. Use JavaScript's [`trim()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/trim) method to remove white space around the name.
+
+![Fandom's Movies page](./images/devtools-exercise-fandom.png)
+
+<details>
+  <summary>Solution</summary>
+
+  1. Open the [Movies page](https://www.fandom.com/topics/movies).
+  1. Activate the element selection tool in your DevTools.
+  1. Click on the list item for the top Fandom wiki in the category.
+  1. Notice that it has a class `topic_explore-wikis__link`.
+  1. In the **Console**, execute `document.querySelector('.topic_explore-wikis__link')`. This returns the element representing the top list item. They use the selector only for the **Top Wikis** list, and because `document.querySelector()` returns the first matching element, you're almost done.
+  1. Save the element in a variable by executing `item = document.querySelector('.topic_explore-wikis__link')`.
+  1. Get the element's text without extra white space by executing `item.textContent.trim()`. At the time of writing, this returns `"Pixar Wiki"`.
+
+</details>
+
+### Extract details about the first post on Guardian's F1 news
+
+On the Guardian's [F1 news page](https://www.theguardian.com/sport/formulaone), use CSS selectors and HTML manipulation in the **Console** to extract details about the first post. Specifically, extract its title, lead paragraph, and URL of the associated photo.
+
+![F1 news page](./images/devtools-exercise-guardian2.png)
+
+<details>
+  <summary>Solution</summary>
+
+  1. Open the [F1 news page](https://www.theguardian.com/sport/formulaone).
+  1. Activate the element selection tool in your DevTools.
+  1. Click on the first post.
+  1. Notice that the markup does not provide clear, reusable class names for this task. The structure uses generic tags and randomized classes, requiring you to rely on the element hierarchy and order instead.
+  1. In the **Console**, execute `post = document.querySelector('#maincontent ul li')`. This returns the element representing the first post.
+  1. Extract the post's title by executing `post.querySelector('h3').textContent`.
+  1. Extract the lead paragraph by executing `post.querySelector('span div').textContent`.
+  1. Extract the photo URL by executing `post.querySelector('img').src`.
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/04_downloading_html.md">
+---
+title: Downloading HTML with Python
+sidebar_label: Downloading HTML
+description: Lesson about building a Python application for watching prices. Using the HTTPX library to download HTML code of a product listing page.
+sidebar_position: 4
+slug: /scraping-basics-python/downloading-html
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll start building a Python application for watching prices. As a first step, we'll use the HTTPX library to download HTML code of a product listing page.**
+
+---
+
+Using browser tools for developers is crucial for understanding the structure of a particular page, but it's a manual task. Let's start building our first automation, a Python program which downloads HTML code of the product listing.
+
+## Starting a Python project
+
+Before we start coding, we need to set up a Python project. Create new directory with a virtual environment, then inside the directory and with the environment activated, install the HTTPX library:
+
+```text
+$ pip install httpx
+...
+Successfully installed ... httpx-0.0.0
+```
+
+:::tip Installing packages
+
+Being comfortable around Python project setup and installing packages is a prerequisite of this course, but if you wouldn't say no to a recap, we recommend the [Installing Packages](https://packaging.python.org/en/latest/tutorials/installing-packages/) tutorial from the official Python Packaging User Guide.
+
+:::
+
+Now let's test that all works. Inside the project directory create a new file called `main.py` with the following code:
+
+```py
+import httpx
+
+print("OK")
+```
+
+Running it as a Python program will verify that your setup is okay and you've installed HTTPX:
+
+```text
+$ python main.py
+OK
+```
+
+:::info Troubleshooting
+
+If you see errors or for any other reason cannot run the code above, it means that your environment isn't set up correctly. We're sorry, but figuring out the issue is out of scope of this course.
+
+:::
+
+## Downloading product listing
+
+Now onto coding! Let's change our code so it downloads HTML of the product listing instead of printing `OK`. The [documentation of the HTTPX library](https://www.python-httpx.org/) provides us with examples how to use it. Inspired by those, our code will look like this:
+
+```py
+import httpx
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+print(response.text)
+```
+
+If you run the program now, it should print the downloaded HTML:
+
+```text
+$ python main.py
+<!doctype html>
+<html class="no-js" lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, height=device-height, minimum-scale=1.0, maximum-scale=1.0">
+    <meta name="theme-color" content="#00badb">
+    <meta name="robots" content="noindex">
+    <title>Sales</title>
+  ...
+  </body>
+</html>
+```
+
+Running `httpx.get(url)`, we made a HTTP request and received a response. It's not particularly useful yet, but it's a good start of our scraper.
+
+:::tip Client and server, request and response
+
+HTTP is a network protocol powering the internet. Understanding it well is an important foundation for successful scraping, but for this course, it's enough to know just the basic flow and terminology:
+
+- HTTP is an exchange between two participants.
+- The _client_ sends a _request_ to the _server_, which replies with a _response_.
+- In our case, `main.py` is the client, and the technology running at `warehouse-theme-metal.myshopify.com` replies to our request as the server.
+
+:::
+
+## Handling errors
+
+Websites can return various errors, such as when the server is temporarily down, applying anti-scraping protections, or simply being buggy. In HTTP, each response has a three-digit _status code_ that indicates whether it is an error or a success.
+
+:::tip All status codes
+
+If you've never worked with HTTP response status codes before, briefly scan their [full list](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to get at least a basic idea of what you might encounter. For further education on the topic, we recommend [HTTP Cats](https://http.cat/) as a highly professional resource.
+
+:::
+
+A robust scraper skips or retries requests on errors. Given the complexity of this task, it's best to use libraries or frameworks. For now, we'll at least make sure that our program visibly crashes and prints what happened in case there's an error.
+
+First, let's ask for trouble. We'll change the URL in our code to a page that doesn't exist, so that we get a response with [status code 404](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404). This could happen, for example, when the product we are scraping is no longer available:
+
+```text
+https://warehouse-theme-metal.myshopify.com/does/not/exist
+```
+
+We could check the value of `response.status_code` against a list of allowed numbers, but HTTPX already provides `response.raise_for_status()`, a method that analyzes the number and raises the `httpx.HTTPError` exception if our request wasn't successful:
+
+```py
+import httpx
+
+url = "https://warehouse-theme-metal.myshopify.com/does/not/exist"
+response = httpx.get(url)
+response.raise_for_status()
+print(response.text)
+```
+
+If you run the code above, the program should crash:
+
+```text
+$ python main.py
+Traceback (most recent call last):
+  File "/Users/.../main.py", line 5, in <module>
+    response.raise_for_status()
+  File "/Users/.../.venv/lib/python3/site-packages/httpx/_models.py", line 761, in raise_for_status
+    raise HTTPStatusError(message, request=request, response=self)
+httpx.HTTPStatusError: Client error '404 Not Found' for url 'https://warehouse-theme-metal.myshopify.com/does/not/exist'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404
+```
+
+Letting our program visibly crash on error is enough for our purposes. Now, let's return to our primary goal. In the next lesson, we'll be looking for a way to extract information about products from the downloaded HTML.
+
+---
+
+<Exercises />
+
+### Scrape AliExpress
+
+Download HTML of a product listing page, but this time from a real world e-commerce website. For example this page with AliExpress search results:
+
+```text
+https://www.aliexpress.com/w/wholesale-darth-vader.html
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+
+  url = "https://www.aliexpress.com/w/wholesale-darth-vader.html"
+  response = httpx.get(url)
+  response.raise_for_status()
+  print(response.text)
+  ```
+
+</details>
+
+### Save downloaded HTML as a file
+
+Download HTML, then save it on your disk as a `products.html` file. You can use the URL we've been already playing with:
+
+```text
+https://warehouse-theme-metal.myshopify.com/collections/sales
+```
+
+<details>
+  <summary>Solution</summary>
+
+  Right in your Terminal or Command Prompt, you can create files by _redirecting output_ of command line programs:
+
+  ```text
+  python main.py > products.html
+  ```
+
+  If you want to use Python instead, it offers several ways how to create files. The solution below uses [pathlib](https://docs.python.org/3/library/pathlib.html):
+
+  ```py
+  import httpx
+  from pathlib import Path
+
+  url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+  response = httpx.get(url)
+  response.raise_for_status()
+  Path("products.html").write_text(response.text)
+  ```
+
+</details>
+
+### Download an image as a file
+
+Download a product image, then save it on your disk as a file. While HTML is _textual_ content, images are _binary_. You may want to scan through the [HTTPX QuickStart](https://www.python-httpx.org/quickstart/) for guidance. You can use this URL pointing to an image of a TV:
+
+```text
+https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72cc8ff-fcd6-4141-b9cc-e1320f867785.jpg
+```
+
+<details>
+  <summary>Solution</summary>
+
+  Python offers several ways how to create files. The solution below uses [pathlib](https://docs.python.org/3/library/pathlib.html):
+
+  ```py
+  from pathlib import Path
+  import httpx
+
+  url = "https://warehouse-theme-metal.myshopify.com/cdn/shop/products/sonyxbr55front_f72cc8ff-fcd6-4141-b9cc-e1320f867785.jpg"
+  response = httpx.get(url)
+  response.raise_for_status()
+  Path("tv.jpg").write_bytes(response.content)
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/05_parsing_html.md">
+---
+title: Parsing HTML with Python
+sidebar_label: Parsing HTML
+description: Lesson about building a Python application for watching prices. Using the Beautiful Soup library to parse HTML code of a product listing page.
+sidebar_position: 5
+slug: /scraping-basics-python/parsing-html
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll look for products in the downloaded HTML. We'll use BeautifulSoup to turn the HTML into objects which we can work with in our Python program.**
+
+---
+
+From lessons about browser DevTools we know that the HTML tags representing individual products have a `class` attribute which, among other values, contains `product-item`.
+
+![Products have the ‘product-item’ class](./images/product-item.png)
+
+As a first step, let's try counting how many products are on the listing page.
+
+## Processing HTML
+
+After downloading, the entire HTML is available in our program as a string. We can print it to the screen or save it to a file, but not much more. However, since it's a string, could we use [string operations](https://docs.python.org/3/library/stdtypes.html#string-methods) or [regular expressions](https://docs.python.org/3/library/re.html) to count the products?
+
+While somewhat possible, such an approach is tedious, fragile, and unreliable. To work with HTML, we need a robust tool dedicated to the task: an _HTML parser_. It takes a text with HTML markup and turns it into a tree of Python objects.
+
+:::info Why regex can't parse HTML
+
+While [Bobince's infamous StackOverflow answer](https://stackoverflow.com/a/1732454/325365) is funny, it doesn't go much into explaining. In formal language theory, HTML's hierarchical and nested structure makes it a [context-free language](https://en.wikipedia.org/wiki/Context-free_language). Regular expressions match patterns in [regular languages](https://en.wikipedia.org/wiki/Regular_language), which are much simpler. This difference makes it hard for a regex to handle HTML's nested tags. HTML's complex syntax rules and various edge cases also add to the difficulty.
+
+:::
+
+We'll choose [Beautiful Soup](https://beautiful-soup-4.readthedocs.io/) as our parser, as it's a popular library renowned for its ability to process even non-standard, broken markup. This is useful for scraping, because real-world websites often contain all sorts of errors and discrepancies.
+
+```text
+$ pip install beautifulsoup4
+...
+Successfully installed beautifulsoup4-4.0.0 soupsieve-0.0
+```
+
+Now let's use it for parsing the HTML. The `BeautifulSoup` object allows us to work with the HTML elements in a structured way. As a demonstration, we'll first get the `<h1>` tag, which represents the main heading of the page.
+
+![Tag of the main heading](./images/h1.png)
+
+Update your code to the following:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+print(soup.select("h1"))
+```
+
+Let's run the program:
+
+```text
+$ python main.py
+[<h1 class="collection__title heading h1">Sales</h1>]
+```
+
+Our code lists all `<h1>` tags it can find on the page. It's the case that there's just one, so in the result we can see a list with a single item. What if we want to print just the text? Let's change the end of the program to the following:
+
+```py
+headings = soup.select("h1")
+first_heading = headings[0]
+print(first_heading.text)
+```
+
+If we run our scraper again, it prints the text of the first `<h1>` tag:
+
+```text
+$ python main.py
+Sales
+```
+
+## Using CSS selectors
+
+Beautiful Soup's `.select()` method runs a _CSS selector_ against a parsed HTML document and returns all the matching elements. It's like calling `document.querySelectorAll()` in browser DevTools.
+
+Scanning through [usage examples](https://beautiful-soup-4.readthedocs.io/en/latest/#css-selectors) will help us to figure out code for counting the product cards:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+products = soup.select(".product-item")
+print(len(products))
+```
+
+In CSS, `.product-item` selects all elements whose `class` attribute contains value `product-item`. We call `soup.select()` with the selector and get back a list of matching elements. Beautiful Soup handles all the complexity of understanding the HTML markup for us. On the last line, we use `len()` to count how many items there is in the list.
+
+```text
+$ python main.py
+24
+```
+
+That's it! We've managed to download a product listing, parse its HTML, and count how many products it contains. In the next lesson, we'll be looking for a way to extract detailed information about individual products.
+
+---
+
+<Exercises />
+
+### Scrape F1 teams
+
+Print a total count of F1 teams listed on this page:
+
+```text
+https://www.formula1.com/en/teams
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://www.formula1.com/en/teams"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+  print(len(soup.select(".outline")))
+  ```
+
+</details>
+
+### Scrape F1 drivers
+
+Use the same URL as in the previous exercise, but this time print a total count of F1 drivers.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://www.formula1.com/en/teams"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+  print(len(soup.select(".f1-grid")))
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/06_locating_elements.md">
+---
+title: Locating HTML elements with Python
+sidebar_label: Locating HTML elements
+description: Lesson about building a Python application for watching prices. Using the Beautiful Soup library to locate products on the product listing page.
+sidebar_position: 6
+slug: /scraping-basics-python/locating-elements
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll locate product data in the downloaded HTML. We'll use BeautifulSoup to find those HTML elements which contain details about each product, such as title or price.**
+
+---
+
+In the previous lesson we've managed to print text of the page's main heading or count how many products are in the listing. Let's combine those two. What happens if we print `.text` for each product card?
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+for product in soup.select(".product-item"):
+    print(product.text)
+```
+
+Well, it definitely prints _something_…
+
+```text
+$ python main.py
+Save $25.00
+
+
+JBL
+JBL Flip 4 Waterproof Portable Bluetooth Speaker
+
+
+
+Black
+
++7
+
+
+Blue
+
++6
+
+
+Grey
+...
+```
+
+To get details about each product in a structured way, we'll need a different approach.
+
+## Locating child elements
+
+As in the browser DevTools lessons, we need to change the code so that it locates child elements for each product card.
+
+![Product card's child elements](./images/child-elements.png)
+
+We should be looking for elements which have the `product-item__title` and `price` classes. We already know how that translates to CSS selectors:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+for product in soup.select(".product-item"):
+    titles = product.select(".product-item__title")
+    first_title = titles[0].text
+
+    prices = product.select(".price")
+    first_price = prices[0].text
+
+    print(first_title, first_price)
+```
+
+Let's run the program now:
+
+```text
+$ python main.py
+JBL Flip 4 Waterproof Portable Bluetooth Speaker
+Sale price$74.95
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV
+Sale priceFrom $1,398.00
+...
+```
+
+There's still some room for improvement, but it's already much better!
+
+## Locating a single element
+
+Often, we want to assume in our code that a certain element exists only once. It's a bit tedious to work with lists when you know you're looking for a single element. For this purpose, Beautiful Soup offers the `.select_one()` method. Like `document.querySelector()` in browser DevTools, it returns just one result or `None`. Let's simplify our code!
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+for product in soup.select(".product-item"):
+    title = product.select_one(".product-item__title").text
+    price = product.select_one(".price").text
+    print(title, price)
+```
+
+This program does the same as the one we already had, but its code is more concise.
+
+:::note Fragile code
+
+We assume that the selectors we pass to the `select()` or `select_one()` methods return at least one element. If they don't, calling `[0]` on an empty list or `.text` on `None` would crash the program. If you perform type checking on your Python program, the code examples above may even trigger warnings about this.
+
+Not handling these cases allows us to keep the code examples more succinct. Additionally, if we expect the selectors to return elements but they suddenly don't, it usually means the website has changed since we wrote our scraper. Letting the program crash in such cases is a valid way to notify ourselves that we need to fix it.
+
+:::
+
+## Precisely locating price
+
+In the output we can see that the price isn't located precisely. For each product, our scraper also prints the text `Sale price`. Let's look at the HTML structure again. Each bit containing the price looks like this:
+
+```html
+<span class="price">
+  <span class="visually-hidden">Sale price</span>
+  $74.95
+</span>
+```
+
+When translated to a tree of Python objects, the element with class `price` will contain several _nodes_:
+
+- Textual node with white space,
+- a `span` HTML element,
+- a textual node representing the actual amount and possibly also white space.
+
+We can use Beautiful Soup's `.contents` property to access individual nodes. It returns a list of nodes like this:
+
+```py
+["\n", <span class="visually-hidden">Sale price</span>, "$74.95"]
+```
+
+It seems like we can read the last element to get the actual amount from a list like the above. Let's fix our program:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+for product in soup.select(".product-item"):
+    title = product.select_one(".product-item__title").text
+    price = product.select_one(".price").contents[-1]
+    print(title, price)
+```
+
+If we run the scraper now, it should print prices as only amounts:
+
+```text
+$ python main.py
+JBL Flip 4 Waterproof Portable Bluetooth Speaker $74.95
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV From $1,398.00
+...
+```
+
+## Formatting output
+
+The results seem to be correct, but they're hard to verify because the prices visually blend with the titles. Let's set a different separator for the `print()` function:
+
+```py
+print(title, price, sep=" | ")
+```
+
+The output is much nicer this way:
+
+```text
+$ python main.py
+JBL Flip 4 Waterproof Portable Bluetooth Speaker | $74.95
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | From $1,398.00
+...
+```
+
+Great! We have managed to use CSS selectors and walk the HTML tree to get a list of product titles and prices. But wait a second—what's `From $1,398.00`? One does not simply scrape a price! We'll need to clean that. But that's a job for the next lesson, which is about extracting data.
+
+---
+
+<Exercises />
+
+### Scrape Wikipedia
+
+Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print short English names of all the states and territories mentioned in all tables. This is the URL:
+
+```text
+https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+```
+
+Your program should print the following:
+
+```text
+Algeria
+Angola
+Benin
+Botswana
+...
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for table in soup.select(".wikitable"):
+      for row in table.select("tr"):
+          cells = row.select("td")
+          if cells:
+              third_column = cells[2]
+              title_link = third_column.select_one("a")
+              print(title_link.text)
+  ```
+
+  Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells.
+
+</details>
+
+### Use CSS selectors to their max
+
+Simplify the code from previous exercise. Use a single for loop and a single CSS selector. You may want to check out the following pages:
+
+- [Descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator)
+- [`:nth-child()` pseudo-class](https://developer.mozilla.org/en-US/docs/Web/CSS/:nth-child)
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for name_cell in soup.select(".wikitable tr td:nth-child(3)"):
+      print(name_cell.select_one("a").text)
+  ```
+
+</details>
+
+### Scrape F1 news
+
+Download Guardian's page with the latest F1 news, use Beautiful Soup to parse it, and print titles of all the listed articles. This is the URL:
+
+```text
+https://www.theguardian.com/sport/formulaone
+```
+
+Your program should print something like the following:
+
+```text
+Wolff confident Mercedes are heading to front of grid after Canada improvement
+Frustrated Lando Norris blames McLaren team for missed chance
+Max Verstappen wins Canadian Grand Prix: F1 – as it happened
+...
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://www.theguardian.com/sport/formulaone"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for title in soup.select("#maincontent ul li h3"):
+      print(title.text)
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/07_extracting_data.md">
+---
+title: Extracting data from HTML with Python
+sidebar_label: Extracting data from HTML
+description: Lesson about building a Python application for watching prices. Using string manipulation to extract and clean data scraped from the product listing page.
+sidebar_position: 7
+slug: /scraping-basics-python/extracting-data
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson we'll finish extracting product data from the downloaded HTML. With help of basic string manipulation we'll focus on cleaning and correctly representing the product price.**
+
+---
+
+Locating the right HTML elements is the first step of a successful data extraction, so it's no surprise that we're already close to having the data in the correct form. The last bit that still requires our attention is the price:
+
+```text
+$ python main.py
+JBL Flip 4 Waterproof Portable Bluetooth Speaker | $74.95
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | From $1,398.00
+...
+```
+
+Let's summarize what stands in our way if we want to have it in our Python program as a number:
+
+- A dollar sign precedes the number,
+- the number contains decimal commas for better human readability, and
+- some prices start with `From`, which reveals there is a certain complexity in how the shop deals with prices.
+
+## Representing price
+
+The last bullet point is the most important to figure out before we start coding. We thought we'll be scraping numbers, but in the middle of our effort, we discovered that the price is actually a range.
+
+It's because some products have variants with different prices. Later in the course we'll get to crawling, i.e. following links and scraping data from more than just one page. That will allow us to get exact prices for all the products, but for now let's extract just what's in the listing.
+
+Ideally we'd go and discuss the problem with those who are about to use the resulting data. For their purposes, is the fact that some prices are just minimum prices important? What would be the most useful representation of the range for them? Maybe they'd tell us that it's okay if we just remove the `From` prefix?
+
+```py
+price_text = product.select_one(".price").contents[-1]
+price = price_text.removeprefix("From ")
+```
+
+In other cases, they'd tell us the data must include the range. And in cases when we just don't know, the safest option is to include all the information we have and leave the decision on what's important to later stages. One approach could be having the exact and minimum prices as separate values. If we don't know the exact price, we leave it empty:
+
+```py
+price_text = product.select_one(".price").contents[-1]
+if price_text.startswith("From "):
+    min_price = price_text.removeprefix("From ")
+    price = None
+else:
+    min_price = price_text
+    price = min_price
+```
+
+:::tip Built-in string methods
+
+If you're not proficient in Python's string methods, [.startswith()](https://docs.python.org/3/library/stdtypes.html#str.startswith) checks the beginning of a given string, and [.removeprefix()](https://docs.python.org/3/library/stdtypes.html#str.removeprefix) removes something from the beginning of a given string.
+
+:::
+
+The whole program would look like this:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+for product in soup.select(".product-item"):
+    title = product.select_one(".product-item__title").text
+
+    price_text = product.select_one(".price").contents[-1]
+    if price_text.startswith("From "):
+        min_price = price_text.removeprefix("From ")
+        price = None
+    else:
+        min_price = price_text
+        price = min_price
+
+    print(title, min_price, price, sep=" | ")
+```
+
+## Removing white space
+
+Often, the strings we extract from a web page start or end with some amount of whitespace, typically space characters or newline characters, which come from the [indentation](https://en.wikipedia.org/wiki/Indentation_(typesetting)#Indentation_in_programming) of the HTML tags.
+
+We call the operation of removing whitespace _stripping_ or _trimming_, and it's so useful in many applications that programming languages and libraries include ready-made tools for it. Let's add Python's built-in [.strip()](https://docs.python.org/3/library/stdtypes.html#str.strip):
+
+```py
+title = product.select_one(".product-item__title").text.strip()
+
+price_text = product.select_one(".price").contents[-1].strip()
+```
+
+:::info Handling strings in Beautiful Soup
+
+Beautiful Soup offers several attributes when it comes to working with strings:
+
+- `.string`, which often is like `.text`,
+- `.strings`, which [returns a list of all nested textual nodes](https://beautiful-soup-4.readthedocs.io/en/latest/#strings-and-stripped-strings),
+- `.stripped_strings`, which does the same but with whitespace removed.
+
+These might be useful in some complex scenarios, but in our case, they won't make scraping the title or price any shorter or more elegant.
+
+:::
+
+## Removing dollar sign and commas
+
+We got rid of the `From` and possible whitespace, but we still can't save the price as a number in our Python program:
+
+```py
+>>> price = "$1,998.00"
+>>> float(price)
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+ValueError: could not convert string to float: '$1,998.00'
+```
+
+:::tip Interactive Python
+
+The demonstration above is inside the Python's [interactive REPL](https://realpython.com/interacting-with-python/). It's a useful playground where you can try how code behaves before you use it in your program.
+
+:::
+
+We need to remove the dollar sign and the decimal commas. For this type of cleaning, [regular expressions](https://docs.python.org/3/library/re.html) are often the best tool for the job, but in this case [`.replace()`](https://docs.python.org/3/library/stdtypes.html#str.replace) is also sufficient:
+
+```py
+price_text = (
+    product
+    .select_one(".price")
+    .contents[-1]
+    .strip()
+    .replace("$", "")
+    .replace(",", "")
+)
+```
+
+## Representing money in programs
+
+Now we should be able to add `float()`, so that we have the prices not as a text, but as numbers:
+
+```py
+if price_text.startswith("From "):
+    min_price = float(price_text.removeprefix("From "))
+    price = None
+else:
+    min_price = float(price_text)
+    price = min_price
+```
+
+Great! Only if we didn't overlook an important pitfall called [floating-point error](https://en.wikipedia.org/wiki/Floating-point_error_mitigation). In short, computers save `float()` numbers in a way which isn't always reliable:
+
+```py
+>>> 0.1 + 0.2
+0.30000000000000004
+```
+
+These errors are small and usually don't matter, but sometimes they can add up and cause unpleasant discrepancies. That's why it's typically best to avoid `float()` when working with money. Let's instead use Python's built-in [`Decimal()`](https://docs.python.org/3/library/decimal.html) type:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+for product in soup.select(".product-item"):
+    title = product.select_one(".product-item__title").text.strip()
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    print(title, min_price, price, sep=" | ")
+```
+
+If we run the code above, we have nice, clean data about all the products!
+
+```text
+$ python main.py
+JBL Flip 4 Waterproof Portable Bluetooth Speaker | 74.95 | 74.95
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 1398.00 | None
+...
+```
+
+Well, not to spoil the excitement, but in its current form, the data isn't very useful. In the next lesson we'll save the product details to a file which data analysts can use or other programs can read.
+
+---
+
+<Exercises />
+
+### Scrape units on stock
+
+Change our scraper so that it extracts how many units of each product are on stock. Your program should print the following. Note the unit amounts at the end of each line:
+
+```text
+JBL Flip 4 Waterproof Portable Bluetooth Speaker 672
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV 77
+Sony SACS9 10" Active Subwoofer 7
+Sony PS-HX500 Hi-Res USB Turntable 15
+Klipsch R-120SW Powerful Detailed Home Speaker - Unit 0
+Denon AH-C720 In-Ear Headphones 236
+...
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for product in soup.select(".product-item"):
+      title = product.select_one(".product-item__title").text.strip()
+
+      units_text = (
+          product
+          .select_one(".product-item__inventory")
+          .text
+          .removeprefix("In stock,")
+          .removeprefix("Only")
+          .removesuffix(" left")
+          .removesuffix("units")
+          .strip()
+      )
+      if "Sold out" in units_text:
+          units = 0
+      else:
+          units = int(units_text)
+
+      print(title, units)
+  ```
+
+</details>
+
+### Use regular expressions
+
+Simplify the code from previous exercise. Use [regular expressions](https://docs.python.org/3/library/re.html) to parse the number of units. You can match digits using a range like `[0-9]` or by a special sequence `\d`. To match more characters of the same type you can use `+`.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import re
+  import httpx
+  from bs4 import BeautifulSoup
+
+  url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for product in soup.select(".product-item"):
+      title = product.select_one(".product-item__title").text.strip()
+
+      units_text = product.select_one(".product-item__inventory").text
+      if re_match := re.search(r"\d+", units_text):
+          units = int(re_match.group())
+      else:
+          units = 0
+
+      print(title, units)
+  ```
+
+</details>
+
+### Scrape publish dates of F1 news
+
+Download Guardian's page with the latest F1 news and use Beautiful Soup to parse it. Print titles and publish dates of all the listed articles. This is the URL:
+
+```text
+https://www.theguardian.com/sport/formulaone
+```
+
+Your program should print something like the following. Note the dates at the end of each line:
+
+```text
+Wolff confident Mercedes are heading to front of grid after Canada improvement 2024-06-10
+Frustrated Lando Norris blames McLaren team for missed chance 2024-06-09
+Max Verstappen wins Canadian Grand Prix: F1 – as it happened 2024-06-09
+...
+```
+
+Hints:
+
+- HTML's `<time>` tag can have an attribute `datetime`, which [contains data in a machine-readable format](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/time), such as the ISO 8601.
+- Beautiful Soup gives you [access to attributes as if they were dictionary keys](https://beautiful-soup-4.readthedocs.io/en/latest/#attributes).
+- In Python you can create `datetime` objects using `datetime.fromisoformat()`, a [built-in method for parsing ISO 8601 strings](https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat).
+- To get just the date part, you can call `.date()` on any `datetime` object.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+  from datetime import datetime
+
+  url = "https://www.theguardian.com/sport/formulaone"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for article in soup.select("#maincontent ul li"):
+      title = article.select_one("h3").text.strip()
+
+      time_iso = article.select_one("time")["datetime"].strip()
+      published_at = datetime.fromisoformat(time_iso)
+      published_on = published_at.date()
+
+      print(title, published_on)
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/08_saving_data.md">
+---
+title: Saving data with Python
+sidebar_label: Saving data
+description: Lesson about building a Python application for watching prices. Using standard library to save data scraped from product listing pages in popular formats such as CSV or JSON.
+sidebar_position: 8
+slug: /scraping-basics-python/saving-data
+---
+
+**In this lesson, we'll save the data we scraped in the popular formats, such as CSV or JSON. We'll use Python's standard library to export the files.**
+
+---
+
+We managed to scrape data about products and print it, with each product separated by a new line and each field separated by the `|` character. This already produces structured text that can be parsed, i.e., read programmatically.
+
+```text
+$ python main.py
+JBL Flip 4 Waterproof Portable Bluetooth Speaker | 74.95 | 74.95
+Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 1398.00 | None
+...
+```
+
+However, the format of this text is rather _ad hoc_ and does not adhere to any specific standard that others could follow. It's unclear what to do if a product title already contains the `|` character or how to represent multi-line product descriptions. No ready-made library can handle all the parsing.
+
+We should use widely popular formats that have well-defined solutions for all the corner cases and that other programs can read without much effort. Two such formats are CSV (_Comma-separated values_) and JSON (_JavaScript Object Notation_).
+
+## Collecting data
+
+Producing results line by line is an efficient approach to handling large datasets, but to simplify this lesson, we'll store all our data in one variable. This'll take three changes to our program:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+# highlight-next-line
+data = []
+for product in soup.select(".product-item"):
+    title = product.select_one(".product-item__title").text.strip()
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    # highlight-next-line
+    data.append({"title": title, "min_price": min_price, "price": price})
+
+# highlight-next-line
+print(data)
+```
+
+Before looping over the products, we prepare an empty list. Then, instead of printing each line, we append the data of each product to the list in the form of a Python dictionary. At the end of the program, we print the entire list at once.
+
+```text
+$ python main.py
+[{'title': 'JBL Flip 4 Waterproof Portable Bluetooth Speaker', 'min_price': Decimal('74.95'), 'price': Decimal('74.95')}, {'title': 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV', 'min_price': Decimal('1398.00'), 'price': None}, ...]
+```
+
+:::tip Pretty print
+
+If you find the complex data structures printed by `print()` difficult to read, try using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp) from the `pprint` module instead.
+
+:::
+
+## Saving data as CSV
+
+The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets.
+
+In Python, it's convenient to read and write CSV files, thanks to the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage:
+
+```py
+>>> import csv
+>>> with open("data.csv", "w") as file:
+...     writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"])
+...     writer.writeheader()
+...     writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"})
+...     writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"})
+...
+```
+
+We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents:
+
+```csv title=data.csv
+name,age,hobbies
+Alice,24,"kickbox, Python"
+Bob,42,"reading, TypeScript"
+```
+
+In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this.
+
+When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have.
+
+![CSV example preview](images/csv-example.png)
+
+Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+# highlight-next-line
+import csv
+```
+
+Next, instead of printing the data, we'll finish the program by exporting it to CSV. Replace `print(data)` with the following:
+
+```py
+with open("products.csv", "w") as file:
+    writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"])
+    writer.writeheader()
+    for row in data:
+        writer.writerow(row)
+```
+
+If we run our scraper now, it won't display any output, but it will create a `products.csv` file in the current working directory, which contains all the data about the listed products.
+
+![CSV preview](images/csv.png)
+
+## Saving data as JSON
+
+The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of objects in the JavaScript programming language, which is similar to the syntax of Python dictionaries.
+
+In Python, there's a [`json`](https://docs.python.org/3/library/json.html) standard library module, which is so straightforward that we can start using it in our code right away. We'll need to begin with imports:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import csv
+# highlight-next-line
+import json
+```
+
+Next, let’s append one more export to end of the source code of our scraper:
+
+```py
+with open("products.json", "w") as file:
+    json.dump(data, file)
+```
+
+That’s it! If we run the program now, it should also create a `products.json` file in the current working directory:
+
+```text
+$ python main.py
+Traceback (most recent call last):
+  ...
+    raise TypeError(f'Object of type {o.__class__.__name__} '
+TypeError: Object of type Decimal is not JSON serializable
+```
+
+Ouch! JSON supports integers and floating-point numbers, but there's no guidance on how to handle `Decimal`. To maintain precision, it's common to store monetary values as strings in JSON files. But this is a convention, not a standard, so we need to handle it manually. We'll pass a custom function to `json.dump()` to serialize objects that it can't handle directly:
+
+```py
+def serialize(obj):
+    if isinstance(obj, Decimal):
+        return str(obj)
+    raise TypeError("Object not JSON serializable")
+
+with open("products.json", "w") as file:
+    json.dump(data, file, default=serialize)
+```
+
+Now the program should work as expected, producing a JSON file with the following content:
+
+<!-- eslint-skip -->
+```json title=products.json
+[{"title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", "min_price": "74.95", "price": "74.95"}, {"title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV", "min_price": "1398.00", "price": null}, ...]
+```
+
+If you skim through the data, you'll notice that the `json.dump()` function handled some potential issues, such as escaping double quotes found in one of the titles by adding a backslash:
+
+```json
+{"title": "Sony SACS9 10\" Active Subwoofer", "min_price": "158.00", "price": "158.00"}
+```
+
+:::tip Pretty JSON
+
+While a compact JSON file without any whitespace is efficient for computers, it can be difficult for humans to read. You can pass `indent=2` to `json.dump()` for prettier output.
+
+Also, if your data contains non-English characters, set `ensure_ascii=False`. By default, Python encodes everything except [ASCII](https://en.wikipedia.org/wiki/ASCII), which means it would save [Bún bò Nam Bô](https://vi.wikipedia.org/wiki/B%C3%BAn_b%C3%B2_Nam_B%E1%BB%99) as `B\\u00fan b\\u00f2 Nam B\\u00f4`.
+
+:::
+
+We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages.
+
+---
+
+## Exercises
+
+In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them.
+
+### Process your CSV
+
+Open the `products.csv` file in a spreadsheet app. Use the app to find all products with a min price greater than $500.
+
+<details>
+  <summary>Solution</summary>
+
+  Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account:
+
+  1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data.
+  2. Select the header row. Go to **Data > Create filter**.
+  3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data.
+
+  ![CSV in Google Sheets](images/csv-sheets.png)
+
+</details>
+
+### Process your JSON
+
+Write a new Python program that reads `products.json`, finds all products with a min price greater than $500, and prints each one using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp).
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import json
+  from pprint import pp
+  from decimal import Decimal
+
+  with open("products.json", "r") as file:
+      products = json.load(file)
+
+  for product in products:
+      if Decimal(product["min_price"]) > 500:
+          pp(product)
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/09_getting_links.md">
+---
+title: Getting links from HTML with Python
+sidebar_label: Getting links from HTML
+description: Lesson about building a Python application for watching prices. Using the Beautiful Soup library to locate links to individual product pages.
+sidebar_position: 9
+slug: /scraping-basics-python/getting-links
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson, we'll locate and extract links to individual product pages. We'll use BeautifulSoup to find the relevant bits of HTML.**
+
+---
+
+The previous lesson concludes our effort to create a scraper. Our program now downloads HTML, locates and extracts data from the markup, and saves the data in a structured and reusable way.
+
+For some use cases, this is already enough! In other cases, though, scraping just one page is hardly useful. The data is spread across the website, over several pages.
+
+## Crawling websites
+
+We'll use a technique called crawling, i.e. following links to scrape multiple pages. The algorithm goes like this:
+
+1. Visit the start URL.
+1. Extract new URLs (and data), and save them.
+1. Visit one of the newly found URLs and save data and/or more URLs from it.
+1. Repeat steps 2 and 3 until you have everything you need.
+
+This will help us figure out the actual prices of products, as right now, for some, we're only getting the min price. Implementing the algorithm will require quite a few changes to our code, though.
+
+## Restructuring code
+
+Over the course of the previous lessons, the code of our program grew to almost 50 lines containing downloading, parsing, and exporting:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import csv
+import json
+
+url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+response = httpx.get(url)
+response.raise_for_status()
+
+html_code = response.text
+soup = BeautifulSoup(html_code, "html.parser")
+
+data = []
+for product in soup.select(".product-item"):
+    title = product.select_one(".product-item__title").text.strip()
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    data.append({"title": title, "min_price": min_price, "price": price})
+
+with open("products.csv", "w") as file:
+    writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"])
+    writer.writeheader()
+    for row in data:
+        writer.writerow(row)
+
+def serialize(obj):
+    if isinstance(obj, Decimal):
+        return str(obj)
+    raise TypeError("Object not JSON serializable")
+
+with open("products.json", "w") as file:
+    json.dump(data, file, default=serialize)
+```
+
+Let's introduce several functions to make the whole thing easier to digest. First, we can turn the beginning of our program into this `download()` function, which takes a URL and returns a `BeautifulSoup` instance:
+
+```py
+def download(url):
+    response = httpx.get(url)
+    response.raise_for_status()
+
+    html_code = response.text
+    return BeautifulSoup(html_code, "html.parser")
+```
+
+Next, we can put parsing into a `parse_product()` function, which takes the product item element and returns the dictionary with data:
+
+```py
+def parse_product(product):
+    title = product.select_one(".product-item__title").text.strip()
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    return {"title": title, "min_price": min_price, "price": price}
+```
+
+Now the CSV export. We'll make a small change here. Having to specify the field names is not ideal. What if we add more field names in the parsing function? We'd always have to remember to go and edit the export function as well. If we could figure out the field names in place, we'd remove this dependency. One way would be to infer the field names from the dictionary keys of the first row:
+
+```py
+def export_csv(file, data):
+    # highlight-next-line
+    fieldnames = list(data[0].keys())
+    writer = csv.DictWriter(file, fieldnames=fieldnames)
+    writer.writeheader()
+    for row in data:
+        writer.writerow(row)
+```
+
+:::note Fragile code
+
+The code above assumes the `data` variable contains at least one item, and that all the items have the same keys. This isn't robust and could break, but in our program, this isn't a problem, and omitting these corner cases allows us to keep the code examples more succinct.
+
+:::
+
+The last function we'll add will take care of the JSON export. For better readability of the JSON export, let's make a small change here too and set the indentation level to two spaces:
+
+```py
+def export_json(file, data):
+    def serialize(obj):
+        if isinstance(obj, Decimal):
+            return str(obj)
+        raise TypeError("Object not JSON serializable")
+
+    # highlight-next-line
+    json.dump(data, file, default=serialize, indent=2)
+```
+
+Now let's put it all together:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import csv
+import json
+
+def download(url):
+    response = httpx.get(url)
+    response.raise_for_status()
+
+    html_code = response.text
+    return BeautifulSoup(html_code, "html.parser")
+
+def parse_product(product):
+    title = product.select_one(".product-item__title").text.strip()
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    return {"title": title, "min_price": min_price, "price": price}
+
+def export_csv(file, data):
+    fieldnames = list(data[0].keys())
+    writer = csv.DictWriter(file, fieldnames=fieldnames)
+    writer.writeheader()
+    for row in data:
+        writer.writerow(row)
+
+def export_json(file, data):
+    def serialize(obj):
+        if isinstance(obj, Decimal):
+            return str(obj)
+        raise TypeError("Object not JSON serializable")
+
+    json.dump(data, file, default=serialize, indent=2)
+
+listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+listing_soup = download(listing_url)
+
+data = []
+for product in listing_soup.select(".product-item"):
+    item = parse_product(product)
+    data.append(item)
+
+with open("products.csv", "w") as file:
+    export_csv(file, data)
+
+with open("products.json", "w") as file:
+    export_json(file, data)
+```
+
+The program is much easier to read now. With the `parse_product()` function handy, we could also replace the convoluted loop with one that only takes up four lines of code.
+
+:::tip Refactoring
+
+We turned the whole program upside down, and at the same time, we didn't make any actual changes! This is [refactoring](https://en.wikipedia.org/wiki/Code_refactoring): improving the structure of existing code without changing its behavior.
+
+![Refactoring](images/refactoring.gif)
+
+:::
+
+## Extracting links
+
+With everything in place, we can now start working on a scraper that also scrapes the product pages. For that, we'll need the links to those pages. Let's open the browser DevTools and remind ourselves of the structure of a single product item:
+
+![Product card's child elements](./images/child-elements.png)
+
+Several methods exist for transitioning from one page to another, but the most common is a link tag, which looks like this:
+
+```html
+<a href="https://example.com">Text of the link</a>
+```
+
+In DevTools, we can see that each product title is, in fact, also a link tag. We already locate the titles, so that makes our task easier. We just need to edit the code so that it extracts not only the text of the element but also the `href` attribute. Beautiful Soup elements support accessing attributes as if they were dictionary keys:
+
+```py
+def parse_product(product):
+    title_element = product.select_one(".product-item__title")
+    title = title_element.text.strip()
+    url = title_element["href"]
+
+    ...
+
+    return {"title": title, "min_price": min_price, "price": price, "url": url}
+```
+
+In the previous code example, we've also added the URL to the dictionary returned by the function. If we run the scraper now, it should produce exports where each product contains a link to its product page:
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  {
+    "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker",
+    "min_price": "74.95",
+    "price": "74.95",
+    "url": "/products/jbl-flip-4-waterproof-portable-bluetooth-speaker"
+  },
+  {
+    "title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV",
+    "min_price": "1398.00",
+    "price": null,
+    "url": "/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv"
+  },
+  ...
+]
+```
+
+Hmm, but that isn't what we wanted! Where is the beginning of each URL? It turns out the HTML contains so-called relative links.
+
+## Turning relative links into absolute
+
+Browsers reading the HTML know the base address and automatically resolve such links, but we'll have to do this manually. The function [`urljoin`](https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin) from Python's standard library will help us. Let's add it to our imports first:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import csv
+import json
+# highlight-next-line
+from urllib.parse import urljoin
+```
+
+Next, we'll change the `parse_product()` function so that it also takes the base URL as an argument and then joins it with the relative URL to the product page:
+
+```py
+# highlight-next-line
+def parse_product(product, base_url):
+    title_element = product.select_one(".product-item__title")
+    title = title_element.text.strip()
+    # highlight-next-line
+    url = urljoin(base_url, title_element["href"])
+
+    ...
+
+    return {"title": title, "min_price": min_price, "price": price, "url": url}
+```
+
+Now we'll pass the base URL to the function in the main body of our program:
+
+```py
+listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+listing_soup = download(listing_url)
+
+data = []
+for product in listing_soup.select(".product-item"):
+    # highlight-next-line
+    item = parse_product(product, listing_url)
+    data.append(item)
+```
+
+When we run the scraper now, we should see full URLs in our exports:
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  {
+    "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker",
+    "min_price": "74.95",
+    "price": "74.95",
+    "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker"
+  },
+  {
+    "title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV",
+    "min_price": "1398.00",
+    "price": null,
+    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv"
+  },
+  ...
+]
+```
+
+Ta-da! We've managed to get links leading to the product pages. In the next lesson, we'll crawl these URLs so that we can gather more details about the products in our dataset.
+
+---
+
+<Exercises />
+
+### Scrape links to countries in Africa
+
+Download Wikipedia's page with the list of African countries, use Beautiful Soup to parse it, and print links to Wikipedia pages of all the states and territories mentioned in all tables. Start with this URL:
+
+```text
+https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+```
+
+Your program should print the following:
+
+```text
+https://en.wikipedia.org/wiki/Algeria
+https://en.wikipedia.org/wiki/Angola
+https://en.wikipedia.org/wiki/Benin
+https://en.wikipedia.org/wiki/Botswana
+...
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+  from urllib.parse import urljoin
+
+  listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+  response = httpx.get(listing_url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for name_cell in soup.select(".wikitable tr td:nth-child(3)"):
+      link = name_cell.select_one("a")
+      url = urljoin(listing_url, link["href"])
+      print(url)
+  ```
+
+</details>
+
+### Scrape links to F1 news
+
+Download Guardian's page with the latest F1 news, use Beautiful Soup to parse it, and print links to all the listed articles. Start with this URL:
+
+```text
+https://www.theguardian.com/sport/formulaone
+```
+
+Your program should print something like the following:
+
+```text
+https://www.theguardian.com/world/2024/sep/13/africa-f1-formula-one-fans-lewis-hamilton-grand-prix
+https://www.theguardian.com/sport/2024/sep/12/mclaren-lando-norris-oscar-piastri-team-orders-f1-title-race-max-verstappen
+https://www.theguardian.com/sport/article/2024/sep/10/f1-designer-adrian-newey-signs-aston-martin-deal-after-quitting-red-bull
+https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-undriveable-monster-how-bad-really-is-it-and-why
+...
+```
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+  from urllib.parse import urljoin
+
+  url = "https://www.theguardian.com/sport/formulaone"
+  response = httpx.get(url)
+  response.raise_for_status()
+
+  html_code = response.text
+  soup = BeautifulSoup(html_code, "html.parser")
+
+  for item in soup.select("#maincontent ul li"):
+      link = item.select_one("a")
+      url = urljoin(url, link["href"])
+      print(url)
+  ```
+
+  Note that some cards contain two links. One leads to the article, and one to the comments. If we selected all the links in the list by `#maincontent ul li a`, we would get incorrect output like this:
+
+  ```text
+  https://www.theguardian.com/sport/article/2024/sep/02/example
+  https://www.theguardian.com/sport/article/2024/sep/02/example#comments
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/10_crawling.md">
+---
+title: Crawling websites with Python
+sidebar_label: Crawling websites
+description: Lesson about building a Python application for watching prices. Using the HTTPX library to follow links to individual product pages.
+sidebar_position: 10
+slug: /scraping-basics-python/crawling
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson, we'll follow links to individual product pages. We'll use HTTPX to download them and BeautifulSoup to process them.**
+
+---
+
+In previous lessons we've managed to download the HTML code of a single page, parse it with BeautifulSoup, and extract relevant data from it. We'll do the same now for each of the products.
+
+Thanks to the refactoring, we have functions ready for each of the tasks, so we won't need to repeat ourselves in our code. This is what you should see in your editor now:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import csv
+import json
+from urllib.parse import urljoin
+
+def download(url):
+    response = httpx.get(url)
+    response.raise_for_status()
+
+    html_code = response.text
+    return BeautifulSoup(html_code, "html.parser")
+
+def parse_product(product, base_url):
+    title_element = product.select_one(".product-item__title")
+    title = title_element.text.strip()
+    url = urljoin(base_url, title_element["href"])
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    return {"title": title, "min_price": min_price, "price": price, "url": url}
+
+def export_csv(file, data):
+    fieldnames = list(data[0].keys())
+    writer = csv.DictWriter(file, fieldnames=fieldnames)
+    writer.writeheader()
+    for row in data:
+        writer.writerow(row)
+
+def export_json(file, data):
+    def serialize(obj):
+        if isinstance(obj, Decimal):
+            return str(obj)
+        raise TypeError("Object not JSON serializable")
+
+    json.dump(data, file, default=serialize, indent=2)
+
+listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+listing_soup = download(listing_url)
+
+data = []
+for product in listing_soup.select(".product-item"):
+    item = parse_product(product, listing_url)
+    data.append(item)
+
+with open("products.csv", "w") as file:
+    export_csv(file, data)
+
+with open("products.json", "w") as file:
+    export_json(file, data)
+```
+
+## Extracting vendor name
+
+Each product URL points to a so-called _product detail page_, or PDP. If we open one of the product URLs in the browser, e.g. the one about [Sony XBR-950G BRAVIA](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv), we can see that it contains a vendor name, [SKU](https://en.wikipedia.org/wiki/Stock_keeping_unit), number of reviews, product images, product variants, stock availability, description, and perhaps more.
+
+![Product detail page](./images/pdp.png)
+
+Depending on what's valuable for our use case, we can now use the same techniques as in previous lessons to extract any of the above. As a demonstration, let's scrape the vendor name. In browser DevTools, we can see that the HTML around the vendor name has the following structure:
+
+```html
+<div class="product-meta">
+  <h1 class="product-meta__title heading h1">
+    Sony XBR-950G BRAVIA 4K HDR Ultra HD TV
+  </h1>
+  <div class="product-meta__label-list">
+    ...
+  </div>
+  <div class="product-meta__reference">
+    <!-- highlight-next-line -->
+    <a class="product-meta__vendor link link--accented" href="/collections/sony">
+        <!-- highlight-next-line -->
+        Sony
+    <!-- highlight-next-line -->
+    </a>
+    <span class="product-meta__sku">
+      SKU:
+      <span class="product-meta__sku-number">SON-985594-XBR-65</span>
+    </span>
+  </div>
+  <a href="#product-reviews" class="product-meta__reviews-badge link" data-offset="30">
+    <div class="rating">
+      <div class="rating__stars" role="img" aria-label="4.0 out of 5.0 stars">
+        ...
+      </div>
+      <span class="rating__caption">3 reviews</span>
+    </div>
+  </a>
+  ...
+</div>
+```
+
+It looks like using a CSS selector to locate the element with the `product-meta__vendor` class, and then extracting its text, should be enough to get the vendor name as a string:
+
+```py
+vendor = product_soup.select_one(".product-meta__vendor").text.strip()
+```
+
+But where do we put this line in our program?
+
+## Crawling product detail pages
+
+In the `data` loop we're already going through all the products. Let's expand it to include downloading the product detail page, parsing it, extracting the vendor's name, and adding it as a new key in the item's dictionary:
+
+```py
+...
+
+listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+listing_soup = download(listing_url)
+
+data = []
+for product in listing_soup.select(".product-item"):
+    item = parse_product(product, listing_url)
+    # highlight-next-line
+    product_soup = download(item["url"])
+    # highlight-next-line
+    item["vendor"] = product_soup.select_one(".product-meta__vendor").text.strip()
+    data.append(item)
+
+...
+```
+
+If you run the program now, it'll take longer to finish since it's making 24 more HTTP requests. But in the end, it should produce exports with a new field containing the vendor's name:
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  {
+    "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker",
+    "min_price": "74.95",
+    "price": "74.95",
+    "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker",
+    "vendor": "JBL"
+  },
+  {
+    "title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV",
+    "min_price": "1398.00",
+    "price": null,
+    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv",
+    "vendor": "Sony"
+  },
+  ...
+]
+```
+
+## Extracting price
+
+Scraping the vendor's name is nice, but the main reason we started checking the detail pages in the first place was to figure out how to get a price for each product. From the product listing, we could only scrape the min price, and remember—we’re building a Python app to track prices!
+
+Looking at the [Sony XBR-950G BRAVIA](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv), it's clear that the listing only shows min prices, because some products have variants, each with a different price. And different stock availability. And different SKUs…
+
+![Morpheus revealing the existence of product variants](images/variants.png)
+
+In the next lesson, we'll scrape the product detail pages so that each product variant is represented as a separate item in our dataset.
+
+---
+
+<Exercises />
+
+### Scrape calling codes of African countries
+
+This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the calling code from the info table. Print the URL and the calling code for each country. Start with this URL:
+
+```text
+https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
+```
+
+Your program should print the following:
+
+```text
+https://en.wikipedia.org/wiki/Algeria +213
+https://en.wikipedia.org/wiki/Angola +244
+https://en.wikipedia.org/wiki/Benin +229
+https://en.wikipedia.org/wiki/Botswana +267
+https://en.wikipedia.org/wiki/Burkina_Faso +226
+https://en.wikipedia.org/wiki/Burundi None
+https://en.wikipedia.org/wiki/Cameroon +237
+...
+```
+
+Hint: Locating cells in tables is sometimes easier if you know how to [navigate up](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#going-up) in the HTML element soup.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+  from urllib.parse import urljoin
+
+  def download(url):
+      response = httpx.get(url)
+      response.raise_for_status()
+      return BeautifulSoup(response.text, "html.parser")
+
+  def parse_calling_code(soup):
+      for label in soup.select("th.infobox-label"):
+          if label.text.strip() == "Calling code":
+              data = label.parent.select_one("td.infobox-data")
+              return data.text.strip()
+      return None
+
+  listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
+  listing_soup = download(listing_url)
+  for name_cell in listing_soup.select(".wikitable tr td:nth-child(3)"):
+      link = name_cell.select_one("a")
+      country_url = urljoin(listing_url, link["href"])
+      country_soup = download(country_url)
+      calling_code = parse_calling_code(country_soup)
+      print(country_url, calling_code)
+  ```
+
+</details>
+
+### Scrape authors of F1 news articles
+
+This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
+
+```text
+https://www.theguardian.com/sport/formulaone
+```
+
+Your program should print something like this:
+
+```text
+Daniel Harris: Sports quiz of the week: Johan Neeskens, Bond and airborne antics
+Colin Horgan: The NHL is getting its own Drive to Survive. But could it backfire?
+Reuters: US GP ticket sales ‘took off’ after Max Verstappen stopped winning in F1
+Giles Richards: Liam Lawson gets F1 chance to replace Pérez alongside Verstappen at Red Bull
+PA Media: Lewis Hamilton reveals lifelong battle with depression after school bullying
+...
+```
+
+Hints:
+
+- You can use [attribute selectors](https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors) to select HTML elements based on their attribute values.
+- Sometimes a person authors the article, but other times it's contributed by a news agency.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+  from urllib.parse import urljoin
+
+  def download(url):
+      response = httpx.get(url)
+      response.raise_for_status()
+      return BeautifulSoup(response.text, "html.parser")
+
+  def parse_author(article_soup):
+      link = article_soup.select_one('aside a[rel="author"]')
+      if link:
+          return link.text.strip()
+      address = article_soup.select_one('aside address')
+      if address:
+          return address.text.strip()
+      return None
+
+  listing_url = "https://www.theguardian.com/sport/formulaone"
+  listing_soup = download(listing_url)
+  for item in listing_soup.select("#maincontent ul li"):
+      link = item.select_one("a")
+      article_url = urljoin(listing_url, link["href"])
+      article_soup = download(article_url)
+      title = article_soup.select_one("h1").text.strip()
+      author = parse_author(article_soup)
+      print(f"{author}: {title}")
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/11_scraping_variants.md">
+---
+title: Scraping product variants with Python
+sidebar_label: Scraping product variants
+description: Lesson about building a Python application for watching prices. Using browser DevTools to figure out how to extract product variants and exporting them as separate items.
+sidebar_position: 11
+slug: /scraping-basics-python/scraping-variants
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson, we'll scrape the product detail pages to represent each product variant as a separate item in our dataset.**
+
+---
+
+We'll need to figure out how to extract variants from the product detail page, and then change how we add items to the data list so we can add multiple items after scraping one product URL.
+
+## Locating variants
+
+First, let's extract information about the variants. If we go to [Sony XBR-950G BRAVIA](https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv) and open the DevTools, we can see that the buttons for switching between variants look like this:
+
+```html
+<div class="block-swatch-list">
+  <div class="block-swatch">
+    <input class="block-swatch__radio product-form__single-selector is-filled" type="radio" name="template--14851594125363__main-1916221128755-1" id="template--14851594125363__main-1916221128755-1-1" value="55&quot;" checked="" data-option-position="1">
+    <label class="block-swatch__item" for="template--14851594125363__main-1916221128755-1-1" title="55&quot;">
+    <!-- highlight-next-line -->
+    <span class="block-swatch__item-text">55"</span>
+    </label>
+  </div>
+  <div class="block-swatch">
+    <input class="block-swatch__radio product-form__single-selector" type="radio" name="template--14851594125363__main-1916221128755-1" id="template--14851594125363__main-1916221128755-1-2" value="65&quot;" data-option-position="1">
+    <label class="block-swatch__item" for="template--14851594125363__main-1916221128755-1-2" title="65&quot;">
+    <!-- highlight-next-line -->
+    <span class="block-swatch__item-text">65"</span>
+    </label>
+  </div>
+</div>
+```
+
+Nice! We can extract the variant names, but we also need to extract the price for each variant. Switching the variants using the buttons shows us that the HTML changes dynamically. This means the page uses JavaScript to display information about the variants.
+
+![Switching variants](images/variants-js.gif)
+
+If we can't find a workaround, we'd need our scraper to run JavaScript. That's not impossible. Scrapers can spin up their own browser instance and automate clicking on buttons, but it's slow and resource-intensive. Ideally, we want to stick to plain HTTP requests and Beautiful Soup as much as possible.
+
+After a bit of detective work, we notice that not far below the `block-swatch-list` there's also a block of HTML with a class `no-js`, which contains all the data!
+
+```html
+<div class="no-js product-form__option">
+  <label class="product-form__option-name text--strong" for="product-select-1916221128755">Variant</label>
+  <div class="select-wrapper select-wrapper--primary is-filled">
+    <select id="product-select-1916221128755" name="id">
+      <!-- highlight-next-line -->
+      <option value="17550242349107" data-sku="SON-695219-XBR-55">
+        <!-- highlight-next-line -->
+        55" - $1,398.00
+      </option>
+      <!-- highlight-next-line -->
+      <option value="17550242414643" data-sku="SON-985594-XBR-65" selected="selected">
+        <!-- highlight-next-line -->
+        65" - $2,198.00
+      </option>
+    </select>
+  </div>
+</div>
+```
+
+These elements aren't visible to regular visitors. They're there just in case JavaScript fails to work, otherwise they're hidden. This is a great find because it allows us to keep our scraper lightweight.
+
+## Extracting variants
+
+Using our knowledge of Beautiful Soup, we can locate the options and extract the data we need:
+
+```py
+...
+
+listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+listing_soup = download(listing_url)
+
+data = []
+for product in listing_soup.select(".product-item"):
+    item = parse_product(product, listing_url)
+    product_soup = download(item["url"])
+    vendor = product_soup.select_one(".product-meta__vendor").text.strip()
+
+    if variants := product_soup.select(".product-form__option.no-js option"):
+        for variant in variants:
+            data.append(item | {"variant_name": variant.text.strip()})
+    else:
+        item["variant_name"] = None
+        data.append(item)
+
+...
+```
+
+The CSS selector `.product-form__option.no-js` matches elements with both `product-form__option` and `no-js` classes. Then we're using the [descendant combinator](https://developer.mozilla.org/en-US/docs/Web/CSS/Descendant_combinator) to match all `option` elements somewhere inside the `.product-form__option.no-js` wrapper.
+
+Python dictionaries are mutable, so if we assigned the variant with `item["variant_name"] = ...`, we'd always overwrite the values. Instead of saving an item for each variant, we'd end up with the last variant repeated several times. To avoid this, we create a new dictionary for each variant and merge it with the `item` data before adding it to `data`. If we don't find any variants, we add the `item` as is, leaving the `variant_name` key empty.
+
+:::tip Python syntax you might not know
+
+Since Python 3.8, you can use `:=` to simplify checking if an assignment resulted in a non-empty value. It's called an _assignment expression_ or _walrus operator_. You can learn more about it in the [docs](https://docs.python.org/3/reference/expressions.html#assignment-expressions) or in the [proposal document](https://peps.python.org/pep-0572/).
+
+Since Python 3.9, you can use `|` to merge two dictionaries. If the [docs](https://docs.python.org/3/library/stdtypes.html#dict) aren't clear enough, check out the [proposal document](https://peps.python.org/pep-0584/) for more details.
+
+:::
+
+If you run the program, you should see 34 items in total. Some items don't have variants, so they won't have a variant name. However, they should still have a price set—our scraper should already have that info from the product listing page.
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  ...
+  {
+    "variant_name": null,
+    "title": "Klipsch R-120SW Powerful Detailed Home Speaker - Unit",
+    "min_price": "324.00",
+    "price": "324.00",
+    "url": "https://warehouse-theme-metal.myshopify.com/products/klipsch-r-120sw-powerful-detailed-home-speaker-set-of-1",
+    "vendor": "Klipsch"
+  },
+  ...
+]
+```
+
+Some products will break into several items, each with a different variant name. We don't know their exact prices from the product listing, just the min price. In the next step, we should be able to parse the actual price from the variant name for those items.
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  ...
+  {
+    "variant_name": "Red - $178.00",
+    "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
+    "min_price": "128.00",
+    "price": null,
+    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control",
+    "vendor": "Sony"
+  },
+  {
+    "variant_name": "Black - $178.00",
+    "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
+    "min_price": "128.00",
+    "price": null,
+    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control",
+    "vendor": "Sony"
+  },
+  ...
+]
+```
+
+Perhaps surprisingly, some products with variants will have the price field set. That's because the shop sells all variants of the product for the same price, so the product listing shows the price as a fixed amount, like _$74.95_, instead of _from $74.95_.
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  ...
+  {
+    "variant_name": "Red - $74.95",
+    "title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker",
+    "min_price": "74.95",
+    "price": "74.95",
+    "url": "https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker",
+    "vendor": "JBL"
+  },
+  ...
+]
+```
+
+## Parsing price
+
+The items now contain the variant as text, which is good for a start, but we want the price to be in the `price` key. Let's introduce a new function to handle that:
+
+```py
+def parse_variant(variant):
+    text = variant.text.strip()
+    name, price_text = text.split(" - ")
+    price = Decimal(
+        price_text
+        .replace("$", "")
+        .replace(",", "")
+    )
+    return {"variant_name": name, "price": price}
+```
+
+First, we split the text into two parts, then we parse the price as a decimal number. This part is similar to what we already do for parsing product listing prices. The function returns a dictionary we can merge with `item`.
+
+## Saving price
+
+Now, if we use our new function, we should finally get a program that can scrape exact prices for all products, even if they have variants. The whole code should look like this now:
+
+```py
+import httpx
+from bs4 import BeautifulSoup
+from decimal import Decimal
+import csv
+import json
+from urllib.parse import urljoin
+
+def download(url):
+    response = httpx.get(url)
+    response.raise_for_status()
+
+    html_code = response.text
+    return BeautifulSoup(html_code, "html.parser")
+
+def parse_product(product, base_url):
+    title_element = product.select_one(".product-item__title")
+    title = title_element.text.strip()
+    url = urljoin(base_url, title_element["href"])
+
+    price_text = (
+        product
+        .select_one(".price")
+        .contents[-1]
+        .strip()
+        .replace("$", "")
+        .replace(",", "")
+    )
+    if price_text.startswith("From "):
+        min_price = Decimal(price_text.removeprefix("From "))
+        price = None
+    else:
+        min_price = Decimal(price_text)
+        price = min_price
+
+    return {"title": title, "min_price": min_price, "price": price, "url": url}
+
+def parse_variant(variant):
+    text = variant.text.strip()
+    name, price_text = text.split(" - ")
+    price = Decimal(
+        price_text
+        .replace("$", "")
+        .replace(",", "")
+    )
+    return {"variant_name": name, "price": price}
+
+def export_csv(file, data):
+    fieldnames = list(data[0].keys())
+    writer = csv.DictWriter(file, fieldnames=fieldnames)
+    writer.writeheader()
+    for row in data:
+        writer.writerow(row)
+
+def export_json(file, data):
+    def serialize(obj):
+        if isinstance(obj, Decimal):
+            return str(obj)
+        raise TypeError("Object not JSON serializable")
+
+    json.dump(data, file, default=serialize, indent=2)
+
+listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
+listing_soup = download(listing_url)
+
+data = []
+for product in listing_soup.select(".product-item"):
+    item = parse_product(product, listing_url)
+    product_soup = download(item["url"])
+    vendor = product_soup.select_one(".product-meta__vendor").text.strip()
+
+    if variants := product_soup.select(".product-form__option.no-js option"):
+        for variant in variants:
+            # highlight-next-line
+            data.append(item | parse_variant(variant))
+    else:
+        item["variant_name"] = None
+        data.append(item)
+
+with open("products.csv", "w") as file:
+    export_csv(file, data)
+
+with open("products.json", "w") as file:
+    export_json(file, data)
+```
+
+Run the scraper and see for yourself if all the items in the data contain prices:
+
+<!-- eslint-skip -->
+```json title=products.json
+[
+  ...
+  {
+    "variant_name": "Red",
+    "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
+    "min_price": "128.00",
+    "price": "178.00",
+    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control",
+    "vendor": "Sony"
+  },
+  {
+    "variant_name": "Black",
+    "title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
+    "min_price": "128.00",
+    "price": "178.00",
+    "url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control",
+    "vendor": "Sony"
+  },
+  ...
+]
+```
+
+Success! We managed to build a Python application for watching prices!
+
+Is this the end? Maybe! In the next lesson, we'll use a scraping framework to build the same application, but with less code, faster requests, and better visibility into what's happening while we wait for the program to finish.
+
+---
+
+<Exercises />
+
+### Build a scraper for watching Python jobs
+
+You're able to build a scraper now, aren't you? Let's build another one! Python's official website has a [job board](https://www.python.org/jobs/). Scrape the job postings that match the following criteria:
+
+- Tagged as "Database"
+- Posted within the last 60 days
+
+For each job posting found, use [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp) to print a dictionary containing the following data:
+
+- Job title
+- Company
+- URL to the job posting
+- Date of posting
+
+Your output should look something like this:
+
+```py
+{'title': 'Senior Full Stack Developer',
+ 'company': 'Baserow',
+ 'url': 'https://www.python.org/jobs/7705/',
+ 'posted_on': datetime.date(2024, 9, 16)}
+{'title': 'Senior Python Engineer',
+ 'company': 'Active Prime',
+ 'url': 'https://www.python.org/jobs/7699/',
+ 'posted_on': datetime.date(2024, 9, 5)}
+...
+```
+
+You can find everything you need for working with dates and times in Python's [`datetime`](https://docs.python.org/3/library/datetime.html) module, including `date.today()`, `datetime.fromisoformat()`, `datetime.date()`, and `timedelta()`.
+
+<details>
+  <summary>Solution</summary>
+
+  After inspecting the job board, you'll notice that job postings tagged as "Database" have a dedicated URL. We'll use that as our starting point, which saves us from having to scrape and check the tags manually.
+
+  ```py
+  from pprint import pp
+  import httpx
+  from bs4 import BeautifulSoup
+  from urllib.parse import urljoin
+  from datetime import datetime, date, timedelta
+
+  today = date.today()
+  jobs_url = "https://www.python.org/jobs/type/database/"
+  response = httpx.get(jobs_url)
+  response.raise_for_status()
+  soup = BeautifulSoup(response.text, "html.parser")
+
+  for job in soup.select(".list-recent-jobs li"):
+      link = job.select_one(".listing-company-name a")
+
+      time = job.select_one(".listing-posted time")
+      posted_at = datetime.fromisoformat(time["datetime"])
+      posted_on = posted_at.date()
+      posted_ago = today - posted_on
+
+      if posted_ago <= timedelta(days=60):
+          title = link.text.strip()
+          company = list(job.select_one(".listing-company-name").stripped_strings)[-1]
+          url = urljoin(jobs_url, link["href"])
+          pp({"title": title, "company": company, "url": url, "posted_on": posted_on})
+  ```
+
+</details>
+
+### Find the shortest CNN article which made it to the Sports homepage
+
+Scrape the [CNN Sports](https://edition.cnn.com/sport) homepage. For each linked article, calculate its length in characters:
+
+- Locate the element that holds the main content of the article.
+- Use [`get_text()`](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#get-text) to extract all the content as plain text.
+- Use `len()` to calculate the character count.
+
+Skip pages without text (like those that only have a video). Sort the results and print the URL of the shortest article that made it to the homepage.
+
+At the time of writing, the shortest article on the CNN Sports homepage is [about a donation to the Augusta National Golf Club](https://edition.cnn.com/2024/10/03/sport/masters-donation-hurricane-helene-relief-spt-intl/), which is just 1,642 characters long.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import httpx
+  from bs4 import BeautifulSoup
+  from urllib.parse import urljoin
+
+  def download(url):
+      response = httpx.get(url)
+      response.raise_for_status()
+      return BeautifulSoup(response.text, "html.parser")
+
+  listing_url = "https://edition.cnn.com/sport"
+  listing_soup = download(listing_url)
+
+  data = []
+  for card in listing_soup.select(".layout__main .card"):
+      link = card.select_one(".container__link")
+      article_url = urljoin(listing_url, link["href"])
+      article_soup = download(article_url)
+      if content := article_soup.select_one(".article__content"):
+          length = len(content.get_text())
+          data.append((length, article_url))
+
+  data.sort()
+  shortest_item = data[0]
+  item_url = shortest_item[1]
+  print(item_url)
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/12_framework.md">
+---
+title: Using a scraping framework with Python
+sidebar_label: Using a framework
+description: Lesson about building a Python application for watching prices. Using the Crawlee framework to simplify creating a scraper.
+sidebar_position: 12
+slug: /scraping-basics-python/framework
+---
+
+import Exercises from './_exercises.mdx';
+
+**In this lesson, we'll rework our application for watching prices so that it builds on top of a scraping framework. We'll use Crawlee to make the program simpler, faster, and more robust.**
+
+---
+
+Before rewriting our code, let's point out several caveats in our current solution:
+
+- **Hard to maintain:** All the data we need from the listing page is also available on the product page. By scraping both, we have to maintain selectors for two HTML documents. Instead, we could scrape links from the listing page and process all data on the product pages.
+- **Slow:** The program runs sequentially, which is generously considerate toward the target website, but extremely inefficient.
+- **No logging:** The scraper gives no sense of progress, making it tedious to use. Debugging issues becomes even more frustrating without proper logs.
+- **Boilerplate code:** We implement downloading and parsing HTML, or exporting data to CSV, although we're not the first people to meet and solve these problems.
+- **Prone to anti-scraping:** If the target website implemented anti-scraping measures, a bare-bones program like ours would stop working.
+- **Browser means rewrite:** We got lucky extracting variants. If the website didn't include a fallback, we might have had no choice but to spin up a browser instance and automate clicking on buttons. Such a change in the underlying technology would require a complete rewrite of our program.
+- **No error handling:** The scraper stops if it encounters issues. It should allow for skipping problematic products with warnings or retrying downloads when the website returns temporary errors.
+
+In this lesson, we'll tackle all the above issues while keeping the code concise thanks to a scraping framework.
+
+:::info Why Crawlee and not Scrapy
+
+From the two main open-source options for Python, [Scrapy](https://scrapy.org/) and [Crawlee](https://crawlee.dev/python/), we chose the latter—not just because we're the company financing its development.
+
+We genuinely believe beginners to scraping will like it more, since it allows to create a scraper with less code and less time spent reading docs. Scrapy's long history ensures it's battle-tested, but it also means its code relies on technologies that aren't really necessary today. Crawlee, on the other hand, builds on modern Python features like asyncio and type hints.
+
+:::
+
+## Installing Crawlee
+
+When starting with the Crawlee framework, we first need to decide which approach to downloading and parsing we prefer. We want the one based on BeautifulSoup, so let's install the `crawlee` package with the `beautifulsoup` extra specified in brackets. The framework has a lot of dependencies, so expect the installation to take a while.
+
+```text
+$ pip install crawlee[beautifulsoup]
+...
+Successfully installed Jinja2-0.0.0 ... ... ... crawlee-0.0.0 ... ... ...
+```
+
+## Running Crawlee
+
+Now let's use the framework to create a new version of our scraper. In the same project directory where our `main.py` file lives, create a file `newmain.py`. This way, we can keep peeking at the original implementation while working on the new one. The initial content will look like this:
+
+```py title="newmain.py"
+import asyncio
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+
+async def main():
+    crawler = BeautifulSoupCrawler()
+
+    @crawler.router.default_handler
+    async def handle_listing(context):
+        print(context.soup.title.text.strip())
+
+    await crawler.run(["https://warehouse-theme-metal.myshopify.com/collections/sales"])
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+In the code, we do the following:
+
+1. We perform imports and specify an asynchronous `main()` function.
+2. Inside, we first create a crawler. The crawler objects control the scraping. This particular crawler is of the BeautifulSoup flavor.
+3. In the middle, we give the crawler a nested asynchronous function `handle_listing()`. Using a Python decorator (that line starting with `@`), we tell it to treat it as a default handler. Handlers take care of processing HTTP responses. This one finds the title of the page in `soup` and prints its text without whitespace.
+4. The function ends with running the crawler with the product listing URL. We await the crawler to finish its work.
+5. The last two lines ensure that if we run the file as a standalone program, Python's asynchronous machinery will run our `main()` function.
+
+Don't worry if this involves a lot of things you've never seen before. For now, you don't need to know exactly how [`asyncio`](https://docs.python.org/3/library/asyncio.html) works or what decorators do. Let's stick to the practical side and see what the program does when executed:
+
+```text
+$ python newmain.py
+[crawlee.beautifulsoup_crawler._beautifulsoup_crawler] INFO  Current request statistics:
+┌───────────────────────────────┬──────────┐
+│ requests_finished             │ 0        │
+│ requests_failed               │ 0        │
+│ retry_histogram               │ [0]      │
+│ request_avg_failed_duration   │ None     │
+│ request_avg_finished_duration │ None     │
+│ requests_finished_per_minute  │ 0        │
+│ requests_failed_per_minute    │ 0        │
+│ request_total_duration        │ 0.0      │
+│ requests_total                │ 0        │
+│ crawler_runtime               │ 0.010014 │
+└───────────────────────────────┴──────────┘
+[crawlee._autoscaling.autoscaled_pool] INFO  current_concurrency = 0; desired_concurrency = 2; cpu = 0; mem = 0; event_loop = 0.0; client_info = 0.0
+Sales
+[crawlee._autoscaling.autoscaled_pool] INFO  Waiting for remaining tasks to finish
+[crawlee.beautifulsoup_crawler._beautifulsoup_crawler] INFO  Final request statistics:
+┌───────────────────────────────┬──────────┐
+│ requests_finished             │ 1        │
+│ requests_failed               │ 0        │
+│ retry_histogram               │ [1]      │
+│ request_avg_failed_duration   │ None     │
+│ request_avg_finished_duration │ 0.308998 │
+│ requests_finished_per_minute  │ 185      │
+│ requests_failed_per_minute    │ 0        │
+│ request_total_duration        │ 0.308998 │
+│ requests_total                │ 1        │
+│ crawler_runtime               │ 0.323721 │
+└───────────────────────────────┴──────────┘
+```
+
+If our previous scraper didn't give us any sense of progress, Crawlee feeds us with perhaps too much information for the purposes of a small program. Among all the logging, notice the line `Sales`. That's the page title! We managed to create a Crawlee scraper that downloads the product listing page, parses it with BeautifulSoup, extracts the title, and prints it.
+
+:::tip Asynchronous code and decorators
+
+You don't need to be an expert in asynchronous programming or decorators to finish this lesson, but you might find yourself curious for more details. If so, check out [Async IO in Python: A Complete Walkthrough](https://realpython.com/async-io-python/) and [Primer on Python Decorators](https://realpython.com/primer-on-python-decorators/).
+
+:::
+
+## Crawling product detail pages
+
+The code now features advanced Python concepts, so it's less accessible to beginners, and the size of the program is about the same as if we worked without a framework. The tradeoff of using a framework is that primitive scenarios may become unnecessarily complex, while complex scenarios may become surprisingly primitive. As we rewrite the rest of the program, the benefits of using Crawlee will become more apparent.
+
+For example, it takes a single line of code to extract and follow links to products. Three more lines, and we have parallel processing of all the product detail pages:
+
+```py
+import asyncio
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+
+async def main():
+    crawler = BeautifulSoupCrawler()
+
+    @crawler.router.default_handler
+    async def handle_listing(context):
+        # highlight-next-line
+        await context.enqueue_links(label="DETAIL", selector=".product-list a.product-item__title")
+
+    # highlight-next-line
+    @crawler.router.handler("DETAIL")
+    # highlight-next-line
+    async def handle_detail(context):
+        # highlight-next-line
+        print(context.request.url)
+
+    await crawler.run(["https://warehouse-theme-metal.myshopify.com/collections/sales"])
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+First, it's necessary to inspect the page in browser DevTools to figure out the CSS selector that allows us to locate links to all the product detail pages. Then we can use the `enqueue_links()` method to find the links and add them to Crawlee's internal HTTP request queue. We tell the method to label all the requests as `DETAIL`.
+
+Below that, we give the crawler another asynchronous function, `handle_detail()`. We again inform the crawler that this function is a handler using a decorator, but this time it's not a default one. This handler will only take care of HTTP requests labeled as `DETAIL`. For now, all it does is print the request URL.
+
+If we run the code, we should see how Crawlee first downloads the listing page and then makes parallel requests to each of the detail pages, printing their URLs along the way:
+
+```text
+$ python newmain.py
+[crawlee.beautifulsoup_crawler._beautifulsoup_crawler] INFO  Current request statistics:
+┌───────────────────────────────┬──────────┐
+...
+└───────────────────────────────┴──────────┘
+[crawlee._autoscaling.autoscaled_pool] INFO  current_concurrency = 0; desired_concurrency = 2; cpu = 0; mem = 0; event_loop = 0.0; client_info = 0.0
+https://warehouse-theme-metal.myshopify.com/products/sony-xbr-65x950g-65-class-64-5-diag-bravia-4k-hdr-ultra-hd-tv
+https://warehouse-theme-metal.myshopify.com/products/jbl-flip-4-waterproof-portable-bluetooth-speaker
+https://warehouse-theme-metal.myshopify.com/products/sony-sacs9-10-inch-active-subwoofer
+https://warehouse-theme-metal.myshopify.com/products/sony-ps-hx500-hi-res-usb-turntable
+...
+[crawlee._autoscaling.autoscaled_pool] INFO  Waiting for remaining tasks to finish
+[crawlee.beautifulsoup_crawler._beautifulsoup_crawler] INFO  Final request statistics:
+┌───────────────────────────────┬──────────┐
+│ requests_finished             │ 25       │
+│ requests_failed               │ 0        │
+│ retry_histogram               │ [25]     │
+│ request_avg_failed_duration   │ None     │
+│ request_avg_finished_duration │ 0.349434 │
+│ requests_finished_per_minute  │ 318      │
+│ requests_failed_per_minute    │ 0        │
+│ request_total_duration        │ 8.735843 │
+│ requests_total                │ 25       │
+│ crawler_runtime               │ 4.713262 │
+└───────────────────────────────┴──────────┘
+```
+
+In the final stats, you can see that we made 25 requests (1 listing page + 24 product pages) in less than 5 seconds. Your numbers might differ, but regardless, it should be much faster than making the requests sequentially.
+
+## Extracting data
+
+The BeautifulSoup crawler provides handlers with the `context.soup` attribute, which contains the parsed HTML of the handled page. This is the same `soup` object we used in our previous program. Let's locate and extract the same data as before:
+
+```py
+async def main():
+    ...
+
+    @crawler.router.handler("DETAIL")
+    async def handle_detail(context):
+        item = {
+            "url": context.request.url,
+            "title": context.soup.select_one(".product-meta__title").text.strip(),
+            "vendor": context.soup.select_one(".product-meta__vendor").text.strip(),
+        }
+        print(item)
+```
+
+Now for the price. We're not doing anything new here—just import `Decimal` and copy-paste the code from our old scraper.
+
+The only change will be in the selector. In `main.py`, we looked for `.price` within a `product_soup` object representing a product card. Now, we're looking for `.price` within the entire product detail page. It's better to be more specific so we don't accidentally match another price on the same page:
+
+```py
+async def main():
+    ...
+
+    @crawler.router.handler("DETAIL")
+    async def handle_detail(context):
+        price_text = (
+            context.soup
+            # highlight-next-line
+            .select_one(".product-form__info-content .price")
+            .contents[-1]
+            .strip()
+            .replace("$", "")
+            .replace(",", "")
+        )
+        item = {
+            "url": context.request.url,
+            "title": context.soup.select_one(".product-meta__title").text.strip(),
+            "vendor": context.soup.select_one(".product-meta__vendor").text.strip(),
+            "price": Decimal(price_text),
+        }
+        print(item)
+```
+
+Finally, the variants. We can reuse the `parse_variant()` function as-is, and in the handler we'll again take inspiration from what we had in `main.py`. The full program will look like this:
+
+```py
+import asyncio
+from decimal import Decimal
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+
+async def main():
+    crawler = BeautifulSoupCrawler()
+
+    @crawler.router.default_handler
+    async def handle_listing(context):
+        await context.enqueue_links(selector=".product-list a.product-item__title", label="DETAIL")
+
+    @crawler.router.handler("DETAIL")
+    async def handle_detail(context):
+        price_text = (
+            context.soup
+            .select_one(".product-form__info-content .price")
+            .contents[-1]
+            .strip()
+            .replace("$", "")
+            .replace(",", "")
+        )
+        item = {
+            "url": context.request.url,
+            "title": context.soup.select_one(".product-meta__title").text.strip(),
+            "vendor": context.soup.select_one(".product-meta__vendor").text.strip(),
+            "price": Decimal(price_text),
+            "variant_name": None,
+        }
+        if variants := context.soup.select(".product-form__option.no-js option"):
+            for variant in variants:
+                print(item | parse_variant(variant))
+        else:
+            print(item)
+
+    await crawler.run(["https://warehouse-theme-metal.myshopify.com/collections/sales"])
+
+def parse_variant(variant):
+    text = variant.text.strip()
+    name, price_text = text.split(" - ")
+    price = Decimal(
+        price_text
+        .replace("$", "")
+        .replace(",", "")
+    )
+    return {"variant_name": name, "price": price}
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+If you run this scraper, you should get the same data for the 24 products as before. Crawlee has saved us a lot of effort by managing downloading, parsing, and parallelization. The code is also cleaner, with two separate and labeled handlers.
+
+Crawlee doesn't do much to help with locating and extracting the data—that part of the code remains almost the same, framework or not. This is because the detective work of finding and extracting the right data is the core value of custom scrapers. With Crawlee, you can focus on just that while letting the framework take care of everything else.
+
+## Saving data
+
+When we're at _letting the framework take care of everything else_, let's take a look at what it can do about saving data. As of now the product detail page handler prints each item as soon as the item is ready. Instead, we can push the item to Crawlee's default dataset:
+
+```py
+async def main():
+    ...
+
+    @crawler.router.handler("DETAIL")
+    async def handle_detail(context):
+        price_text = (
+            ...
+        )
+        item = {
+            ...
+        }
+        if variants := context.soup.select(".product-form__option.no-js option"):
+            for variant in variants:
+                # highlight-next-line
+                await context.push_data(item | parse_variant(variant))
+        else:
+            # highlight-next-line
+            await context.push_data(item)
+```
+
+That's it! If you run the program now, there should be a `storage` directory alongside the `newmain.py` file. Crawlee uses it to store its internal state. If you go to the `storage/datasets/default` subdirectory, you'll see over 30 JSON files, each representing a single item.
+
+![Single dataset item](images/dataset-item.png)
+
+We can also export all the items to a single file of our choice. We'll do it at the end of the `main()` function, after the crawler has finished scraping:
+
+```py
+async def main():
+    ...
+
+    await crawler.run(["https://warehouse-theme-metal.myshopify.com/collections/sales"])
+    # highlight-next-line
+    await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2)
+    # highlight-next-line
+    await crawler.export_data_csv(path='dataset.csv')
+```
+
+After running the scraper again, there should be two new files in your directory, `dataset.json` and `dataset.csv`, containing all the data. If you peek into the JSON file, it should have indentation.
+
+## Logging
+
+Crawlee gives us stats about HTTP requests and concurrency, but we don't get much visibility into the pages we're crawling or the items we're saving. Let's add some custom logging:
+
+```py
+import asyncio
+from decimal import Decimal
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+
+async def main():
+    crawler = BeautifulSoupCrawler()
+
+    @crawler.router.default_handler
+    async def handle_listing(context):
+        # highlight-next-line
+        context.log.info("Looking for product detail pages")
+        await context.enqueue_links(selector=".product-list a.product-item__title", label="DETAIL")
+
+    @crawler.router.handler("DETAIL")
+    async def handle_detail(context):
+        # highlight-next-line
+        context.log.info(f"Product detail page: {context.request.url}")
+        price_text = (
+            context.soup
+            .select_one(".product-form__info-content .price")
+            .contents[-1]
+            .strip()
+            .replace("$", "")
+            .replace(",", "")
+        )
+        item = {
+            "url": context.request.url,
+            "title": context.soup.select_one(".product-meta__title").text.strip(),
+            "vendor": context.soup.select_one(".product-meta__vendor").text.strip(),
+            "price": Decimal(price_text),
+            "variant_name": None,
+        }
+        if variants := context.soup.select(".product-form__option.no-js option"):
+            for variant in variants:
+                # highlight-next-line
+                context.log.info("Saving a product variant")
+                await context.push_data(item | parse_variant(variant))
+        else:
+            # highlight-next-line
+            context.log.info("Saving a product")
+            await context.push_data(item)
+
+    await crawler.run(["https://warehouse-theme-metal.myshopify.com/collections/sales"])
+
+    # highlight-next-line
+    crawler.log.info("Exporting data")
+    await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2)
+    await crawler.export_data_csv(path='dataset.csv')
+
+def parse_variant(variant):
+    text = variant.text.strip()
+    name, price_text = text.split(" - ")
+    price = Decimal(
+        price_text
+        .replace("$", "")
+        .replace(",", "")
+    )
+    return {"variant_name": name, "price": price}
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+Depending on what we find helpful, we can tweak the logs to include more or less detail. The `context.log` or `crawler.log` objects are just [standard Python loggers](https://docs.python.org/3/library/logging.html).
+
+Even with the extra logging we've added, we've managed to cut at least 20 lines of code compared to the original program. Throughout this lesson, we've been adding features to match the old scraper's functionality, but the new code is still clean and readable. Plus, we've been able to focus on what's unique to the website we're scraping and the data we care about, while the framework manages the rest.
+
+In the next lesson, we'll use a scraping platform to set up our application to run automatically every day.
+
+---
+
+<Exercises />
+
+### Build a Crawlee scraper of F1 Academy drivers
+
+Scrape information about all [F1 Academy](https://en.wikipedia.org/wiki/F1_Academy) drivers listed on the official [Drivers](https://www.f1academy.com/Racing-Series/Drivers) page. Each item you push to Crawlee's default dataset should include the following data:
+
+- URL of the driver's f1academy.com page
+- Name
+- Team
+- Nationality
+- Date of birth (as a `date()` object)
+- Instagram URL
+
+If you export the dataset as JSON, it should look something like this:
+
+<!-- eslint-skip -->
+```json
+[
+  {
+    "url": "https://www.f1academy.com/Racing-Series/Drivers/29/Emely-De-Heus",
+    "name": "Emely De Heus",
+    "team": "MP Motorsport",
+    "nationality": "Dutch",
+    "dob": "2003-02-10",
+    "instagram_url": "https://www.instagram.com/emely.de.heus/",
+  },
+  {
+    "url": "https://www.f1academy.com/Racing-Series/Drivers/28/Hamda-Al-Qubaisi",
+    "name": "Hamda Al Qubaisi",
+    "team": "MP Motorsport",
+    "nationality": "Emirati",
+    "dob": "2002-08-08",
+    "instagram_url": "https://www.instagram.com/hamdaalqubaisi_official/",
+  },
+  ...
+]
+```
+
+Hints:
+
+- Use Python's `datetime.strptime(text, "%d/%m/%Y").date()` to parse dates in the `DD/MM/YYYY` format. Check out the [docs](https://docs.python.org/3/library/datetime.html#datetime.datetime.strptime) for more details.
+- To locate the Instagram URL, use the attribute selector `a[href*='instagram']`. Learn more about attribute selectors in the [MDN docs](https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors).
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import asyncio
+  from datetime import datetime
+
+  from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+
+  async def main():
+      crawler = BeautifulSoupCrawler()
+
+      @crawler.router.default_handler
+      async def handle_listing(context):
+          await context.enqueue_links(selector=".teams-driver-item a", label="DRIVER")
+
+      @crawler.router.handler("DRIVER")
+      async def handle_driver(context):
+          info = {}
+          for row in context.soup.select(".common-driver-info li"):
+              name = row.select_one("span").text.strip()
+              value = row.select_one("h4").text.strip()
+              info[name] = value
+
+          detail = {}
+          for row in context.soup.select(".driver-detail--cta-group a"):
+              name = row.select_one("p").text.strip()
+              value = row.select_one("h2").text.strip()
+              detail[name] = value
+
+          await context.push_data({
+              "url": context.request.url,
+              "name": context.soup.select_one("h1").text.strip(),
+              "team": detail["Team"],
+              "nationality": info["Nationality"],
+              "dob": datetime.strptime(info["DOB"], "%d/%m/%Y").date(),
+              "instagram_url": context.soup.select_one(".common-social-share a[href*='instagram']").get("href"),
+          })
+
+      await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"])
+      await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2)
+
+  if __name__ == '__main__':
+      asyncio.run(main())
+  ```
+
+</details>
+
+### Use Crawlee to find the ratings of the most popular Netflix films
+
+The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
+
+- URL of the film's IMDb page
+- Title
+- Rating
+
+If you export the dataset as JSON, it should look something like this:
+
+<!-- eslint-skip -->
+```json
+[
+  {
+    "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1",
+    "title": "The Merry Gentlemen",
+    "rating": "5.0/10"
+  },
+  {
+    "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1",
+    "title": "Hot Frosty",
+    "rating": "5.4/10"
+  },
+  ...
+]
+```
+
+To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this:
+
+```py
+...
+from urllib.parse import quote_plus
+
+async def main():
+    ...
+
+    @crawler.router.default_handler
+    async def handle_netflix_table(context):
+        requests = []
+        for name_cell in context.soup.select(...):
+            name = name_cell.text.strip()
+            imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
+            requests.append(Request.from_url(imdb_search_url, label="..."))
+        await context.add_requests(requests)
+
+    ...
+...
+```
+
+When navigating to the first search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue.
+
+<details>
+  <summary>Solution</summary>
+
+  ```py
+  import asyncio
+  from urllib.parse import quote_plus
+
+  from crawlee import Request
+  from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+
+  async def main():
+      crawler = BeautifulSoupCrawler()
+
+      @crawler.router.default_handler
+      async def handle_netflix_table(context):
+          requests = []
+          for name_cell in context.soup.select(".list-tbl-global .tbl-cell-name"):
+              name = name_cell.text.strip()
+              imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
+              requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH"))
+          await context.add_requests(requests)
+
+      @crawler.router.handler("IMDB_SEARCH")
+      async def handle_imdb_search(context):
+          await context.enqueue_links(selector=".find-result-item a", label="IMDB", limit=1)
+
+      @crawler.router.handler("IMDB")
+      async def handle_imdb(context):
+          rating_selector = "[data-testid='hero-rating-bar__aggregate-rating__score']"
+          rating_text = context.soup.select_one(rating_selector).text.strip()
+          await context.push_data({
+              "url": context.request.url,
+              "title": context.soup.select_one("h1").text.strip(),
+              "rating": rating_text,
+          })
+
+      await crawler.run(["https://www.netflix.com/tudum/top10"])
+      await crawler.export_data_json(path='dataset.json', ensure_ascii=False, indent=2)
+
+  if __name__ == '__main__':
+      asyncio.run(main())
+  ```
+
+</details>
+</file>
+
+<file path="webscraping/scraping_basics_python/13_platform.md">
+---
+title: Using a scraping platform with Python
+sidebar_label: Using a platform
+description: Lesson about building a Python application for watching prices. Using the Apify platform to deploy a scraper.
+sidebar_position: 13
+slug: /scraping-basics-python/platform
+---
+
+:::danger Work in progress
+
+This course is incomplete. As we work on adding new lessons, we would love to hear your feedback. You can comment right here under each page or [file a GitHub Issue](https://github.com/apify/apify-docs/issues) to discuss a problem.
+
+:::
+</file>
+
+<file path="webscraping/scraping_basics_python/index.md">
+---
+title: Web scraping basics for Python devs
+description: Learn how to use Python to extract information from websites in this practical course, starting from the absolute basics.
+sidebar_position: 10
+category: web scraping & automation
+slug: /scraping-basics-python
+---
+
+import DocCardList from '@theme/DocCardList';
+
+**Learn how to use Python to extract information from websites in this practical course, starting from the absolute basics.**
+
+---
+
+:::danger Work in progress
+
+This course is incomplete. As we work on adding new lessons, we would love to hear your feedback. Comment right here under each page or [file a GitHub Issue](https://github.com/apify/apify-docs/issues) to discuss a problem.
+
+:::
+
+In this course we'll use Python to create an application for watching prices. It'll be able to scrape all product pages of an e-commerce website and record prices. Data from several runs of such program would be useful for seeing trends in price changes, detecting discounts, etc.
+
+<!--
+TODO image of warehouse with some CVS or JSON exported, similar to sources/academy/webscraping/scraping_basics_javascript/images/beginners-data-collection.png, which is for some reason the same as sources/academy/webscraping/scraping_basics_javascript/images/beginners-data-extraction.png
+-->
+
+![E-commerce listing on the left, JSON with data on the right](./images/scraping.png)
+
+## What you'll do
+
+- Inspect pages using browser DevTools.
+- Download web pages using the HTTPX library.
+- Extract data from web pages using the Beautiful Soup library.
+- Save extracted data in various formats, e.g. CSV which MS Excel or Google Sheets can open.
+- Follow links programmatically (crawling).
+- Save time and effort with frameworks, such as Crawlee, and scraping platforms, such as Apify.
+
+## Who this course is for
+
+Anyone with basic knowledge of developing programs in Python who wants to start with web scraping can take this course. The course does not expect you to have any prior knowledge of web technologies or scraping.
+
+## Requirements
+
+- A macOS, Linux, or Windows machine with a web browser and Python installed.
+- Familiarity with Python basics: variables, conditions, loops, functions, strings, lists, dictionaries, files, classes, and exceptions.
+- Comfort with importing from the Python standard library, using virtual environments, and installing dependencies with `pip`.
+- Familiarity with running commands in Terminal (macOS/Linux) or Command Prompt (Windows).
+
+## You may want to know
+
+Let's explore the key reasons to take this course. What is web scraping good for, and what career opportunities does it enable for you?
+
+### Why learn scraping
+
+The internet is full of useful data, but most of it isn't offered in a structured way that's easy to process programmatically. That's why you need scraping, a set of approaches to download websites and extract data from them.
+
+Scraper development is also a fun and challenging way to learn web development, web technologies, and understand the internet. You'll reverse-engineer websites, understand how they work internally, discover what technologies they use, and learn how they communicate with servers. You'll also master your chosen programming language and core programming concepts. Understanding web scraping gives you a head start in learning web technologies such as HTML, CSS, JavaScript, frontend frameworks (like React or Next.js), HTTP, REST APIs, GraphQL APIs, and more.
+
+### Why build your own scrapers
+
+Scrapers are programs specifically designed to mine data from the internet. Point-and-click or no-code scraping solutions do exist, but they only take you so far. While simple to use, they lack the flexibility and optimization needed to handle advanced cases. Only custom-built scrapers can tackle more difficult challenges. And unlike ready-made solutions, they can be fine-tuned to perform tasks more efficiently, at a lower cost, or with greater precision.
+
+### Why become a scraper dev
+
+As a scraper developer, you are not limited by whether certain data is available programmatically through an official API—the entire web becomes your API! Here are some things you can do if you understand scraping:
+
+- Improve your productivity by building personal tools, such as your own real estate or rare sneakers watchdog.
+- Companies can hire you to build custom scrapers mining data important for their business.
+- Become an invaluable asset to data journalism, data science, or nonprofit teams working to make the world a better place.
+- You can publish your scrapers on platforms like the [Apify Store](https://apify.com/store) and earn money by renting them out to others.
+
+### Why learn with Apify
+
+We are [Apify](https://apify.com), a web scraping and automation platform. We do our best to build this course on top of open source technologies. That means what you learn applies to any scraping project, and you'll be able to run your scrapers on any computer. We will show you how scraping platforms can simplify your life, but those lessons are optional and designed to fit within our [free tier](https://apify.com/pricing).
+
+## Course content
+
+<DocCardList />
+</file>
+
+<file path="webscraping/typescript/enums.md">
+---
+title: Enums
+description: Learn how to define, use, and manage constant values using a cool feature called "enums" that TypeScript brings to the table.
+sidebar_position: 7.4
+slug: /switching-to-typescript/enums
+---
+
+# Enums! {#enums}
+
+**Learn how to define, use, and manage constant values using a cool feature called "enums" that TypeScript brings to the table.**
+
+---
+
+Enums are a nice feature offered by TypeScript that can be used to create automatically enumerated global constant identifiers that can also be used as custom types. We've dedicated an entire lesson to enums because they're a new feature brought into JavaScript by TypeScript, and because they can be quite useful in certain projects.
+
+## Let's talk about constants {#lets-talk-about-constants}
+
+If you've followed along with any of the more advanced courses in the Apify academy, or at least read the [Best practices](../scraping_basics_javascript/best_practices.md) lesson in the **Web scraping for beginners** course, you'll definitely be familiar with the idea of constant variables. In a nutshell, we create constant variables for values that will never change, and will likely used in multiple places. The naming convention for constants is **ALL_CAPS_AND_UNDERSCORED**.
+
+Here's an object of constant values that we've prepared for use within our project.
+
+```ts
+const fileExtensions = {
+    JAVASCRIPT: '.js',
+    TYPESCRIPT: '.ts',
+    RUST: '.rs',
+    PYTHON: '.py',
+};
+```
+
+No problem, this will totally work; however, the issue is that TypeScript doesn't know what these values are - it infers them to just be strings. We can solve this by adding a type annotation with a custom type definition:
+
+```ts
+// Since TypeScript infers these values to be just strings,
+// we have to create a type definition telling it that these
+// properties hold super specific strings.
+const fileExtensions: {
+    JAVASCRIPT: '.js';
+    TYPESCRIPT: '.ts';
+    RUST: '.rs';
+    PYTHON: '.py';
+// Define the object's values
+} = {
+    JAVASCRIPT: '.js',
+    TYPESCRIPT: '.ts',
+    RUST: '.rs',
+    PYTHON: '.py',
+};
+
+// Or, we can just do this
+const fileExtensions = {
+    JAVASCRIPT: '.js',
+    TYPESCRIPT: '.ts',
+    RUST: '.rs',
+    PYTHON: '.py',
+} as const;
+```
+
+> Using an actual concrete value such as `'.js'` or `24` or something else instead of a type name is called a [literal type](https://www.typescriptlang.org/docs/handbook/literal-types.html).
+
+And now we'll create a variable with a hacky custom type that points to the values in the `fileExtensions` object:
+
+![TypeScript autofilling the values of the fileExtensions object](./images/constant-autofill.png)
+
+Because of the custom type definition for `fileExtensions` and the type annotation used for the `values` variable, we are getting some autofill for the variable, and the variable can only be set to values within the `fileExtensions` object. Though this implementation works, TypeScript offers a unique new feature called **enums** that was designed just for these use cases.
+
+## Creating enums {#creating-enums}
+
+The [`enum`](https://www.typescriptlang.org/docs/handbook/enums.html) keyword is a new keyword brought to us by TypeScript that allows us the same functionality we implemented in the above section, plus more. To create one, use the keyword followed by the name you'd like to use (the naming convention is generally **CapitalizeEachFirstLetterAndSingular**).
+
+```ts
+enum FileExtension {
+    // Use an "=" sign instead of a ":"
+    JAVASCRIPT = '.js',
+    TYPESCRIPT = '.ts',
+    RUST = '.rs',
+    PYTHON = '.py',
+}
+```
+
+## Using enums {#using-enums}
+
+Using enums is straightforward. Use dot notation as you would with a regular object.
+
+```ts
+enum FileExtension {
+    JAVASCRIPT = '.js',
+    TYPESCRIPT = '.ts',
+    RUST = '.rs',
+    PYTHON = '.py',
+}
+
+const value = FileExtension.JAVASCRIPT;
+
+console.log(value); // => ".js"
+```
+
+## Using enums as types {#using-enums-as-types}
+
+The nice thing about enums is that they can be used directly in type annotations as somewhat of a custom type. Observe this function:
+
+```ts
+const createFileName = (name: string, extension: string) => {
+    return name + extension;
+};
+```
+
+We can restrict `extension` so that it can only be a value in the enum by replacing `extension: string` with `extension: FileExtension`:
+
+```ts
+enum FileExtension {
+    JAVASCRIPT = '.js',
+    TYPESCRIPT = '.ts',
+    RUST = '.rs',
+    PYTHON = '.py',
+}
+
+const createFileName = (name: string, extension: FileExtension) => {
+    return name + extension;
+};
+
+// Call the function and use the enum to populate the second parameter
+const fileName = createFileName('hello', FileExtension.TYPESCRIPT);
+
+console.log(fileName);
+```
+
+We don't get autocomplete, but the `extension` parameter is now restricted to the values defined in the `FileExtension` enum.
+
+## Next up {#next}
+
+The `enum` keyword is just the tip of the iceberg of the exclusive features TypeScript has to offer. Let's now [learn about](./type_aliases.md) type aliases (custom types!) with the `type` keyword.
+</file>
+
+<file path="webscraping/typescript/index.md">
+---
+title: Switching to TypeScript
+description: In this course, learn what TypeScript is, why you should use it instead of vanilla JavaScript, and how to start using it in your own projects.
+sidebar_position: 5
+category: web scraping & automation
+slug: /switching-to-typescript
+---
+
+# Switching to TypeScript {#switching-to-typescript}
+
+> In this course, learn what TypeScript is, why you should use it instead of vanilla JavaScript, and how to start using it in your own projects.
+
+As the world of JavaScript moves forward, [TypeScript](https://www.typescriptlang.org/) continues to become more popular. More companies than ever before are using TypeScript in their codebases, and are heavily preferring it over vanilla JavaScript. But why? What is TypeScript, and why it is so great for developers?
+
+![TypeScript logo](./images/typescript-logo.webp)
+
+## What is TypeScript? {#what-is-typescript}
+
+If you're familiar with the fundamentals of any programming language (JavaScript included), then you're familiar with the concept of **types**. String, boolean, array, object, number - these are all types. What TypeScript does is bring [type safety](https://en.wikipedia.org/wiki/Type_safety) to JavaScript, which is normally a [dynamically typed](https://developer.mozilla.org/en-US/docs/Glossary/Dynamic_typing) and [interpreted](https://www.geeksforgeeks.org/difference-between-compiled-and-interpreted-language/) programming language. This means that if you have declared a variable like this: `const foo = 'bar'`, then later try to access the non-existent property of `foo.baz`, you'll only know about the error once it happens during runtime.
+
+To sum everything said up above, here's a code example written in JavaScript that has a couple of problems with it:
+
+```js
+const john = {
+    name: 'john',
+    job: 'web developer',
+};
+
+const bob = {
+    name: 'bob',
+    job: 'data analyst',
+    age: '27',
+};
+
+const addAges = (num1, num2) => num1 + num2;
+
+console.log(addAges(bob.age, john.age));
+```
+
+This code doesn't actually throw an error, but it does output `27undefined`. That's not good. The first issue is that `john.age` is **undefined**, and the second issue is that `bob.age` is a string and must be converted to a number to work properly in the `addAges` function. Despite these two significant mistakes, JavaScript doesn't tell us at all about them and lets the code run with bugs.
+
+With TypeScript, these types of issues stick out like a sore thumb, and depending on your configurations, the [compiler](https://www.techtarget.com/whatis/definition/compiler#:~:text=A%20compiler%20is%20a%20special,as%20Java%20or%20C%2B%2B.) will refuse to compile it until they have been fixed.
+
+![TS Error 2345](./images/typescript-error.png)
+
+This means that when using TS (a popular acronym for "TypeScript") on a large project, you'll run into much fewer runtime errors and catch the majority of them during the development process.
+
+## What are the advantages of using TypeScript? {#advantages-of-typescript}
+
+1. The ability to **optionally** [statically type](https://developer.mozilla.org/en-US/docs/Glossary/Static_typing) your variables and functions.
+2. [Type Inference](https://www.typescriptlang.org/docs/handbook/type-inference.html), which provides you the benefits of using types, but without having to actually statically type anything. For example, if you create a variable like this: `let num = 5`, TypeScript will automatically infer that `num` is of a **number** type.
+3. Access to the newest features in JavaScript before they are officially supported everywhere.
+4. Fantastic support with [IntelliSense](https://en.wikipedia.org/wiki/Intelligent_code_completion) and epic autocomplete when writing functions, accessing object properties, etc. Most IDEs have TypeScript support.
+5. Access to exclusive TypeScript features such as [Enums](https://www.typescriptlang.org/docs/handbook/enums.html).
+<!-- and [Decorators](https://www.typescriptlang.org/docs/handbook/decorators.html). -->
+
+## How different is TypeScript from JavaScript? {#how-different-is-it}
+
+Think of it this way: JavaScript **IS** TypeScript, but TypeScript isn't JavaScript. All JavaScript code is valid TypeScript code, which means that you can pretty much turn any **.js** file into a **.ts** file and it'll still work just the same after being compiled. It also means that to learn TypeScript, you aren't going to have to learn a whole new programming language if you already know JavaScript.
+
+What are the differences? Well, there's really just one: TypeScript files cannot be run directly. They must first be compiled into regular JavaScript.
+
+## Ready to get started? {#first}
+
+Now that you're familiar with what TypeScript is and aware of its many advantages, let's [get started](./installation.md) in our TS journey by installing the TypeScript compiler (super easy) and writing our first line of code in a **.ts** file.
+</file>
+
+<file path="webscraping/typescript/installation.md">
+---
+title: Installation & getting started
+description: Install TypeScript and the TS compiler on your machine, then write your very first lines of TypeScript code by fixing a logical bug in a vanilla JS snippet.
+sidebar_position: 7.1
+slug: /switching-to-typescript/installation
+---
+
+# Installation & getting started {#installation-getting-started}
+
+**Install TypeScript and the TS compiler on your machine, then write your very first lines of TypeScript code by fixing a logical bug in a vanilla JS snippet.**
+
+---
+
+> In order to install and use TypeScript, you'll first need to make sure you've installed [Node.js](https://nodejs.org). Node.js comes with a package manager called [npm](https://npmjs.com), through which TypeScript can be installed.
+
+To install TypeScript globally on your machine, run the following command in your terminal:
+
+```shell
+npm install -g typescript
+```
+
+> On MacOS or Linux, you might need to prefix this command with `sudo`.
+
+That's it!
+
+## Understanding the problems TypeScript solves {#understanding-what-typescript-solves}
+
+To aid in properly showing some of the benefits TypeScript brings, here is some vanilla JavaScript code that has a bug. This bug will not throw an error or cause the program to crash, but it is a logical error that does not output what we expect.
+
+```js
+const products = [
+    {
+        title: 'iPhone',
+        price: '1000',
+    },
+    {
+        title: 'iPad',
+        price: '1099',
+    },
+];
+
+const addPrices = (price1, price2) => {
+    return price1 + price2;
+};
+
+console.log(addPrices(products[0].price, products[1].price));
+```
+
+The output of this code is **10001099**, because instead of adding two numbers together, we've concatenated two strings together using the `+` operator. Surely, this is not what we wanted to do.
+
+No problem, right? We can just add a type check within the `addPrices` function so that it can support both strings and numbers:
+
+```js
+const products = [
+    {
+        title: 'iPhone',
+        price: '1000',
+    },
+    {
+        title: 'iPad',
+        price: '1099',
+    },
+];
+
+const addPrices = (price1, price2) => {
+    // If they are numbers just add them together
+    if (typeof price1 === 'number' && typeof price2 === 'number') {
+        return price1 + price2;
+    }
+
+    // Otherwise, convert them to numbers and add them together
+    return +price1 + +price2;
+};
+
+console.log(addPrices(products[0].price, products[1].price));
+```
+
+Now, the output of our code is **2099**, exactly what we wanted. However, this extra logic hasn't really solved our issue. We only ever want numbers to be passed into `addPrices`, and nothing else. This is where TS comes in handy.
+
+## Writing your first TypeScript code {#writing-your-first-typescript-code}
+
+> We recommend using [VSCode](https://code.visualstudio.com/) to write your TypeScript code, but feel free to use any IDE that supports TypeScript when following along with this course.
+
+Let's create a folder called **learning-typescript**, adding a new file within it named **first-lines.ts**. Then, we'll go ahead and paste the very first code example from this lesson into that file. Once again, the example is written in vanilla JavaScript, but since all JavaScript code is valid TypeScript code, this will be pretty much seamless.
+
+![Example pasted into first-lines.ts](./images/pasted-example.png)
+
+As seen above, TypeScript has successfully recognized our code; however, there are now red underlines under the `price1` and `price2` parameters in the function declaration of `addPrices`. This is because right now, the compiler has no idea what data types we're expecting to be passed in. This can be solved with the addition of **type annotations** to the parameters by using a colon (`:`) and the name of the parameter's type.
+
+```ts
+const products = [
+    {
+        title: 'iPhone',
+        price: '1000',
+    },
+    {
+        title: 'iPad',
+        price: '1099',
+    },
+];
+
+// Add type annotations so that now the function will
+// only accept numbers.
+const addPrices = (price1: number, price2: number) => {
+    return price1 + price2;
+};
+
+console.log(addPrices(products[0].price, products[1].price));
+```
+
+Since the function now only accepts numbers, the parameters in the function call within the `console.log` at the bottom of the file are now underlined in red.
+
+![Example pasted into first-lines.ts](./images/another-error.png)
+
+This is because TypeScript has automatically inferred (without us even needing to do anything) that `products` is an array of objects containing `title` and `price` properties - both strings. Because of this type inference, it knows that `products[0].price` and `products[1].price` are both strings, and does not allow them to be passed into `addPrices`, which only accepts numbers. We'll solve this by converting the values to numbers when passing them into the function.
+
+```ts
+const products = [
+    {
+        title: 'iPhone',
+        price: '1000',
+    },
+    {
+        title: 'iPad',
+        price: '1099',
+    },
+];
+
+const addPrices = (price1: number, price2: number) => {
+    return price1 + price2;
+};
+
+// Convert the values to numbers as they are passed in
+console.log(addPrices(+products[0].price, +products[1].price));
+```
+
+Now, our code will work exactly as it should, and we can feel certain that no similar mistakes will be made when using the `addPrices` function again.
+
+## Compiling {#compiling}
+
+In order to run the code we wrote in the above section, we'll have to compile the **first-lines.ts** file into vanilla JavasScript.
+
+Using the TypeScript compiler is extremely straightforward. Just run the `tsc` command in your terminal with the path to the file to compile, and the compiler will do its magic!
+
+```shell
+tsc first-lines.ts
+```
+
+A **first-lines.js** file appears after running this command, which can be run like normal with this command:
+
+```shell
+node first-lines.js
+```
+
+Outputted is `2099`, exactly what we expected. Awesome!
+
+## Next up {#next}
+
+In the [next lesson](./using_types.md), we'll be discussing how to use some of the core types that TypeScript offers.
+</file>
+
+<file path="webscraping/typescript/interfaces.md">
+---
+title: Interfaces
+description: Understand how to declare object types with the "interface" keyword, and the subtle difference interfaces have with regular type aliases.
+sidebar_position: 7.8
+slug: /switching-to-typescript/interfaces
+---
+
+# Interfaces {#interfaces}
+
+**Understand how to declare object types with the "interface" keyword, and the subtle difference interfaces have with regular type aliases.**
+
+---
+
+In the [**Using types - II**](./using_types_continued.md) lesson, you learned about object types and how to create them. Let's create a new custom object type using the `type` keyword that we're already familiar with.
+
+```ts
+type Person = {
+    name: string;
+    age: number;
+    hobbies: string[];
+};
+```
+
+We can keep this just as it is, which would be totally okay, or we could use an [interface](https://www.typescriptlang.org/docs/handbook/typescript-tooling-in-5-minutes.html#interfaces). Interfaces and type aliases are nearly identical in their functionality; however there are two main differences:
+
+1. Interfaces can **ONLY** hold function or object types.
+2. Types can only be declared once, while interfaces of the same name can be declared multiple times and will automatically merge.
+3. The syntax between the two differs slightly.
+
+> When working with object types, it usually just comes down to preference whether you decide to use an interface or a type alias.
+
+Using the `interface` keyword, we can turn our `Person` type into an interface.
+
+```ts
+// Interfaces don't need an "=" sign
+interface Person {
+    name: string;
+    age: number;
+    hobbies: string[];
+}
+```
+
+When creating a new interface, you can also use the `extends` keyword to inherit all of the object properties from another interface (or type alias):
+
+```ts
+interface Person {
+    name: string;
+    age: number;
+    hobbies: string[];
+}
+
+// This would also work just fine if "Person"
+// was declared with the "type" keyword instead
+interface Employee extends Person {
+    occupation: string;
+}
+```
+
+This is functionality is not unique to interfaces though, as it can be done with something called an [intersection type](https://www.typescriptlang.org/docs/handbook/2/objects.html#intersection-types) when using the `type` keyword.
+
+```ts
+type Employee = Person & {
+    occupation: string;
+};
+```
+
+Overall, the differences between interfaces and type aliases are quite subtle. In some use cases, one might be better than the other, but in general it's up to you which you want to use.
+
+> To learn more about the differences between `interface` and `type`, check out [this article](https://medium.com/@martin_hotell/interface-vs-type-alias-in-typescript-2-7-2a8f1777af4c).
+
+## Next up {#next}
+
+It's finally time to [build our project](./mini_project.md)! The project we'll be building in the next lesson will be a small API scraper utilizes many of the TypeScript features learned in the course.
+</file>
+
+<file path="webscraping/typescript/mini_project.md">
+---
+title: Mini-project
+description: Build an entire project in TypeScript using concepts learned in this course. Also, learn about two more advanced TypeScript features.
+sidebar_position: 7.9
+slug: /switching-to-typescript/mini-project
+---
+
+# Mini-project {#mini-project}
+
+**Build an entire project in TypeScript using concepts learned in this course. Also, learn about two more advanced TypeScript features.**
+
+---
+
+You're here! If you made it this far, that means that you've familiarized yourself with all the core concepts in TypeScript (and some of the more advanced ones too). We're going to scrape product data from [this API endpoint](https://dummyjson.com/products), and then manipulate it based on user input.
+
+## Project overview {#project-overview}
+
+Here's a rundown of what our project should be able to do:
+
+1. Accept an object with two properties as its input. The **sort** property defines how to sort the data, and can be either **ascending** or **descending**. **removeImages** determines whether or not the images array should be removed from each product.
+2. Fetch the data and get full TypeScript support on the response object (no `any`!).
+3. Sort and modify the data, receiving TypeScript support for the new modified data.
+
+We'll be using a single external package called [**Axios**](https://www.npmjs.com/package/axios) to fetch the data from the API, which can be installed with the following command:
+
+```shell
+npm i axios
+```
+
+## Writing types {#writing-types}
+
+Especially when writing a scraper, it's extremely helpful to write out your data types before even writing a single line of logic. When you understand the data types your program is going to be working with, it's much easier to think how **how** it will work with them.
+
+### Defining object types for API responses {#defining-object-types-for-api-responses}
+
+Our Dummy JSON API returns an array of products that look like this:
+
+```json
+{
+    "id": 1,
+    "title": "iPhone 9",
+    "description": "An apple mobile which is nothing like apple",
+    "price": 549,
+    "discountPercentage": 12.96,
+    "rating": 4.69,
+    "stock": 94,
+    "brand": "Apple",
+    "category": "smartphones",
+    "thumbnail": "https://dummyjson.com/image/i/products/1/thumbnail.jpg",
+    "images": [
+        "https://dummyjson.com/image/i/products/1/1.jpg",
+        "https://dummyjson.com/image/i/products/1/2.jpg",
+        "https://dummyjson.com/image/i/products/1/3.jpg",
+        "https://dummyjson.com/image/i/products/1/4.jpg",
+        "https://dummyjson.com/image/i/products/1/thumbnail.jpg"
+    ]
+}
+```
+
+Let's write an object type for a product in a new file called **types.ts**:
+
+```ts
+// types.ts
+export interface Product {
+    id: number;
+    title: string;
+    description: string;
+    price: number;
+    discountPercentage: number;
+    rating: number;
+    stock: number;
+    brand: string;
+    category: string;
+    thumbnail: string;
+    images: string[];
+}
+```
+
+Great! In the response, the array of products is stored under a key named **products**, so we'll create a `ResponseData` type to represent this as well:
+
+```ts
+// types.ts
+
+// ...
+export interface ResponseData {
+    products: Product[];
+}
+```
+
+> Notice that we are `export`ing these so that they can be used back in **index.ts** later on.
+
+### Defining output types {#defining-output-types}
+
+Luckily for us, we'll be outputting an array of `Product`s, for which we've already written a type. However, the user will have an option to modify each product by removing the **images** property from each one. We need to create a new `ModifiedProduct` that has all the same properties as `Product` **EXCEPT** for **images**.
+
+For this, we can use a [utility type](https://www.typescriptlang.org/docs/handbook/utility-types.html) called `Omit`, which is natively available in TypeScript.
+
+```ts
+// types.ts
+
+// ...
+export type ModifiedProduct = Omit<Product, 'images'>;
+```
+
+This type takes in some arguments (a [generic](https://www.typescriptlang.org/docs/handbook/2/generics.html), as they are called in TS), notated with `<>`. The first one is the type to remove the property from, in our case it's `Product`, and the second is the property to remove.
+
+### Defining input types {#defining-input-types}
+
+The user of our scraper will have to provide some input. First, we'll use an enum to define the types of sorting we want to support:
+
+```ts
+// types.ts
+
+// ...
+export enum SortOrder {
+    ASC = 'ascending',
+    DESC = 'descending',
+}
+```
+
+And finally, we'll create a `UserInput` type which takes in an argument (a generic type).
+
+```ts
+// types.ts
+
+// ...
+export interface UserInput<RemoveImages extends boolean = boolean> {
+    sort: 'ascending' | 'descending';
+    removeImages: RemoveImages;
+}
+```
+
+But hold on a minute, we didn't even learn about generics in this course!
+
+#### Quick chat about generics {#generics}
+
+"Generics" is just a fancy term for arguments that can be passed into a type. Just like regular JavaScript function arguments, they can be passed in and anything can be done with them. Let's break it down:
+
+```ts
+// We can give "RemoveImages" any name, as it is just representative of an argument that will be passed into.
+
+// "RemoveImages" can't be just any type. It must EXTEND a boolean, meaning it must be either true or false.
+
+// By using the "=" sign, we make "RemoveImages" an optional type argument. It will default to the "boolean" type.
+export interface UserInput<RemoveImages extends boolean = boolean> {
+    sort: 'ascending' | 'descending';
+    // The type passed in is being set as the type for the "removeImages" property
+    removeImages: RemoveImages;
+}
+```
+
+Using this generic allows us to go a step further in how specific we are being by adding the ability to literally specify whether or not **removeImages** is `true` or `false`. A bit later, you'll see why we want this functionality.
+
+> We recommend reading up on generics [in the TypeScript documentation](https://www.typescriptlang.org/docs/handbook/2/generics.html) to fully understand this slightly more advanced concept.
+
+### Final types.ts file {#final-types}
+
+```ts
+// types.ts
+export interface Product {
+    id: number;
+    title: string;
+    description: string;
+    price: number;
+    discountPercentage: number;
+    rating: number;
+    stock: number;
+    brand: string;
+    category: string;
+    thumbnail: string;
+    images: string[];
+}
+
+export interface ResponseData {
+    products: Product[];
+}
+
+export type ModifiedProduct = Omit<Product, 'images'>;
+
+// Usually, you'd have this in a file holding constants
+export enum SortOrder {
+    ASC = 'ascending',
+    DESC = 'descending',
+}
+
+export interface UserInput<RemoveImages extends boolean = boolean> {
+    sort: 'ascending' | 'descending';
+    removeImages: RemoveImages;
+}
+```
+
+## Fetching the data {#fetching-the-data}
+
+First, let's go ahead and import **axios** and write our fetching function.
+
+```ts
+// index.ts
+import axios from 'axios';
+
+const fetchData = async () => {
+    const { data } = await axios('https://dummyjson.com/products?limit=100');
+
+    return data;
+};
+```
+
+Easy enough, right? Well, not really. Let's take a look at how TypeScript interprets the function by hovering over it.
+
+![Promise of any type](./images/promise-any.png)
+
+We're returning a promise of any type out of this function. This is where we can use [type assertions](./unknown_and_type_assertions.md) to help TypeScript understand that this response takes the shape of our `ResponseData` type.
+
+```ts
+// index.ts
+import axios from 'axios';
+
+// You don't need to add the word "type" after the "import" keyword when
+// importing types, but it can help improve code readability and also prevent
+// classes from being used normally if they're imported as just a type.
+import type { ResponseData } from './types';
+
+const fetchData = async () => {
+    const { data } = await axios('https://dummyjson.com/products?limit=100');
+
+    return data as ResponseData;
+};
+```
+
+Now, the return type is `Promise<ResponseData>` - much better! Because of this small change, we'll receive full TypeScript support (and IDE autocomplete) on the return value of `fetchData`.
+
+## Sorting the data {#sorting-the-data}
+
+Now, we'll write a function that will sort an array of products.
+
+```ts
+// index.ts
+
+// ...
+
+// Take in an array of products, as well as the order in which they should be sorted
+const sortData = (products: Product[], order: SortOrder) => {
+    // Logic will go here
+};
+```
+
+Since we `SortOrder` ahead of time, we know exactly which cases we need to handle. This is just one example of how writing key important types and constants prior to writing any code can be beneficial.
+
+```ts
+// index.ts
+// ...
+import { SortOrder } from './types';
+
+import type { ResponseData, Product } from './types';
+// ...
+const sortData = (products: Product[], order: SortOrder) => {
+    switch (order) {
+        // Handle ascending and descending sorting
+        case SortOrder.ASC:
+            return [...products].sort((a, b) => a.price - b.price);
+        case SortOrder.DESC:
+            return [...products].sort((a, b) => b.price - a.price);
+        // If for whatever reason the value provided isn't in our SortOrder
+        // enum, just return the products as they were
+        default:
+            return products;
+    }
+};
+```
+
+## Putting the pieces together {#putting-the-pieces-together}
+
+Because of the abstractions we've made with the `fetchData` and `sortData` functions, we can now write another small function called `scrape` which will do the following:
+
+1. Take in an object matching the `UserInput` type.
+2. Fetch the data with the `fetchData` function.
+3. Sort it with the `sortData` function.
+4. Remove the images from each product (if requested by the user).
+
+```ts
+// index.ts
+// ...
+import { SortOrder } from './types';
+
+import type { ResponseData, Product, UserInput, ModifiedProduct } from './types';
+// ...
+
+// Return a promise of either a "Product" array, or a "ModifiedProduct" array
+async function scrape(input: UserInput): Promise<Product[] | ModifiedProduct[]> {
+    // Fetch the data
+    const data = await fetchData();
+
+    // Sort the products based on the input's "sort" property. We have
+    // to cast it to "SortOrder" because despite being equal, technically
+    // the string "ascending" isn't the same type as SortOrder.ASC
+    const sorted = sortData(data.products, input.sort as SortOrder);
+
+    // If the user wants to remove images, map through each product removing
+    // the images and return the result
+    if (input.removeImages) {
+        return sorted.map((item) => {
+            const { images, ...rest } = item;
+
+            return rest;
+        });
+    }
+
+    // Otherwise, just return the sorted products
+    return sorted;
+}
+```
+
+## Running the scraper {#running-the-scraper}
+
+Finally, we'll create a new function called `main` which will initialize the input, call the `scrape` function, and return the result.
+
+```ts
+// index.ts
+
+// ...
+const main = async () => {
+    const INPUT: UserInput<true> = { sort: 'ascending', removeImages: true };
+
+    const result = await scrape(INPUT);
+
+    console.log(result);
+};
+```
+
+And that's it! Well, not quite. We are unable to access the **images** property on `result[0]`, which makes total sense. However, even if we switch **removeImages** in the `INPUT` variable to `false`, we still get an error when trying to access that property, even though we know that it hasn't been filtered out.
+
+![Can't access the "images" property even though it exists](./images/we-need-overloads.png)
+
+This is because we haven't been specific enough. The `scrape` function can return two different data types (either `Product[]` or `ModifiedProduct[]`), and because `ModifiedProduct` doesn't have an **images** property, TypeScript freaks out and says that you shouldn't be trying to access it. This can be fixed with **overloads**.
+
+## Let's talk about overloads {#lets-talk-about-overloads}
+
+[Overloads](https://www.tutorialsteacher.com/typescript/function-overloading) come in handy when you've written a function that returns different things based on what types of arguments were passed into it.
+
+We need to tell TypeScript that the `scrape` function returns a `Product[]` when the input has a type of `UserInput<true>`, but a `ModifiedProduct[]` when the input has a type of `UserInput<false>`. This can be done by declaring the function multiple types with different parameter types and specifying the return type for each one.
+
+```ts
+// index.ts
+
+// ...
+// If "removeImages" is true, a ModifiedProduct array will be returned
+async function scrape(input: UserInput<true>): Promise<ModifiedProduct[]>;
+// If false, a normal product array is returned
+async function scrape(input: UserInput<false>): Promise<Product[]>;
+// The main function declaration, which accepts all types in the declarations above.
+// Notice that it has no explicit return type, since they are defined in the
+// overloads above.
+async function scrape(input: UserInput) {
+    const data = await fetchData();
+
+    const sorted = sortData(data.products, input.sort as SortOrder);
+
+    if (input.removeImages) {
+        return sorted.map((item) => {
+            const { images, ...rest } = item;
+
+            return rest;
+        });
+    }
+
+    return sorted;
+}
+```
+
+Now, we can access `result[0].images` on the return value of `scrape` if **removeImages** was false without any compiler errors being thrown. But, if we switch **removeImages** to true, TypeScript will yell at us.
+
+![No more error](./images/no-more-error.png)
+
+## Final code {#final-code}
+
+```ts
+// index.ts
+import axios from 'axios';
+import { SortOrder } from './types';
+
+import type { ResponseData, Product, UserInput, ModifiedProduct } from './types';
+
+const fetchData = async () => {
+    const { data } = await axios('https://dummyjson.com/products?limit=100');
+
+    return data as ResponseData;
+};
+
+const sortData = (products: Product[], order: SortOrder) => {
+    switch (order) {
+        case SortOrder.ASC:
+            return [...products].sort((a, b) => a.price - b.price);
+        case SortOrder.DESC:
+            return [...products].sort((a, b) => b.price - a.price);
+        default:
+            return products;
+    }
+};
+
+async function scrape(input: UserInput<true>): Promise<ModifiedProduct[]>;
+async function scrape(input: UserInput<false>): Promise<Product[]>;
+async function scrape(input: UserInput) {
+    const data = await fetchData();
+
+    const sorted = sortData(data.products, input.sort as SortOrder);
+
+    if (input.removeImages) {
+        return sorted.map((item) => {
+            const { images, ...rest } = item;
+
+            return rest;
+        });
+    }
+
+    return sorted;
+}
+
+const main = async () => {
+    const INPUT: UserInput<false> = { sort: 'ascending', removeImages: false };
+
+    const result = await scrape(INPUT);
+
+    console.log(result[0].images);
+};
+
+main();
+```
+
+```ts
+// types.ts
+export interface Product {
+    id: number;
+    title: string;
+    description: string;
+    price: number;
+    discountPercentage: number;
+    rating: number;
+    stock: number;
+    brand: string;
+    category: string;
+    thumbnail: string;
+    images: string[];
+}
+
+export interface ResponseData {
+    products: Product[];
+}
+
+export type ModifiedProduct = Omit<Product, 'images'>;
+
+export enum SortOrder {
+    ASC = 'ascending',
+    DESC = 'descending',
+}
+
+export interface UserInput<RemoveImages extends boolean = boolean> {
+    sort: 'ascending' | 'descending';
+    removeImages: RemoveImages;
+}
+```
+
+## Wrap up {#wrap-up}
+
+Nice work! You've reached the end of our **Switching to TypeScript** course, which means that you're ready to start building your own projects in TypeScript. We haven't covered every single TypeScript feature in this course, but you have learned about and used all of the language's most important features.
+
+Congrats! 🎉
+</file>
+
+<file path="webscraping/typescript/type_aliases.md">
+---
+title: Type aliases & function types
+description: Create your own custom types using the "type" keyword, understand the "void" type, and learn how to write custom function types.
+sidebar_position: 7.5
+slug: /switching-to-typescript/type-aliases
+---
+
+# Type aliases {#type-aliases}
+
+**Create your own custom types using the "type" keyword, understand the "void" type, and learn how to write custom function types.**
+
+---
+
+Real quick, let's look at this code:
+
+<!-- eslint-disable -->
+```ts
+// Using a union type to allow value to be either a string,
+// a number, or a boolean
+const returnValueAsString = (value: string | number | boolean) => {
+    return `${value}`;
+};
+
+let myValue: string | number | boolean;
+
+myValue = 55;
+
+console.log(returnValueAsString(myValue));
+```
+
+This is fine, but we had to write `string | number | boolean` twice, and if this were a large project, we'd likely find ourselves writing it even more times. The solution for this is to define the type elsewhere, giving it a name by which it can be identified, and then to use that name within the type annotations for `returnValueAsString` and `myValue`.
+
+## Creating types {#creating-types}
+
+With the `type` keyword, you can abstract all the type stuff you'd normally put in a type annotation into one **type alias**. The great thing about type aliases is that they improve code readability and can be used in multiple places.
+
+First, we'll use the `type` keyword and call the type `MyUnionType`.
+
+<!-- eslint-disable -->
+```ts
+type MyUnionType = any;
+```
+
+Then, we can just copy-paste the union type between a string, number, and boolean and paste it after the equals sign:
+
+```ts
+type MyUnionType = string | number | boolean;
+```
+
+The type is now stored under this `MyUnionType` **alias**.
+
+> Any type declaration "logic" that you can place within a type annotation can also be stored under a type alias.
+
+## Using type aliases {#using-type-aliases}
+
+All we need to do to refactor the code from the beginning of the lesson is replace the type annotations with `MyUnionType`:
+
+<!-- eslint-disable -->
+```ts
+type MyUnionType = string | number | boolean;
+
+const returnValueAsString = (value: MyUnionType) => {
+    return `${value}`;
+};
+
+let myValue: MyUnionType;
+
+myValue = 55;
+
+console.log(returnValueAsString(myValue));
+```
+
+## Function types {#function-types}
+
+Before we learn about how to write function types, let's learn about a problem they can solve. We have a function called `addAll` which takes in array of numbers, adds them all up, and then returns the result.
+
+```ts
+const addAll = (nums: number[]) => {
+    return nums.reduce((prev, curr) => prev + curr, 0);
+};
+```
+
+We want to add the ability to choose whether or not the result should be printed to the console. A function's parameter can be marked as optional by using a question mark.
+
+Let's do that now.
+
+```ts
+// We've added a return type to the function because it will return different
+// things based on the "printResult" parameter. When false, a number will be
+// returned, while when true, nothing will be returned (void).
+const addAll = (nums: number[], printResult?: boolean): number | void => {
+    const result = nums.reduce((prev, curr) => prev + curr, 0);
+
+    if (!printResult) return result;
+
+    console.log('Result:', result);
+};
+```
+
+Also, it'd be nice to have some option to pass in a custom message for when the result is logged to the console, so we'll add another optional parameter for that.
+
+```ts
+const addAll = (nums: number[], printResult?: boolean, printWithMessage?: string): number | void => {
+    const result = nums.reduce((prev, curr) => prev + curr, 0);
+
+    if (!printResult) return result;
+
+    console.log(printWithMessage || 'Result:', result);
+};
+```
+
+Finally, we'll add a final parameter with the option to return/print the result as a string instead of a number.
+
+```ts
+const addAll = (nums: number[], toString?: boolean, printResult?: boolean, printWithMessage?: string): number | string | void => {
+    const result = nums.reduce((prev, curr) => prev + curr, 0);
+
+    if (!printResult) return toString ? result.toString() : result;
+
+    console.log(printWithMessage || 'Result:', toString ? result.toString : result);
+};
+```
+
+What we're left with is a massive function declaration that is very verbose. This isn't necessarily a bad thing, but all of these typings could be put into a function type instead.
+
+### Creating & using function types {#creating-and-using-function-types}
+
+Function types are declared with the `type` keyword (or directly within a type annotation), and are written in a similar fashion to regular arrow functions. All parameters and their types go inside the parentheses (`()`), and the return type of the function goes after the arrow (`=>`).
+
+```ts
+type AddFunction = (numbers: number[], toString?: boolean, printResult?: boolean, printWithMessage?: string) => number | string | void;
+```
+
+This is where the true magic happens. Because our arrow function is stored in a variable, and because we've now created the `AddFunction` type, we can delete all type annotations from the function itself and annotate the variable with `AddFunction` instead.
+
+```ts
+type AddFunction = (numbers: number[], toString?: boolean, printResult?: boolean, printWithMessage?: string) => number | string | void;
+
+const addAll: AddFunction = (nums, toString, printResult, printWithMessage) => {
+    const result = nums.reduce((prev, curr) => prev + curr, 0);
+
+    if (!printResult) return toString ? result.toString() : result;
+
+    console.log(printWithMessage || 'Result:', toString ? result.toString : result);
+};
+```
+
+We've significantly cleaned up the function by moving its verbose type annotations into a type alias without losing the benefits of TypeScript.
+
+![Parameter type](./images/parameter-type.png)
+
+## Next up {#next}
+
+A special type exists that you haven't learned about yet called `unknown`. We haven't yet discussed it, because it's best learned alongside **type casting**, which is yet another feature offered by TypeScript. Learn all about the `unknown` type and typecasting in the [next lesson](./unknown_and_type_assertions.md).
+</file>
+
+<file path="webscraping/typescript/unknown_and_type_assertions.md">
+---
+title: Unknown, any, type guards & type assertions
+description: Understand the "unknown" and "any" types, as well as how to use type guards to make your code safer and type assertions to avoid common TypeScript compiler errors.
+sidebar_position: 7.6
+slug: /switching-to-typescript/unknown-and-type-assertions
+---
+
+# Unknown & type assertions {#unknown-and-type-assertions}
+
+**Understand the "unknown" and "any" types, as well as how to use type guards to make your code safer and type assertions to avoid common TypeScript compiler errors.**
+
+---
+
+Two types we haven't discussed yer are `any` and `unknown`
+
+## Let's talk about "any" {#the-any-type}
+
+In the first [**Using types**](./using_types.md) lesson, you were briefly exposed to the `any` type, which is a special type used to represent all possible JavaScript values. By using this type, you basically tell TypeScript that you don't care, and that you want to be able to do anything with that value, even if it might cause a runtime error. Take a look at this example:
+
+<!-- eslint-disable -->
+```ts
+// Create a variable that TypeScript will completely ignore.
+// Absolutely anything can be stored in here.
+let userInput: any;
+
+// Create a variable that can only hold strings
+let savedInput: string;
+
+// Set the user input to equal a number. This is fine, because
+// it can be any time.
+userInput = 5;
+
+// Set the "savedInput" to be the value of "userInput". Stored in
+// "userInput" is a number, but since we told TypeScript that it's
+// "any" type, an error is not thrown.
+savedInput = userInput;
+```
+
+Sometimes, `any` can be useful; however, in 99% of cases it is best to avoid it as it can lead to logical errors just like the one above.
+
+## Why "unknown" is better {#the-unknown-type}
+
+Just like `any`, the `unknown` type is also a special type that represents all possible JavaScript value, and all types are assignable to it. The big difference is that the TypeScript compiler won't allow any operation on values typed as `unknown`. To see this in action, we just have to change the type of `userInput` in the above code snippet from `any` to `unknown`.
+
+![Replacing "any" with "unknown" from the above snippet](./images/replace-with-unknown.png)
+
+Even this will result in the same error:
+
+<!-- eslint-disable -->
+```ts
+// This results in a compiler error!
+let userInput: unknown;
+let savedInput: string;
+
+userInput = 'hello world!';
+
+savedInput = userInput;
+```
+
+## Type guards {#type-guards}
+
+In order to make the code above not throw any compiler errors, we can use a [type guard](https://www.typescriptlang.org/docs/handbook/advanced-types.html), which is just a check that happens at runtime to ensure that the type is in fact what it should be.
+
+<!-- eslint-disable -->
+```ts
+let userInput: unknown;
+let savedInput: string;
+
+userInput = 5;
+
+// This if statement is called a "type guard"
+// No more error! TypeScript is smart enough to understand
+// what this if statement is doing, and removes the error
+if (typeof userInput === 'string') {
+    savedInput = userInput;
+}
+```
+
+This works, and in fact, it's the most optimal solution for this use case. But what if we were 100% sure that the value stored in `userInput` was a string? Thats when **type assertions** come in handy.
+
+## Type assertions {#type-assertions}
+
+Despite the fancy name, [type assertions](https://www.typescriptlang.org/docs/handbook/2/everyday-types.html#type-assertions) are a concept based around a single keyword: `as`. We usually use this on values that we can't control the return type of, or values that we're sure have a certain type, but TypeScript needs a bit of help understanding that.
+
+<!-- eslint-disable -->
+```ts
+let userInput: unknown;
+let savedInput: string;
+
+userInput = 'hello world!';
+
+// No more error! We've told TypeScript to treat "userInput"
+// as a string, despite the fact that its original type is
+// the "unknown" type
+savedInput = userInput as string;
+```
+
+## Non-null assertion {#non-null-assertion}
+
+You might already be familiar with [optional chaining](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Optional_chaining) in JavaScript (`?.` syntax). TypeScript adds a new operator with a similar syntax that uses an exclamation mark instead (`!.`), which automatically removes `null` and `undefined` from a value's type - essentially asserting that you are certain it will never be `null` or `undefined`.
+
+Consider this snippet:
+
+```ts
+let job: undefined | string;
+
+const chars = job.split('');
+```
+
+TypeScript will yell at you when trying to compile this code, stating that **Object is possibly 'undefined'**, which is true. To assert that `job` will not be `undefined` in this case, we can add an exclamation mark before the dot.
+
+```ts
+let job: undefined | string;
+
+const chars = job!.split('');
+```
+
+This operator is called the [non-null assertion operator](https://www.typescriptlang.org/docs/handbook/2/everyday-types.html#non-null-assertion-operator-postfix-).
+
+## Final thoughts {#final-thoughts}
+
+Even though you've learned about them in the same lesson, type guards and type assertions are inherently very different things with different use cases.
+
+**Type guards** make a runtime check of whether or not a value passes a check that determines that it can be safely used as a certain type. They are great when dealing with values that might hold inconsistent data types (such as user input) where you aren't 100% sure if a certain property will exist.
+
+**Type assertions** tell TypeScript to take a value of one type and to treat it as if it were another type. No runtime checks are made. A common use case is asserting the response body of an API call (usually has the `any` type depending on what you're using to fetch the data) to a custom type to receive TypeScript support on the data.
+
+Oftentimes, these features are used in tandem.
+
+## Next up {#next}
+
+We've now got all the knowledge we need to build a real project in TypeScript, which we're going to do very soon. But, there's one important thing we have to do before writing the code for our project - configure the compiler and understand watch mode. Learn all this in the [next lesson](./watch_mode_and_tsconfig.md)!
+</file>
+
+<file path="webscraping/typescript/using_types_continued.md">
+---
+title: Using types - II
+description: Continue learning about the core types in TypeScript. In this second part lesson, learn how to use and define object types, array types, and tuples.
+sidebar_position: 7.3
+slug: /switching-to-typescript/using-types-continued
+---
+
+# Using types (continued) {#using-types-continued}
+
+**Continue learning about the core types in TypeScript. In this second part lesson, learn how to use and define object types, array types, and tuples.**
+
+---
+
+Now that you're (hopefully) fairly comfortable with strings, booleans, and numbers in TypeScript, we can start discussing the more complex types supported by the language.
+
+| Name     | Example                        | Description                                                                          |
+| -------- | ------------------------------ | ------------------------------------------------------------------------------------ |
+| `object` | `{ name: 'academy' }`          | Any JavaScript object. More specific types are possible.                             |
+| `Array`  | `[1, 2, 3]`, `['a', 'b', 'c']` | Any JavaScript array. Types can be flexible or strict (regarding the element types). |
+| `Tuple`  | `[1, 2]`                       | A fixed-length array.                                                                |
+
+## Objects {#objects}
+
+By now, you're already familiar with what type inference is. Even when creating objects, TypeScript will do its best to infer what that object's type is. For example, TS automatically infers correctly that this object is an object with `name` and `currentLesson` properties, both strings:
+
+![Object type correctly inferred](./images/object-inference.png)
+
+Notice that the value that appeared in the popup when hovering over the `course` variable looks very much like a regular JavaScript object; however, it is not an object at all - instead, it is an object **type**.
+
+> Object types differ slightly in syntax from regular JavaScript objects. Firstly, the properties are separated by semicolons (`;`) instead of commas. Also, instead of being key-value pairs like objects, object types are key-**type** pairs.
+
+This inference from TypeScript is totally valid and correct; however, what if we want to add a new property to the `course` object after it's been initialized?
+
+![Error adding a new property to the object](./images/object-type-error.png)
+
+What's the problem? The problem is that we didn't tell TypeScript that the `course` object can have a property called `learningBasicTypes` of a boolean type in the variable's initial state, which is what TypeScript's inference is based on. Because of this, we have to write our own custom object type.
+
+Just as we did before when assigning types like `number` and `string` to variables, we'll annotate the variable's type with a colon (`:`) followed by the type. However, instead of using a basic type name such as `boolean` or `number`, we'll put a custom object type there instead.
+
+```ts
+const course: {
+    name: string;
+    currentLesson: string;
+} = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+};
+```
+
+> We know, this looks extremely weird at first, but by the end of the course you'll be super comfortable with this syntax. And, a bit later on in the course, you'll learn how to define object types separate from the variable to improve code readability.
+
+### Optional properties {#optional-properties}
+
+Finally, we'll add an optional property to our object type with a key of `learningBasicTypes` and a type of `boolean`. Properties become optional by adding a question mark (`?`) before the colon (`?`) after the property name.
+
+```ts
+// Initialize the object without the "learningBasicTypes" property
+const course: {
+    name: string;
+    currentLesson: string;
+    // Add the "learningBasicTypes" property to the object's type.
+    // This property can be either undefined or boolean
+    learningBasicTypes?: boolean;
+} = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+};
+
+// Add the "learningBasicTypes" property later on
+course.learningBasicTypes = true;
+```
+
+What this question mark does is tell TypeScript that the property doesn't necessarily have to exist on the `course` object (it can be undefined), but if it does exist, it must be a boolean.
+
+## Arrays {#arrays}
+
+Defining arrays is quite straightforward. We'll first add a `typesLearned` property to our `course` object:
+
+```ts
+const course2 = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+    typesLearned: ['number', 'boolean', 'string', 'object'],
+};
+```
+
+Then, in the type definition, we can add a `typesLearned` key. Then, by writing the type that the array's elements are followed by two square brackets (`[]`), we can form an array type.
+
+```ts
+const course: {
+    name: string;
+    currentLesson: string;
+    // By adding square brackets at the end, we tell TypeScript
+    // that "typesLearned" is not a string, but an array of strings
+    typesLearned: string[];
+    learningBasicTypes?: boolean;
+} = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+    typesLearned: ['number', 'boolean', 'string', 'object'],
+};
+```
+
+Some of the benefits of TypeScript can be seen when accessing one of the properties on the `course.typesLearned` array. Since we told TypeScript that it's an array of strings, it confidently knows that each of those properties are going to have the methods on the [String prototype](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#instance_methods), so it will autocomplete methods such as `.slice()`, `.toUpperCase()`, and `.slice()`.
+
+![Epic autocomplete](./images/epic-autocomplete.png)
+
+Nice!
+
+## Tuples {#tuples}
+
+The **Tuple** type is a special type that is not in vanilla JavaScript, but is supported in other programming languages - including TypeScript. It is almost the same thing as an array; however tuples have a fixed length that cannot be changed, and each element at each index has a specific type. Don't worry, this type is much easier to understand in practice.
+
+Let's add a final property to our epic `course` object called `courseInfo`. This property will hold an array where the first value corresponds to this course's number in the Apify academy, and the value in the second position describes the level of this course.
+
+```ts
+const course = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+    typesLearned: ['number', 'boolean', 'string', 'object'],
+    courseInfo: [7, 'advanced'],
+};
+```
+
+Here's one way we could define this property's type:
+
+```ts
+const course: {
+    name: string;
+    currentLesson: string;
+    typesLearned: string[];
+    // an array of numbers or strings
+    courseInfo: (number | string)[];
+    learningBasicTypes?: boolean;
+} = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+    typesLearned: ['number', 'boolean', 'string', 'object'],
+    courseInfo: [7, 'advanced'],
+};
+```
+
+This is actually how TypeScript infers the type of `courseInfo` as well. It tells the compiler that `courseInfo` is an array of any length that can hold both numbers and strings. However, we want to be more specific than that in this case. The length of `courseInfo` should always be 2, the first value should always be a number, and the second one always be a string. We should use a tuple instead.
+
+```ts
+const course: {
+    name: string;
+    currentLesson: string;
+    typesLearned: string[];
+    // an array with 2 elements. courseInfo[0] is a string,
+    // while courseInfo[0] is a number
+    courseInfo: [number, string];
+    learningBasicTypes?: boolean;
+} = {
+    name: 'Switching to TypeScript',
+    currentLesson: 'Using types - II',
+    typesLearned: ['number', 'boolean', 'string', 'object'],
+    courseInfo: [7, 'advanced'],
+};
+```
+
+By placing types inside of the square brackets in a specific order, we tell TypeScript that the property is not of an array type, but a tuple instead. When we try to reassign the properties to an incorrect data type, we get an error.
+
+![Epic autocomplete](./images/string-not-number.png)
+
+## Next up {#next}
+
+Whew! Nice job learning about the core types in TypeScript. The next lesson is lighter than the last two, but still super important. [Let's learn](./enums.md) about the `enum` keyword!
+</file>
+
+<file path="webscraping/typescript/using_types.md">
+---
+title: Using types - I
+description: Dip your toes into using types with TypeScript by learning about the core types offered by the language, and how to define variables and functions with them.
+sidebar_position: 7.2
+slug: /switching-to-typescript/using-types
+---
+
+# Using types {#using-types}
+
+**Dip your toes into using types with TypeScript by learning about the core types offered by the language, and how to define variables and functions with them.**
+
+---
+
+In the previous lesson, we got a rough idea of how types are used by utilizing the `number` type in a function's parameters. The `number` type is one of TypeScript's three core types.
+
+## Core types
+
+| Name      | Example                 | Description                                                  |
+| --------- | ----------------------- | ------------------------------------------------------------ |
+| `number`  | `1`, `5.3`, `-10`       | All numbers. No differentiation between integers and floats. |
+| `string`  | `'hi'`, `"hello world"` | All text values.                                             |
+| `boolean` | `true`, `false`         | Just these two. No "truthy" or "falsy" values.               |
+
+## With variables {#with-variables}
+
+Just because we're writing TS files doesn't mean we need to explicitly define the type of every single value/parameter though. We'll create a new file called **using-types.ts** and create a basic variable:
+
+```ts
+const value = 10;
+```
+
+When hovering over the variable, we see that TypeScript was smart enough to infer that the data type stored in `value` should always be a number.
+
+![Type of variable automatically inferred](./images/number-inference.png)
+
+Attempting to reassign `value` to a type other than a number will result in a compiler error.
+
+But what if we want to declare the variable with no initial value, and then change it later?
+
+<!-- eslint-disable -->
+```ts
+let value;
+
+value = 10;
+```
+
+TypeScript can't automatically infer the type of the variable when we don't provide it an initial value, so it automatically uses the `any` type.
+
+> Note: Avoid using the `any` type as much as possible. It completely defeats the purpose of using TypeScript in the first place, as it removes the benefits of TS.
+
+Because of this, we can set `value` to be absolutely anything without receiving any compiler errors.
+
+![TypeScript didn't infer the type](./images/is-any.png)
+
+To resolve this, we can annotate the variable by adding a colon (`:`) after the name followed by the name of the type we'd like to be tied to the variable.
+
+```ts
+let value: number;
+
+// Totally ok
+value = 10;
+
+// This will throw a compiler error
+value = 'hello academy!';
+```
+
+To allow for the `value` variable to hold multiple different types, we can use a [union type](https://www.typescriptlang.org/docs/handbook/unions-and-intersections.html). It works just the same way as the **or** (`||`) operator in JavaScript, but only uses one pipe (`|`) character and only works with types and type annotations.
+
+```ts
+// "value" can hold either a number or a string
+let value: number | string;
+
+// Totally ok
+value = 10;
+
+// Totally ok
+value = 'hello academy!';
+
+// This will throw a compiler error, because we didn't include
+// number arrays in our union type.
+value = [1, 2, 3];
+```
+
+Later in this course, we'll be getting more into union types.
+
+## With functions {#with-functions}
+
+With functions, you can define the types of both the parameters and the return type. Here's a basic function:
+
+```ts
+const totalLengthIsGreaterThan10 = (string1, string2) => {
+    // Returns true if the total length of both strings is greater
+    // than 10, and false if it's less than 10.
+    return (string1 + string2).length > 10;
+};
+```
+
+Just like with the parameters in the function from the last lesson, and similar to variables, these parameters can be annotated with a colon (`:`) and a type name. In this case, we are expecting two strings into this function.
+
+```ts
+const totalLengthIsGreaterThan10 = (string1: string, string2: string) => {
+    return (string1 + string2).length > 10;
+};
+```
+
+The return value of this function is a boolean, which TypeScript has intelligently inferred.
+
+![Return type inferred](./images/return-inferred.png)
+
+Despite the correct inference, if we wanted to explicitly annotate this function's return type, we could. Return type annotations go after the parentheses (`()`) where the function's parameters are defined.
+
+```ts
+const totalLengthIsGreaterThan10 = (string1: string, string2: string): boolean => {
+    return (string1 + string2).length > 10;
+};
+```
+
+For non-arrow functions, the type annotation syntax is the exact same:
+
+```ts
+function totalLengthIsGreaterThan10(string1: string, string2: string): boolean {
+    return (string1 + string2).length > 10;
+}
+```
+
+## Next up {#next}
+
+[Next up](./using_types_continued.md), we'll discuss a few more basic types supported in TypeScript and how to use them.
+</file>
+
+<file path="webscraping/typescript/watch_mode_and_tsconfig.md">
+---
+title: Watch mode & tsconfig.json
+description: Learn how to fine-tune TypeScript for an entire project's needs and efficiently compile numerous TS files at a single time (automagically).
+sidebar_position: 7.7
+slug: /switching-to-typescript/watch-mode-and-tsconfig
+---
+
+# Watch mode and tsconfig.json {#watch-mode-and-tsconfig}
+
+**Learn how to fine-tune TypeScript for an entire project's needs and efficiently compile numerous TS files at a single time (automagically).**
+
+---
+
+Thus far, each time we've made changes to our TypeScript code, we've had to run the `tsc FILE_NAME.ts` command in the terminal. Very quickly, this becomes repetitive and cumbersome, especially for large projects with more than one file. Luckily, the TypeScript compiler has a special feature called **watch mode**, which will watch a specific file (or all **.ts** files) for any changes. If any changes are made, it will automatically recompile.
+
+> Test out watch mode on a single file by using the `--watch` (or `-w` for short) flag like so: `tsc FILE_NAME --watch`.
+
+## tsconfig.json {#tsconfig}
+
+If your project has more than one file, it's necessary to have a `tsconfig.json` file at the root of your project. This is a file which allows you to configure TypeScript to your liking, as well as utilize a "general" watch mode that watches all TS files and recompiles when changes are made.
+
+### Creating the file {#creating-the-file}
+
+In the next lesson, we'll be learning how to use interfaces in combination with type casting and a few other concepts from the previous lessons by building a small project. Let's create a new directory for this project right now and call it **my-first-typescript-project**. Within the directory, we'll first initialize the project with this command:
+
+```shell
+npm init -y
+```
+
+Then, in order to tell TypeScript that this is a whole project, we'll run this command:
+
+```shell
+tsc --init
+```
+
+Notice that a new **tsconfig.json** file has been automatically created. When you open it up, here's what you'll see:
+
+```json
+{
+  "compilerOptions": {
+    /* Visit https://aka.ms/tsconfig.json to read more about this file */
+
+    /* Projects */
+    // "incremental": true,                              /* Enable incremental compilation */
+    // "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
+    // "tsBuildInfoFile": "./",                          /* Specify the folder for .tsbuildinfo incremental compilation files. */
+    // "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects */
+    // "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
+    // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
+
+    /* Language and Environment */
+    "target": "es2016",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
+    // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
+    // "jsx": "preserve",                                /* Specify what JSX code is generated. */
+    // "experimentalDecorators": true,                   /* Enable experimental support for TC39 stage 2 draft decorators. */
+    // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
+    // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h' */
+    // "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
+    // "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using `jsx: react-jsx*`.` */
+    // "reactNamespace": "",                             /* Specify the object invoked for `createElement`. This only applies when targeting `react` JSX emit. */
+    // "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
+    // "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
+
+    /* Modules */
+    "module": "commonjs",                                /* Specify what module code is generated. */
+    // "rootDir": "./",                                  /* Specify the root folder within your source files. */
+    // "moduleResolution": "node",                       /* Specify how TypeScript looks up a file from a given module specifier. */
+    // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
+    // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
+    // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
+    // "typeRoots": [],                                  /* Specify multiple folders that act like `./node_modules/@types`. */
+    // "types": [],                                      /* Specify type package names to be included without being referenced in a source file. */
+    // "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
+    // "resolveJsonModule": true,                        /* Enable importing .json files */
+    // "noResolve": true,                                /* Disallow `import`s, `require`s or `<reference>`s from expanding the number of files TypeScript should add to a project. */
+
+    /* JavaScript Support */
+    // "allowJs": true,                                  /* Allow JavaScript files to be a part of your program. Use the `checkJS` option to get errors from these files. */
+    // "checkJs": true,                                  /* Enable error reporting in type-checked JavaScript files. */
+    // "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from `node_modules`. Only applicable with `allowJs`. */
+
+    /* Emit */
+    // "declaration": true,                              /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
+    // "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
+    // "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
+    // "sourceMap": true,                                /* Create source map files for emitted JavaScript files. */
+    // "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If `declaration` is true, also designates a file that bundles all .d.ts output. */
+    // "outDir": "./",                                   /* Specify an output folder for all emitted files. */
+    // "removeComments": true,                           /* Disable emitting comments. */
+    // "noEmit": true,                                   /* Disable emitting files from a compilation. */
+    // "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
+    // "importsNotUsedAsValues": "remove",               /* Specify emit/checking behavior for imports that are only used for types */
+    // "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
+    // "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
+    // "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
+    // "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
+    // "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
+    // "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
+    // "newLine": "crlf",                                /* Set the newline character for emitting files. */
+    // "stripInternal": true,                            /* Disable emitting declarations that have `@internal` in their JSDoc comments. */
+    // "noEmitHelpers": true,                            /* Disable generating custom helper functions like `__extends` in compiled output. */
+    // "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
+    // "preserveConstEnums": true,                       /* Disable erasing `const enum` declarations in generated code. */
+    // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
+    // "preserveValueImports": true,                     /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
+
+    /* Interop Constraints */
+    // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
+    // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
+    "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */
+    // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
+    "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */
+
+    /* Type Checking */
+    "strict": true,                                      /* Enable all strict type-checking options. */
+    // "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied `any` type.. */
+    // "strictNullChecks": true,                         /* When type checking, take into account `null` and `undefined`. */
+    // "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
+    // "strictBindCallApply": true,                      /* Check that the arguments for `bind`, `call`, and `apply` methods match the original function. */
+    // "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
+    // "noImplicitThis": true,                           /* Enable error reporting when `this` is given the type `any`. */
+    // "useUnknownInCatchVariables": true,               /* Type catch clause variables as 'unknown' instead of 'any'. */
+    // "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
+    // "noUnusedLocals": true,                           /* Enable error reporting when a local variables aren't read. */
+    // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read */
+    // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
+    // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
+    // "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
+    // "noUncheckedIndexedAccess": true,                 /* Include 'undefined' in index signature results */
+    // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
+    // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type */
+    // "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
+    // "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
+
+    /* Completeness */
+    // "skipDefaultLibCheck": true,                      /* Skip type checking .d.ts files that are included with TypeScript. */
+    "skipLibCheck": true                                 /* Skip type checking all .d.ts files. */
+  }
+}
+```
+
+### Bare basic configurations {#bare-basic-configurations}
+
+As you can see, there are a whole lot of options, which is quite overwhelming. Don't worry, we'll be walking you through all of the important ones, so for now let's delete all of the contents of this **tsconfig.json** and start from scratch.
+
+#### Excluding files and folders {#excluding-files-and-folders}
+
+It is possible to tell TypeScript which files to compile, and which ones to ignore. The **exclude** option in **tsconfig.json** holds an array of file/folder names/paths that should **not** be watched.
+
+```json
+{
+    "compilerOptions": {},
+    "exclude": ["node_modules"]
+}
+```
+
+In our case, we don't want to compile any TypeScript code that could possibly be living in the **node_modules** folder that will appear when we start installing dependencies, so we've added it to the array.
+
+#### Telling TypeScript which files to compile {#telling-typescript-what-to-compile}
+
+Along with the **exclude** property is the **include** property, which holds an array of files/paths to check when compiling. Anything not included within the array will be ignored.
+
+In the next project, we are going to follow a very common pattern with TypeScript projects by keeping all of our TS files in a folder named **src**. Let's create a **src** folder within **my-first-typescript-project**, then add its path to the **include** property's array.
+
+```json
+{
+    "compilerOptions": {},
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+#### Specify where to put compiled files {#specify-where-to-put-compiled-files}
+
+It's common practice in TypeScript projects to keep **.ts** files separate from their respective compiled **.js** files. Usually, the compiled files are placed in a folder named **dist** or **build**. Let's use **dist** for our project.
+
+Within **compilerOptions**, the **outDir** property tells TypeScript just that - where to place all compiled files.
+
+```json
+{
+    "compilerOptions": {
+        "outDir": "dist/"
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+This time, you don't have to manually create a folder named **dist**. During compile time, TypeScript will detect whether or not the folder exists and automatically create it if it doesn't.
+
+> We also recommend learning about [**rootDir**](https://www.typescriptlang.org/tsconfig/#rootDir).
+
+### Important basic configurations {#important-basic-configurations}
+
+Other than telling TypeScript **what** files it should (and should not) compile, we also need to tell it **how** they should be compiled.
+
+#### Setting the target {#setting-the-target}
+
+**target** within **compilerOptions** tells TypeScript which JavaScript version you'd like to compile your code into. This allows for the ability to, for example, use ES7 features during development time, but support environments that only work with the ES3 version of JavaScript. We'll use **esnext**.
+
+```json
+{
+    "compilerOptions": {
+        "target": "esnext",
+        "outDir": "dist/"
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+#### Setting libs {#setting-libs}
+
+By default TypeScript will allow us to use things like `document.querySelector()` or `window.reload()` even though we're writing Node.js code where those global objects don't exist. This is because TypeScript automatically has these libraries enabled. In order to prevent this, we'll get more specific about the **lib**s we'd like to use.
+
+```json
+{
+    "compilerOptions": {
+        "target": "esnext",
+        "lib": ["ES2015", "ES2016", "ES2018", "ES2019.Object", "ES2018.AsyncIterable", "ES2020.String", "ES2019.Array"],
+        "outDir": "dist/"
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+> Learn more about the **lib** configuration option [in the TypeScript documentation](https://www.typescriptlang.org/tsconfig/#lib).
+
+#### Removing comments {#removing-comments}
+
+This one is pretty straightforward. **removeComments** allows you to keep the comments which are useful in the code during development out of your compiled files.
+
+```json
+{
+    "compilerOptions": {
+        "target": "esnext",
+        "lib": ["ES2015", "ES2016", "ES2018", "ES2019.Object", "ES2018.AsyncIterable", "ES2020.String", "ES2019.Array"],
+        "outDir": "dist/",
+        "removeComments": true
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+#### Refusing to compile if there are any errors {#dont-compile-if-errors}
+
+In most statically typed programming languages, the compiler will refuse to produce an output until all errors have been fixed; however, TypeScript by default will still compile even if there are errors. To enable the more strict functionality that other languages support, set **noEmitOnError** to **true**.
+
+```json
+{
+    "compilerOptions": {
+        "target": "esnext",
+        "lib": ["ES2015", "ES2016", "ES2018", "ES2019.Object", "ES2018.AsyncIterable", "ES2020.String", "ES2019.Array"],
+        "outDir": "dist/",
+        "removeComments": true,
+        "noEmitOnError": true
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+#### Adding strict type checking {#strict-type-checking}
+
+TypeScript has [multiple options](https://learntypescript.dev/11/l6-strictness) for strict type checking that can be configured. To enable all of them, set **strict** to **true** (this is recommended).
+
+```json
+{
+    "compilerOptions": {
+        "target": "esnext",
+        "lib": ["ES2015", "ES2016", "ES2018", "ES2019.Object", "ES2018.AsyncIterable", "ES2020.String", "ES2019.Array"],
+        "outDir": "dist/",
+        "removeComments": true,
+        "noEmitOnError": true,
+        "strict": true
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+#### Setting module resolution & type {#module-resolution}
+
+By default, TypeScript doesn't know how to handle or recognize modules imported into our projects. We'll tell the compiler it's a Node.js project with the **moduleResolution** option set to **node**, and that we are using CommonJS for the module type.
+
+```json
+{
+    "compilerOptions": {
+        "target": "esnext",
+        "lib": ["ES2015", "ES2016", "ES2018", "ES2019.Object", "ES2018.AsyncIterable", "ES2020.String", "ES2019.Array"],
+        "outDir": "dist/",
+        "removeComments": true,
+        "noEmitOnError": true,
+        "strict": true,
+        "moduleResolution": "node",
+        "skipLibCheck": true, /* this will prevent TS errors from happening due to library errors */
+        "module": "commonjs"
+    },
+    "exclude": ["node_modules"],
+    "include": ["src/"]
+}
+```
+
+## Watch mode {#watch-mode}
+
+Now that you've finished configuring the **tsconfig.json** file, go ahead and create an **index.ts** file in the **src** folder. Because we've configured this project with TypeScript, we can just run this command:
+
+```shell
+## -w is the shortened version of the --watch flag
+tsc -w
+```
+
+And our files in **src** will automatically compile into a folder named **dist**, and one that change will be recompiled.
+
+## Next up {#next}
+
+Now that we're all set up, we can move forward and start building our mini-project. But first, [let's learn](./interfaces.md) about the `interface` keyword!
+</file>
+
+<file path="homepage_content.json">
+{
+    "Beginner courses": [
+        {
+            "title": "Web scraping for beginners",
+            "link": "academy/web-scraping-for-beginners",
+            "description": "Learn how to develop web scrapers on your own computer with open-source tools. This web scraping course teaches you all the basics a scraper developer needs to know.",
+            "imageUrl": "/img/academy/intro.svg"
+        },
+        {
+            "title": "Apify platform",
+            "link": "academy/apify-platform",
+            "description": "The Apify platform is the best place to run your scrapers and automations in the cloud. Learn what an actor is, how to turn your program into an actor, and how to deploy it.",
+            "imageUrl": "/img/academy/apify_platform.png"
+        }
+    ],
+    "Advanced web scraping courses": [
+        {
+            "title": "API scraping",
+            "link": "academy/api-scraping",
+            "description": "Learn all about how the professionals scrape various types of APIs with various configurations, parameters, and requirements.",
+            "imageUrl": "/img/academy/actors-01.svg"
+        },
+        {
+            "title": "Anti-scraping protections",
+            "link": "academy/anti-scraping",
+            "description": "Understand the various anti-scraping measures different sites use to prevent bots from accessing them, and how to appear more human to fix these issues.",
+            "imageUrl": "/img/academy/proxy-01.svg"
+        },
+        {
+            "title": "Expert scraping with Apify",
+            "link": "academy/expert-scraping-with-apify",
+            "description": "After learning the basics of actors, learn to develop pro-level scrapers on the Apify platform with this advanced course.",
+            "imageUrl": "/img/academy/enterprise-01.svg"
+        }
+    ]
+}
+</file>
+
+<file path="index.mdx">
+---
+title: Web Scraping Academy
+description: Learn everything about web scraping and automation with our free courses that will turn you into an expert scraper developer.
+sidebar_position: 0
+slug: /
+displayed_sidebar: courses
+hide_table_of_contents: true
+---
+import Card from "@site/src/components/Card";
+import CardGrid from "@site/src/components/CardGrid";
+import homepageContent from "./homepage_content.json";
+
+Learn everything about web scraping and automation with our free courses that will turn you into an expert scraper developer.
+
+<>
+{
+    Object.entries(homepageContent).map(([categoryName, sections]) =>
+        <>
+            <h2>{categoryName}</h2>
+            <CardGrid>
+                {
+                    sections.map((section, i) =>
+                    <Card
+                        title={section.title}
+                        desc={section.description}
+                        imageUrl={section.imageUrl}
+                        to={section.link}
+                        key={i}
+                    />)
+                }
+            </CardGrid>
+        </>
+    )}
+</>
+
+<!-- The homepage menu card items are in the homepage_content.json file.
+
+    Each item has its own JSON object, in which "cardItem" is the title and "href" is the link. If the link leads to outside the Apify Docs site, add the `"isExternalLink" : true` property. For local links, just use the article's path. E.g. `"/tutorials/apify-scrapers/web-scraper"`.
+
+    In the title (cardItem), do not just give the article's name. Phrase the title in a way that answers a question or fulfills a goal the user might have.
+
+    For example:
+    {
+        "cardItem": "How to run an Actor",
+        "href": "https://www.youtube.com/watch?v=BsidLZKdYWQ",
+        "isExternalLink": true
+    },
+
+    Note:
+    In JSON, all entries except booleans (true/false) and numbers need to be in double quote marks ("").
+
+    Check the README for more information.
+-->
+</file>
+
+<file path="sidebars.js">
+module.exports = {
+    courses: [
+        {
+            type: 'category',
+            label: 'Web scraping & Automation',
+            collapsible: false,
+            className: 'section-header',
+            items: [
+                {
+                    type: 'autogenerated',
+                    dirName: 'webscraping',
+                },
+            ],
+        },
+        {
+            type: 'category',
+            label: 'Apify Platform',
+            collapsible: false,
+            className: 'section-header',
+            items: [
+                {
+                    type: 'autogenerated',
+                    dirName: 'platform',
+                },
+            ],
+        },
+    ],
+    tutorials: [
+        {
+            type: 'category',
+            label: 'Tutorials',
+            collapsible: false,
+            className: 'section-header',
+            items: [
+                {
+                    type: 'autogenerated',
+                    dirName: 'tutorials',
+                },
+            ],
+        },
+    ],
+    glossary: [
+        {
+            type: 'category',
+            label: 'Glossary',
+            collapsible: false,
+            className: 'section-header',
+            items: [
+                {
+                    type: 'autogenerated',
+                    dirName: 'glossary',
+                },
+            ],
+        },
+    ],
+};
+</file>
+
+</files>
diff --git a/sources/academy/platform/get_most_of_actors/store_basics/how_actor_monetization_works.md b/sources/academy/platform/get_most_of_actors/store_basics/how_actor_monetization_works.md
index 2a1f590a30..8e6a13af36 100644
--- a/sources/academy/platform/get_most_of_actors/store_basics/how_actor_monetization_works.md
+++ b/sources/academy/platform/get_most_of_actors/store_basics/how_actor_monetization_works.md
@@ -149,7 +149,7 @@ Example of useful pricing estimates from the **Analytics** tab:
 ## Resources
 
 - Learn about [incentives behind monetization](https://apify.com/partners/actor-developers)
-- Detailed guide to [setting up monetization models](https://docs.apify.com/academy/actor-marketing-playbook/monetizing-your-actor)
+- Detailed guide to [setting up monetization models](https://docs.apify.com/academy/get-most-of-actors/monetizing-your-actor)
 - Guide to [publishing Actors](https://docs.apify.com/platform/actors/publishing)
 - Watch our webinar on how to [build, publish and monetize Actors](https://www.youtube.com/watch?v=4nxStxC1BJM)
 - Read a blog post from our CEO on the [reasoning behind monetizing Actors](https://blog.apify.com/make-regular-passive-income-developing-web-automation-actors-b0392278d085/)
diff --git a/sources/platform/integrations/ai/langflow.md b/sources/platform/integrations/ai/langflow.md
index c3acc42ee6..af94e9aa82 100644
--- a/sources/platform/integrations/ai/langflow.md
+++ b/sources/platform/integrations/ai/langflow.md
@@ -53,7 +53,7 @@ After successfully installing Langflow, we can start the platform:
 uv run langflow run
 ```
 
-When the platform is started, open the [Langflow UI](http://127.0.0.1:7860) in your browser.
+When the platform is started, open the Langflow UI using `http://127.0.0.1:7860` in your browser.
 
 > Other installation methods can be found in the [Langflow documentation](https://docs.langflow.org/get-started-installation).