In [2]:
# Stage 1: Basics of Web Scraping
import requests
from bs4 import BeautifulSoup

url = 'http://quotes.toscrape.com/'
response = requests.get(url)

# Check status
print(response.status_code)

# Parse the HTML
soup = BeautifulSoup(response.text, 'lxml')

# Get all quotes
quotes = soup.find_all('span', class_='text')

for i, quote in enumerate(quotes, 1):
    print(f"{i}. {quote.text}")



200
1. “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
2. “It is our choices, Harry, that show what we truly are, far more than our abilities.”
3. “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
4. “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
5. “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
6. “Try not to become a man of success. Rather become a man of value.”
7. “It is better to be hated for what you are than to be loved for what you are not.”
8. “I have not failed. I've just found 10,000 ways that won't work.”
9. “A woman is like a tea bag; you never know how strong it is until it's in hot water.”
10. “A day without sunshine is like, you know, night.”


In [3]:
for page in range(1, 4):
    URL = f"http://quotes.toscrape.com/page/{page}/"
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract quotes as before

quotes = soup.find_all('span', class_='text')
authors = soup.find_all('small', class_='author')

for quote, author in zip(quotes, authors):
    print(f"{quote.text} - {author.text}")

“I love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.” - Pablo Neruda
“For every minute you are angry you lose sixty seconds of happiness.” - Ralph Waldo Emerson
“If you judge people, you have no time to love them.” - Mother Teresa
“Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.” - Garrison Keillor
“Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.” - Jim Henson
“Today you are You, that is truer than true. There is no one alive who is Youer than You.” - Dr. Seuss
“If you want your children to be intelligent, read them fairy tales. If you want them to be more intelligent,

In [4]:
import pandas as pd

authors = soup.find_all('small', class_='author')

data = {
    'Quote': [q.text for q in quotes],
    'Author': [a.text for a in authors]
}

df = pd.DataFrame(data)
df.to_csv('quotes.csv', index=False)
df.head()



Unnamed: 0,Quote,Author
0,"“I love you without knowing how, or when, or f...",Pablo Neruda
1,“For every minute you are angry you lose sixty...,Ralph Waldo Emerson
2,"“If you judge people, you have no time to love...",Mother Teresa
3,“Anyone who thinks sitting in church can make ...,Garrison Keillor
4,“Beauty is in the eye of the beholder and it m...,Jim Henson


In [5]:
#!pip install selenium webdriver-manager beautifulsoup4


In [6]:
# Step 1: Install dependencies (run this first)
# !apt-get update > /dev/null
# !apt install chromium-chromedriver > /dev/null
# !pip install selenium beautifulsoup4 > /dev/null



In [7]:
# Step 2: Setup Selenium with headless Chrome (new version compatible)
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from bs4 import BeautifulSoup
# import time

# # Chrome options
# chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")

# # ChromeDriver path for Colab
# chrome_path = "/usr/bin/chromedriver"
# service = Service(executable_path=chrome_path)

# # Start the browser
# driver = webdriver.Chrome(service=service, options=chrome_options)

# # Open the target page
# driver.get("http://quotes.toscrape.com/js")
# time.sleep(3)  # Wait for JS to load

# # Get page source and parse it
# html = driver.page_source
# soup = BeautifulSoup(html, "html.parser")

# # Extract quotes
# quotes = soup.find_all('span', class_='text')
# for quote in quotes:
#     print(quote.text)

# driver.quit()



In [8]:
# ✅ Allowed:

# Public sites with no login

# Personal projects or educational scraping

# APIs offered by the site

# 🚫 Avoid:

# Ignoring robots.txt

# Scraping sensitive/private data

# Spamming sites with too many requests

# https://example.com/robots.txt



In [9]:
# !pip install requests beautifulsoup4 lxml


In [10]:
# ✅ Part 2: Navigating Pages (Pagination)
import requests
from bs4 import BeautifulSoup

base_url = 'http://quotes.toscrape.com'
page_url = '/page/1/'
all_quotes = []

while page_url:
    print(f"Scraping: {base_url + page_url}")
    res = requests.get(base_url + page_url)
    soup = BeautifulSoup(res.text, 'lxml')

    quotes = soup.find_all('span', class_='text')
    authors = soup.find_all('small', class_='author')

    for quote, author in zip(quotes, authors):
        all_quotes.append({'Quote': quote.text, 'Author': author.text})

    next_btn = soup.find('li', class_='next')
    page_url = next_btn.a['href'] if next_btn else None

# Save to CSV
import pandas as pd
df = pd.DataFrame(all_quotes)
df.to_csv('all_quotes.csv', index=False)
df.head()


Scraping: http://quotes.toscrape.com/page/1/
Scraping: http://quotes.toscrape.com/page/2/
Scraping: http://quotes.toscrape.com/page/3/
Scraping: http://quotes.toscrape.com/page/4/
Scraping: http://quotes.toscrape.com/page/5/
Scraping: http://quotes.toscrape.com/page/6/
Scraping: http://quotes.toscrape.com/page/7/
Scraping: http://quotes.toscrape.com/page/8/
Scraping: http://quotes.toscrape.com/page/9/
Scraping: http://quotes.toscrape.com/page/10/


Unnamed: 0,Quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe


In [11]:
# 🧩 Part 3: Extracting Other HTML Elements
all_quotes = []

while page_url:
    res = requests.get(base_url + page_url)
    soup = BeautifulSoup(res.text, 'lxml')
    quote_blocks = soup.find_all('div', class_='quote')

    for block in quote_blocks:
        quote = block.find('span', class_='text').text
        author = block.find('small', class_='author').text
        tags = [tag.text for tag in block.find_all('a', class_='tag')]

        all_quotes.append({
            'Quote': quote,
            'Author': author,
            'Tags': ', '.join(tags)
        })

    next_btn = soup.find('li', class_='next')
    page_url = next_btn.a['href'] if next_btn else None

df = pd.DataFrame(all_quotes)
df.to_csv('all2_quotes.csv', index=False)
df.head()

In [12]:
# ✅ Part 4: Adding Headers (User-Agent Spoofing)

# Websites often block bots that don’t send proper headers (like browsers do). A key header is the User-Agent.
# Here’s how you do that:

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

url = 'http://quotes.toscrape.com/page/1/'
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'lxml')
quotes = soup.find_all('span', class_='text')

for quote in quotes:
    print(quote.text)


“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


In [13]:
# ✅ Part 5: Using requests.Session()
# If a site uses cookies or login sessions, use Session() to persist them.

session = requests.Session()
session.headers.update(headers)

url = 'http://quotes.toscrape.com/page/1/'
res = session.get(url)
soup = BeautifulSoup(res.text, 'lxml')

In [14]:
# ✅ Part 6: Error Handling with try-except
# Make your scraper robust against failures (e.g. 404, no Internet, broken HTML).

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()  # raise error for 4xx/5xx
    soup = BeautifulSoup(response.text, 'lxml')
except requests.exceptions.HTTPError as errh:
    print("HTTP Error:", errh)
except requests.exceptions.ConnectionError as errc:
    print("Connection Error:", errc)
except requests.exceptions.Timeout as errt:
    print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
    print("Request Failed:", err)


In [15]:
# ✅ Part 7: Save to JSON, Excel
# You’ll need this for freelance deliverables.

# Save to JSON
import json
with open('quotes.json', 'w') as f:
    json.dump(all_quotes, f, indent=4)

# Save to Excel
df.to_excel('quotes.xlsx', index=False)


In [16]:
# ✅ BONUS: Respectful Scraping with Delay
# Always pause between requests to avoid getting blocked:

import time

time.sleep(2)  # Pause 2 seconds between requests


In [17]:
# 🚀 Stage 3: Scraping JavaScript-Rendered Sites with Selenium in Google Colab
# Some websites load content via JavaScript, meaning requests and BeautifulSoup can’t see the data. You need a real browser for that
#  — and that’s where Selenium comes in.

#  Part 1: Setup Selenium in Google Colab (Headless Chrome)
# Google Colab doesn’t support GUI-based Chrome, so we use headless mode.

# Install packages
!apt-get update # just in case
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

# Set up environment
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (3.171.85.123)] [Con                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [2 InRelease 128 kB/128 kB 100%] [Connected to cloud.r-project.org (3.171.850% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                Get:5 http://archive.ubuntu.com/ubuntu jamm

In [1]:
!apt-get update -y > /dev/null
!apt install -y chromium-chromedriver > /dev/null
!pip install -U selenium beautifulsoup4 > /dev/null


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)




In [2]:
# ✅ Part 2: Basic Selenium Browser Script

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# Headless Chrome setup
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=chrome_options)

# Load page
driver.get("http://quotes.toscrape.com/js/")  # JS-based version

# Wait for JavaScript to load
time.sleep(2)

# Use BeautifulSoup on rendered HTML
soup = BeautifulSoup(driver.page_source, 'lxml')
quotes = soup.find_all("span", class_="text")

for quote in quotes:
    print(quote.text)

driver.quit()


“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


In [3]:
# ✅ Part 3: Wait for Elements to Load (Dynamic Sites)
# Sites like LinkedIn, Airbnb, Upwork may take time to load content. Use WebDriverWait to wait until the content appears:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver.get("http://quotes.toscrape.com/js/")

# Wait until quote block is visible
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, "quote"))
)

soup = BeautifulSoup(driver.page_source, 'lxml')
quotes = soup.find_all("span", class_="text")

for quote in quotes:
    print(quote.text)

driver.quit()




MaxRetryError: HTTPConnectionPool(host='localhost', port=53295): Max retries exceeded with url: /session/56a138673b1f3b4930300c3d4a9e4742/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7999c1285350>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [4]:
# ✅ Part 5: BONUS – Take Screenshots (for Clients)

driver.get("http://quotes.toscrape.com/js/")
driver.save_screenshot("quotes_page.png")




MaxRetryError: HTTPConnectionPool(host='localhost', port=53295): Max retries exceeded with url: /session/56a138673b1f3b4930300c3d4a9e4742/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7999c11b8850>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [6]:
# 🧪 Challenge Task (Optional for Practice)
# Scrape quotes from all pages of the JS-based version of the site:

# Click the "Next" button using Selenium

# Keep scraping until the "Next" button is no longer available

# If you want, I’ll help you code that too.

In [7]:
# Excellent! Now let’s dive into Stage 4, where you learn how to automate login,
# fill forms, and interact with dropdowns, search bars, and more — essential skills for freelance gigs and client work.

# 🚪 Stage 4: Login & Form Automation with Selenium
# ✅ Part 1: Automate Login to a Website

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time

# Headless Chrome for Colab
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=chrome_options)

# Go to login page
driver.get("http://quotes.toscrape.com/login")

# Fill in the username and password
driver.find_element(By.ID, "username").send_keys("admin")
driver.find_element(By.ID, "password").send_keys("admin")

# Submit the form
driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]').click()

# Wait and check if login was successful
time.sleep(2)

# Grab page content after login
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'lxml')
quotes = soup.find_all("span", class_="text")

for quote in quotes:
    print(quote.text)

driver.quit()


ElementClickInterceptedException: Message: element click intercepted: Element <input type="submit" value="Login" class="btn btn-primary"> is not clickable at point (56, 367). Other element would receive the click: <div class="container">...</div>
  (Session info: chrome=138.0.7204.49)
Stacktrace:
#0 0x58f42508c26a <unknown>
#1 0x58f424b36ab0 <unknown>
#2 0x58f424b8f55c <unknown>
#3 0x58f424b8d3f4 <unknown>
#4 0x58f424b8aa62 <unknown>
#5 0x58f424b8a180 <unknown>
#6 0x58f424b7c90a <unknown>
#7 0x58f424bae1a2 <unknown>
#8 0x58f424b7c28a <unknown>
#9 0x58f424bae36e <unknown>
#10 0x58f424bd3fee <unknown>
#11 0x58f424badf73 <unknown>
#12 0x58f424b7aaeb <unknown>
#13 0x58f424b7b751 <unknown>
#14 0x58f425050b7b <unknown>
#15 0x58f425054959 <unknown>
#16 0x58f425037959 <unknown>
#17 0x58f425055518 <unknown>
#18 0x58f42501c10f <unknown>
#19 0x58f425079918 <unknown>
#20 0x58f425079af6 <unknown>
#21 0x58f42508b586 <unknown>
#22 0x7e5884d5dac3 <unknown>


In [10]:
# ✅ Part 2: Handling Search Bars and Text Input
# Example: Automate Google Search

# ⚠️ Google aggressively blocks bots. For practice only — don’t scrape Google heavily.

driver.get("https://www.google.com/")

# Accept cookies if needed (depends on region)
try:
    accept_btn = driver.find_element(By.ID, "L2AGLb")
    accept_btn.click()
except:
    pass

search_input = driver.find_element(By.NAME, "q")
search_input.send_keys("Web scraping with Python")

search_input.submit()

time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'lxml')
titles = soup.select("h3")

for title in titles[:5]:
    print(title.text)

driver.quit()




MaxRetryError: HTTPConnectionPool(host='localhost', port=34591): Max retries exceeded with url: /session/54e7e4fe9a1ec7aa0d753cf1433a1c88/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7999c0efd750>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [11]:
# ✅ Part 3: Handling Dropdowns, Checkboxes, and Buttons

from selenium.webdriver.support.ui import Select

# For dropdowns like <select id="dropdown">...</select>
dropdown = Select(driver.find_element(By.ID, 'dropdown'))
dropdown.select_by_visible_text('Option 1')
# For checkboxes
checkbox = driver.find_element(By.ID, 'remember')
if not checkbox.is_selected():
    checkbox.click()




MaxRetryError: HTTPConnectionPool(host='localhost', port=34591): Max retries exceeded with url: /session/54e7e4fe9a1ec7aa0d753cf1433a1c88/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7999c0effd90>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [13]:
# ✅ Part 4: Clicking “Load More” Buttons
# Many websites load more items via a button. You can click like this:

while True:
    try:
        load_btn = driver.find_element(By.CLASS_NAME, "load-more")
        load_btn.click()
        time.sleep(2)  # wait for items to load
    except:
        break  # no more button




In [14]:
# 🧪 Mini Project Idea
# You can now create a bot that:

# Logs into a site (e.g., job board)

# Fills a search form (e.g., “Python jobs”)

# Scrapes paginated results

# Saves them to CSV

# I'll help you build it step by step if you want.

In [15]:
# Great! Now you’re entering the real battlefield of web scraping: sites that try to block bots.
# Welcome to Stage 5: Anti-Bot Evasion & Scraping at Scale — a must-know for real-world freelance jobs and enterprise-level scraping.

# 🛡️ Stage 5: Anti-Bot Evasion & Scraping at Scale
# Websites don’t like bots for many reasons (security, load, profit). You need techniques to:

# Avoid detection

# Handle blocks, CAPTCHAs

# Rotate your identity

# ✅ Part 1: User-Agent Rotation
# Never scrape using a fixed User-Agent. Use a random one from a list.

import random

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...",
    "Mozilla/5.0 (Linux; Android 10)...",
    # Add 5–10 more realistic agents
]

headers = {
    "User-Agent": random.choice(user_agents)
}
response = requests.get(url, headers=headers)


NameError: name 'requests' is not defined

In [17]:
# ✅ Part 2: Proxy Rotation (Free & Paid)
# A proxy hides your real IP. Rotating proxies helps avoid IP bans.

proxies = {
    "http": "http://12.34.56.78:8080",
    "https": "http://12.34.56.78:8080"
}
res = requests.get(url, proxies=proxies, headers=headers)

# 🧠 You can use:

# Free proxies (less reliable): https://free-proxy-list.net/

# Paid services (more reliable): ScraperAPI, BrightData, Oxylabs

NameError: name 'requests' is not defined

In [18]:
# ✅ Part 3: CAPTCHA Handling (⚠️ Hard Level)
# If you hit a CAPTCHA:

# Options:
# ❌ Avoid — skip site or slow down scraping

# 🔁 Retry after delay

# 🧠 Manual Solve (for small jobs): open browser + human input

# 🤖 Use CAPTCHA-solving APIs (e.g., 2Captcha, Anti-Captcha) – paid

# You can detect a CAPTCHA page like this:
if "captcha" in response.text.lower():
    print("Blocked by CAPTCHA!")


NameError: name 'response' is not defined

In [19]:
# ✅ Part 4: Delays + Randomized Timing

import time

# Pause randomly between 1 to 5 seconds
time.sleep(random.uniform(1, 5))


In [21]:
# ✅ Part 5: Retry & Resilient Scraper with Backoff

import requests
import time

def fetch(url, retries=3):
    for i in range(retries):
        try:
            res = requests.get(url, headers=headers, timeout=10)
            res.raise_for_status()
            return res
        except Exception as e:
            print(f"Try {i+1}: Error - {e}")
            time.sleep(2 ** i)  # exponential backoff
    return None
# 📢 Always check for a public API first. Saves time, avoids legal issues.

In [22]:
# ✅ Part 7: Scaling Best Practices
# For large-scale scraping jobs:

# Break tasks into batches

# Save intermediate results to resume scraping

# Use logs to track failed URLs

# Use queues or async frameworks for parallel scraping

In [25]:
# 🎯 Stage 6: Real Freelance Projects + Portfolio
# We’ll cover:

# Job board scraping (RemoteOK, Upwork, LinkedIn)

# E-commerce scraping (product + price + reviews)

# News scraping with date + headlines

# Portfolio building & client-ready scripts

In [28]:
# ✅ Part 1: Freelance-Ready Project Structure
# folder structure

# web-scraper-project/
# ├── scraper.py
# ├── requirements.txt
# ├── README.md
# ├── output/
# │   └── data.csv
# └── utils/
#     └── helpers.py



In [29]:
# Project 1: Job Scraper – RemoteOK
# Target: https://remoteok.com/

# ✅ Goal: Extract job title, company, tags, date, and apply link

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://remoteok.com/"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

jobs = soup.find_all("tr", class_="job")

data = []
for job in jobs:
    title = job.find("h2").text.strip() if job.find("h2") else None
    company = job.find("h3").text.strip() if job.find("h3") else None
    tags = [tag.text.strip() for tag in job.find_all("span", class_="tag")]
    date = job.find("time")
    date_posted = date["datetime"] if date else None
    link = "https://remoteok.com" + job["data-href"] if job.has_attr("data-href") else None

    data.append({
        "Title": title,
        "Company": company,
        "Tags": ", ".join(tags),
        "Date Posted": date_posted,
        "Apply Link": link
    })

df = pd.DataFrame(data)
df.to_csv("remote_jobs.csv", index=False)
df.head()


Unnamed: 0,Title,Company,Tags,Date Posted,Apply Link
0,Senior Python Backend Engineer,Orga AI,,2025-06-26T11:26:17+00:00,https://remoteok.com/remote-jobs/remote-senior...


In [31]:
# 📌 Project 2: E-commerce Scraper (Books To Scrape)
# Target: http://books.toscrape.com
# 💻 Pagination + Data Extract
# ✅ Goal: Scrape book title, price, rating, availability, and category.

base_url = "http://books.toscrape.com/catalogue/page-{}.html"
all_books = []

for page in range(1, 4):  # scrape first 3 pages
    res = requests.get(base_url.format(page))
    soup = BeautifulSoup(res.text, "lxml")
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        rating = book.p["class"][1]
        availability = book.find("p", class_="instock availability").text.strip()

        all_books.append({
            "Title": title,
            "Price": price,
            "Rating": rating,
            "Availability": availability
        })

pd.DataFrame(all_books).to_csv("books.csv", index=False)


In [32]:
# 📌 Project 3: News Headlines Scraper
# Target: https://www.reuters.com/

# ✅ Goal: Extract latest headlines, categories, timestamps.

url = "https://www.reuters.com/"
res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(res.text, "lxml")

headlines = soup.find_all("h3")

for i, h in enumerate(headlines[:10], 1):
    print(f"{i}. {h.text.strip()}")


In [33]:
# 🎓 Portfolio Tips for Freelancers
# ✅ Upload to GitHub
# Create repos like:

# freelance-job-scraper

# ecommerce-product-crawler

# news-headlines-scraper

# Add:

# A clear README.md

# Sample output (CSV, JSON)

# Screenshots if needed

# ✅ Showcase on Your Freelancing Profile
# Include scraping in your skills + portfolio:

# “I built a Python scraper for e-commerce sites”

# “Extracted job listings from RemoteOK and formatted to Excel”

# “Built login bots and scraping pipelines using Selenium and BeautifulSoup”

# ✅ Bonus: Convert to Web App (Later Stage)
# When you’re ready:

# Use Flask or Streamlit to turn scrapers into web apps

# Deploy with PythonAnywhere, Render, or Vercel

# 🧩 What You Can Scrape as a Freelancer
# Niche	Sites
# Job listings	RemoteOK, Upwork, Indeed
# Product scraping	Flipkart, Amazon, Etsy (careful here!)
# Real estate	Zillow, MagicBricks
# Stock/crypto data	Yahoo Finance, CoinMarketCap
# News headlines	Reuters, BBC, CNN
# Academic papers	arXiv, Semantic Scholar

# ✅ Summary: You're Ready!
# Stage	Mastered Topics
# 1–2	Basics: requests, soup, pagination, CSV
# 3–4	JavaScript sites, Selenium, login, forms
# 5	Anti-bot, CAPTCHA, user-agent, proxies
# 6	Freelance projects, portfolio, job-ready scripts