In [4]:
pip install requests beautifulsoup4 pandas lxml selenium webdriver-manager


Collecting lxml
  Using cached lxml-6.0.2-cp313-cp313-win_amd64.whl.metadata (3.7 kB)
Collecting selenium
  Using cached selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Using cached trio-0.31.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting python-dotenv (from webdriver-manager)
  Using cached python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting s


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin


In [7]:


BASE_URL = "https://books.toscrape.com/"

def parse_book_card(article):
    title = article.h3.a['title'].strip()
    price_text = article.find("p", class_="price_color").get_text().strip()
    availability = article.find("p", class_="instock availability").get_text().strip()
    star_tag = article.find("p", class_="star-rating")
    star_classes = star_tag.get("class", [])
    rating_word = None
    for cls in star_classes:
        if cls != "star-rating":
            rating_word = cls
            break
    return {"Title": title, "Price": price_text, "Availability": availability, "Star Rating": rating_word}

def scrape_all_books():
    results = []
    next_url = BASE_URL
    session = requests.Session()
    headers = {"User-Agent": "Mozilla/5.0"}
    while next_url:
        resp = session.get(next_url, headers=headers, timeout=20)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        articles = soup.find_all("article", class_="product_pod")
        for art in articles:
            results.append(parse_book_card(art))
        next_li = soup.select_one("li.next > a")
        if next_li:
            href = next_li.get("href")
            next_url = urljoin(next_url, href)
            time.sleep(0.5)
        else:
            next_url = None
    return results

if __name__ == "__main__":
    books = scrape_all_books()
    df = pd.DataFrame(books, columns=["Title", "Price", "Availability", "Star Rating"])
    df.to_csv("books.csv", index=False)
    print(f"Saved {len(df)} books to books.csv")


Saved 1000 books to books.csv


In [9]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

IMDB_TOP250_URL = "https://www.imdb.com/chart/top/"

def get_driver(headless=True):
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--log-level=3")
    chrome_options.add_argument("--remote-debugging-port=9222")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                "AppleWebKit/537.36 (KHTML, like Gecko) "
                                "Chrome/120.0.0.0 Safari/537.36")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=chrome_options)

def scrape_imdb_top250(headless=True):
    driver = get_driver(headless)
    try:
        driver.get(IMDB_TOP250_URL)
        time.sleep(3)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # IMDb recently changed table structure — target the correct selector
        table = soup.select_one("ul.ipc-metadata-list")
        if not table:
            raise ValueError("IMDb table not found — structure may have changed.")
        
        movies = []
        items = table.find_all("li", class_="ipc-metadata-list-summary-item")
        for i, item in enumerate(items, start=1):
            title_tag = item.find("h3")
            title = title_tag.get_text(strip=True).split('. ', 1)[-1] if title_tag else None
            year_tag = item.find("span", class_="cli-title-metadata-item")
            year = year_tag.get_text(strip=True) if year_tag else None
            rating_tag = item.find("span", class_="ipc-rating-star")
            rating = rating_tag.get_text(strip=True).replace("⭐", "") if rating_tag else None
            movies.append({"Rank": i, "Title": title, "Year": year, "IMDB Rating": rating})
        
        return pd.DataFrame(movies)
    
    finally:
        driver.quit()

if __name__ == "__main__":
    df = scrape_imdb_top250(headless=True)
    df.to_csv("imdb_top250.csv", index=False)
    print(f"✅ Saved {len(df)} rows to imdb_top250.csv")


✅ Saved 250 rows to imdb_top250.csv


In [17]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE = "https://www.timeanddate.com"
URL = "https://www.timeanddate.com/weather/?sort=1&low=4"

options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(URL)
time.sleep(3)

soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

rows = soup.select("table.zebra.tb-wt tbody tr")
data = []

for i, row in enumerate(rows[:50], start=1):
    cells = row.find_all("td")
    if len(cells) < 3:
        continue
    city_tag = cells[0].find("a")
    city = city_tag.get_text(strip=True) if city_tag else None
    href = BASE + city_tag["href"] if city_tag else None
    temp = cells[1].get_text(strip=True)
    condition = cells[2].get_text(strip=True)
    data.append({
        "City Name": city,
        "Temperature": temp,
        "Weather Condition": condition,
        "URL": href
    })
    print(f"{i}. {city} ✓")

df = pd.DataFrame(data)
df.to_csv("weather.csv", index=False)
print(f"\n✅ Saved {len(df)} rows to weather.csv")



✅ Saved 0 rows to weather.csv
