## Assignment 4

Q1 Write a Python program to scrape all available books from the website (https://books.toscrape.com/) Books to Scrape – a live site built for practicing scraping (safe, legal, no anti-bot). For each book, extract the following details:
1. Title
2. Price
3. Availability (In stock / Out of stock)
4. Star Rating (One, Two, Three, Four, Five)
Store the scraped results into a Pandas DataFrame and export them to a CSV file named books.csv.
(Note: Use the requests library to fetch the HTML page. Use BeautifulSoup to parse and extract book details and handle pagination so that books from all pages are scraped)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://books.toscrape.com/catalogue/page-{}.html"

def get_star_rating(tag):
    # Extract star rating as text (One, Two, Three, Four, Five).
    classes = tag.get("class", [])
    ratings = ["One", "Two", "Three", "Four", "Five"]
    for r in ratings:
        if r in classes:
            return r
    return None

books = []
page = 1

while True:
    url = BASE_URL.format(page)
    response = requests.get(url)

    if response.status_code != 200:
        break  # No more pages
    
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="product_pod")
    
    if not articles:
        break  # No books found on this page
    
    for article in articles:
        title = article.h3.a["title"]
        price = article.find("p", class_="price_color").text.strip()
        availability = article.find("p", class_="instock availability").text.strip()
        star_rating = get_star_rating(article.find("p", class_="star-rating"))
        
        books.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Star Rating": star_rating
        })
    
    page += 1

# Create DataFrame
df = pd.DataFrame(books)

# Save to CSV
df.to_csv("books.csv", index=False)

print(f"Scraped {len(df)} books. Data saved to books.csv")

Scraped 1000 books. Data saved to books.csv


In [8]:
df.head()

Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


Q2. Write a Python program to scrape the IMDB Top 250 Movies list (https://www.imdb.com/chart/top/) . For each movie, extract the following details:
1. Rank (1–250)
2. Movie Title
3. Year of Release
4. IMDB Rating
Store the results in a Pandas DataFrame and export it to a CSV file named imdb_top250.csv.
(Note: Use Selenium/Playwright to scrape the required details from this website)

In [9]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Setup Chrome
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://www.imdb.com/chart/top/"
driver.get(url)

# Wait until the list is visible
WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item"))
)

movies = driver.find_elements(By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")

all_movies = []

for rank, movie in enumerate(movies, start=1):
    try:
        # Title
        title = movie.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text.strip()

        # Year (first <span> inside metadata block)
        metadata_spans = movie.find_elements(By.CSS_SELECTOR, "div.cli-title-metadata span")
        year = metadata_spans[0].text if metadata_spans else "N/A"

        # Rating
        rating = movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--rating").text.strip()

        all_movies.append({
            "Rank": rank,
            "Title": title,
            "Year": year,
            "IMDB Rating": rating
        })
    except Exception as e:
        print(f"⚠️ Skipping movie #{rank} due to error: {e}")
        continue

driver.quit()

# Save to CSV
df = pd.DataFrame(all_movies)
df.to_csv("imdb_top250.csv", index=False, encoding="utf-8")

print(f"Scraped {len(df)} movies and saved to imdb_top250.csv")

Scraped 250 movies and saved to imdb_top250.csv


Q3. Write a Python program to scrape the weather information for top world cities from the given website (https://www.timeanddate.com/weather/) . For each city, extract the following details:
1. City Name
2. Temperature
3. Weather Condition (e.g., Clear, Cloudy, Rainy, etc.)
Store the results in a Pandas DataFrame and export it to a CSV file named weather.csv.

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Setup Chrome
options = Options()
options.add_argument("--headless")   # remove if you want to see browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

driver.get("https://www.timeanddate.com/weather/")
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "table.zebra.fw.tb-theme"))
)

# Get the table
table = driver.find_element(By.CSS_SELECTOR, "table.zebra.fw.tb-theme")
rows = table.find_elements(By.TAG_NAME, "tr")

cities, conditions, temps = [], [], []

for row in rows[1:]:   # skip header
    cols = row.find_elements(By.TAG_NAME, "td")
    if not cols:
        continue

    # each city is 4 cols: city, time, condition(icon), temperature
    for i in range(0, len(cols), 4):
        if i + 3 >= len(cols):
            continue

        city = cols[i].text.strip()

        cond_cell = cols[i+2]
        condition = "N/A"
        try:
            img = cond_cell.find_element(By.TAG_NAME, "img")
            condition = img.get_attribute("alt") or img.get_attribute("title")
        except:
            if cond_cell.text.strip():
                condition = cond_cell.text.strip()

        temp = cols[i+3].text.strip()

        if city:
            cities.append(city)
            conditions.append(condition)
            temps.append(temp)

driver.quit()

df = pd.DataFrame({
    "City": cities,
    "Condition": conditions,
    "Temperature": temps
})

print(f"Scraping of {len(df)} cities done. Data saved to weather.csv")
print(df.head(10))
df.to_csv("weather.csv", index=False)

Scraping of 140 cities done. Data saved to weather.csv
             City                Condition Temperature
0           Accra  Scattered clouds. Warm.       27 °C
1      Edmonton *             Sunny. Mild.       17 °C
2        Nassau *      Broken clouds. Hot.       32 °C
3     Addis Ababa    Passing clouds. Cool.       16 °C
4     Frankfurt *     Broken clouds. Mild.       17 °C
5       New Delhi               Fog. Warm.       31 °C
6        Adelaide              Quite cool.       12 °C
7  Guatemala City     Broken clouds. Mild.       23 °C
8   New Orleans *             Sunny. Warm.       29 °C
9         Algiers          Overcast. Warm.       26 °C
