# Notebook 1 
## COMP30760 â€” Assignment 1  
## Charlie Zhang - 23341901

## Task 1: Data Collection
- Web scraping and parsing
- Saving data in an appropriate format.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import re
from urllib.parse import urljoin
from io import StringIO

Download the HTML source code for the target web page at the link. This is where we will scrape all the data.

In [None]:
url = "" // remove the url

In [3]:
import urllib.request
import urllib.error

try:
    # open the url
    response = urllib.request.urlopen(url)
    # read the response data (bytes) and decode it into a string
    html = response.read().decode("utf-8", errors="replace")
    # print the html page
    print(html)
    # implement error handling
except urllib.error.HTTPError as e:
    print(f"HTTP Error {e.code}: {e.reason}")
    html = None
except urllib.error.URLError as e:
    print(f"Network Error: {e.reason}")
    html = None
except Exception as e:
    print(f"Unexpected error: {e}")
    html = None

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="robots" content="noindex">  
  <meta name="description" content="Car sale records database for educational purposes. Browse second-hand car sales by manufacturer with detailed information including price, year, mileage, and specifications.">
  <title>Car Sale Records - Browse Used Car Sales Database</title>
  <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css">
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
  <script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
  <link rel="stylesheet" type="text/css" href="style.css">
</head>
<body>
    <div class="container">
      <main>
        <section class="instructions">
          <div class="row">
            <div

Find all the links on the page in order to find the makes of the cars

In [4]:
import bs4

# create a BeautifulSoup object from the index page
# using lxml because it is faster and reliable that the html.parser
soup = bs4.BeautifulSoup(html, "lxml")

# from checking all of the links they all end with page01.html
# these links are the make/brands of the cars
# we use this to find all of the links with page01.html
find_makes = [link for link in soup.find_all("a", href=True) if "page01.html" in link["href"]]


# find the links and display how many are found and which ones were found
if find_makes:
    print(f"Found {len(find_makes)} make links:")
    
    for link in find_makes:
        print(f"- {link.get_text(strip=True)} -> {link['href']}")
        
else: 
    # error handling - if there were none found
    print("No links found")


Found 4 make links:
- Make: Audi -> Audi-page01.html
- Make: BMW -> BMW-page01.html
- Make: Mercedes-Benz -> Mercedes-Benz-page01.html
- Make: Volkswagen -> Volkswagen-page01.html


Import all necessary objects

In [5]:
import time
import urllib.request
import urllib.error
from urllib.parse import urljoin
from io import StringIO

import bs4
import pandas as pd

Find the links with -page01.html as all of the pages of information have these in the ending

In [6]:
get_links = [
    (a["href"].rsplit("-page01.html", 1)[0], urljoin(url, a["href"]))
    for a in find_makes
]

Takes in raw html data and parsing the data. Clean the data

In [7]:

all_cars = []

# this loop overs every make and find them because of the page01.html in the ending of the link
for make, base_url in get_links:
    prefix = base_url.rsplit("page01.html", 1)[0]
    page_no = 1
    
    # keeps looping until it scans all pages
    while True:
        page_url = f"{prefix}page{page_no:02d}.html"
        try:
            with urllib.request.urlopen(page_url) as resp:
                page_html = resp.read().decode("utf-8", errors="replace") # decode the data into text

        # error handling to show if it doesn't work
        except urllib.error.HTTPError as e:
            if e.code == 404:
                break
            print("HTTP error", e.code, e.reason, "for", page_url)
            break
        except Exception as e:
            print("Fetch error:", e, "for", page_url)
            break

        # 2nd soup for the page
        page_soup = bs4.BeautifulSoup(page_html, "lxml")

        # parse each table on the page using pandas.read_html
        for table in page_soup.find_all("table"):
            
            try:
                t = pd.read_html(StringIO(str(table)))[0] # reads the html and turns it into pandas dataframe
            except ValueError:
                continue
                
            if t is None or t.empty:
                continue

            # assume first two columns are key and value
            t = t.iloc[:, :2].copy()
            t.columns = ["field", "value"]

            # cleaning the data
            t["field"] = t["field"].astype(str)
            t["field"] = t["field"].str.strip()
            t["field"] = t["field"].str.rstrip(":")

            # more data cleaning
            t["value"] = t["value"].astype(str)
            t["value"] = t["value"].str.strip()

            t = t[t["field"].str.len() > 0]

            # build a record dictionary because it make fields easily searchable
            record = {}
            for f, v in zip(t["field"], t["value"]):
                if v and v.lower() != "nan":
                    record[f] = v

            # fixes case sensitive sale prices to a standard Sale Price
            sales = t["field"].str.strip().str.lower() == "sale price"
            if sales.any():
                record["Sale Price"] = t.loc[sales, "value"].iloc[0]

            # grabs the headings
            title = table.find_previous(["h1", "h2", "h3"])
            # save the title 
            if title:
                record["Title"] = title.get_text(" ", strip=True)

            # save the make
            if record:
                record["Make"] = make
                all_cars.append(record)

        page_no += 1 # increments by 1 each time to move onto the next page
        time.sleep(0.2) # added in a small delay to ensure data is collected safely 

Converts the scraped data into a pandas table and create a json file.

In [8]:
df = pd.DataFrame.from_records(all_cars)
df.to_json("cars_dataset.json", orient="records", indent=2, force_ascii=False) # Saves the table as a json

print("cars_dataset.json created")

cars_dataset.json created
