In [1]:
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from helpers import normalize_filename, fetch

## Good techniques for organizing your scraped data

- Cache your content when fetching it
- Minimize the number of requests you make
- Use a deterministic file naming scheme or database 

In [2]:
url = 'https://en.wikipedia.org/wiki/Estimation_of_covariance_matrices'

In [3]:
normalize_filename(url)

'enwikipediaorg_wiki_Estimation_of_covari_d276bab9-20231103.html'

In [5]:
html = fetch(url)

Using cached version of https://en.wikipedia.org/wiki/Estimation_of_covariance_matrices


In [8]:
# Get the HTML content of the page
url = 'https://brickset.com/sets/30496-1/U-Wing-Fighter'
page = fetch(url)

Using cached version of https://brickset.com/sets/30496-1/U-Wing-Fighter


In [9]:
soup = BeautifulSoup(page, 'html.parser')

## Basic CSS selectors

describe basic selectors and their properties

#### tag selector: `{tag_name}`: used to specify **all elements of the same type**

- Example: `p` will select all paragraphs in the document.

#### id selector: `#{id_name}`: used to specify a **single, unique element**.
- Example: `#username` will select the element with `id="username"`.

#### class selector: `.{class_name}`: used to specify  **one or more elements of the same class**.
- Example: `.important` will select all elements with `class="important"`.

#### attribute selector: `[attribute_name]`: used to specify **one or more elements with the same attribute**.
- Example: `[href]` will select all elements with an `href` attribute.

#### attribute value selector: `[attribute_name="value"]`: used to specify **one or more elements with the same attribute value**.
- Example: `[href="https://www.google.com"]` will select all elements with `href="https://www.google.com"`.




In [13]:
soup.find_all('h1')[0].get_text().strip()

'30496: U-wing Fighter'

In [None]:
'.featurebox dl dt'

In [10]:
soup.find_all(attrs={'class': 'featurebox'})

[<section class="featurebox"><h2>Details</h2><div class="text">
 <dl>
 <dt>Number</dt>
 <dd>30496-1</dd>
 <dt>Name</dt>
 <dd>U-wing Fighter</dd>
 <dt>Type</dt>
 <dd>Normal</dd>
 <dt>Theme group</dt>
 <dd>Licensed</dd>
 <dt>Theme</dt>
 <dd><a href="/sets/theme-Star-Wars">Star Wars</a></dd>
 <dt>Subtheme</dt>
 <dd><a href="/sets/subtheme-Rogue-One">Rogue One</a></dd>
 <dt>Year released</dt>
 <dd><a href="/sets/theme-Star-Wars/year-2017">2017</a></dd>
 <dt>Launch/exit</dt>
 <dd>01 Jan 17 - 31 Dec 18</dd>
 <dt>Tags</dt>
 <dd><div class="viewtags"><a class="plain" href="#">View tags »</a></div><div id="tags26907"><a href="/sets/tag-Disney">Disney</a><br/><a href="/sets/tag-Microscale">Microscale</a><br/><a href="/sets/tag-Polybag">Polybag</a><br/><a href="/sets/tag-Rebel-Alliance">Rebel Alliance</a><br/><a href="/sets/tag-Rogue-One">Rogue One</a><br/><a href="/sets/tag-Star-Wars-Anthology">Star Wars Anthology</a><br/><a href="/sets/tag-Starfighter">Starfighter</a><br/><a href="/sets/tag-Tra

In [14]:
# Locate the feature box and the description list within it
feature_box = soup.select_one(".featurebox .text dl")

In [15]:
feature_box

<dl>
<dt>Number</dt>
<dd>30496-1</dd>
<dt>Name</dt>
<dd>U-wing Fighter</dd>
<dt>Type</dt>
<dd>Normal</dd>
<dt>Theme group</dt>
<dd>Licensed</dd>
<dt>Theme</dt>
<dd><a href="/sets/theme-Star-Wars">Star Wars</a></dd>
<dt>Subtheme</dt>
<dd><a href="/sets/subtheme-Rogue-One">Rogue One</a></dd>
<dt>Year released</dt>
<dd><a href="/sets/theme-Star-Wars/year-2017">2017</a></dd>
<dt>Launch/exit</dt>
<dd>01 Jan 17 - 31 Dec 18</dd>
<dt>Tags</dt>
<dd><div class="viewtags"><a class="plain" href="#">View tags »</a></div><div id="tags26907"><a href="/sets/tag-Disney">Disney</a><br/><a href="/sets/tag-Microscale">Microscale</a><br/><a href="/sets/tag-Polybag">Polybag</a><br/><a href="/sets/tag-Rebel-Alliance">Rebel Alliance</a><br/><a href="/sets/tag-Rogue-One">Rogue One</a><br/><a href="/sets/tag-Star-Wars-Anthology">Star Wars Anthology</a><br/><a href="/sets/tag-Starfighter">Starfighter</a><br/><a href="/sets/tag-Transport-Vehicle">Transport Vehicle</a><br/><a href="/sets/tag-Ut-60D-U-Wing-Troop-Tr

In [16]:
# Extract the 'dt' and 'dd' elements
dt_elements = feature_box.find_all('dt')
dd_elements = feature_box.find_all('dd')

# Use list comprehension to extract text from elements
data = {dt.text.strip(): dd.text.strip() for dt, dd in zip(dt_elements, dd_elements)}

# Display the data
print(data)


{'Number': '30496-1', 'Name': 'U-wing Fighter', 'Type': 'Normal', 'Theme group': 'Licensed', 'Theme': 'Star Wars', 'Subtheme': 'Rogue One', 'Year released': '2017', 'Launch/exit': '01 Jan 17 - 31 Dec 18', 'Tags': 'View tags »DisneyMicroscalePolybagRebel AllianceRogue OneStar Wars AnthologyStarfighterTransport VehicleUt-60D U-Wing Troop Transport', 'Pieces': '55', 'RRP': '$3.99', 'Current value': 'New: ~$5\nUsed: ~$3', 'Price per piece': '7.3c', 'Age range': '6 - 12', 'Packaging': 'Polybag', 'Dimensions': '17 x 17 x 0.8 cm (6.7 x 6.7 x 0.3 in)', 'Barcodes': 'UPC: 673419268806EAN: 5702015877039', 'LEGO item numbers': 'NA: 6176989EU: 6176988', 'Notes': '[US] Available in Walmart, May 2017.[NO] Free with Donald Duck and Co. comic, May 2017.[BE] Free with qualifying spend in Maxi Toys, June 2016.[ES] Free with qualifying spend in Carrefour, September 2017.Might be in 5005704-1', 'Rating': '✭✭✭✭✩ 3.6 221 ratings, 2\xa0reviews Brickset review'}


In [17]:
pd.Series(data).to_frame()

Unnamed: 0,0
Number,30496-1
Name,U-wing Fighter
Type,Normal
Theme group,Licensed
Theme,Star Wars
Subtheme,Rogue One
Year released,2017
Launch/exit,01 Jan 17 - 31 Dec 18
Tags,View tags »DisneyMicroscalePolybagRebel Allian...
Pieces,55
