In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import json

In [2]:
url = "https://baraasalout.github.io/test.html"
request = requests.get(url)
soup = BeautifulSoup(request.content, 'html.parser') # parsing all the html of the website

In [3]:
# Function to extract the wanted text and save it in a CSV file
def extractText():
    data = []
    fields = ['Type', 'Content']

    # extracting the content of every type of text as a list
    h1 = soup.find_all("h1")
    h2 = soup.find_all("h2")
    p = soup.find_all("p")
    li = soup.find_all("li")

    # looping through each type to print it and add it to the data list as a dictionary containing the type of the text and it's content
    print("All h1: ")
    for i in h1:
        print(i.text)
        data.append({"Type": "Heading 1", "Content": i.text})

    print("\n All h2: ")
    for i in h2:
        print(i.text)
        data.append({"Type": "Heading 2", "Content": i.text})

    print("\n All p: ")
    for i in p:
        print(i.text)
        data.append({"Type": "Paragragh", "Content": i.text})

    print("\n All li: ")
    for i in li:
        print(i.text)
        data.append({"Type": "List item", "Content": i.text})

    # Now we create the CSV file and fill it with the content in the data list
    with open('Extract_Text_data.csv', 'w', newline = '', encoding = 'utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fields)
        writer.writeheader()
        writer.writerows(data)

In [4]:
extractText()

All h1: 
Web Scraping Practice

 All h2: 
Available Products
Product Table
Watch This Video
Contact Us
Product Information
Featured Products

 All p: 
Welcome to the web scraping task! Use your skills to extract the required data from this page.
Sharp Objects
£47.82
✔ In stock
In a Dark, Dark Wood
£19.63
✔ In stock
The Past Never Ends
£56.50
✔ In stock
A Murder in Time
£16.64
 Out stock
Wireless Headphones
$49.99
Available colors: Black, White, Blue
Smart Speaker
$89.99
Available colors: Grey, Black
Smart Watch
$149.99
Available colors: Black, Silver, Gold
© 2024 Web Scraping Practice. All Rights Reserved.

 All li: 
Laptop
Smartphone
Tablet
Smartwatch


In [5]:
# function to extract the content out of the table and save it in CSV file
def extractTable():
    # defin the data list and the heading of the table 
    data = []
    fields = ["Product", 'Price', "In Stock"]

    #Extract the table and it's rows as a list
    table = soup.find('table')
    rows = table.find_all('tr')
    

    #now we loop through each row starting form the second one(Becoause the first one is the header), then the data is added as a dictionary to the data list the
    for i in range(1, len(rows)):
        cells = rows[i].find_all('td')
        # print(cells[0])
        data.append({"Product": cells[0].text, "Price": cells[1].text, "In Stock": cells[2].text})

    # then the CSV file is created and filled with the data list
    with open('Extract_Table_Data.csv', 'w', newline = '', encoding = 'utf-8') as file: 
        writer = csv.DictWriter(file, fieldnames = fields)
        writer.writeheader()
        writer.writerows(data)

    for i in data:
        print(data)

In [6]:
extractTable()

[{'Product': 'Laptop', 'Price': '$1000', 'In Stock': 'Yes'}, {'Product': 'Smartphone', 'Price': '$800', 'In Stock': 'No'}, {'Product': 'Tablet', 'Price': '$500', 'In Stock': 'Yes'}]
[{'Product': 'Laptop', 'Price': '$1000', 'In Stock': 'Yes'}, {'Product': 'Smartphone', 'Price': '$800', 'In Stock': 'No'}, {'Product': 'Tablet', 'Price': '$500', 'In Stock': 'Yes'}]
[{'Product': 'Laptop', 'Price': '$1000', 'In Stock': 'Yes'}, {'Product': 'Smartphone', 'Price': '$800', 'In Stock': 'No'}, {'Product': 'Tablet', 'Price': '$500', 'In Stock': 'Yes'}]


In [7]:
# a Function to extract the data from the book cards section in the website as save it in a JSON file
def extractBookCards():
    data = []
    #Extract the cards data from the website as a list using the class of it's container
    cardsContainer = soup.find("div", class_='book-products')
    cards = cardsContainer.find_all('div')

    # loop through the cards and get all the text from it and the text of it's button then add each one in a dictionary in the data list
    for card in cards:
        p = card.find_all('p')
        button = card.find('button')

        data.append({'Book title':p[0].text, 'Price': p[1].text, 'Stock availablity': p[2].text, "Button text": button.text })

    #Create the JSON file and add the data from the data list
    with open("Product_Information.json", 'w') as file:
        json.dump(data, file)

    for i in data:
        print(i)
    


In [8]:
extractBookCards()

{'Book title': 'Sharp Objects', 'Price': '£47.82', 'Stock availablity': '✔ In stock', 'Button text': 'Add to basket'}
{'Book title': 'In a Dark, Dark Wood', 'Price': '£19.63', 'Stock availablity': '✔ In stock', 'Button text': 'Add to basket'}
{'Book title': 'The Past Never Ends', 'Price': '£56.50', 'Stock availablity': '✔ In stock', 'Button text': 'Add to basket'}
{'Book title': 'A Murder in Time', 'Price': '£16.64', 'Stock availablity': ' Out stock', 'Button text': 'Add to basket'}


In [9]:
# function to extract the data from the form in the page and save it in a JSON 
def extractFormDetails():
    data = []   
    #extract the form and get all types of tags inside it even if it was label or input
    form = soup.find('form')
    inputs = form.find_all(True)
    
    # now we filter the data in the variable input by removing the labels from it
    cleanInputs = [i for i in inputs if i.name != 'label']
    # then we loop throu the rest of the clean input  and add it's data to the data list as a dictionary
    for i in cleanInputs:
        inputType = i.get('type') or i.name
        data.append({"Field name": i.get('name'), "Input type": inputType, "Default value": i.get("value"), "Place holder": i.get("placeholder")})
        
    # Create the JSON file and add the data from the data list 
    with open('Extract_Form_details.json', 'w') as file:
        json.dump(data, file)
    
    for i in data:
        print(i)

In [10]:
extractFormDetails()

{'Field name': 'username', 'Input type': 'text', 'Default value': None, 'Place holder': 'Enter your username'}
{'Field name': 'password', 'Input type': 'password', 'Default value': None, 'Place holder': 'Enter your password'}
{'Field name': 'options', 'Input type': 'select', 'Default value': None, 'Place holder': None}
{'Field name': None, 'Input type': 'option', 'Default value': 'option1', 'Place holder': None}
{'Field name': None, 'Input type': 'option', 'Default value': 'option2', 'Place holder': None}
{'Field name': None, 'Input type': 'option', 'Default value': 'option3', 'Place holder': None}
{'Field name': 'terms', 'Input type': 'checkbox', 'Default value': None, 'Place holder': None}
{'Field name': None, 'Input type': 'submit', 'Default value': 'Submit', 'Place holder': None}


In [11]:
# A function which extracts media from the web page either it was a hyperlink or an image or a video then add it's link in a JSON file
def extractMedia():
    data = []

    # extract all the links and videos (and images as bonus from me) from the page using it's tags then loop on it 
    # and add it to the data list in form of a dictionary which cantains the value of the type of the tage and the link from the src attribute
    links = soup.find_all("a")    #There is no <a> tags in the website But just in case
    for link in links:
        data.append({'Type': "Link", 'Link': link.get('href')})
        
    videos = soup.find_all('iframe')     #I meant to use find_all just in case more videos are added later although there is anly one video in the page 
    for video in videos:
        data.append({'Type': "Video", 'Link': video.get('src')})

    images = soup.find_all('img')
    for image in images:
        data.append({'Type': "Image", 'Link': image.get('src')})

    # save the data in the JSON file
    with open("Extract_Links.json", 'w') as file:
        json.dump(data, file)

    for i in data:
        print(i)

In [12]:
extractMedia()

{'Type': 'Video', 'Link': 'https://www.youtube.com/watch?v=ujf9RNuBdCU'}
{'Type': 'Image', 'Link': 'http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg'}
{'Type': 'Image', 'Link': 'http://books.toscrape.com/media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg'}
{'Type': 'Image', 'Link': 'http://books.toscrape.com/media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg'}
{'Type': 'Image', 'Link': 'http://books.toscrape.com/media/cache/97/27/97275841c81e66d53bf9313cba06f23e.jpg'}
{'Type': 'Image', 'Link': 'https://via.placeholder.com/250x150?text=Product+1'}
{'Type': 'Image', 'Link': 'https://via.placeholder.com/250x150?text=Product+2'}
{'Type': 'Image', 'Link': 'https://via.placeholder.com/250x150?text=Product+3'}


In [13]:
# This is a scripting challenge function
# it extracts the data from the cards in featured products section in the webpage including the name and the id of the product and it's hidden price from the attributes 
def scriptingChallenge():
    data = []
    fields = ["Product Name", "Hidden Price", 'Available Colors', 'Product ID']

    # extract the cards from the featured products section as a list using the class names of the cards and it's container
    products = soup.find('div', class_='products').find_all('div', class_ = 'product-card')

    # loop through the products and get the wanted data either from it's child tags with class names or from it's attributes 
    # Add the data to the data list as dictionaries containing the wanted data
    for product in products:
        data.append({'Product Name': product.find('p', class_='name').text, 
                    "Hidden Price": product.find('p', class_='price').text,
                    'Available Colors': product.find('p', class_='colors').text,
                    'Product ID': product.get('data-id')})
    # create a CSV file to save the data from the data list
    with open('Featured_products_data.csv', 'w', newline = '') as file:
        writer = csv.DictWriter(file, fieldnames = fields)
        writer.writeheader()
        writer.writerows(data)
    
    for i in data:
        print(i)

In [14]:
scriptingChallenge()

{'Product Name': 'Wireless Headphones', 'Hidden Price': '$49.99', 'Available Colors': 'Available colors: Black, White, Blue', 'Product ID': '101'}
{'Product Name': 'Smart Speaker', 'Hidden Price': '$89.99', 'Available Colors': 'Available colors: Grey, Black', 'Product ID': '102'}
{'Product Name': 'Smart Watch', 'Hidden Price': '$149.99', 'Available Colors': 'Available colors: Black, Silver, Gold', 'Product ID': '103'}
