# Mediamarkt scraper

### Import packages

In [57]:
# Import packages
from bs4 import BeautifulSoup
import requests
from time import sleep
import pandas as pd
import json
from pandas.io.json import json_normalize

### Collect page URLS

In [58]:
# Define function to check whether there is a next page
def check_next_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    next_btn = soup.find(class_= "pagination-next") 
    return next_btn.find("a").attrs["href"] if next_btn else None
print("Function created.")

In [59]:
# Define a function to collect all page urls
def generate_page_urls(page_url):
    page_urls = []
    while page_url:
        print("Saving: ", page_url)
        page_urls.append(page_url)
        if check_next_page(page_url) != None: 
            page_url = "https://www.mediamarkt.nl" + check_next_page(page_url)
        else:
            break
    print("Done with this category!")
    
    sleep(1)
    
    return page_urls
print("Function created.")

In [60]:
# Use the generate_page_urls function to collect all the page urls of the categories you want to scrape

# 1. Define the first page of every category
smartphones_url = "https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=1"
laptops_url = "https://www.mediamarkt.nl/nl/category/_laptops-482723.html?page=1"
tablets_url = "https://www.mediamarkt.nl/nl/category/_tablets-645048.html?page=1"
tvs_url = "https://www.mediamarkt.nl/nl/category/_televisies-450682.html?page=1"

# 2. Use the function on all categories, first checking whether there is a next page and if so, adding it to page_urls
page_urls = generate_page_urls(smartphones_url) + generate_page_urls(laptops_url) + generate_page_urls(tablets_url) + generate_page_urls(tvs_url)
print("All page URLS have been collected.")

Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=1
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=2
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=3
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=4
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=5
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=6
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=7
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=8
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=9
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=10
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=11
Saving:  https://www.mediamarkt.nl/nl/category/_smartphones-483222.html?page=12
Saving:  https://www.mediamarkt.nl/nl/category/_s

### Collect product page URLS

In [61]:
# Create a function to collect the product_urls
def create_product_urls(page_urls):
    product_urls = []
    for page_url in page_urls:
        res = requests.get(page_url)
        soup = BeautifulSoup(res.text, "html.parser")
        products = soup.find_all("h2")
        
        for product in products:
            try:
                product_url = "https://www.mediamarkt.nl" + product.find("a").attrs["href"]
                product_urls.append(product_url)
                print("Saving " + product_url)
            except:
                print("this is no product")
            
        sleep(1)
        
    return product_urls
print("Function created.")

Done!


In [62]:
# Use the create_product_urls function to create a list product_urls
product_urls = create_product_urls(page_urls)
print("All product URLS have been saved.")

dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product
dit is geen product


### Collect product specific data

In [63]:
# Search for the right elements, store them in variables an put them together in a dictionary
product_data = []

def scrape(product_urls):
    for url in product_urls:
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser") 
        device_type = soup.find(class_ = "breadcrumbs").find_all("li")[2].text.replace("\n", "")
        names = soup.find(class_ = "stickable").img.attrs["alt"]
        try:
            prices = soup.find("div", class_ = "price big").text
        except:
            prices = "no price"
        try:
            instock = soup.find(class_ = "box infobox availability").meta.attrs["content"]
        except: 
            instock = "OutOfStock"
        try:
            ratings = soup.find(class_ = "bvseo-ratingValue").text
        except:
            ratings = "no rating"
        try: 
            reviews = soup.find(class_ = "bvseo-reviewCount").text
        except: 
            reviews = "no reviews"
    
        # Get product attributes and store them in attributes_json
        attributes = soup.find(class_ = "specification").find_all('dt')
        values = soup.find(class_ = "specification").find_all('dd')

        attributes_json = {}
        for x, y in zip(attributes, values):
            attributes_json[x.text]=y.text
    
        # Store all variables in products
        products = {"device_type": device_type, "name": names, "price": prices, "instock": instock, "rating": ratings, 
                    "nr_reviews": reviews, "attributes": attributes_json}
        product_data.append(products)
        print("Saving: ", products["name"])

    sleep(1)
    
    return(product_data)
print("Function created.")

In [64]:
# Use the scraping function to save the data in product_data
product_data = scrape(product_urls)
print("All product data have been collected.")

Saving:  MOTOROLA moto e7 power - 64 GB Dual-Sim Blauw
Saving:  SAMSUNG Galaxy A52 - 128 GB Zwart
Saving:  SAMSUNG Galaxy A32 5G - 128 GB Zwart
Saving:  SAMSUNG Galaxy A32 4G - 128 GB Zwart
Saving:  SAMSUNG Galaxy A22 5G - 64 GB Grijs
Saving:  SAMSUNG Galaxy S21 5G - 128 GB Grijs
Saving:  MOTOROLA moto g60s - 128GB Dual-Sim - Blauw
Saving:  SAMSUNG Galaxy A52s 5G - 128 GB Zwart
Saving:  XIAOMI Redmi 9A
Saving:  XIAOMI 11 Lite 5G New Edition 128GB Zwart
Saving:  SAMSUNG Galaxy A03s - 32 GB Zwart
Saving:  SAMSUNG Galaxy A12 - 32 GB Wit
Saving:  SAMSUNG Galaxy Xcover 5 EE - 64 GB Zwart
Saving:  APPLE iPhone 12 - 64 GB Zwart 5G
Saving:  MOTOROLA moto e20 - 32GB Dual-Sim - Grijs
Saving:  SAMSUNG Galaxy A13 - 128 GB Zwart
Saving:  SAMSUNG Galaxy A12 - 32 GB Zwart
Saving:  APPLE iPhone 11 - 64 GB Zwart
Saving:  APPLE iPhone 11 - 64 GB Wit
Saving:  SAMSUNG Galaxy S21 Plus 5G - 128 GB Zwart
Saving:  APPLE iPhone 13 - 128 GB Green 5G
Saving:  SAMSUNG Galaxy S20 FE 4G - 128 GB Donkerblauw
Saving:

### Store and export the product data

In [106]:
# Write the raw data, product_data, to a json_file
with open('raw_product_data.json', 'w') as json_file:
  json.dump(product_data, json_file)
print("Data have been saved in a json file.")

Done!


In [111]:
# Opening and normalizing the raw JSON data

# 1. Open the saved raw JSON data and convert it into a pandas dataframe
df = pd.read_json('raw_product_data.json')

# 2. Normalize the data, putting the before nested items in attributes into columns and dropping the column 'attributes'
df = df.join(pd.json_normalize(df.attributes)).drop(columns=['attributes'])
print("Dataframe has been created.")

              device_type                                           name  \
0       Mobiele telefoons  MOTOROLA moto e7 power - 64 GB Dual-Sim Blauw   
1       Mobiele telefoons              SAMSUNG Galaxy A52 - 128 GB Zwart   
2       Mobiele telefoons           SAMSUNG Galaxy A32 5G - 128 GB Zwart   
3       Mobiele telefoons           SAMSUNG Galaxy A32 4G - 128 GB Zwart   
4       Mobiele telefoons            SAMSUNG Galaxy A22 5G - 64 GB Grijs   
...                   ...                                            ...   
1576           Televisies                              OK. ODL32641F-DIB   
1577           Televisies                   PHILIPS The One 43PUS8545/12   
1578  Beamers & projectie              SAMSUNG THE PREMIERE LSP9T (2021)   
1579           Televisies             SAMSUNG Neo QLED 4K 85QN90A (2021)   
1580           Televisies                   PHILIPS The One 50PUS8545/12   

       price     instock rating nr_reviews Type apparaat: Besturingssysteem:  \
0      

In [113]:
# Write the pandas dataframe to a csv
df.to_csv("mediamarkt_scraper_output.csv", sep = ",", index = False)
print("Done, the csv is ready for data preparation.")

Done, the csv is ready for data preparation.
