In [314]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.by import By
import json
import pandas as pd

# Initialize a list to store the data for each individual page
all_data = {}

# Initialize the web driver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
# Initial search page URL
url = "https://samlingar.shm.se/sok?type=object&productionPeriod=Vikingatid&hasImage=1&category=Arkeologisk%20samling&category=Vapen%20och%20rustningar&listType=archaeological&rows=500&offset=0"
driver.get(url)

# Wait for the page to load (you may need to adjust the sleep duration)
time.sleep(2)

# Find the cookie disclaimer button by its aria-label
cookie_disclaimer = driver.find_element(By.CSS_SELECTOR, '[aria-label="Godkänn alla kakor"]')
# Check if the cookie disclaimer button is displayed and then click it
if cookie_disclaimer.is_displayed():
    ActionChains(driver).move_to_element(cookie_disclaimer).click().perform()


In [315]:
# Scrape data from the main table
table = driver.find_element(By.TAG_NAME, "table")
df = pd.read_html(table.get_attribute('outerHTML'))[0]

# Drop the 'Bild' column
if 'Bild' in df.columns:
    df.drop(columns=['Bild'], inplace=True)

# Extract museum names from the 'title' attribute of each <i> element
museum_elements = driver.find_elements(By.CSS_SELECTOR, "td i.museum-icon")
museum_names = [elem.get_attribute('title') for elem in museum_elements]

# Replace the 'Museum' column with extracted text
if 'Museum' in df.columns and len(museum_names) == len(df):
    df['Museum'] = museum_names
else:
    print("Mismatch in number of rows while extracting museum names")


In [316]:
# Collect names and href links for each item
item_link = [(item.get_attribute('href')) for item in driver.find_elements(By.CLASS_NAME, "archaeological-list__link")]

item_names = [f"{item.text} - {index+1}" for index, item in enumerate(driver.find_elements(By.CLASS_NAME, "archaeological-list__link"))]
df['Unique Name'] = item_names
df['Catalog Link'] = item_link

# Initialize a column for extra details
df['Extra Details'] = None


In [317]:
# Navigate to each item's link and scrape additional details
for index, link in enumerate(df['Catalog Link']):
    driver.get(link)

    # Scrape the details from the item's page
    item_details = {}
    item_tables = driver.find_elements(By.TAG_NAME, "table")
    for item_table in item_tables:
        item_df = pd.read_html(item_table.get_attribute('outerHTML'))[0]
        for row in item_df.itertuples(index=False):
            item_details[row[0]] = row[1]

    # Store the scraped details as JSON in the DataFrame
    df.at[index, 'Extra Details'] = json.dumps(item_details)


In [None]:
import numpy as np

# Close the driver
driver.quit()

# Replace '-' with NaNs in the entire DataFrame
df.replace('-', np.NaN, inplace=True)

df.head()  # Display the first few rows of the table


Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,Land,Kontexttyp,Kontextnr.,Artbedömning,Benslagsbedömning,Museum,Unique Name,Catalog Link,Extra Details
0,Bipolär vikt,371770_HST,16835,Undernummer: 2,"Brons, Järn",Tystebols,,Stenkyrka socken,Gotland,Sverige,,,,,Historiska museet,Bipolär vikt - 1,https://samlingar.shm.se/object/B5B8200F-E7CB-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
1,Vikt,106523_HST,34000,Undernummer: Bj 977,Brons,"Björkö, Hemlanden",L2017:1904,Adelsö socken,Uppland,Sverige,Kammargrav,977,,,Historiska museet,Vikt - 2,https://samlingar.shm.se/object/156E4F60-151E-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
2,Balansvåg,107068_HST,12426,Undernummer: M:IV,Brons,Rösta,L1945:291,Ås socken,Jämtland,Sverige,Grav,M:IV,,,Historiska museet,Balansvåg - 3,https://samlingar.shm.se/object/C98C78B5-223A-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
3,Mynthänge,46_HST,33758,Undernummer: 50,Silver,Spillings,L1976:7626,Othem socken,Gotland,Sverige,Skattfynd,1,,,Historiska museet,Mynthänge - 4,https://samlingar.shm.se/object/32ECD6F0-CE46-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
4,Barr,50_HST,33758,Undernummer: 57,Silver,Spillings,L1976:7626,Othem socken,Gotland,Sverige,Skattfynd,1,,,Historiska museet,Barr - 5,https://samlingar.shm.se/object/7018CC3C-AE00-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."


In [None]:
df.to_csv('Viking_war_artifacts.csv')

In [None]:
df

Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,Land,Kontexttyp,Kontextnr.,Artbedömning,Benslagsbedömning,Museum,Unique Name,Catalog Link,Extra Details
0,Bipolär vikt,371770_HST,16835,Undernummer: 2,"Brons, Järn",Tystebols,,Stenkyrka socken,Gotland,Sverige,,,,,Historiska museet,Bipolär vikt - 1,https://samlingar.shm.se/object/B5B8200F-E7CB-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
1,Vikt,106523_HST,34000,Undernummer: Bj 977,Brons,"Björkö, Hemlanden",L2017:1904,Adelsö socken,Uppland,Sverige,Kammargrav,977,,,Historiska museet,Vikt - 2,https://samlingar.shm.se/object/156E4F60-151E-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
2,Balansvåg,107068_HST,12426,Undernummer: M:IV,Brons,Rösta,L1945:291,Ås socken,Jämtland,Sverige,Grav,M:IV,,,Historiska museet,Balansvåg - 3,https://samlingar.shm.se/object/C98C78B5-223A-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
3,Mynthänge,46_HST,33758,Undernummer: 50,Silver,Spillings,L1976:7626,Othem socken,Gotland,Sverige,Skattfynd,1,,,Historiska museet,Mynthänge - 4,https://samlingar.shm.se/object/32ECD6F0-CE46-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
4,Barr,50_HST,33758,Undernummer: 57,Silver,Spillings,L1976:7626,Othem socken,Gotland,Sverige,Skattfynd,1,,,Historiska museet,Barr - 5,https://samlingar.shm.se/object/7018CC3C-AE00-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,Våg,974693_HST,35000,Fyndnummer: 29764,Cu-legering,"Björkö, Svarta jorden",L2017:1568,Adelsö socken,Uppland,Sverige,R-enhet,1175,,,Historiska museet,Våg - 251,https://samlingar.shm.se/object/D2E438E3-13CD-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
251,klippsilver,975158_HST,35000,Fyndnummer: 23348,Silver,"Björkö, Svarta jorden",L2017:1568,Adelsö socken,Uppland,Sverige,R-enhet,1014,,,Historiska museet,klippsilver - 252,https://samlingar.shm.se/object/F065A048-4F39-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
252,Barr,975626_HST,35000,Fyndnummer: 27127,Silver,"Björkö, Svarta jorden",L2017:1568,Adelsö socken,Uppland,Sverige,R-enhet,1165,,,Historiska museet,Barr - 253,https://samlingar.shm.se/object/58A3D31D-DBCD-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
253,Smycke,976212_HST,35000,Fyndnummer: 30854,Silver,"Björkö, Svarta jorden",L2017:1568,Adelsö socken,Uppland,Sverige,R-enhet,1232,,,Historiska museet,Smycke - 254,https://samlingar.shm.se/object/959D37A5-2B85-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."


In [None]:
# alternates: 
# https://samlingar.shm.se/sok?type=object&productionPeriod=Vikingatid&hasImage=1&category=Arkeologisk%20samling&category=Vapen%20och%20rustningar&listType=archaeological&rows=500&offset=0
# https://samlingar.shm.se/sok?type=object&productionPeriod=Vikingatid&hasImage=1&category=Arkeologisk%20samling&category=Handel%20och%20v%C3%A4rdem%C3%A4tare&listType=archaeological&rows=300&offset=0
# https://samlingar.shm.se/sok?type=object&query=Vikingatid&listType=archaeological&rows=1000&offset=0

In [None]:

# Find all the search result items
results = driver.find_elements(By.XPATH, "//a[contains(@class,'rKAserSVOTC2Sy4lh50nJg') and contains(@class,'miFVghdNq-ew7wuzMJWdDw')]")

In [None]:
all_data = {}

for table in tables:
    # Initialize lists for table headers
    table_headers = []

    # Find table header (th) elements
    header_elements = table.find_elements(By.TAG_NAME, "th")
    for header in header_elements:
        table_headers.append(header.text.strip())

    # Identify the index of "Andra nummer" in the headers
    andra_nummer_index = table_headers.index("Andra nummer")

    # Find table rows (tr) elements and extract data
    rows = table.find_elements(By.TAG_NAME, "tr")
    for row in rows:
        # Find columns (td) elements within the row
        columns = row.find_elements(By.TAG_NAME, "td")
        if columns:  # Ensure that the row has data columns
            row_data = [column.text.strip() for column in columns]

            # Use "Andra nummer" as the key and the rest of the data as value
            andra_nummer = row_data[andra_nummer_index]
            all_data[andra_nummer] = {table_headers[i]: row_data[i] for i in range(len(table_headers)) if i != andra_nummer_index}

In [None]:
from collections import defaultdict

# Dictionary to hold the title and its corresponding URL
titles_and_urls = defaultdict(list)

for result in results:
    # Extract the aria-label attribute and split it to get the title
    aria_label = result.get_attribute('aria-label')
    title_split = aria_label.split(' - Föremålsbenämning:')
    title = title_split[0].strip()

    # Extract the URL
    url = result.get_attribute('href')

    # Modify title if it's a duplicate
    if titles_and_urls[title]:
        count = len(titles_and_urls[title]) + 1
        title = f"{title} ({count})"
    
    # Add the title and URL to the dictionary
    titles_and_urls[title].append(url)

# Flatten the dictionary to ensure each title has a unique URL
titles_and_urls = {k: v[0] for k, v in titles_and_urls.items()}

# Dictionary to hold the data for all items
all_items_dict = {}

# Iterate over titles and URLs to fetch table data
for title, url in titles_and_urls.items():
    # Open the link of the result
    driver.get(url)
    time.sleep(3)  # wait for the page to load

    # Grab the first table on the page as the data source
    try:
        table = pd.read_html(driver.page_source)[0]
        row_dict = {row[0]: row[1] for row in table.itertuples(index=False)}
        all_items_dict[title] = row_dict
    except IndexError:
        print(f"No table found on page for {title}")
    except Exception as e:
        print(f"Error processing table for {title}: {e}")


In [None]:
driver.quit()

In [None]:
# You now have a dictionary with item titles as keys and table data (as dictionaries) as values
all_data_json = json.dumps(all_items_dict, indent=4)

# Convert the JSON string to a Python dictionary
data_dict = json.loads(all_data_json)

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(data_dict, orient='index')

# Reset the index to make the item titles a column
df = df.reset_index()

# Rename the former index column to 'unique_name'
df = df.rename(columns={'index': 'unique_name'})

In [None]:
df.to_csv('Viking_artifacts.csv')

In [None]:
# You now have a dictionary with item titles as keys and table data (as dictionaries) as values
all_data_json = json.dumps(all_items_dict, indent=4)
# Convert the JSON string to a Python dictionary
data_dict = json.loads(all_data_json)

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(data_dict, orient='index')

# Now df is a DataFrame (table) representation of your JSON data
df.head()

In [None]:
# Dictionary to hold the title and its corresponding URL
titles_and_urls = {}

for result in results:
    # Extract the aria-label attribute and split it to get the title
    aria_label = result.get_attribute('aria-label')
    title_split = aria_label.split(' - Föremålsbenämning:')
    title = title_split[0].strip()

    # Extract the URL
    url = result.get_attribute('href')

    # Add the title and URL to the dictionary
    titles_and_urls[title] = url

# Dictionary to hold the data for all items
all_items_dict = {}

# Iterate over titles and URLs to fetch table data
for title, url in titles_and_urls.items():
    # Open the link of the result
    driver.get(url)
    time.sleep(3)  # wait for the page to load

    # Grab the first table on the page as the data source
    try:
        table = pd.read_html(driver.page_source)[0]
        row_dict = {row[0]: row[1] for row in table.itertuples(index=False)}
        all_items_dict[title] = row_dict
    except IndexError:
        print(f"No table found on page for {title}")
    except Exception as e:
        print(f"Error processing table for {title}: {e}")