In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.by import By
import json
import pandas as pd
import numpy as np
from googletrans import Translator
from collections import defaultdict
import requests


In [None]:

# Initialize a list to store the data for each individual page
all_data = {}

# Initialize the web driver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
# Initial search page URL
url = "https://samlingar.shm.se/sok?type=object&productionPeriod=Vikingatid&hasImage=1&category=Arkeologisk%20samling&category=Vapen%20och%20rustningar&listType=archaeological&rows=500&offset=0"
driver.get(url)

# Wait for the page to load (you may need to adjust the sleep duration)
time.sleep(2)

# Find the cookie disclaimer button by its aria-label
cookie_disclaimer = driver.find_element(By.CSS_SELECTOR, '[aria-label="Godkänn alla kakor"]')
# Check if the cookie disclaimer button is displayed and then click it
if cookie_disclaimer.is_displayed():
    ActionChains(driver).move_to_element(cookie_disclaimer).click().perform()


In [315]:
# Scrape data from the main table
table = driver.find_element(By.TAG_NAME, "table")
df = pd.read_html(table.get_attribute('outerHTML'))[0]

# Drop the 'Bild' column
if 'Bild' in df.columns:
    df.drop(columns=['Bild'], inplace=True)

# Extract museum names from the 'title' attribute of each <i> element
museum_elements = driver.find_elements(By.CSS_SELECTOR, "td i.museum-icon")
museum_names = [elem.get_attribute('title') for elem in museum_elements]

# Replace the 'Museum' column with extracted text
if 'Museum' in df.columns and len(museum_names) == len(df):
    df['Museum'] = museum_names
else:
    print("Mismatch in number of rows while extracting museum names")


In [316]:
# Collect names and href links for each item
item_link = [(item.get_attribute('href')) for item in driver.find_elements(By.CLASS_NAME, "archaeological-list__link")]

item_names = [f"{item.text} - {index+1}" for index, item in enumerate(driver.find_elements(By.CLASS_NAME, "archaeological-list__link"))]
df['Unique Name'] = item_names
df['Catalog Link'] = item_link

# Initialize a column for extra details
df['Extra Details'] = None


In [317]:
# Navigate to each item's link and scrape additional details
for index, link in enumerate(df['Catalog Link']):
    driver.get(link)

    # Scrape the details from the item's page
    item_details = {}
    item_tables = driver.find_elements(By.TAG_NAME, "table")
    for item_table in item_tables:
        item_df = pd.read_html(item_table.get_attribute('outerHTML'))[0]
        for row in item_df.itertuples(index=False):
            item_details[row[0]] = row[1]

    # Store the scraped details as JSON in the DataFrame
    df.at[index, 'Extra Details'] = json.dumps(item_details)


In [318]:
# Close the driver
driver.quit()

# Replace '-' with NaNs in the entire DataFrame
df.replace('-', np.NaN, inplace=True)

df.head()  # Display the first few rows of the table


Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,Land,Kontexttyp,Kontextnr.,Artbedömning,Benslagsbedömning,Museum,Unique Name,Catalog Link,Extra Details
0,Spjut,371667_HST,34000.0,Undernummer: Bj 581,Järn,"Björkö, Norr om Borg",L2017:1478,Adelsö socken,Uppland,Sverige,Kammargrav,581,,,Historiska museet,Spjut - 1,https://samlingar.shm.se/object/58928958-A16E-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
1,Spjut,371668_HST,34000.0,Undernummer: Bj 581,Järn,"Björkö, Norr om Borg",L2017:1478,Adelsö socken,Uppland,Sverige,Kammargrav,581,,,Historiska museet,Spjut - 2,https://samlingar.shm.se/object/EFCC758B-B6BB-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
2,Svärd Petersen Y,263086_HST,34000.0,Undernummer: Bj 752B,Järn,"Björkö, Hemlanden",L2017:1904,Adelsö socken,Uppland,Sverige,Kammargrav,752B,,,Historiska museet,Svärd Petersen Y - 3,https://samlingar.shm.se/object/FFCD05E4-1460-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
3,Tveeggat svärd,263468_HST,34000.0,Undernummer: Bj 542,Järn,"Björkö, Norr om Borg",L2017:1478,Adelsö socken,Uppland,Sverige,Kammargrav,542,,,Historiska museet,Tveeggat svärd - 4,https://samlingar.shm.se/object/7D599E53-A81D-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."
4,Svärd Petersen Y,264449_HST,,FID: 264449,Järn,,,,,Sverige,,,,,Historiska museet,Svärd Petersen Y - 5,https://samlingar.shm.se/object/CD8214A7-2FFB-...,"{""Museum"": ""Historiska museet"", ""F\u00f6rem\u0..."


In [47]:
# These are my own functions that help clean the data

def search_columns_for_substring(columns, substring):
    return [col for col in columns if substring in col]

# this function counts how many empty values are in each column for a data set (use this after the drop col function to verify)
def count_nans_in_dataframe(df):
    nan_counts = df.isna().sum()
    return pd.DataFrame({'Column': nan_counts.index, 'NaN Count': nan_counts.values})

# this function drops columns that do not meet the minimum threshold
# if by="count" then drop columns that don't have at least that many populated fields
    # ie. by_val=100 drops columns that have less than 100 populated rows
# if by="prop" then drop columns that don't have at least by_val% populated rows
    # ie. by_val=0.05 drops columns that aren't at least 5% populated
# if by="field" then drop columns that have more missing values than the columns specified
    # ie. by_val="Last_Device_Array.anv" will keep Last_Device_Array.anv but drop any cols that have more missing values than Last_Device_Array.anv
def drop_columns_with_fewer_nans(df, by="prop", by_val="0.05"):
    if by == "count":
        threshold = float(by_val)
    elif by == "prop": threshold = round(df.shape[0]*float(by_val))
    elif by == "field": threshold = df.shape[0]-df[by_val].isna().sum()
    cols_to_drop = []
    for col in df.columns:
        if df[col].isna().sum() > df.shape[0]-threshold:
            cols_to_drop.append(col)
    df = df.drop(cols_to_drop, axis=1)
    return df

# this function to drops columns with duplicate names
def drop_duplicate_columns(df):
    duplicates = df.columns[df.columns.duplicated(keep='first')]
    df = df.loc[:, ~df.columns.duplicated(keep='first')]
    return df

In [48]:
###### UNPACK JSON
# Load the dataset
# dataset = pd.read_csv('/Users/aly.milne/Library/CloudStorage/OneDrive-BrighamYoungUniversity/Fall 2023/STAT 386/ST386_Final_Project/Scraped_Data/Viking_war_artifacts.csv')

# unpack json
df['Extra Details'] = df['Extra Details'].map(json.loads)
war_artifacts_exploded = pd.json_normalize(df.to_dict(orient='records'))

# drop rows that are not at least 25% populated
cleaned_war_artifacts = drop_columns_with_fewer_nans(war_artifacts_exploded, "prop", 0.25)

# Removing 'Extra Details.' prefix from column names
cleaned_war_artifacts.columns = cleaned_war_artifacts.columns.str.replace('Extra Details.', '', regex=False)

# Applying the function to your dataframe
cleaned_war_artifacts = drop_duplicate_columns(cleaned_war_artifacts)

KeyError: 'Extra Details'

In [45]:
# Initialize the translator and cache
translator = Translator()
translations_cache = defaultdict(str)

# Function to batch translate a list of texts
def batch_translate(texts, src='sv', dest='en'):
    # Filter out None values and ensure text is string
    filtered_texts = [str(text) for text in texts if pd.notna(text)]
    
    # Batch processing and caching
    batch_size = 10  # Adjust batch size as needed
    for i in range(0, len(filtered_texts), batch_size):
        batch = filtered_texts[i:i+batch_size]
        untranslated_batch = [text for text in batch if text not in translations_cache]

        if untranslated_batch:
            try:
                translations = translator.translate(untranslated_batch, src=src, dest=dest)
                for text, translation in zip(untranslated_batch, translations):
                    translations_cache[text] = translation.text
            except Exception as e:  # General exception catch
                print(f"Error during translation: {e}")
                for text in untranslated_batch:
                    translations_cache[text] = text

        time.sleep(0.5)  # Respect rate limits

    return [translations_cache[text] for text in texts]


In [46]:

###### TRANSLATE
# Load your dataset
df_war_translate = cleaned_war_artifacts

# Identify text columns in the dataframes

text_columns_war = df_war_translate.select_dtypes(include=['object']).columns

# Translate text columns
for df, text_columns in [(df_war_translate, text_columns_war)]:
    for column in text_columns:
        print(f"Translating column: {column}")
        df[column + '_Translated'] = batch_translate(df[column].tolist())

df_war_translate.head(5)


Translating column: Föremålsbenämning
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not Non

KeyboardInterrupt: 

In [None]:
def geocode(location):
    """ Geocode a location using Nominatim API. """
    url = 'https://nominatim.openstreetmap.org/search'
    headers = {
        'User-Agent': 'alyanngirl@gmail.com'
    }
    params = {
        'q': location,
        'format': 'json'
    }

    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        results = response.json()
        if results:
            return results[0]['lat'], results[0]['lon']
    return None, None


In [None]:

###### GET LOCATIONS: WAR ARTIFACTS
# Add columns for latitude and longitude
cleaned_war_artifacts['latitude'] = None
cleaned_war_artifacts['longitude'] = None

# Iterate over the DataFrame and geocode each location
for index, row in cleaned_war_artifacts.iterrows():
    location = row['Plats']
    lat, lon = geocode(location)
    cleaned_war_artifacts.at[index, 'latitude'] = lat
    cleaned_war_artifacts.at[index, 'longitude'] = lon

    # Respect Nominatim's usage policy
    time.sleep(1)

# Save the updated DataFrame
cleaned_war_artifacts.to_csv('/Scraped_Data/war_w_locations.csv', index=False)

In [None]:
# alternates: 
# https://samlingar.shm.se/sok?type=object&productionPeriod=Vikingatid&hasImage=1&category=Arkeologisk%20samling&category=Vapen%20och%20rustningar&listType=archaeological&rows=500&offset=0
# https://samlingar.shm.se/sok?type=object&productionPeriod=Vikingatid&hasImage=1&category=Arkeologisk%20samling&category=Handel%20och%20v%C3%A4rdem%C3%A4tare&listType=archaeological&rows=300&offset=0
# https://samlingar.shm.se/sok?type=object&query=Vikingatid&listType=archaeological&rows=1000&offset=0

In [33]:
###### TRANSLATE: WAR ARTIFACTS
text_columns = cleaned_war_artifacts.select_dtypes(include=['object']).columns

# Translate each text column
for column in text_columns:
    # Process the DataFrame in chunks
    chunk_size = 100  # Adjust based on your data and rate limits
    for start in range(0, cleaned_war_artifacts.shape[0], chunk_size):
        end = start + chunk_size
        df_slice = cleaned_war_artifacts[start:end]
        translated_texts = batch_translate(df_slice[column].astype(str).tolist())
        
        # Use .loc to assign the values back to avoid SettingWithCopyWarning
        cleaned_war_artifacts.loc[start:end-1, column + '_Translated'] = translated_texts
        time.sleep(1)  # Respect rate limits



# Save the translated DataFrame to a new file
# translated_df.to_csv('/path/to/your/translated_dataset.csv', index=False)

Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JS

KeyboardInterrupt: 