In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [8]:
# These are my own functions that help clean the data

def search_columns_for_substring(columns, substring):
    return [col for col in columns if substring in col]

# this function counts how many empty values are in each column for a data set (use this after the drop col function to verify)
def count_nans_in_dataframe(df):
    nan_counts = df.isna().sum()
    return pd.DataFrame({'Column': nan_counts.index, 'NaN Count': nan_counts.values})

# this function drops columns that do not meet the minimum threshold
# if by="count" then drop columns that don't have at least that many populated fields
    # ie. by_val=100 drops columns that have less than 100 populated rows
# if by="prop" then drop columns that don't have at least by_val% populated rows
    # ie. by_val=0.05 drops columns that aren't at least 5% populated
# if by="field" then drop columns that have more missing values than the columns specified
    # ie. by_val="Last_Device_Array.anv" will keep Last_Device_Array.anv but drop any cols that have more missing values than Last_Device_Array.anv
def drop_columns_with_fewer_nans(df, by="prop", by_val="0.05"):
    if by == "count":
        threshold = float(by_val)
    elif by == "prop": threshold = round(df.shape[0]*float(by_val))
    elif by == "field": threshold = df.shape[0]-df[by_val].isna().sum()
    cols_to_drop = []
    for col in df.columns:
        if df[col].isna().sum() > df.shape[0]-threshold:
            cols_to_drop.append(col)
    df = df.drop(cols_to_drop, axis=1)
    return df

In [9]:
# Load the war dataset
file_path = '/Users/aly.milne/Library/CloudStorage/OneDrive-BrighamYoungUniversity/Fall 2023/STAT 386/ST386_Final_Project/Scraped_Data/Viking_war_artifacts.csv'
war_df = pd.read_csv(file_path)
war_df['Extra Details'] = war_df['Extra Details'].map(json.loads)
war_artifacts_exploded = pd.json_normalize(war_df.to_dict(orient='records'))

# drop rows that are not at least 25% populated
cleaned_war_artifacts = drop_columns_with_fewer_nans(war_artifacts_exploded, "prop", 0.25)

cleaned_war_artifacts.head(5)

Unnamed: 0.1,Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,...,Extra Details.Historisk plats,Extra Details.Förvärvsnummer,Extra Details.Omnämns i katalog,Extra Details.Förvärvsmetod,Extra Details.Fyndplats,Extra Details.Arkeologisk kontext,Extra Details.Kontextnamn,Extra Details.Undersökare,Extra Details.Undersökningsår,Extra Details.Antal fragment
0,0,Spjut,371667_HST,34000.0,Undernummer: Bj 581,Järn,"Björkö, Norr om Borg",L2017:1478,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Norr om Borg, Fornlämning: L201...","Kammargrav, Grav, Flatmarksgrav: 581",Bj 581,"Stolpe, Hjalmar",1878.0,
1,1,Spjut,371668_HST,34000.0,Undernummer: Bj 581,Järn,"Björkö, Norr om Borg",L2017:1478,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Norr om Borg, Fornlämning: L201...","Kammargrav, Grav, Flatmarksgrav: 581",Bj 581,"Stolpe, Hjalmar",1878.0,
2,2,Svärd Petersen Y,263086_HST,34000.0,Undernummer: Bj 752B,Järn,"Björkö, Hemlanden",L2017:1904,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Hemlanden, Fornlämning: L2017:1...","Kammargrav, Grav, Hög: 752B",Bj 752B,"Stolpe, Hjalmar",1879.0,7.0
3,3,Tveeggat svärd,263468_HST,34000.0,Undernummer: Bj 542,Järn,"Björkö, Norr om Borg",L2017:1478,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Norr om Borg, Fornlämning: L201...","Kammargrav, Grav, Obekant yttre gravskick: 542",Bj 542,"Stolpe, Hjalmar",1878.0,2.0
4,4,Svärd Petersen Y,264449_HST,,FID: 264449,Järn,,,,,...,,,,,Land: Sverige,,,,,1.0


In [10]:

# Load the era dataset
file_path = '/Users/aly.milne/Library/CloudStorage/OneDrive-BrighamYoungUniversity/Fall 2023/STAT 386/ST386_Final_Project/Scraped_Data/Viking_era_artifacts.csv'
era_df = pd.read_csv(file_path)
era_df['Extra Details'] = era_df['Extra Details'].map(json.loads)
era_artifacts_exploded = pd.json_normalize(era_df.to_dict(orient='records'))

# drop rows that are not at least 25% populated
cleaned_era_artifacts = drop_columns_with_fewer_nans(era_artifacts_exploded, "prop", 0.25)

cleaned_era_artifacts.head(5)

Unnamed: 0.1,Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,...,Extra Details.Tidsperiod,Extra Details.Tillverkningsplats,Extra Details.Tillverkare,Extra Details.Föremålsnummer,Extra Details.Andra nummer,Extra Details.Litteratur,Extra Details.Förvärvsnummer,Extra Details.Omnämns i katalog,Extra Details.Förvärvsmetod,Extra Details.Fyndplats
0,0,yxa med skafthål,TESTPOST_SIS_01,TESTPOST_SIS_ACC_01,Nr 1867: 17:17:17:896:783,Päronträ,"Ohio, San Juan County, Reapture Creek, Fagersk...",TESTPOST_SIS_GEO_FORNL_02,,,...,1700-tal,"Baoussé-Roussé, Menton (Tillverkningsplats not...","Ablancourt, Nicolas Perrot d' (Tillverkare)",TESTPOST_SIS_01,Nr 1867: 17:17:17:896:783,Naturligt åldrat papper. Rapport nr 4\t2. Sven...,TESTPOST_SIS_ACC_01,https://historiska.se/utstallningar/,Gåva,"Plats: Ohio, San Juan County, Reapture Creek, ..."
1,1,Mynt,2720_SKO,,,Silver,,,,,...,,,,2720_SKO,,,,,,
2,2,Svärd,13389_SKO,,,Järn,,,,,...,Vikingatid,,,13389_SKO,,,,,,
3,3,Skäggyxa,13391_SKO,,,Järn,,,,,...,,,,13391_SKO,,,,,,
4,4,Svärd Petersen H,13394_SKO,,,Järn,,,,,...,Vikingatid,,,13394_SKO,,,,,,


In [11]:

# Load the trade dataset
file_path = '/Users/aly.milne/Library/CloudStorage/OneDrive-BrighamYoungUniversity/Fall 2023/STAT 386/ST386_Final_Project/Scraped_Data/Viking_trade_artifacts.csv'
trade_df = pd.read_csv(file_path)
trade_df['Extra Details'] = trade_df['Extra Details'].map(json.loads)
trade_artifacts_exploded = pd.json_normalize(trade_df.to_dict(orient='records'))

# drop rows that are not at least 25% populated
cleaned_trade_artifacts = drop_columns_with_fewer_nans(trade_artifacts_exploded, "prop", 0.25)

cleaned_trade_artifacts.head(5)

Unnamed: 0.1,Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,...,Extra Details.Andra nummer,Extra Details.Förvärvsnummer,Extra Details.Omnämns i katalog,Extra Details.Fyndplats,Extra Details.Historisk plats,Extra Details.Förvärvsmetod,Extra Details.Arkeologisk kontext,Extra Details.Kontextnamn,Extra Details.Undersökare,Extra Details.Undersökningsår
0,0,Bipolär vikt,371770_HST,16835.0,Undernummer: 2,"Brons, Järn",Tystebols,,Stenkyrka socken,Gotland,...,Undernummer: 2 FID: 371770,16835,http://catview.historiska.se/catview/CatViewSe...,"Plats: Tystebols, Socken: Stenkyrka socken, Ko...",,,,,,
1,1,Vikt,106523_HST,34000.0,Undernummer: Bj 977,Brons,"Björkö, Hemlanden",L2017:1904,Adelsö socken,Uppland,...,Undernummer: Bj 977 Fyndnummer: 62 FID: 106523,34000,http://catview.historiska.se/catview/CatViewSe...,"Plats: Björkö, Hemlanden, Fornlämning: L2017:1...","Birka, Adelsö socken",KML,"Kammargrav, Grav, Hög: 977",Bj 977,"Stolpe, Hjalmar",1881.0
2,2,Balansvåg,107068_HST,12426.0,Undernummer: M:IV,Brons,Rösta,L1945:291,Ås socken,Jämtland,...,Undernummer: M:IV FID: 107068,12426,http://catview.historiska.se/catview/CatViewSe...,"Plats: Rösta, Fornlämning: L1945:291, Socken: ...",,,Grav: M:IV,,,
3,3,Mynthänge,46_HST,33758.0,Undernummer: 50,Silver,Spillings,L1976:7626,Othem socken,Gotland,...,Undernummer: 50 Fyndnummer: 43 FID: 46,33758,http://catview.historiska.se/catview/CatViewSe...,"Plats: Spillings, Fornlämning: L1976:7626, Fas...",,KML (inlösen),Skattfynd: 1,Spillingskatten,,
4,4,Barr,50_HST,33758.0,Undernummer: 57,Silver,Spillings,L1976:7626,Othem socken,Gotland,...,Undernummer: 57 Fyndnummer: 46 FID: 50,33758,http://catview.historiska.se/catview/CatViewSe...,"Plats: Spillings, Fornlämning: L1976:7626, Fas...",,KML (inlösen),Skattfynd: 1,Spillingskatten,,


In [18]:
from googletrans import Translator, LANGUAGES

# Initialize the translator
translator = Translator()

# Function to translate text
def translate_text(text, src='sv', dest='en'):
    try:
        return translator.translate(text, src=src, dest=dest).text
    except Exception as e:
        # In case of an error, return original text
        return text

# Columns to translate
columns_to_translate = cleaned_war_artifacts.columns.to_list()

translated_war = []

# Translate the columns
for column in columns_to_translate:
    cleaned_war_artifacts[column] = cleaned_war_artifacts[column].apply(lambda x: translate_text(x) if isinstance(x, str) else x)

cleaned_war_artifacts.head(5)

# Save the translated dataframe to a new CSV file
cleaned_war_artifacts.to_csv('translated_viking_war_artifacts.csv', index=False)

Unnamed: 0.1,Unnamed: 0,Föremålsbenämning,Föremålsnr.,Förvärvsnr.,Andra nummer,Material,Plats,Fornlämning,Socken,Landskap,...,Extra Details.Historisk plats,Extra Details.Förvärvsnummer,Extra Details.Omnämns i katalog,Extra Details.Förvärvsmetod,Extra Details.Fyndplats,Extra Details.Arkeologisk kontext,Extra Details.Kontextnamn,Extra Details.Undersökare,Extra Details.Undersökningsår,Extra Details.Antal fragment
0,0,Spear,371667_hst,34000.0,Sub -code: BJ 581,Iron,"Björkö, north of Borg",L2017:1478,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Norr om Borg, Fornlämning: L201...","Kammargrav, Grav, Flatmarksgrav: 581",Bj 581,"Stolpe, Hjalmar",1878.0,
1,1,Spear,371668_hst,34000.0,Sub -code: BJ 581,Iron,"Björkö, north of Borg",L2017:1478,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Norr om Borg, Fornlämning: L201...","Kammargrav, Grav, Flatmarksgrav: 581",Bj 581,"Stolpe, Hjalmar",1878.0,
2,2,Sword Petersen Y.,263086_HST,34000.0,Sub -code: BJ 752B,Iron,"Björkö, homeland",L2017:1904,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Hemlanden, Fornlämning: L2017:1...","Kammargrav, Grav, Hög: 752B",Bj 752B,"Stolpe, Hjalmar",1879.0,7.0
3,3,Tveggat sword,263468_hst,34000.0,Sub -code: BJ 542,Iron,"Björkö, north of Borg",L2017:1478,Adelsö socken,Uppland,...,"Birka, Adelsö socken",34000.0,http://catview.historiska.se/catview/CatViewSe...,KML,"Plats: Björkö, Norr om Borg, Fornlämning: L201...","Kammargrav, Grav, Obekant yttre gravskick: 542",Bj 542,"Stolpe, Hjalmar",1878.0,2.0
4,4,Sword Petersen Y.,264449_HST,,FID: 264449,Iron,,,,,...,,,,,Land: Sverige,,,,,1.0


In [19]:
# Columns to translate
columns_to_translate = cleaned_era_artifacts.columns.to_list()

# Translate the columns
for column in columns_to_translate:
    cleaned_era_artifacts[column] = cleaned_era_artifacts[column].apply(lambda x: translate_text(x) if isinstance(x, str) else x)

cleaned_era_artifacts.head(5)

# Save the translated dataframe to a new CSV file
cleaned_era_artifacts.to_csv('translated_viking_era_artifacts.csv', index=False)

KeyboardInterrupt: 

In [None]:
# Columns to translate
columns_to_translate = cleaned_trade_artifacts.columns.to_list()

# Translate the columns
for column in columns_to_translate:
    cleaned_trade_artifacts[column] = cleaned_trade_artifacts[column].apply(lambda x: translate_text(x) if isinstance(x, str) else x)

cleaned_trade_artifacts.head(5)

# Save the translated dataframe to a new CSV file
cleaned_trade_artifacts.to_csv('translated_viking_trade_artifacts.csv', index=False)