In [12]:
import os
import pandas as pd
from google.cloud import translate_v2 as translate
from google.oauth2 import service_account
import json
from googleapiclient.discovery import build
import urllib.parse


In [13]:

# Define the path to the file containing the project ID and API key
config_file = '/Users/aly.milne/Library/CloudStorage/OneDrive-BrighamYoungUniversity/Fall 2023/STAT 386/ST386_Final_Project/api-key.txt'

# Initialize variables
project_id1 = None
api_key1 = None

# Read the project ID and API key from the file
with open(config_file, 'r') as file:
    lines = file.readlines()  # Read all lines into a list

    # Extract project ID from the first line, if available
    if len(lines) > 0:
        project_id = lines[0].strip()

    # Extract API key from the second line, if available
    if len(lines) > 1:
        api_key = lines[1].strip()
        


In [14]:
# Initialize the translation client
translate_client = translate.Client()

# Cache for storing translations
translation_cache = {}

def batch_translate_text_with_model(target: str, texts: list, model: str = "nmt") -> list:
    translated_texts = []
    for text in texts:
        if text in translation_cache:
            # Use cached translation if available
            translated_texts.append(translation_cache[text])
        else:
            if isinstance(text, bytes):
                text = text.decode("utf-8")

            result = translate_client.translate(text, target_language=target, model=model)
            translated_text = result["translatedText"]
            translation_cache[text] = translated_text
            translated_texts.append(translated_text)
    return translated_texts


In [15]:

# Read the CSV file
df = pd.read_csv('/Users/aly.milne/Library/CloudStorage/OneDrive-BrighamYoungUniversity/Fall 2023/STAT 386/ST386_Final_Project/Scraped_Data/trade_w_locations.csv', encoding='utf-8')

# Identify text columns
text_columns = df.select_dtypes(include=['object']).columns
# Remove "Unique Name" from the list
text_columns = [col for col in text_columns if col != "Unique Name"]
# Remove "Museum" from the list
text_columns = [col for col in text_columns if col != "Museum"]
# Remove "Catalog Link" from the list
text_columns = [col for col in text_columns if col != "Catalog Link"]


In [16]:
text_columns

['Föremålsbenämning',
 'Föremålsnr.',
 'Andra nummer',
 'Material',
 'Plats',
 'Fornlämning',
 'Socken',
 'Landskap',
 'Land',
 'Kontexttyp',
 'Kontextnr.',
 'Kategori',
 'Storlek',
 'Datering',
 'Tidsperiod',
 'Föremålsnummer',
 'Omnämns i katalog',
 'Fyndplats',
 'Arkeologisk kontext',
 'Förvärvsmetod',
 'Kontextnamn',
 'Historisk plats',
 'Undersökare',
 'Undersökningsår']

In [17]:
batch_size = 100

for column in text_columns:
    # Process the column in batches
    for i in range(0, len(df), batch_size):
        batch_slice = slice(i, i + batch_size)
        batch_texts = df[column][batch_slice].dropna()
        translated_batch = batch_translate_text_with_model('en', batch_texts.tolist())

        # Update only the rows that were translated
        df.loc[batch_texts.index, column + '_translated'] = translated_batch

In [18]:
# Splitting the 'Datering' column into 'Era Start Year' and 'Era End Year'
df[['Era Start Year', 'Era End Year']] = df['Datering'].str.split(' – ', expand=True)

# Display the first few rows of the dataframe to verify the new columns
df[['Datering', 'Era Start Year', 'Era End Year']].head()

Unnamed: 0,Datering,Era Start Year,Era End Year
0,800 – 1100,800,1100
1,800 – 1100,800,1100
2,800 – 1100,800,1100
3,800 – 1100,800,1100
4,800 – 1100,800,1100


In [19]:
import re

# Conversion factors to millimeters for lengths and diameters, and to grams for weight
length_conversion_factors = {'mm': 1, 'cm': 10, 'm': 1000}
weight_conversion_factors = {'g': 1, 'kg': 1000}

# Function to extract and keep the largest measurements in mm and g
def extract_max_measurements(row):
    measurements = {
        'Width': None,
        'Length': None,
        'Thickness': None,
        'Diameter': None,
        'Weight': None
    }

    # Check if row is a string
    if not isinstance(row, str):
        return pd.Series(measurements)

    # Find all matches of measurements
    matches = re.findall(r'(Width|Length|Thickness|Diameter|Weight) (\d+(\.\d+)?) (mm|cm|m|g|kg)', row)
    for match in matches:
        measure_type, measure_value, _, unit = match
        measure_value = float(measure_value)
        if measure_type in ['Width', 'Length', 'Thickness', 'Diameter']:
            measure_value *= length_conversion_factors[unit]
        elif measure_type == 'Weight':
            measure_value *= weight_conversion_factors[unit]

        if measurements[measure_type] is None or measure_value > measurements[measure_type]:
            measurements[measure_type] = measure_value

    return pd.Series(measurements)

# Apply the function and create new columns
df[['Width', 'Length', 'Thickness', 'Diameter', 'Weight']] = df['Storlek_translated'].apply(extract_max_measurements)



In [20]:
df.to_csv('trade_translated.csv', index=False)