In [4]:
import pandas as pd
import json

# Load your dataset (modify the filename accordingly)
file_path = "Data4Good_Arolsen_Archives_50k.csv"  # Change to your actual file path
column_name = "Geo Location"    # Change to your actual column name

# Read the dataset
df = pd.read_csv(file_path)


# Extract the JSON column
geo_data = df[column_name].tolist()

# Save to a JSON file
output_file = "geo_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(geo_data, f, ensure_ascii=False, indent=4)

print(f"JSON data saved to {output_file}")


JSON data saved to geo_data.json


In [7]:
import json
import pandas as pd
import re
from datetime import datetime

# Load the general dataset with dates information
general_data_file = "Data4Good_Arolsen_Archives_50k.csv"  # Path to the general dataset
data = pd.read_csv(general_data_file)

# Load JSON data from the file
input_file = "geo_data.json"
with open(input_file, 'r', encoding='utf-8') as f:
    geo_data = json.load(f)

# Define functions
def create_time_intervals(birthdate, dates):
    if not dates:  # If no dates are available
        return []
    
    # Filter valid dates
    valid_dates = [d for d in dates if pd.notnull(d)]
    
    if pd.notnull(birthdate):
        all_dates = [birthdate] + sorted(valid_dates)
    else:
        all_dates = sorted(valid_dates)
    
    return all_dates  # Return all valid dates

def extract_dates(text):
    if pd.isnull(text):
        return []
    
    # Pattern to capture dates
    date_pattern = r'(?<!\d)(\d{1,2}[./]\d{1,2}[./]\d{2,4}|\d{1,2}[./]\d{2}|\d{4})(?!\d)'
    
    raw_dates = re.findall(date_pattern, text)
    
    possible_formats = ['%d.%m.%Y', '%d.%m.%y', '%m.%Y', '%m.%y', '%Y', '%m/%Y', '%m/%y']
    
    parsed_dates = []
    for date_str in raw_dates:
        for fmt in possible_formats:
            try:
                parsed_date = datetime.strptime(date_str, fmt)
                
                # Correct year if it's in the future
                if parsed_date.year > datetime.now().year:
                    parsed_date = parsed_date.replace(year=parsed_date.year - 100)
                
                parsed_dates.append(parsed_date.date())
                break  # Exit once parsed
            except ValueError:
                continue
    
    return parsed_dates

# Prepare dataset transformation
rows = []

for record_id, record_str in enumerate(geo_data, start=1):
    try:
        if not isinstance(record_str, str):
            record_str = "{}"  # Replace invalid values with empty JSON
        
        cleaned_str = record_str.strip('"').replace('""', '"')
        record = json.loads(cleaned_str)

        paths = record.get("paths", [])
        markers = {marker.get("label", "Unknown"): marker.get("type", "Unknown") for marker in record.get("markers", [])}
        
        # Get Middle value from the general dataset for extracting dates
        middle_value = data.loc[record_id - 1, 'Middle'] if record_id - 1 < len(data) else None
        extracted_dates = extract_dates(middle_value)
        
        # Get Birthdate from the general dataset
        birthdate_str = data.loc[record_id - 1, 'Birthdate (Geb)'] if record_id - 1 < len(data) else None
        birthdate = pd.to_datetime(birthdate_str, format='%d/%m/%Y', errors='coerce') if birthdate_str else None

        # Get the list of all valid dates (birthdate + extracted dates)
        all_dates = create_time_intervals(birthdate, extracted_dates)

        for index, path in enumerate(paths):
            origin = path.get("fromLabel", "Unknown")
            dest = path.get("toLabel", "Unknown")
            type_value = markers.get(origin, "Unknown")

            # Create the interval for the current index
            if index < len(all_dates) - 1:
                start_date = all_dates[index].strftime('%d/%m/%Y')
                end_date = all_dates[index + 1].strftime('%d/%m/%Y')
                interval = f"{start_date} - {end_date}"
            else:
                interval = "Unknown - Unknown"  # Default for last index or if there's no valid date

            rows.append({
                "ID": record_id,
                "Origin": origin,
                "Dest": dest,
                "Index": index,
                "Type": type_value,
                "Interval": interval
            })
        
        if not paths:
            rows.append({
                "ID": record_id,
                "Origin": "Unknown",
                "Dest": "Unknown",
                "Index": -1,
                "Type": "Unknown",
                "Interval": "Unknown - Unknown"
            })

    except Exception as e:
        print(f"Error processing record {record_id}: {e}")
        continue

# Convert to DataFrame
df = pd.DataFrame(rows)

# Save to CSV
output_file = "transformed_dataset_with_intervals.csv"
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"Structured data with intervals saved to {output_file}")


Structured data with intervals saved to transformed_dataset_with_intervals.csv


In [3]:
import json
import pandas as pd

# Load JSON data from the file
input_file = "geo_data.json"  # Adjust to your actual file
with open(input_file, 'r', encoding='utf-8') as f:
    geo_data = json.load(f)

# Prepare a list to store the transformed rows
rows = []

# Iterate through each JSON object in the dataset
for record_id, record_str in enumerate(geo_data, start=1):
    try:
        # Ensure record_str is treated as a string
        if not isinstance(record_str, str):
            record_str = "{}"  # Replace non-string values with an empty JSON object
        
        # Clean up the JSON string by removing outer quotes and fixing escape sequences
        cleaned_str = record_str.strip('"').replace('""', '"')
        
        # Convert cleaned string into a valid JSON object, or empty if parsing fails
        try:
            record = json.loads(cleaned_str)
        except json.JSONDecodeError:
            record = {}  # Assign an empty dictionary in case of decoding error

        paths = record.get("paths", [])
        markers = {marker.get("label", "Unknown"): marker.get("type", "Unknown") for marker in record.get("markers", [])}
        
        for path in paths:
            origin = path.get("fromLabel", "Unknown")
            dest = path.get("toLabel", "Unknown")
            index = path.get("index", -1)  # Default to -1 if index is missing
            
            # Get the type by matching origin with marker labels
            type_value = markers.get(origin, "Unknown")

            # Append the row to the list
            rows.append({
                "ID": record_id,
                "Origin": origin,
                "Dest": dest,
                "Index": index,
                "Type": type_value
            })
        
        # Handle cases where no valid paths exist (e.g., empty JSONs)
        if not paths:
            rows.append({
                "ID": record_id,
                "Origin": "Unknown",
                "Dest": "Unknown",
                "Index": -1,
                "Type": "Unknown"
            })

    except Exception as e:
        print(f"Unexpected error in record {record_id}: {e}")
        continue

# Convert list to DataFrame
dataframe = pd.DataFrame(rows)




In [4]:
camp_mapping = {
    'Auschwitz': 'Auschwitz',
    'Auschw': 'Auschwitz',
    'Auschwits': 'Auschwitz',
    'Auschwitz - Birkenau': 'Auschwitz',
    'Birkenau': 'Auschwitz',

    'Stutthof': 'Stutthof',

    'Bergen Belsen': 'Bergen-Belsen',
    '- Belsen': 'Bergen-Belsen',
    'Belsen': 'Bergen-Belsen',
    'Bergen - Belsen': 'Bergen-Belsen',
    'Berg . - Bels': 'Bergen-Belsen',

    'Gross - Rosen': 'Gross-Rosen',
    'Rosen': 'Gross-Rosen',

    
    'Buchenwald': 'Buchenwald',

    'Dachau': 'Dachau',

    'Theresienstadt': 'Theresienstadt',
    'Mauthausen': 'Mauthausen',
    'Sachsenhausen': 'Sachsenhausen',
    'Ravensbrück': 'Ravensbrück',
    'Westerbork': 'Westerbork',
    'Feldafing': 'Feldafing',
    'Zeilsheim': 'Zeilsheim',
    'Föhrenwalds': 'Föhrenwald',
    'Plaszow': 'Plaszow',
    'Landsberg': 'Landsberg',
    'Eschwege': 'Eschwege',
    'Majdanek': 'Majdanek',
}

# Standardize the camp names in the 'Origin' column
dataframe['Origin'] = dataframe['Origin'].replace(camp_mapping)

In [5]:
# Save to CSV file
output_file = "transformed_dataset.csv"
dataframe.to_csv(output_file, index=False, encoding='utf-8')

print(f"Structured data saved to {output_file}")

Structured data saved to transformed_dataset.csv


In [10]:
df['Type'].value_counts()

Type
Location       103635
Birth Place     37056
Camp Name       18778
Unknown          5688
Camp              138
Name: count, dtype: int64

In [21]:
pd.set_option('display.max_rows', None)

# Get the full list of camp names with their counts
unique_camp_names = df.loc[df['Type'] == 'Camp Name', 'Origin'].value_counts()

# Display the full list
print(unique_camp_names)

Origin
Auschwitz                          1842
Stutthof                            572
Bergen - Belsen                     473
Dachau                              458
Buchenwald                          419
Ravensbrück                         418
- Belsen                            322
Mauthausen                          321
Auschw                              276
Westerbork                          208
Theresienstadt                      167
Feldafing                           155
Sachsenhausen                       151
Zeilsheim                           129
Föhrenwald                          123
Auschwits                           119
Plaszow                             118
Belsen                              109
Auschwitz - Birkenau                108
Landsberg                           102
-                                   100
Gross - Rosen                        98
Eschwege                             92
Stalag                               88
Oranienburg                      

In [22]:
# Define the mapping of inconsistent names to standardized camp names
camp_mapping = {
    'Auschwitz': 'Auschwitz',
    'Auschw': 'Auschwitz',
    'Auschwits': 'Auschwitz',
    'Auschwitz - Birkenau': 'Auschwitz',
    'Birkenau': 'Auschwitz',

    'Stutthof': 'Stutthof',

    'Bergen Belsen': 'Bergen-Belsen',
    '- Belsen': 'Bergen-Belsen',
    'Belsen': 'Bergen-Belsen',
    'Bergen - Belsen': 'Bergen-Belsen',
    'Berg . - Bels': 'Bergen-Belsen',

    'Gross - Rosen': 'Gross-Rosen',
    'Rosen': 'Gross-Rosen',

    
    'Buchenwald': 'Buchenwald',

    'Dachau': 'Dachau',

    'Theresienstadt': 'Theresienstadt',
    'Mauthausen': 'Mauthausen',
    'Sachsenhausen': 'Sachsenhausen',
    'Ravensbrück': 'Ravensbrück',
    'Westerbork': 'Westerbork',
    'Feldafing': 'Feldafing',
    'Zeilsheim': 'Zeilsheim',
    'Föhrenwalds': 'Föhrenwald',
    'Plaszow': 'Plaszow',
    'Landsberg': 'Landsberg',
    'Eschwege': 'Eschwege',
    'Majdanek': 'Majdanek',
}

# Standardize the camp names in the 'Origin' column
df['Standardized_Origin'] = df['Origin'].replace(camp_mapping)

# Check the standardized value counts
standardized_counts = df.loc[df['Type'] == 'Camp Name', 'Standardized_Origin'].value_counts()

# Display the standardized counts
print(standardized_counts)

Standardized_Origin
Auschwitz                          2416
Bergen-Belsen                      1024
Stutthof                            572
Dachau                              458
Buchenwald                          419
Ravensbrück                         418
Mauthausen                          321
Westerbork                          208
Gross-Rosen                         174
Theresienstadt                      167
Feldafing                           155
Sachsenhausen                       151
Zeilsheim                           129
Föhrenwald                          123
Plaszow                             118
Landsberg                           102
-                                   100
Eschwege                             92
Stalag                               88
Oranienburg                          84
Flossenburg                          77
Radom                                73
Majdanek                             63
Kaiserwald                           63
Pocking             