In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Load the transformed dataset
df = pd.read_csv("transformed_dataset_with_intervals.csv")

# Define the mapping of inconsistent names to standardized camp names
camp_mapping = {
    'Auschwitz': 'Auschwitz',
    'Auschw': 'Auschwitz',
    'Auschwits': 'Auschwitz',
    'Auschwitz - Birkenau': 'Auschwitz',
    'Birkenau': 'Auschwitz',

    'Stutthof': 'Stutthof',

    'Bergen Belsen': 'Bergen-Belsen',
    '- Belsen': 'Bergen-Belsen',
    'Belsen': 'Bergen-Belsen',
    'Bergen - Belsen': 'Bergen-Belsen',
    'Berg . - Bels': 'Bergen-Belsen',

    'Gross - Rosen': 'Gross-Rosen',
    'Rosen': 'Gross-Rosen',

    'Buchenwald': 'Buchenwald',
    'Dachau': 'Dachau',
    'Theresienstadt': 'Theresienstadt',
    'Mauthausen': 'Mauthausen',
    'Sachsenhausen': 'Sachsenhausen',
    'Ravensbrück': 'Ravensbrück',
    'Westerbork': 'Westerbork',
    'Feldafing': 'Feldafing',
    'Zeilsheim': 'Zeilsheim',
    'Föhrenwalds': 'Föhrenwald',
    'Plaszow': 'Plaszow',
    'Landsberg': 'Landsberg',
    'Eschwege': 'Eschwege',
    'Majdanek': 'Majdanek',
}

# Standardize the camp names in the 'Origin' and 'Dest' columns
df['Origin'] = df['Origin'].replace(camp_mapping)
df['Dest'] = df['Dest'].replace(camp_mapping)

# List of distinct camps for prediction
distinct_camps = [
    'Auschwitz', 'Stutthof', 'Bergen-Belsen', 'Gross-Rosen', 'Buchenwald', 'Dachau',
    'Theresienstadt', 'Mauthausen', 'Sachsenhausen', 'Ravensbrück', 'Westerbork',
    'Feldafing', 'Zeilsheim', 'Föhrenwald', 'Plaszow', 'Landsberg', 'Eschwege', 'Majdanek'
]

# Encode locations to numerical values
all_locations = pd.concat([df['Origin'], df['Dest']]).unique()
location_encoder = LabelEncoder()
location_encoder.fit(all_locations)

df['Origin_Encoded'] = location_encoder.transform(df['Origin'])
df['Dest_Encoded'] = location_encoder.transform(df['Dest'])

# Filter known and unknown camp samples
known_camps = df[df['Dest'].isin(distinct_camps)]
unknown_camps = df[~df['Dest'].isin(distinct_camps)]  # Entries without known camps

# Grouping sequences by ID (combine all movements per person)
def get_grouped_sequences(data):
    grouped = data.groupby("ID").agg({
        "Origin_Encoded": lambda x: list(x),
        "Dest_Encoded": lambda x: list(x),
        "Dest": lambda x: list(x)[-1]  # Get final destination
    }).reset_index()
    return grouped

# Prepare the grouped data
train_data = get_grouped_sequences(known_camps)
test_data = get_grouped_sequences(unknown_camps)

# Prepare feature matrix (sequence of movements) and labels (final camp destination)
X_train = pad_sequences(train_data['Origin_Encoded'], padding='post')
y_train = train_data['Dest'].apply(lambda x: x if x in distinct_camps else None).dropna()
y_train = location_encoder.transform(y_train)

X_test = pad_sequences(test_data['Origin_Encoded'], maxlen=X_train.shape[1], padding='post')

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the camp names for unknown records
predicted_camps_encoded = rf_model.predict(X_test)

# Decode predictions back to camp names
predicted_camps = location_encoder.inverse_transform(predicted_camps_encoded)

# Assign predictions to the unknown samples
test_data['Predicted_Camp'] = predicted_camps

# Save results to a CSV file
test_data.to_csv("predicted_camp_locations.csv", index=False)

print("Predictions saved to predicted_camp_locations.csv")


Predictions saved to predicted_camp_locations.csv


In [8]:
len(test_data)

47736