In [32]:
from pathlib import Path
import subprocess
import pandas as pd
import numpy as np
import json

In [33]:
# Markov Wrapper functions
def train_markov_model(train_csv, save_dir, state_size=2):
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # if model.json already exists, skip training
    model_json = save_dir / "model.json"
    if model_json.exists():
        print(f"‚è≠Ô∏è Model already trained at {model_json}, skipping training.")
        return model_json

    subprocess.run([
        "python3", "markov/train.py",
        "--data_csv", str(train_csv),
        "--save_dir", str(save_dir),
        "--state_size", str(state_size)
    ], check=True)
    print(f"‚úÖ Model trained and saved to {model_json}")
    return model_json


def evaluate_perplexity(model_path, data_dir, run_dir, output_dir):
    data_dir = Path(data_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Count CSVs in input and output dirs
    input_files = list(data_dir.glob("*.csv"))
    output_files = list(output_dir.glob("*.csv"))
    if len(input_files) == len(output_files):
        print(f"‚è≠Ô∏è Perplexity already computed for all files in {output_dir}, skipping.")
        return

    print(f"‚öôÔ∏è Running perplexity on {len(input_files)} input files...")
    subprocess.run([
        "python3", "markov/perplexity.py",
        "--model", str(model_path),
        "--data_dir", str(data_dir),
        "--output_dir", str(output_dir),
    ], check=True)

    print(f"‚úÖ Perplexity results saved to {output_dir}")



def test_markov_model(model_path, data_dir, run_dir, output_dir):
    data_dir = Path(data_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    input_files = list(data_dir.glob("*.csv"))
    output_files = list(output_dir.glob("*.csv"))

    if len(input_files) == len(output_files):
        print(f"‚è≠Ô∏è Test results already exist in {output_dir}, skipping.")
        return

    print(f"‚öôÔ∏è Running top-k test on {len(input_files)} input files...")
    subprocess.run([
        "python3", "markov/test.py",
        "--model", str(model_path),
        "--data_dir", str(data_dir),
        "--output_dir", str(output_dir),
        "--mode", "topk",
        "--k_values", "1", "5", "10"
    ], check=True)

    print(f"‚úÖ Test results saved to {output_dir}")

def train_lstpm_model(train_csv, save_dir, nepochs=25):
    """
    Preprocess + train LSTPM model, with skip checks.

    Args:
        train_csv (Path): path to training_set.csv
        save_dir (Path): directory to save model

    Returns:
        (Path to trained model .m, Path to preprocessed_dir)
    """
    save_dir = Path(save_dir)
    run_dir = save_dir.parent
    preprocessed_dir = run_dir / "preprocessed"
    metadata_path = run_dir / "metadata.json"
    distance_path = run_dir / "distance.pkl"

    model_path = save_dir / "res.m"

    # ----- 1. Check if model already trained -----
    if model_path.exists():
        print(f"‚è≠Ô∏è LSTPM model already trained at {model_path}, skipping training.")
        return model_path

    # ----- 2. Check if preprocessed already done -----
    input_csvs = list(train_csv.parent.glob("*.csv"))
    pk_files = list(preprocessed_dir.glob("*.pk"))

    if len(input_csvs) == len(pk_files) and len(pk_files) > 0:
        print(f"‚è≠Ô∏è Preprocessed data already present in {preprocessed_dir}, skipping preprocessing.")
    else:
        print("‚öôÔ∏è Preprocessing LSTPM data...")
        result = subprocess.run([
            "python3", "LSTPM/train/preprocess.py",
            "--in_dir", str(train_csv.parent),
            "--training_set_name", train_csv.stem,
            "--out_dir", str(run_dir)
        ], capture_output=True, text=True)
        if result.returncode != 0:
            print("‚ùå Preprocessing failed!")
            print("STDOUT:\n", result.stdout)
            print("STDERR:\n", result.stderr)
            raise RuntimeError("Preprocessing failed.")
        print(f"‚úÖ Preprocessing completed, files saved to {preprocessed_dir}")

    # ----- 3. Train model -----
    print("üéØ Training LSTPM...")
    subprocess.run([
        "python3", "LSTPM/train/train.py",
        "--data_pk", str(preprocessed_dir / f"{train_csv.stem}.pk"),
        "--metadata_json", str(metadata_path),
        "--distance", str(distance_path),
        "--save_dir", str(save_dir),
        "--batch_size", "512",
        "--epochs", str(nepochs)
    ], check=True)
    print(f"‚úÖ LSTPM model saved at {model_path}")

    return model_path

def test_lstpm_model(model_path, data_dir, run_dir, output_dir):
    output_dir = Path(output_dir)
    if len(list(output_dir.glob("*.csv"))) == len(list(Path(data_dir).glob("*.csv"))):
        print(f"‚è≠Ô∏è LSTPM test results already exist in {output_dir}")
        return

    print(f"üìä Testing LSTPM...")
    subprocess.run([
        "python3", "LSTPM/train/test.py",
        "--data_dir", str(Path(run_dir) / "preprocessed"),
        "--model_m", str(model_path),
        "--distance", str(model_path.parent.parent / "distance.pkl"),
        "--mode", "topk",
        "--k_values", "1", "5", "10",
        "--output", str(output_dir)
    ], check=True)

def evaluate_lstpm_perplexity(model_path, data_dir, run_dir, output_dir):
    output_dir = Path(output_dir)
    preprocessed_dir = Path(run_dir) / "preprocessed"
    if len(list(output_dir.glob("*.csv"))) == len(list(Path(preprocessed_dir).glob("*.pk"))):
        print(f"‚è≠Ô∏è LSTPM perplexity already computed for {output_dir}")
        return

    print(f"üìà Evaluating LSTPM perplexity...")
    subprocess.run([
        "python3", "LSTPM/train/perplexity.py",
        "--data_dir", str(preprocessed_dir),
        "--model_m", str(model_path),
        "--distance", str(model_path.parent.parent / "distance.pkl"),
        "--output", str(output_dir)
    ], check=True)

# DeepMove Wrapper functions

def train_deepmove_model(train_csv, save_dir, model_type):
    save_dir = Path(save_dir)
    run_dir = save_dir.parent
    preprocessed_dir = run_dir / "preprocessed"
    metadata_path = run_dir / "metadata.json"
    train_pk = preprocessed_dir / f"{train_csv.stem}.pk"

    # Skip if model already trained
    if (save_dir / "res.m").exists():
        print(f"‚è≠Ô∏è DeepMove ({model_type}) model already trained at {save_dir}, skipping.")
        return save_dir / "res.m"

    # Skip preprocessing if already done
    if not train_pk.exists():
        print("‚öôÔ∏è Preprocessing DeepMove data...")
        subprocess.run([
            "python3", "DeepMove/codes/preprocess.py",
            "--in_dir", str(train_csv.parent),
            "--training_set_name", train_csv.stem,
            "--out_dir", str(run_dir)
        ], check=True)
    else:
        print(f"‚è≠Ô∏è Preprocessed file {train_pk} already exists, skipping preprocessing.")

    # Train
    print(f"üéØ Training DeepMove model ({model_type})...")
    subprocess.run([
        "python3", "DeepMove/codes/main.py",
        "--metadata_json", str(metadata_path),
        "--model_mode", model_type,
        "--data_path", str(train_pk),
        "--epoch_max", "40",
        "--save_dir", str(save_dir),
        "--pretrain", "0"
    ], check=True)

    return save_dir / "res.m"


def test_deepmove_model(model_path, data_dir, run_dir, output_dir, model_type):
    output_dir = Path(output_dir)
    preprocessed_dir = Path(run_dir) / "preprocessed"
    if len(list(output_dir.glob("*.csv"))) == len(list(Path(preprocessed_dir).glob("*.pk"))):
        print(f"‚è≠Ô∏è DeepMove test results already exist in {output_dir}")
        return

    print(f"üìä Testing DeepMove ({model_type})...")
    subprocess.run([
        "python3", "DeepMove/codes/test.py",
        "--metadata_json", str(model_path.parent.parent / "metadata.json"),
        "--model_mode", model_type,
        "--model_path", str(model_path),
        "--data_dir", str(preprocessed_dir),
        "--mode", "topk",
        "--k_values", "1", "5", "10", "20",
        "--output", str(output_dir)
    ], check=True)



def perplexity_deepmove(model_path, data_dir, run_dir, output_dir, model_type):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
 
    run_dir = Path(model_path.parent.parent / "preprocessed")
    input_files = list(run_dir.glob("*.pk"))
    output_files = list(output_dir.glob("*.csv"))

    if len(input_files) == len(output_files):
        print(f"‚è≠Ô∏è DeepMove perplexity already computed for all files in {output_dir}")
        return

    print(f"üìà Evaluating DeepMove ({model_type}) perplexity on {len(input_files)} files...")

    for pk_file in input_files:
        out_file = output_dir / f"{pk_file.stem}.csv"
        if out_file.exists():
            print(f"‚è≠Ô∏è Skipping already computed file: {out_file.name}")
            continue

        print(f"Computing perplexity for {pk_file.name}...")
        subprocess.run([
            "python3", "DeepMove/codes/perplexity.py",
            "--metadata_json", str(model_path.parent.parent / "metadata.json"),
            "--model_mode", model_type,
            "--model_path", str(model_path),
            "--data_pk", str(pk_file),
            "--output", str(out_file)
        ], check=True)

    print(f"‚úÖ DeepMove perplexity evaluation completed for all new files.")

In [38]:
DATASETS = {
    "ShenzhenUrban": {
        # "type1": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType1/Datasets",
        # "canaries": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType1/Canaries",
        # "type2_home": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType2/Home/Datasets",
        # "type2_work": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType2/Work/Datasets",
        "type3": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType3/Datasets",
    },
    "ShanghaiKaggle": {
        # "type1": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShanghaiKaggle/NormalizationType1/Datasets",
        # "canaries": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShanghaiKaggle/NormalizationType1/Canaries",
        # "type2_home": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShanghaiKaggle/NormalizationType2/Home/Datasets",
        # "type2_work": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShanghaiKaggle/NormalizationType2/Work/Datasets",
        "type3": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShanghaiKaggle/NormalizationType3/Datasets"
    },
    "YJMob100Kv3": {
        # "type1": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/YJMob100Kv3/NormalizationType1/Datasets",
        # "canaries": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/YJMob100Kv3/NormalizationType1/Canaries",
        # "type2_home": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/YJMob100Kv3/NormalizationType2/Home/Datasets",
        # "type2_work": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/YJMob100Kv3/NormalizationType2/Work/Datasets",
        "type3": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/YJMob100Kv3/NormalizationType3/Datasets"
    },
}

MODELS = {
    "markov": {
        "train": train_markov_model,
        "test": test_markov_model,
        "perplexity": evaluate_perplexity,
    },
    "lstpm": {
        "train": train_lstpm_model,
        "test": test_lstpm_model,
        "perplexity": evaluate_lstpm_perplexity,
    },
    "deepmove_simple": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "simple"),
        "test": lambda model_path, data_dir, run_dir, output_dir: test_deepmove_model(model_path, data_dir, run_dir, output_dir, "simple"),
        "perplexity": lambda model_path, data_dir, run_dir, output_dir: perplexity_deepmove(model_path, data_dir, run_dir, output_dir, "simple"),
    },

    "deepmove_simple_long": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "simple_long"),
        "test": lambda model_path, data_dir,run_dir, output_dir: test_deepmove_model(model_path, data_dir, run_dir, output_dir, "simple_long"),
        "perplexity": lambda model_path, data_dir, run_dir, output_dir: perplexity_deepmove(model_path, data_dir, run_dir, output_dir, "simple_long"),
    },

    "deepmove_attn_avg_long_user": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "attn_avg_long_user"),
        "test": lambda model_path, data_dir, run_dir, output_dir: test_deepmove_model(model_path, data_dir, run_dir, output_dir, "attn_avg_long_user"),
        "perplexity": lambda model_path, data_dir, run_dir, output_dir: perplexity_deepmove(model_path, data_dir, run_dir, output_dir, "attn_avg_long_user"),
    },

    "deepmove_attn_local_long": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "attn_local_long"),
        "test": lambda model_path, data_dir, run_dir, output_dir: test_deepmove_model(model_path, data_dir, run_dir, output_dir, "attn_local_long"),
        "perplexity": lambda model_path, data_dir, run_dir, output_dir: perplexity_deepmove(model_path, data_dir, run_dir, output_dir, "attn_local_long"),
    }
}

# Unified results directory
OUTPUT_ROOT = Path("results/")



In [35]:
def compute_memorization_metrics(perplexity_dir, mapping_file):
    """
    Given a folder with <cluster_X_perplexity.csv> and training_set.csv,
    compute the 3 memorization metrics per training tid.
    """
    perplexity_dir = Path(perplexity_dir)

    # Support both filenames
    training_perp_path = perplexity_dir / "training_set_perplexity.csv"
    if not training_perp_path.exists():
        training_perp_path = perplexity_dir / "training_set.csv"

    training_df = pd.read_csv(training_perp_path)
    # print(training_df.head())

    # Support both column names: 'tid' or 'user'
    id_col = "tid" if "tid" in training_df.columns else "user"

    training_dict = training_df.set_index(id_col)["perplexity"].to_dict()
    mapping_df = pd.read_csv(mapping_file)

    if "cluster_file" in mapping_df.columns:
        type3 = False
        mapping_dict = mapping_df.set_index("cluster_file")["representant_tid"].to_dict()
    else:
        # type 3
        type3 = True
        mapping_df['reference_file'] = mapping_df['device_id'].apply(lambda x: f"{x}.csv")
        mapping_dict = mapping_df.set_index("reference_file")["training_tid"].to_dict()

    rows = []

    for ref_file in perplexity_dir.glob("*.csv"):
        cluster_id = ref_file.stem.replace("_perplexity", "") + ".csv"
        if cluster_id == "training_set.csv" or cluster_id == "training_set_perplexity.csv":
            continue
        print(ref_file)
        ref_df = pd.read_csv(ref_file)
        if ref_df.empty:
            continue

        # Adapt to tid/user here as well
        ref_id_col = "tid" if "tid" in ref_df.columns else "user"

        training_tid_val = mapping_dict.get(cluster_id)
        if training_tid_val not in training_dict:
            continue

        train_perp = training_dict[training_tid_val]

        result_row = {
            "tid": training_tid_val,
            "cluster_id": cluster_id,
        }

        if type3:
            for perturbation in ['substitute', 'stationary', 'shuffle']:
                mapping_df_perturbed = mapping_df[mapping_df['perturbation'] == perturbation]
                ref_df_perturbed = ref_df[ref_df[ref_id_col].isin(mapping_df_perturbed['reference_tid'])]

                ref_perps = ref_df_perturbed["perplexity"].values
                ref_mean = np.mean(ref_perps)
                rank = np.sum(ref_perps <= train_perp) + 1
                exposure = np.log2(len(ref_perps)) - np.log2(rank)
                percentile = (rank - 1) / len(ref_perps)
                gap = train_perp - ref_mean

                result_row.update({
                    f"train_perplexity_{perturbation}": train_perp,
                    f"mean_ref_perplexity_{perturbation}": ref_mean,
                    f"exposure_{perturbation}": exposure,
                    f"percentile_{perturbation}": percentile,
                    f"gap_{perturbation}": gap,
                })

        # General (non-perturbed) cluster
        ref_perps = ref_df["perplexity"].values
        ref_mean = np.mean(ref_perps)
        rank = np.sum(ref_perps <= train_perp) + 1
        exposure = np.log2(len(ref_perps)) - np.log2(rank)
        percentile = (rank - 1) / len(ref_perps)
        gap = train_perp - ref_mean

        result_row.update({
            "train_perplexity": train_perp,
            "mean_ref_perplexity": ref_mean,
            "exposure": exposure,
            "percentile": percentile,
            "gap": gap,
        })

        rows.append(result_row)

    return pd.DataFrame(rows)


def compute_memorization_per_window(perplexity_dir, mapping_file):
    perplexity_dir = Path(perplexity_dir)
    training_perp_path = perplexity_dir / "training_set_perplexity.csv"
    if not training_perp_path.exists():
        training_perp_path = perplexity_dir / "training_set.csv"

    training_df = pd.read_csv(training_perp_path)
    id_col = "tid" if "tid" in training_df.columns else "user"
    training_dict = training_df.set_index(id_col)["perplexity"].to_dict()

    mapping_df = pd.read_csv(mapping_file)
    mapping_df['reference_file'] = mapping_df['device_id'].apply(lambda x: f"{x}.csv")
    mapping_dict = mapping_df.set_index("reference_file")["training_tid"].to_dict()

    # Time window info for each reference_tid
    window_info = mapping_df.set_index("reference_tid")[["window_index", "window_start_hour"]].to_dict(orient="index")

    rows = []

    for ref_file in perplexity_dir.glob("*.csv"):
        cluster_id = ref_file.stem.replace("_perplexity", "") + ".csv"
        if cluster_id in ["training_set.csv", "training_set_perplexity.csv"]:
            continue

        ref_df = pd.read_csv(ref_file)
        if ref_df.empty:
            continue

        ref_id_col = "tid" if "tid" in ref_df.columns else "user"
        training_tid = mapping_dict.get(cluster_id)
        if training_tid not in training_dict:
            continue

        train_perp = training_dict[training_tid]

        # Add time window info
        ref_df["window_index"] = ref_df[ref_id_col].map(lambda tid: window_info.get(tid, {}).get("window_index", -1))
        ref_df["window_start_hour"] = ref_df[ref_id_col].map(lambda tid: window_info.get(tid, {}).get("window_start_hour", -1))
        ref_df["hour_of_day"] = ref_df["window_start_hour"] % 24  # <-- Group by hour of day (0‚Äì23)

        for hour, sub_df in ref_df.groupby("hour_of_day"):
            if hour == -1:
                continue

            ref_perps = sub_df["perplexity"].values
            if len(ref_perps) == 0:
                continue

            ref_mean = np.mean(ref_perps)
            rank = np.sum(ref_perps <= train_perp) + 1
            exposure = np.log2(len(ref_perps)) - np.log2(rank)
            percentile = (rank - 1) / len(ref_perps)
            gap = train_perp - ref_mean

            rows.append({
                "tid": training_tid,
                "cluster_id": cluster_id,
                "hour_of_day": hour,
                "train_perplexity": train_perp,
                "mean_ref_perplexity": ref_mean,
                "exposure": exposure,
                "percentile": percentile,
                "gap": gap
            })

    df = pd.DataFrame(rows)

    # Optional: aggregate across trajectories for smoother visualization
    #df_grouped = df.groupby("hour_of_day")[["exposure", "percentile", "gap"]].mean().reset_index()

    return df #, df_grouped



def compute_carlini_memorization_metric(perplexity_dir, canary_id="canary1"):
    """
    Compute Carlini-style exposure of a single canary against a reference set.
    
    Args:
        perplexity_dir (str or Path): Directory containing training_set_perplexity.csv and reference_set_perplexity.csv
        canary_id (str): The tid/user ID of the inserted canary
        output_file (str or Path): Where to save the exposure results (.json)
    """
    perplexity_dir = Path(perplexity_dir)

     # === Find training file ===
    train_file = perplexity_dir / "training_set_perplexity.csv"
    if not train_file.exists():
        train_file = perplexity_dir / "training_set.csv"
        if not train_file.exists():
            raise FileNotFoundError("Missing both training_set_perplexity.csv and training_set.csv")

    # === Find reference file ===
    ref_file = perplexity_dir / "reference_set_perplexity.csv"
    if not ref_file.exists():
        ref_file = perplexity_dir / "reference_set.csv"
        if not ref_file.exists():
            raise FileNotFoundError("Missing both reference_set_perplexity.csv and reference_set.csv")


    # Read training and reference perplexities
    df_train = pd.read_csv(train_file)
    df_ref = pd.read_csv(ref_file)

    id_col = "tid" if "tid" in df_train.columns else "user"

    # Canary perplexity
    try:
        canary_perp = df_train.set_index(id_col).loc[canary_id]["perplexity"]
    except KeyError:
        raise ValueError(f"Canary ID '{canary_id}' not found in training set.")

    # Reference perplexities
    ref_perps = df_ref["perplexity"].dropna().values
    if len(ref_perps) == 0:
        raise ValueError("Reference set is empty or invalid.")

    # === Compute metrics ===
    ref_mean = np.mean(ref_perps)
    rank = np.sum(ref_perps <= canary_perp) + 1
    exposure = np.log2(len(ref_perps)) - np.log2(rank)
    percentile = (rank - 1) / len(ref_perps)
    gap = canary_perp - ref_mean

    result = {
        "canary_id": canary_id,
        "canary_perplexity": float(canary_perp),
        "mean_ref_perplexity": float(ref_mean),
        "exposure": float(exposure),
        "percentile": float(percentile),
        "gap": float(gap),
        "num_reference": int(len(ref_perps))
    }

    return result



In [36]:
def run_memorization_test(dataset_name, type_name, dataset_path, model_name, test=False):
    print(f"\nüöÄ Running: {model_name.upper()} | {dataset_name} | {type_name}")
    
    model = MODELS[model_name]
    dataset_path = Path(dataset_path)
    training_file = dataset_path / "training_set.csv"
    mapping_file = dataset_path / "representant_mapping.txt"

    if not training_file.exists():
        print(f"‚ö†Ô∏è No training set found in {dataset_path}")
        return

    run_dir = OUTPUT_ROOT /   dataset_name / model_name/ type_name
    model_dir = run_dir / "model"
    perplexity_dir = run_dir / "perplexity"
    test_dir = run_dir / "test"

    # Train
    model_path = model["train"](training_file, model_dir)

    # Perplexity
    model["perplexity"](model_path, dataset_path, run_dir, perplexity_dir)

    #Metrics
    if type_name == "canaries":
        metrics_path = run_dir / "carlini_exposure.json"
        if metrics_path.exists():
            print(f"‚è≠Ô∏è Metrics already exist at {metrics_path}")
        else:
            result = compute_carlini_memorization_metric(perplexity_dir, canary_id="canary1")
            with open(metrics_path, "w") as f:
                json.dump(result, f, indent=2)
            print(f"‚úÖ Carlini-style exposure saved to: {metrics_path}")
    else:
        metrics_path = run_dir / "memorization_metrics.csv"
        if metrics_path.exists():
            print(f"‚è≠Ô∏è Metrics already exist at {metrics_path}")
        else:
            metrics_df = compute_memorization_metrics(perplexity_dir, mapping_file)
            metrics_df.to_csv(metrics_path, index=False)
        print(f"‚úÖ Metrics saved to: {metrics_path}")
        
        if type_name == "type3":
            per_window_path = run_dir / "memorization_metrics_per_window.csv"
            if per_window_path.exists():
                print(f"‚è≠Ô∏è Per-window metrics already exist at {per_window_path}")
            else:
                per_window_df = compute_memorization_per_window(perplexity_dir, mapping_file)
                per_window_df.to_csv(per_window_path, index=False)
            print(f"‚úÖ Per-window metrics saved to: {per_window_path}")

    #Test
    if test:
        model["test"](model_path, dataset_path, run_dir, test_dir)

In [None]:
ALL_MODELS = ["markov", "lstpm", "deepmove_simple", "deepmove_simple_long", "deepmove_attn_avg_long_user", "deepmove_attn_local_long"]
#ALL_MODELS = ["deepmove_attn_local_long"]  

for model_name in ALL_MODELS:
    for dataset_name, type_paths in DATASETS.items():
        for type_name, path in type_paths.items():
            # while True:
            #     try:
            #         run_memorization_test(dataset_name, type_name, path, model_name)
            #         break  # Success ‚Üí exit the loop
            #     except Exception as e:
            #         print(f"‚ö†Ô∏è Error occurred: {e} ‚Äî Retrying...")
            #         time.sleep(5)  # Wait before retrying
            run_memorization_test(dataset_name, type_name, path, model_name, test=False)


üöÄ Running: MARKOV | ShenzhenUrban | type3
‚è≠Ô∏è Model already trained at results/ShenzhenUrban/markov/type3/model/model.json, skipping training.
‚öôÔ∏è Running perplexity on 2001 input files...
Loading Markov model from results/ShenzhenUrban/markov/type3/model/model.json...
Model loaded from results/ShenzhenUrban/markov/type3/model/model.json
Processing /home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType3/Datasets/55567558.csv...
{'tid': '55567558_1386960_1387056_ref_win0_v0', 'perplexity': np.float64(2.0560791189841434)}
{'tid': '55567558_1386960_1387056_ref_win0_v1', 'perplexity': np.float64(2.0560791189841434)}
{'tid': '55567558_1386960_1387056_ref_win0_v10', 'perplexity': np.float64(2.0560791189841434)}
{'tid': '55567558_1386960_1387056_ref_win0_v11', 'perplexity': np.float64(2.0560791189841434)}
{'tid': '55567558_1386960_1387056_ref_win0_v12', 'perplexity': np.float64(2.0560791189841434)}
{'tid': '55567558_1386960_1387056_ref_win0_v1

In [None]:
# ALL_MODELS = ["markov", "lstpm", "deepmove_simple", "deepmove_simple_long", "deepmove_attn_avg_long_user", "deepmove_attn_local_long"]

# for model_name in ALL_MODELS:
#     for dataset_name, type_paths in DATASETS.items():
#         for type_name, path in type_paths.items():
#             run_memorization_test(dataset_name, type_name, path, model_name, test=True)


üöÄ Running: MARKOV | ShanghaiKaggle | type1
‚è≠Ô∏è Model already trained at results/ShanghaiKaggle/markov/type1/model/model.json, skipping training.
‚è≠Ô∏è Perplexity already computed for all files in results/ShanghaiKaggle/markov/type1/perplexity, skipping.
‚è≠Ô∏è Metrics already exist at results/ShanghaiKaggle/markov/type1/memorization_metrics.csv
‚úÖ Metrics saved to: results/ShanghaiKaggle/markov/type1/memorization_metrics.csv
‚è≠Ô∏è Test results already exist in results/ShanghaiKaggle/markov/type1/test, skipping.

üöÄ Running: LSTPM | ShanghaiKaggle | type1
‚è≠Ô∏è LSTPM model already trained at results/ShanghaiKaggle/lstpm/type1/model/res.m, skipping training.
‚è≠Ô∏è LSTPM perplexity already computed for results/ShanghaiKaggle/lstpm/type1/perplexity
‚è≠Ô∏è Metrics already exist at results/ShanghaiKaggle/lstpm/type1/memorization_metrics.csv
‚úÖ Metrics saved to: results/ShanghaiKaggle/lstpm/type1/memorization_metrics.csv
‚è≠Ô∏è LSTPM test results already exist in results/Shangh

  nn.init.xavier_uniform(t)
  nn.init.orthogonal(t)
  nn.init.constant(t, 0)


Output for directory processing will be in: results/ShanghaiKaggle/deepmove_attn_local_long/type1/test
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_2051568_2051664.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_91920_92016.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_714816_714912.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_981360_981456.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_897648_897744.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_2941392_2941488.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_66720_66816.pk...
Processing results/ShanghaiKaggle/deepmove_attn_local_long/type1/preprocessed/cluster_1056816_1056912.pk...
Processing results/ShanghaiKaggle/deepmove_attn