In [None]:
from pathlib import Path
import subprocess
import pandas as pd
import numpy as np

In [None]:
# Markov Wrapper functions
def train_markov_model(train_csv, save_dir, state_size=2):
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # if model.json already exists, skip training
    model_json = save_dir / "model.json"
    if model_json.exists():
        print(f"‚è≠Ô∏è Model already trained at {model_json}, skipping training.")
        return model_json

    subprocess.run([
        "python3", "markov/train.py",
        "--data_csv", str(train_csv),
        "--save_dir", str(save_dir),
        "--state_size", str(state_size)
    ], check=True)
    print(f"‚úÖ Model trained and saved to {model_json}")
    return model_json


def evaluate_perplexity(model_path, data_dir, output_dir):
    data_dir = Path(data_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Count CSVs in input and output dirs
    input_files = list(data_dir.glob("*.csv"))
    output_files = list(output_dir.glob("*.csv"))

    if len(input_files) == len(output_files):
        print(f"‚è≠Ô∏è Perplexity already computed for all files in {output_dir}, skipping.")
        return

    print(f"‚öôÔ∏è Running perplexity on {len(input_files)} input files...")
    subprocess.run([
        "python3", "markov/perplexity.py",
        "--model", str(model_path),
        "--data_dir", str(data_dir),
        "--output_dir", str(output_dir),
    ], check=True)

    print(f"‚úÖ Perplexity results saved to {output_dir}")



def test_markov_model(model_path, data_dir, output_dir):
    data_dir = Path(data_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    input_files = list(data_dir.glob("*.csv"))
    output_files = list(output_dir.glob("*.csv"))

    if len(input_files) == len(output_files):
        print(f"‚è≠Ô∏è Test results already exist in {output_dir}, skipping.")
        return

    print(f"‚öôÔ∏è Running top-k test on {len(input_files)} input files...")
    subprocess.run([
        "python3", "markov/test.py",
        "--model", str(model_path),
        "--data_dir", str(data_dir),
        "--output_dir", str(output_dir),
        "--mode", "topk",
        "--k_values", "1", "5", "10"
    ], check=True)

    print(f"‚úÖ Test results saved to {output_dir}")



def train_lstpm_model(train_csv, save_dir, nepochs=25):
    """
    Preprocess + train LSTPM model, with skip checks.

    Args:
        train_csv (Path): path to training_set.csv
        save_dir (Path): directory to save model

    Returns:
        (Path to trained model .m, Path to preprocessed_dir)
    """
    save_dir = Path(save_dir)
    run_dir = save_dir.parent
    preprocessed_dir = run_dir / "preprocessed"
    metadata_path = run_dir / "metadata.json"
    distance_path = run_dir / "distance.pkl"

    model_path = save_dir / "res.m"

    # ----- 1. Check if model already trained -----
    if model_path.exists():
        print(f"‚è≠Ô∏è LSTPM model already trained at {model_path}, skipping training.")
        return model_path

    # ----- 2. Check if preprocessed already done -----
    input_csvs = list(train_csv.parent.glob("*.csv"))
    pk_files = list(preprocessed_dir.glob("*.pk"))

    if len(input_csvs) == len(pk_files) and len(pk_files) > 0:
        print(f"‚è≠Ô∏è Preprocessed data already present in {preprocessed_dir}, skipping preprocessing.")
    else:
        print("‚öôÔ∏è Preprocessing LSTPM data...")
        result = subprocess.run([
            "python3", "LSTPM/train/preprocess.py",
            "--in_dir", str(train_csv.parent),
            "--training_set_name", train_csv.stem,
            "--out_dir", str(run_dir)
        ], capture_output=True, text=True)
        if result.returncode != 0:
            print("‚ùå Preprocessing failed!")
            print("STDOUT:\n", result.stdout)
            print("STDERR:\n", result.stderr)
            raise RuntimeError("Preprocessing failed.")
        print(f"‚úÖ Preprocessing completed, files saved to {preprocessed_dir}")

    # ----- 3. Train model -----
    print("üéØ Training LSTPM...")
    subprocess.run([
        "python3", "LSTPM/train/train.py",
        "--data_pk", str(preprocessed_dir / f"{train_csv.stem}.pk"),
        "--metadata_json", str(metadata_path),
        "--distance", str(distance_path),
        "--save_dir", str(save_dir),
        "--batch_size", "512",
        "--epochs", str(nepochs)
    ], check=True)
    print(f"‚úÖ LSTPM model saved at {model_path}")

    return model_path

def test_lstpm_model(model_path, data_dir, output_dir):
    output_dir = Path(output_dir)
    if len(list(output_dir.glob("*.csv"))) == len(list(Path(data_dir).glob("*.csv"))):
        print(f"‚è≠Ô∏è LSTPM test results already exist in {output_dir}")
        return

    print(f"üìä Testing LSTPM...")
    subprocess.run([
        "python3", "LSTPM/train/test.py",
        "--data_dir", str(Path(data_dir) / "preprocessed"),
        "--model_m", str(model_path),
        "--distance", str(model_path.parent.parent / "distance.pkl"),
        "--mode", "topk",
        "--k_values", "1", "5", "10",
        "--output", str(output_dir)
    ], check=True)

def evaluate_lstpm_perplexity(model_path, data_dir, output_dir):
    output_dir = Path(output_dir)
    preprocessed_dir = Path(data_dir) / "preprocessed"
    if len(list(output_dir.glob("*.csv"))) == len(list(Path(preprocessed_dir).glob("*.pk"))):
        print(f"‚è≠Ô∏è LSTPM perplexity already computed for {output_dir}")
        return

    print(f"üìà Evaluating LSTPM perplexity...")
    subprocess.run([
        "python3", "LSTPM/train/perplexity.py",
        "--data_dir", str(preprocessed_dir),
        "--model_m", str(model_path),
        "--distance", str(model_path.parent.parent / "distance.pkl"),
        "--output", str(output_dir)
    ], check=True)

# DeepMove Wrapper functions

def train_deepmove_model(train_csv, save_dir, model_type):
    save_dir = Path(save_dir)
    run_dir = save_dir.parent
    preprocessed_dir = run_dir / "preprocessed"
    metadata_path = run_dir / "metadata.json"
    train_pk = preprocessed_dir / f"{train_csv.stem}.pk"

    # Skip if model already trained
    if (save_dir / "res.m").exists():
        print(f"‚è≠Ô∏è DeepMove ({model_type}) model already trained at {save_dir}, skipping.")
        return save_dir / "res.m"

    # Skip preprocessing if already done
    if not train_pk.exists():
        print("‚öôÔ∏è Preprocessing DeepMove data...")
        subprocess.run([
            "python3", "DeepMove/codes/preprocess.py",
            "--in_dir", str(train_csv.parent),
            "--training_set_name", train_csv.stem,
            "--out_dir", str(run_dir)
        ], check=True)
    else:
        print(f"‚è≠Ô∏è Preprocessed file {train_pk} already exists, skipping preprocessing.")

    # Train
    print(f"üéØ Training DeepMove model ({model_type})...")
    subprocess.run([
        "python3", "DeepMove/codes/main.py",
        "--metadata_json", str(metadata_path),
        "--model_mode", model_type,
        "--data_path", str(train_pk),
        "--epoch_max", "40",
        "--save_dir", str(save_dir),
        "--pretrain", "0"
    ], check=True)

    return save_dir / "res.m"


def test_deepmove_model(model_path, data_dir, output_dir, model_type):
    output_dir = Path(output_dir)
    if len(list(output_dir.glob("*.csv"))) == len(list(Path(data_dir).glob("*.csv"))):
        print(f"‚è≠Ô∏è DeepMove test results already exist in {output_dir}")
        return

    print(f"üìä Testing DeepMove ({model_type})...")
    subprocess.run([
        "python3", "DeepMove/codes/test.py",
        "--metadata_json", str(model_path.parent.parent / "metadata.json"),
        "--model_mode", model_type,
        "--model_path", str(model_path),
        "--data_dir", str(model_path.parent.parent / "preprocessed"),
        "--mode", "topk",
        "--k_values", "1", "5", "10", "20",
        "--output", str(output_dir)
    ], check=True)



def perplexity_deepmove(model_path, data_dir, output_dir, model_type):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
 
    data_dir = Path(model_path.parent.parent / "preprocessed")
    input_files = list(data_dir.glob("*.pk"))
    output_files = list(output_dir.glob("*.csv"))

    if len(input_files) == len(output_files):
        print(f"‚è≠Ô∏è DeepMove perplexity already computed for all files in {output_dir}")
        return

    print(f"üìà Evaluating DeepMove ({model_type}) perplexity on {len(input_files)} files...")

    for pk_file in input_files:
        out_file = output_dir / f"{pk_file.stem}.csv"
        if out_file.exists():
            print(f"‚è≠Ô∏è Skipping already computed file: {out_file.name}")
            continue

        print(f"Computing perplexity for {pk_file.name}...")
        subprocess.run([
            "python3", "DeepMove/codes/perplexity.py",
            "--metadata_json", str(model_path.parent.parent / "metadata.json"),
            "--model_mode", model_type,
            "--model_path", str(model_path),
            "--data_pk", str(pk_file),
            "--output", str(out_file)
        ], check=True)

    print(f"‚úÖ DeepMove perplexity evaluation completed for all new files.")

In [55]:
DATASETS = {
    "ShenzhenUrban": {
        "type1": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType1/Datasets",
        "type2_home": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType2/Home/Datasets",
        "type2_work": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType2/Work/Datasets",
        "type3": "/home/akouamdj/mobleak-datasets/PreprocessedData/2SplittedData/ShenzhenUrban/NormalizationType3/Datasets",
    },
    # Add more datasets later
}

MODELS = {
    "markov": {
        "train": train_markov_model,
        "test": test_markov_model,
        "perplexity": evaluate_perplexity,
    },
    "lstpm": {
        "train": train_lstpm_model,
        "test": test_lstpm_model,
        "perplexity": evaluate_lstpm_perplexity,
    },
    "deepmove_simple": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "simple"),
        "test": lambda model_path, data_dir, output_dir: test_deepmove_model(model_path, data_dir, output_dir, "simple"),
        "perplexity": lambda model_path, data_dir, output_dir: perplexity_deepmove(model_path, data_dir, output_dir, "simple"),
    },

    "deepmove_simple_long": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "simple_long"),
        "test": lambda model_path, data_dir, output_dir: test_deepmove_model(model_path, data_dir, output_dir, "simple_long"),
        "perplexity": lambda model_path, data_dir, output_dir: perplexity_deepmove(model_path, data_dir, output_dir, "simple_long"),
    },

    "deepmove_attn_avg_long_user": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "attn_avg_long_user"),
        "test": lambda model_path, data_dir, output_dir: test_deepmove_model(model_path, data_dir, output_dir, "attn_avg_long_user"),
        "perplexity": lambda model_path, data_dir, output_dir: perplexity_deepmove(model_path, data_dir, output_dir, "attn_avg_long_user"),
    },

    "deepmove_attn_local_long": {
        "train": lambda train_path, run_dir: train_deepmove_model(train_path, run_dir, "attn_local_long"),
        "test": lambda model_path, data_dir, output_dir: test_deepmove_model(model_path, data_dir, output_dir, "attn_local_long"),
        "perplexity": lambda model_path, data_dir, output_dir: perplexity_deepmove(model_path, data_dir, output_dir, "attn_local_long"),
    }
}

# Unified results directory
OUTPUT_ROOT = Path("results/")



In [85]:
def compute_memorization_metrics(perplexity_dir, mapping_file):
    """
    Given a folder with <cluster_X_perplexity.csv> and training_set.csv,
    compute the 3 memorization metrics per training tid.
    """
    perplexity_dir = Path(perplexity_dir)

    # Support both filenames
    training_perp_path = perplexity_dir / "training_set_perplexity.csv"
    if not training_perp_path.exists():
        training_perp_path = perplexity_dir / "training_set.csv"

    training_df = pd.read_csv(training_perp_path)
    # print(training_df.head())

    # Support both column names: 'tid' or 'user'
    id_col = "tid" if "tid" in training_df.columns else "user"

    training_dict = training_df.set_index(id_col)["perplexity"].to_dict()
    mapping_df = pd.read_csv(mapping_file)

    if "cluster_file" in mapping_df.columns:
        type3 = False
        mapping_dict = mapping_df.set_index("cluster_file")["representant_tid"].to_dict()
    else:
        # type 3
        type3 = True
        mapping_df['reference_file'] = mapping_df['device_id'].apply(lambda x: f"{x}.csv")
        mapping_dict = mapping_df.set_index("reference_file")["training_tid"].to_dict()

    rows = []

    for ref_file in perplexity_dir.glob("*.csv"):
        cluster_id = ref_file.stem.replace("_perplexity", "") + ".csv"
        if cluster_id == "training_set.csv" or cluster_id == "training_set_perplexity.csv":
            continue
        print(ref_file)
        ref_df = pd.read_csv(ref_file)
        if ref_df.empty:
            continue

        # Adapt to tid/user here as well
        ref_id_col = "tid" if "tid" in ref_df.columns else "user"

        training_tid_val = mapping_dict.get(cluster_id)
        if training_tid_val not in training_dict:
            continue

        train_perp = training_dict[training_tid_val]

        result_row = {
            "tid": training_tid_val,
            "cluster_id": cluster_id,
        }

        if type3:
            for perturbation in ['substitute', 'stationary', 'shuffle']:
                mapping_df_perturbed = mapping_df[mapping_df['perturbation'] == perturbation]
                ref_df_perturbed = ref_df[ref_df[ref_id_col].isin(mapping_df_perturbed['reference_tid'])]

                ref_perps = ref_df_perturbed["perplexity"].values
                ref_mean = np.mean(ref_perps)
                rank = np.sum(ref_perps <= train_perp) + 1
                exposure = np.log2(len(ref_perps)) - np.log2(rank)
                percentile = (rank - 1) / len(ref_perps)
                gap = train_perp - ref_mean

                result_row.update({
                    f"train_perplexity_{perturbation}": train_perp,
                    f"mean_ref_perplexity_{perturbation}": ref_mean,
                    f"exposure_{perturbation}": exposure,
                    f"percentile_{perturbation}": percentile,
                    f"gap_{perturbation}": gap,
                })

        # General (non-perturbed) cluster
        ref_perps = ref_df["perplexity"].values
        ref_mean = np.mean(ref_perps)
        rank = np.sum(ref_perps <= train_perp) + 1
        exposure = np.log2(len(ref_perps)) - np.log2(rank)
        percentile = (rank - 1) / len(ref_perps)
        gap = train_perp - ref_mean

        result_row.update({
            "train_perplexity": train_perp,
            "mean_ref_perplexity": ref_mean,
            "exposure": exposure,
            "percentile": percentile,
            "gap": gap,
        })

        rows.append(result_row)

    return pd.DataFrame(rows)

In [88]:
def run_memorization_test(dataset_name, type_name, dataset_path, model_name):
    print(f"\nüöÄ Running: {model_name.upper()} | {dataset_name} | {type_name}")
    
    model = MODELS[model_name]
    dataset_path = Path(dataset_path)
    training_file = dataset_path / "training_set.csv"
    mapping_file = dataset_path / "representant_mapping.txt"

    if not training_file.exists():
        print(f"‚ö†Ô∏è No training set found in {dataset_path}")
        return

    run_dir = OUTPUT_ROOT /   dataset_name / model_name/ type_name
    model_dir = run_dir / "model"
    perplexity_dir = run_dir / "perplexity"
    test_dir = run_dir / "test"

    # Train
    model_path = model["train"](training_file, model_dir)

    # Perplexity
    model["perplexity"](model_path, run_dir, perplexity_dir)

    # Metrics
    metrics_path = run_dir / "memorization_metrics.csv"
    if metrics_path.exists():
        print(f"‚è≠Ô∏è Metrics already exist at {metrics_path}")
    else:
        metrics_df = compute_memorization_metrics(perplexity_dir, mapping_file)
        metrics_df.to_csv(metrics_path, index=False)
    print(f"‚úÖ Metrics saved to: {metrics_path}")

    # Test
    #model["test"](model_path, dataset_path, test_dir)

In [None]:
#ALL_MODELS = ["markov", "lstpm", "deepmove_simple", "deepmove_simple_long", "deepmove_attn_avg_long_user", "deepmove_attn_local_long"]
ALL_MODELS = ["deepmove_attn_avg_long_user", "deepmove_attn_local_long"]  

for model_name in ALL_MODELS:
    for dataset_name, type_paths in DATASETS.items():
        for type_name, path in type_paths.items():
            run_memorization_test(dataset_name, type_name, path, model_name)



üöÄ Running: DEEPMOVE_ATTN_AVG_LONG_USER | ShenzhenUrban | type1
‚è≠Ô∏è DeepMove (attn_avg_long_user) model already trained at results/ShenzhenUrban/deepmove_attn_avg_long_user/type1/model, skipping.
üìà Evaluating DeepMove (attn_avg_long_user) perplexity on 2001 files...
‚è≠Ô∏è Skipping already computed file: cluster_34789008_34789104.csv
‚è≠Ô∏è Skipping already computed file: cluster_2004480_2004576.csv
‚è≠Ô∏è Skipping already computed file: cluster_17717616_17717712.csv
‚è≠Ô∏è Skipping already computed file: cluster_49545120_49545216.csv
‚è≠Ô∏è Skipping already computed file: cluster_14733408_14733504.csv
‚è≠Ô∏è Skipping already computed file: cluster_47759952_47760048.csv
‚è≠Ô∏è Skipping already computed file: cluster_49642752_49642848.csv
‚è≠Ô∏è Skipping already computed file: cluster_17917152_17917248.csv
‚è≠Ô∏è Skipping already computed file: cluster_28251744_28251840.csv
‚è≠Ô∏è Skipping already computed file: cluster_13000416_13000512.csv
‚è≠Ô∏è Skipping already computed fil

  nn.init.xavier_uniform(t)
  nn.init.orthogonal(t)
  nn.init.constant(t, 0)


Processing results/ShenzhenUrban/deepmove_attn_avg_long_user/type1/preprocessed/cluster_20011440_20011536.pk...


  return F.softmax(attn_energies)
  score = F.log_softmax(y)


‚è≠Ô∏è Skipping already computed file: cluster_37308240_37308336.csv
‚è≠Ô∏è Skipping already computed file: cluster_45696144_45696240.csv
‚è≠Ô∏è Skipping already computed file: cluster_36955056_36955152.csv
‚è≠Ô∏è Skipping already computed file: cluster_3620736_3620832.csv
‚è≠Ô∏è Skipping already computed file: cluster_17341488_17341584.csv
‚è≠Ô∏è Skipping already computed file: cluster_13257792_13257888.csv
‚è≠Ô∏è Skipping already computed file: cluster_48299856_48299952.csv
‚è≠Ô∏è Skipping already computed file: cluster_16301904_16302000.csv
‚è≠Ô∏è Skipping already computed file: cluster_16829712_16829808.csv
‚è≠Ô∏è Skipping already computed file: cluster_5391840_5391936.csv
‚è≠Ô∏è Skipping already computed file: cluster_1955616_1955712.csv
‚è≠Ô∏è Skipping already computed file: cluster_37707792_37707888.csv
‚è≠Ô∏è Skipping already computed file: cluster_40794480_40794576.csv
‚è≠Ô∏è Skipping already computed file: cluster_37533888_37533984.csv
‚è≠Ô∏è Skipping already computed file: c

  nn.init.xavier_uniform(t)
  nn.init.orthogonal(t)
  nn.init.constant(t, 0)


users:2000 markov:None train:4000 test:2000


  return F.softmax(attn_energies)
  score = F.log_softmax(y)
  torch.nn.utils.clip_grad_norm(model.parameters(), clip)
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha = 1) (Triggered internally at /pytorch/torch/csrc/utils/python_arg_parser.cpp:1661.)
  p.data.add_(-lr, p.grad.data)


==>Train Epoch:00 Loss:2.5383 lr:0.0003333333333333333
==>Test Acc:0.9122 Loss:0.7636
single epoch time cost:576.6859900951385
==>Train Epoch:01 Loss:0.6883 lr:0.0003333333333333333
==>Test Acc:0.9223 Loss:0.5690
==>Train Epoch:02 Loss:0.4995 lr:0.0003333333333333333
==>Test Acc:0.9251 Loss:0.4989
==>Train Epoch:03 Loss:0.4198 lr:0.0003333333333333333
==>Test Acc:0.9331 Loss:0.4440
==>Train Epoch:04 Loss:0.3592 lr:0.0003333333333333333
==>Test Acc:0.9393 Loss:0.4062
==>Train Epoch:05 Loss:0.3216 lr:0.0003333333333333333
==>Test Acc:0.9419 Loss:0.3829
==>Train Epoch:06 Loss:0.3015 lr:0.0003333333333333333
==>Test Acc:0.9425 Loss:0.3756
==>Train Epoch:07 Loss:0.2790 lr:0.0003333333333333333
==>Test Acc:0.9397 Loss:0.3798
==>Train Epoch:08 Loss:0.2702 lr:0.0003333333333333333
==>Test Acc:0.9438 Loss:0.3617
==>Train Epoch:09 Loss:0.2533 lr:0.0003333333333333333
==>Test Acc:0.9454 Loss:0.3531
==>Train Epoch:10 Loss:0.2402 lr:0.0003333333333333333
==>Test Acc:0.9425 Loss:0.3672
==>Train Epoc