### Retrained K-means Clutering with Changes in Sampled Size

#### Experiment Design: 

Preprocessing Step:
1. Start with Dataset of size D with some distribution 
2. Have K-means/K-centers run on this dataset

Running Step 
Goal: From sampling from an additional dataset of size S with the same underlying distribution, learn the boundaries of K-means/K-center clusters (the user has no knowledge of this)

User/Background Step: 
1. Sample from S
2. Online: Have the algorithm run K-means/K-clusters again
3. Run: Learn the boundaries of K-means
4. Go back to Step 1 

Datasets: Brightkite, Gowalla (SNAP) and RangeQueries (UCI)

Import Libraries

In [None]:
import copy
from importlib import reload
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse.linalg import eigsh
import sklearn
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings

import explore_heuristic
import utils

#### Datasets

In [None]:
data_brightkite = pd.read_csv('./Datasets/loc-brightkite_totalCheckins.txt', sep='\t', names=['zero', 'time', 'latitude', 'longitude', 'location id'])
data_gowalla = pd.read_csv('./Datasets/loc-gowalla_totalCheckins.txt', sep='\t', names=['zero', 'time', 'latitude', 'longitude', 'location id'])
data_rangequeries = pd.read_csv('./Datasets/Range-Queries-Aggregates.csv', names=['zero', 'x', 'y', 'x_range', 'y_range', 'count', 'sum_', 'avg'], skiprows=1)

print("data_brightkite:")
print(data_brightkite.head())
print(data_brightkite.info())
print(data_brightkite.describe())

print("data_gowalla:")
print(data_gowalla.head())
print(data_gowalla.info())
print(data_gowalla.describe())

print("data_rangequeries:")
print(data_rangequeries.head())
print(data_rangequeries.info())
print(data_rangequeries.describe())

### Datasets and Parameter Initialization

In [None]:
"Number of Clusters + subsample size parameter initialization"
num_clusters = 10 # Number of clusters - adjust based on your goal 
subsample_size = 500 # Constant for subsample size

In [None]:
"""BrightKite"""
# Filter for relevant columns and drop any rows with missing latitude or longitude
location_brightkite = data_brightkite[['latitude', 'longitude']].dropna()
location_brightkite_array = location_brightkite[['latitude', 'longitude']].to_numpy()

all_locations_k_means_brightkite = utils.ground_truth_kmeans(location_brightkite_array, 
                                                            num_clusters)
subsample_brightkite_array = utils.random_subsample(location_brightkite_array, subsample_size)
subsample_k_means_brightkite = utils.ground_truth_kmeans(subsample_brightkite_array, 
                                                            num_clusters)

In [None]:
"""Gowalla"""
# Filter for relevant columns and drop any rows with missing latitude or longitude
location_gowalla = data_gowalla[['latitude', 'longitude']].dropna()
location_gowalla_array = location_gowalla[['latitude', 'longitude']].to_numpy()

all_locations_k_means_gowalla = utils.ground_truth_kmeans(location_gowalla_array, 
                                                            num_clusters)
subsample_gowalla_array = utils.random_subsample(location_gowalla_array, subsample_size)
subsample_k_means_gowalla = utils.ground_truth_kmeans(subsample_gowalla_array, 
                                                            num_clusters)

In [None]:
"""rangequeries""" 
# Filter for relevant columns and drop any rows with missing latitude or longitude
xy_rangequeries = data_rangequeries[['x', 'y']].dropna()
xy_rangequeries_array = xy_rangequeries[['x', 'y']].to_numpy()

all_locations_k_means_rangequeries = utils.ground_truth_kmeans(xy_rangequeries_array, 
                                                            num_clusters)
subsample_rangequeries_array = utils.random_subsample(xy_rangequeries_array, subsample_size)
subsample_k_means_rangequeries = utils.ground_truth_kmeans(subsample_rangequeries_array, 
                                                            num_clusters)

#### Sampling Size Change Experiments and Model Performance
Experiment 1. Keeping Sampling Rounds fixed, Increasing Sampling Number 

Experiment 2. Keeping the number of sampling rounds fixed, but increasing the sample number.

Experiment 3. Fixed total number of samples, but testing out different variations

In [None]:
# Experiment 1: Experiment with model performance by keeping the number
# of sampling rounds fixed, but increasing the sample number.
reload(explore_heuristic)
reload(utils)

fixed_number_of_rounds = 15
oracle_acc_lists_bk = []
random_acc_lists_bk = []
distance_acc_lists_bk = []

oracle_acc_lists_go = []
distance_acc_lists_go = []
random_acc_lists_go = []

oracle_acc_lists_rq = []
random_acc_lists_rq = []
distance_acc_lists_rq = []

samp_settings = [1, 2, 4, 5, 8, 10, 25, 50, 100, 250]

samp_xticks = ["{}".format(samps) for samps in samp_settings]

""" Experiment 1 for BrightKite """
for samp_num in samp_settings:
    print("On samp num {}".format(samp_num))
    oracle_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    random_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    distance_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )
utils.acc_plot(
    [oracle_acc_lists_bk, random_acc_lists_bk, distance_acc_lists_bk],
    ["oracle", "random", "distance"], xticks=samp_xticks
)

In [None]:
reload(explore_heuristic)
reload(utils)

""" Experiment 1 for Gowalla """
for samp_num in samp_settings:
    print("On samp num {}".format(samp_num))
    oracle_acc_lists_go.append(
       utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
       fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    random_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    distance_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )
utils.acc_plot(
    [oracle_acc_lists_go, random_acc_lists_go, distance_acc_lists_go],
   ["oracle", "random", "distance"], xticks=samp_xticks
)

In [None]:
reload(explore_heuristic)
reload(utils)

""" Experiment 1 for Rangequeries """
for samp_num in samp_settings:
    print("On samp num {}".format(samp_num))
    oracle_acc_lists_rq.append(
       utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
       fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    random_acc_lists_rq.append(
       utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
       fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    distance_acc_lists_rq.append(
       utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, fixed_number_of_rounds,
       samp_num, method="distance", retraining="online")
    )

utils.acc_plot(
    [oracle_acc_lists_rq, random_acc_lists_rq, distance_acc_lists_rq],
    ["oracle", "random", "distance"], xticks=samp_xticks
) 

In [None]:
# Experiment 2: Experiment with model performance by keeping the number
# of sampling rounds fixed, but increasing the sample number.
reload(explore_heuristic)
reload(utils)

fixed_number_of_samps = 15
rounds_oracle_acc_lists_bk = []
rounds_random_acc_lists_bk = []
rounds_distance_acc_lists_bk = []

rounds_oracle_acc_lists_go = []
rounds_random_acc_lists_go = []
rounds_distance_acc_lists_go = []

rounds_oracle_acc_lists_rq = []
rounds_random_acc_lists_rq = []
rounds_distance_acc_lists_rq = []

round_settings = [1, 2, 4, 5, 7, 10, 25, 50, 100, 250]

round_xticks = ["{}".format(rounds) for rounds in round_settings]

""" Experiment 2 with Brightkite """
for round_num in round_settings:
    print("On round num {}".format(round_num))
    rounds_oracle_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    rounds_random_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    rounds_distance_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
utils.acc_plot(
    [rounds_oracle_acc_lists_bk, rounds_random_acc_lists_bk, rounds_distance_acc_lists_bk],
    ["oracle", "random", "distance"], xticks=round_xticks
)


In [None]:
reload(explore_heuristic)
reload(utils)

""" Experiment 2 with Gowalla """
for round_num in round_settings:
    print("On round num {}".format(round_num))
    rounds_oracle_acc_lists_go.append(
       utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
       fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    rounds_random_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    rounds_distance_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )

utils.acc_plot(
    [rounds_oracle_acc_lists_go, rounds_random_acc_lists_go, rounds_distance_acc_lists_go],
    ["oracle", "random", "distance"], xticks=round_xticks
)

In [None]:
reload(explore_heuristic)
reload(utils)

""" Experiment 2 with Rangequeries """
for round_num in round_settings:
    print("On round num {}".format(round_num))
    rounds_oracle_acc_lists_rq.append(
       utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
       fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    rounds_random_acc_lists_rq.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    rounds_distance_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )

utils.acc_plot(
    [rounds_oracle_acc_lists_rq, rounds_random_acc_lists_rq, rounds_distance_acc_lists_rq],
    ["oracle", "random", "distance"], xticks=round_xticks
)

In [None]:
# Experiment 3: Experiment with Model Performance by fixed total number of samples, 
# but testing out different variations on how to procure that amount.
reload(explore_heuristic)
reload(utils)

perm_oracle_acc_lists_bk = []
perm_random_acc_lists_bk = []
perm_distance_acc_lists_bk = []

perm_oracle_acc_lists_go = []
perm_random_acc_lists_go = []
perm_distance_acc_lists_go = []

perm_oracle_acc_lists_rq = []
perm_random_acc_lists_rq = []
perm_distance_acc_lists_rq = []

# We'll always sample 200 samples.
sample_settings = [(1, 200), (2, 100), (4, 50), (5, 40), (10, 20), (50, 4), (100, 2), (200, 1)]

sample_xticks = ["{}_{}".format(rounds, samps) for (rounds, samps) in sample_settings]

""" Experiment 3 with Brightkite """
for (rounds, samps) in sample_settings:
    print("Rounds {}, samps {}".format(rounds, samps))
    perm_oracle_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, rounds,
        samps, method="random", retraining="query")
    )
    perm_random_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, rounds,
        samps, method="random", retraining="online")
    )
    perm_distance_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, rounds,
        samps, method="distance", retraining="online")
    )
utils.acc_plot(
    [perm_oracle_acc_lists_bk, perm_random_acc_lists_bk, perm_distance_acc_lists_bk],
    ["oracle", "random", "distance"], xticks=sample_xticks
)

In [None]:
reload(explore_heuristic)
reload(utils)

""" Experiment 3 with Gowalla """
for (rounds, samps) in sample_settings:
    print("Rounds {}, samps {}".format(rounds, samps))
    perm_oracle_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, rounds,
        samps, method="random", retraining="query")
    )
    perm_random_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, rounds,
        samps, method="random", retraining="online")
    )
    perm_distance_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, rounds,
        samps, method="distance", retraining="online")
    )
utils.acc_plot(
    [perm_oracle_acc_lists_rq, perm_random_acc_lists_rq, perm_distance_acc_lists_rq],
    ["oracle", "random", "distance"], xticks=sample_xticks
)

In [None]:
""" Experiment 3 with Rangequeries """
for (rounds, samps) in sample_settings:
    print("Rounds {}, samps {}".format(rounds, samps))
    perm_oracle_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, rounds,
        samps, method="random", retraining="query")
    )
    perm_random_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, rounds,
        samps, method="random", retraining="online")
    )
    perm_distance_acc_lists_go.append(
        utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, rounds,
        samps, method="distance", retraining="online")
    )
utils.acc_plot(
    [perm_oracle_acc_lists_rq, perm_random_acc_lists_rq, perm_distance_acc_lists_rq],
    ["oracle", "random", "distance"], xticks=sample_xticks
)