### Retrained K-means Clutering with Changes in Sampled Size

#### Experiment Design: 

Preprocessing Step:
1. Start with Dataset of size D with some distribution 
2. Have K-means/K-centers run on this dataset

Running Step 
Goal: From sampling from an additional dataset of size S with the same underlying distribution, learn the boundaries of K-means/K-center clusters (the user has no knowledge of this)

User/Background Step: 
1. Sample from S
2. Online: Have the algorithm run K-means/K-clusters again
3. Run: Learn the boundaries of K-means
4. Go back to Step 1 

Datasets: Brightkite, Gowalla (SNAP) and RangeQueries (UCI)

Import Libraries

In [1]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse.linalg import eigsh
import sklearn
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings

import explore_heuristic
import utils

#### Datasets

In [2]:
data_brightkite = pd.read_csv('./Datasets/loc-brightkite_totalCheckins.txt', sep='\t', names=['zero', 'time', 'latitude', 'longitude', 'location id'])
data_gowalla = pd.read_csv('./Datasets/loc-gowalla_totalCheckins.txt', sep='\t', names=['zero', 'time', 'latitude', 'longitude', 'location id'])
data_rangequeries = pd.read_csv('./Datasets/Range-Queries-Aggregates.csv', names=['zero', 'x', 'y', 'x_range', 'y_range', 'count', 'sum_', 'avg'] )

print("data_brightkite:")
print(data_brightkite.head())
print(data_brightkite.info())
print(data_brightkite.describe())

print("data_gowalla:")
print(data_gowalla.head())
print(data_gowalla.info())
print(data_gowalla.describe())

print("data_rangqueries:")
print(data_rangequeries.head())
print(data_rangequeries.info())
print(data_rangequeries.describe())

  data_rangequeries = pd.read_csv('./Datasets/Range-Queries-Aggregates.csv', names=['zero', 'x', 'y', 'x_range', 'y_range', 'count', 'sum_', 'avg'] )


data_brightkite:
   zero                  time   latitude   longitude  \
0     0  2010-10-17T01:48:53Z  39.747652 -104.992510   
1     0  2010-10-16T06:02:04Z  39.891383 -105.070814   
2     0  2010-10-16T03:48:54Z  39.891077 -105.068532   
3     0  2010-10-14T18:25:51Z  39.750469 -104.999073   
4     0  2010-10-14T00:21:47Z  39.752713 -104.996337   

                                location id  
0          88c46bf20db295831bd2d1718ad7e6f5  
1          7a0f88982aa015062b95e3b4843f9ca2  
2          dd7cd3d264c2d063832db506fba8bf79  
3  9848afcc62e500a01cf6fbf24b797732f8963683  
4          2ef143e12038c870038df53e0478cefc  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4747287 entries, 0 to 4747286
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   zero         int64  
 1   time         object 
 2   latitude     float64
 3   longitude    float64
 4   location id  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 181.1+ MB
None
       

### Datasets and Parameter Initialization

In [3]:
"Number of Clusters + subsample size parameter initialization"
num_clusters = 10 # Number of clusters - adjust based on your goal 
subsample_size = 800 # Constant for subsample size

In [4]:
"""BrightKite"""
# Filter for relevant columns and drop any rows with missing latitude or longitude
location_brightkite = data_brightkite[['latitude', 'longitude']].dropna()
location_brightkite_array = location_brightkite[['latitude', 'longitude']].to_numpy()

all_locations_k_means_brightkite = utils.ground_truth_kmeans(location_brightkite_array, 
                                                            num_clusters)
subsample_brightkite_array = utils.random_subsample(location_brightkite_array, subsample_size)
subsample_k_means_brightkite = utils.ground_truth_kmeans(subsample_brightkite_array, 
                                                            num_clusters)

In [5]:
"""Gowalla"""
# Filter for relevant columns and drop any rows with missing latitude or longitude
location_gowalla = data_gowalla[['latitude', 'longitude']].dropna()
location_gowalla_array = location_gowalla[['latitude', 'longitude']].to_numpy()

all_locations_k_means_gowalla = utils.ground_truth_kmeans(location_gowalla_array, 
                                                            num_clusters)
subsample_gowalla_array = utils.random_subsample(location_gowalla_array, subsample_size)
subsample_k_means_gowalla = utils.ground_truth_kmeans(subsample_gowalla_array, 
                                                            num_clusters)

In [6]:
"""rangequeries""" #Fix 
# Filter for relevant columns and drop any rows with missing latitude or longitude
location_rangequeries = data_rangequeries[['x', 'y']].dropna()
location_rangequeries_array = location_rangequeries[['x', 'y']].to_numpy()

all_locations_k_means_rangequeries = utils.ground_truth_kmeans(location_rangequeries_array, 
                                                            num_clusters)
subsample_rangequeries_array = utils.random_subsample(location_rangequeries_array, subsample_size)
subsample_k_means_rangequeries = utils.ground_truth_kmeans(subsample_rangequeries_array, 
                                                            num_clusters)

ValueError: could not convert string to float: 'x'

#### Sampling Size Experiments and Model Performance Comparison
1. Keeping Sampling Rounds fixed, Increasing Sampling Number 
2. Keeping the number of sampling rounds fixed, but increasing the sample number.
3. 

In [3]:
# First Experiment: Experiment with model performance by keeping the number
# of sampling rounds fixed, but increasing the sample number.
from importlib import reload
reload(explore_heuristic)
reload(utils)

fixed_number_of_rounds = 15
oracle_acc_lists_bk = []
oracle_acc_lists_go = []
oracle_acc_lists_rq = []
random_acc_lists_bk = []
random_acc_lists_go = []
random_acc_lists_rq = []
distance_acc_lists_bk = []
distance_acc_lists_go = []
distance_acc_lists_rq = []

samp_settings = [1, 2, 4, 5, 8, 10, 25, 50, 100, 250]

samp_xticks = ["{}".format(samps) for samps in samp_settings]

for samp_num in samp_settings:
    print("On samp num {}".format(samp_num))
    oracle_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    random_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    distance_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )
utils.acc_plot(
    [oracle_acc_lists_bk, random_acc_lists_bk, distance_acc_lists_bk],
    ["oracle", "random", "distance"], xticks=samp_xticks
)

reload(explore_heuristic)
reload(utils)

for samp_num in samp_settings:
    print("On samp num {}".format(samp_num))
    oracle_acc_lists_go.append(
       utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
       fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    random_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    distance_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )
utils.acc_plot(
    [oracle_acc_lists_go, random_acc_lists_go, distance_acc_lists_go],
   ["oracle", "random", "distance"], xticks=samp_xticks
)

# reload(explore_heuristic)
# reload(utils)
# for samp_num in samp_settings:
    # print("On samp num {}".format(samp_num))
    # oracle_acc_lists_rq.append(
    #   utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
    #   fixed_number_of_rounds, samp_num, method="random", retraining="query")
    # )
    # random_acc_lists_rq.append(
    #    utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
    #    fixed_number_of_rounds, samp_num, method="random", retraining="online")
    #)
    # distance_acc_lists_rq.append(
    #    utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, fixed_number_of_rounds,
    #    samp_num, method="distance", retraining="online")
    #)

# utils.acc_plot(
#    [oracle_acc_lists_rq, random_acc_lists_rq, distance_acc_lists_rq],
#    ["oracle", "random", "distance"], xticks=samp_xticks
# ) 

On samp num 1


NameError: name 'subsample_brightkite_array' is not defined

In [None]:
# Experiment with model performance by keeping the number
# of sampling rounds fixed, but increasing the sample number.
from importlib import reload
reload(explore_heuristic)
reload(utils)

fixed_number_of_samps = 15
rounds_oracle_acc_lists_bk = []
rounds_oracle_acc_lists_go = []
rounds_oracle_acc_lists_rq = []
rounds_random_acc_lists_bk = []
rounds_random_acc_lists_go = []
rounds_random_acc_lists_rq = []
rounds_distance_acc_lists_bk = []
rounds_distance_acc_lists_go = []
rounds_distance_acc_lists_rq = []

round_settings = [1, 2, 4, 5, 7, 10, 25, 50, 100, 250]

round_xticks = ["{}".format(rounds) for rounds in round_settings]

for round_num in round_settings:
    print("On round num {}".format(round_num))
    rounds_oracle_acc_lists_bk.append(
        utils.attack_experiment(subsample_data_array, subsample_k_means, round_num,
        fixed_number_of_samps, method="random", retraining="query")
    )
    rounds_random_acc_lists.append(
        experiment_utils.attack_experiment(subsample_data_array, subsample_k_means, round_num,
        fixed_number_of_samps, method="random", retraining="online")
    )
    rounds_distance_acc_lists.append(
        experiment_utils.attack_experiment(subsample_data_array, subsample_k_means, round_num,
        fixed_number_of_samps, method="distance", retraining="online")
    )
    rounds_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    rounds_oracle_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    rounds_oracle_lists_go.append(
       utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
       fixed_number_of_rounds, samp_num, method="random", retraining="query")
    )
    # rounds_oracle_lists_rq.append(
    #   utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
    #   fixed_number_of_rounds, samp_num, method="random", retraining="query")
    # )
    rounds_random_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    rounds_random_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, 
        fixed_number_of_rounds, samp_num, method="random", retraining="online")
    )
    # rounds_random_lists_rq.append(
    #    utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, 
    #    fixed_number_of_rounds, samp_num, method="random", retraining="online")
    #)
    random_acc_lists_bk.append(
        utils.attack_experiment(subsample_brightkite_array, subsample_k_means_brightkite, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )
    distance_acc_lists_go.append(
        utils.attack_experiment(subsample_gowalla_array, subsample_k_means_gowalla, fixed_number_of_rounds,
        samp_num, method="distance", retraining="online")
    )
    #distance_acc_lists_rq.append(
    #    utils.attack_experiment(subsample_rangequeries_array, subsample_k_means_rangequeries, fixed_number_of_rounds,
    #    samp_num, method="distance", retraining="online")
    #)
utils.acc_plot(
    [rounds_oracle_acc_lists, rounds_random_acc_lists, rounds_distance_acc_lists],
    ["oracle", "random", "distance"], xticks=round_xticks
)

In [None]:
# Experiment with model performance except with fixed total number of samples,
# but testing out different variations on how to procure that amount.
from importlib import reload
reload(explore_heuristic)
reload(utils)

perm_oracle_acc_lists = []
perm_random_acc_lists = []
perm_distance_acc_lists = []

# We'll always sample 200 samples.

sample_settings = [(1, 200), (2, 100), (4, 50), (5, 40), (10, 20), (50, 4), (100, 2), (200, 1)]

sample_xticks = ["{}_{}".format(rounds, samps) for (rounds, samps) in sample_settings]

for (rounds, samps) in sample_settings:
    print("Rounds {}, samps {}".format(rounds, samps))
    perm_oracle_acc_lists.append(
        experiment_utils.attack_experiment(subsample_data_array, subsample_k_means, rounds,
        samps, method="random", retraining="query")
    )
    perm_random_acc_lists.append(
        experiment_utils.attack_experiment(subsample_data_array, subsample_k_means, rounds,
        samps, method="random", retraining="online")
    )
    perm_distance_acc_lists.append(
        experiment_utils.attack_experiment(subsample_data_array, subsample_k_means, rounds,
        samps, method="distance", retraining="online")
    )
experiment_utils.acc_plot(
    [perm_oracle_acc_lists, perm_random_acc_lists, perm_distance_acc_lists],
    ["oracle", "random", "distance"], xticks=sample_xticks
)