In [3]:
import numpy as np
import matplotlib.pyplot as plt
import random
from Minibatchalg import  cost, mini_kmeans, compute_distances
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import StructType, StructField, DoubleType,IntegerType
from pyspark.sql import Row
from pyspark.sql.functions import array, lit,udf
import time
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd


## MiniBatch K-means Algorithm

### Clustering Error and Compute distance

In [1]:
def clustering_error_inf(centroids, data_points):
    num_points = data_points.shape[0]
    
    num_clusters = centroids.shape[0]
    
   
    # Quadratic distance between given points and cluster centroids
    distances = np.zeros(num_points)
    for i in range(num_points):
        point = data_points[i]
        distances[i] = np.min(np.sum((centroids - point) ** 2, axis=1))

    # Mean Squared Error (MSE)
    mse = np.sum(distances) / num_points

    return mse

def compute_distances(X, centroids):
    """ 
    Return Euclidean distance between points and centroids
    assuming both to have the same number of dimensions
    """
    # Assuming X and centroids have the same number of dimensions
    s_sq_difference = np.sum((X - centroids)**2, axis=1)
    distances = np.sqrt(s_sq_difference)
    
    return distances

def compute_distance(point, centroids):
    return np.linalg.norm(point - centroids, axis=1) ** 2




### Mean Cost Function

In [2]:
def cost(X, centroids):
    """
    Estimation of algorithm mean cost function.
    
    Args:
        X (numpy array): (n_samples, n_features).
        centroids (numpy array): (n_clusters, n_features).
    
    Returns:
        float
    """
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]

    # Quadratic distance between points and centroids
    distances = np.zeros(n_samples)
    for i in range(n_samples):
        point = X[i]
        squared_distances = np.sum((centroids - point) ** 2, axis=1)
        min_distance = np.min(squared_distances)
        distances[i] = min_distance

    # Total cost with minimum distance
    total_cost = np.sum(distances)
    
    return total_cost/n_samples




### Mini Batch K-means

In [3]:
def mini_kmeans(X, sc, num_clusters=5, iteration=10, batch_size=20, n_partitions=20):
    k = num_clusters
    b = batch_size
    t = iteration
    N = X.shape[0]
    C = X[np.random.choice(N, k, replace=False)]
    v = np.zeros(len(C))
    Y = np.empty(N, dtype=np.int32)
    times = []
    mse = []

    for i in range(t):
        start = time.time()
        mini_batch_idx = np.random.choice(range(N), size=b, replace=False)
        mini_batch = X[mini_batch_idx]

        if n_partitions is not None:
            dist_mini_batch_idx = sc.parallelize(mini_batch_idx, n_partitions)
        else:
            dist_mini_batch_idx = sc.parallelize(mini_batch_idx)

        distances_rdd = dist_mini_batch_idx.map(lambda x: (x, compute_distance(X[x], C)))
        nearest_cluster_rdd = distances_rdd.map(lambda x: (x[0], np.argmin(x[1], axis=0)))
        Y[mini_batch_idx] = nearest_cluster_rdd.values().collect()

        for idx in mini_batch_idx:
            v[Y[idx]] += 1
            learning_rate = 1 / v[Y[idx]]
            C[Y[idx]] = (1 - learning_rate) * C[Y[idx]] + learning_rate * X[idx]

        end = time.time()
        times.append(end - start)

        # Mean Squared Error
        mse_value = np.mean([np.min(compute_distance(X[i], C)) for i in mini_batch_idx])
        mse.append(mse_value)

    return C, times, mse

# Data
* Data has been provided from scikit learn. The choosen subset is target. Each sample has a value of 1 in its categories, and 0 in others. The array has 3.15% of non zero values.

* In this specific case the data are not imported in Spark cluster but in local machine, which is the master.

* The dimension of dataset is 3000 rows for 103 columns. The amount of rows has been chosen not to overfill Spark cluster. It could be contradictory with the fact that data are effectively in the local machine, but this choice aims to show a consistent data analysis when the comparison between different methods is provided. 

In [4]:
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()

rcv1_re =  rcv1.target[0:3000].toarray() 


rcv1_re.shape

(3000, 103)

In [5]:
df_rcv1=pd.DataFrame(rcv1_re)
df_rcv1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1


## Spark Configuration 
The cluster was composed by three virtual machines provided by CloudVeneto. The machines used for this project had the following charachteristics:

    | VM | IP Address | Memory  | RAM | Number of Cores |
    |----|------------|---------|-----|-----------------|-------|
    | VM2|10.67.22.233| 25 GB   | 8GB | 4               |Master |
    | VM1|10.67.22.219| 25 GB   | 8GB | 4               |Slave01|
    | VM3|10.67.22.157| 25 GB   | 8GB | 4               |Slave02|
    
* Spark configuration in order to select how many cores are effectively used to do the calculations is `.config("spark.cores.max","1")` because using `config("spark.executors.cores","1")` the application setted 1 core for the slave01 and 1 core for slave02, which prevents testing on the desired number of core.
* Memory is setted for 6GB because was the maximum memory that application allowed to run the cluster.



## Experiments
* The following cells are varying  the number of partition in order to test the time execution and mean cost function of the algorithm.
* For each number of cores a spark configuration has been initialized and the other parameter remained fixed.
* In the experiments each procedure is repeated 3 times in order to estimate the mean value and stadard deviation for each quantity to measure.
* A the end of the notebook it has been run a measure of mean squared error between clusterized points and centroids in order to compare them with other methods.


In [5]:
part=np.array([1,2,4,6,8,10,12,14,16,18,20,22,34,36,38,64])

### 1 core

In [6]:
sc.stop()

In [7]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","1")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext
sc


In [11]:
ex_time_1 = []
err_1 = []
std_time_1 = []
std_cost_1 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_1.append(np.mean(repeat_times))
    err_1.append(np.mean(repeat_cost))
    std_time_1.append(np.std(repeat_times))
    std_cost_1.append(np.std(repeat_cost))



                                                                                

In [12]:
data_1 = {
    'Partition': part,
    'Execution time (s)': ex_time_1,
    'Time standard deviation (s)':std_time_1,
    'Mean Cost Function':err_1,
    'Mean Cost Function standard deviation':std_cost_1,
    'Number of cores': '1'
}

results_df_1=pd.DataFrame(data_1)

results_df_1.to_csv('results_df_1.csv', index=False)

results_df_1

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,1.673424,0.050404,2.801444,0.215811,1
1,2,2.268008,0.05483,2.625222,0.123924,1
2,4,3.303739,0.014049,2.703667,0.181032,1
3,6,4.329588,0.014511,2.911333,0.142647,1
4,8,5.422429,0.059375,2.895,0.247082,1
5,10,6.352356,0.012889,2.866111,0.032656,1
6,12,7.363673,0.023223,2.980556,0.018859,1
7,14,8.550364,0.103808,2.709889,0.147607,1
8,16,9.622823,0.046965,2.844444,0.150524,1
9,18,10.564776,0.065713,2.910333,0.098096,1


### 2 cores

In [13]:
sc.stop()

In [14]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","2")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [15]:
ex_time_2 = []
err_2 = []
std_time_2 = []
std_cost_2 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_2.append(np.mean(repeat_times))
    err_2.append(np.mean(repeat_cost))
    std_time_2.append(np.std(repeat_times))
    std_cost_2.append(np.std(repeat_cost))


                                                                                

In [16]:
data_2 = {
    'Partition': part,
    'Execution time (s)': ex_time_2,
    'Time standard deviation (s)':std_time_2,
    'Mean Cost Function':err_2,
    'Mean Cost Function standard deviation':std_cost_2,
    'Number of cores': '2'
}

results_df_2=pd.DataFrame(data_2)

results_df_2.to_csv('results_df_2.csv', index=False)

results_df_2

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.23588,0.667082,2.517,0.014432,2
1,2,1.659717,0.026937,2.941556,0.124435,2
2,4,2.205217,0.012788,2.694333,0.113138,2
3,6,2.727543,0.021563,2.591889,0.031861,2
4,8,3.27202,0.03089,2.789,0.184572,2
5,10,3.841351,0.03919,2.852222,0.199427,2
6,12,4.542056,0.082512,2.662444,0.18919,2
7,14,4.970104,0.045233,2.672333,0.118431,2
8,16,5.480134,0.03577,2.841222,0.109335,2
9,18,5.952064,0.051607,2.648111,0.24751,2


### 3 cores

In [1]:
sc.stop()

In [18]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","3")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [19]:
ex_time_3 = []
err_3 = []
std_time_3 = []
std_cost_3 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_3.append(np.mean(repeat_times))
    err_3.append(np.mean(repeat_cost))
    std_time_3.append(np.std(repeat_times))
    std_cost_3.append(np.std(repeat_cost))

                                                                                

In [42]:
data_3 = {
    'Partition': part,
    'Execution time (s)': ex_time_3,
    'Time standard deviation (s)':std_time_3,
    'Mean Cost Function':err_3,
    'Mean Cost Function standard deviation':std_cost_3,
    'Number of cores': '3'
}

results_df_3=pd.DataFrame(data_3)

results_df_3.to_csv('results_df_3.csv', index=False)

results_df_3

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.766547,1.47652,2.646556,0.230124,3
1,2,1.598418,0.013339,2.729889,0.107833,3
2,4,2.128213,0.027513,2.823667,0.185529,3
3,6,2.152659,0.048061,2.816,0.232712,3
4,8,2.614647,0.010839,2.911222,0.121457,3
5,10,3.060284,0.032477,2.730889,0.158553,3
6,12,3.285773,0.042221,2.685222,0.218263,3
7,14,3.77515,0.0383,2.573667,0.039204,3
8,16,4.088126,0.01576,2.616,0.106217,3
9,18,4.287768,0.021477,2.751111,0.09819,3


### 4 cores

In [21]:
sc.stop()

In [22]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","4")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [23]:
ex_time_4 = []
err_4 = []
std_time_4 = []
std_cost_4 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_4.append(np.mean(repeat_times))
    err_4.append(np.mean(repeat_cost))
    std_time_4.append(np.std(repeat_times))
    std_cost_4.append(np.std(repeat_cost))

                                                                                

In [24]:
data_4 = {
    'Partition': part,
    'Execution time (s)': ex_time_4,
    'Time standard deviation (s)':std_time_4,
    'Mean Cost Function':err_4,
    'Mean Cost Function standard deviation':std_cost_4,
    'Number of cores': '4'
}

results_df_4=pd.DataFrame(data_4)

results_df_4.to_csv('results_df_4.csv', index=False)

results_df_4

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.764341,1.42311,2.716778,0.189448,4
1,2,1.697023,0.045487,2.750333,0.1592,4
2,4,1.967537,0.323611,2.801778,0.075043,4
3,6,2.696606,0.641543,2.720556,0.168405,4
4,8,2.306519,0.023563,2.780889,0.104294,4
5,10,2.66681,0.054812,2.859778,0.121231,4
6,12,2.910332,0.02889,2.759111,0.072533,4
7,14,3.138177,0.027747,2.832889,0.153313,4
8,16,3.380862,0.009696,2.954333,0.030769,4
9,18,3.649886,0.0187,2.685667,0.246528,4


### 5 cores

In [25]:
sc.stop()

In [26]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","5")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [27]:
ex_time_5 = []
err_5 = []
std_time_5 = []
std_cost_5 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_5.append(np.mean(repeat_times))
    err_5.append(np.mean(repeat_cost))
    std_time_5.append(np.std(repeat_times))
    std_cost_5.append(np.std(repeat_cost))

                                                                                

In [28]:
data_5 = {
    'Partition': part,
    'Execution time (s)': ex_time_5,
    'Time standard deviation (s)':std_time_5,
    'Mean Cost Function':err_5,
    'Mean Cost Function standard deviation':std_cost_5,
    'Number of cores': '5'
}

results_df_5=pd.DataFrame(data_5)

results_df_5.to_csv('results_df_5.csv', index=False)

results_df_5

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.784476,1.517641,2.716222,0.242738,5
1,2,1.609984,0.041134,2.914889,0.059023,5
2,4,1.717059,0.060404,2.794778,0.154828,5
3,6,2.088434,0.016745,2.695556,0.061962,5
4,8,2.12374,0.029016,2.812667,0.149149,5
5,10,2.298653,0.077113,2.683111,0.0404,5
6,12,2.61573,0.04418,2.607,0.080143,5
7,14,2.647908,0.029534,2.701889,0.120696,5
8,16,3.044333,0.006506,2.605556,0.102018,5
9,18,3.090394,0.010589,2.702,0.214221,5


### 6 cores

In [29]:
sc.stop()

In [30]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","6")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [31]:
ex_time_6 = []
err_6 = []
std_time_6 = []
std_cost_6 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_6.append(np.mean(repeat_times))
    err_6.append(np.mean(repeat_cost))
    std_time_6.append(np.std(repeat_times))
    std_cost_6.append(np.std(repeat_cost))

                                                                                

In [32]:
data_6 = {
    'Partition': part,
    'Execution time (s)': ex_time_6,
    'Time standard deviation (s)':std_time_6,
    'Mean Cost Function':err_6,
    'Mean Cost Function standard deviation':std_cost_6,
    'Number of cores': '6'
}

results_df_6=pd.DataFrame(data_6)

results_df_6.to_csv('results_df_6.csv', index=False)

results_df_6

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.746342,1.461348,2.903,0.227591,6
1,2,1.615311,0.017426,2.668111,0.035132,6
2,4,1.694841,0.027798,2.686444,0.098086,6
3,6,1.710577,0.047908,2.934,0.09474,6
4,8,2.437383,0.428932,2.707,0.094662,6
5,10,2.121672,0.02886,2.768556,0.177366,6
6,12,2.24603,0.030669,2.657778,0.078705,6
7,14,2.606678,0.006924,2.652667,0.045966,6
8,16,2.641174,0.022743,2.746444,0.199867,6
9,18,2.90348,0.066981,2.693889,0.196165,6


### 7 cores

In [33]:
sc.stop()

In [34]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","7")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [35]:
ex_time_7 = []
err_7 = []
std_time_7 = []
std_cost_7 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_7.append(np.mean(repeat_times))
    err_7.append(np.mean(repeat_cost))
    std_time_7.append(np.std(repeat_times))
    std_cost_7.append(np.std(repeat_cost))

                                                                                

In [36]:
data_7 = {
    'Partition': part,
    'Execution time (s)': ex_time_7,
    'Time standard deviation (s)':std_time_7,
    'Mean Cost Function':err_7,
    'Mean Cost Function standard deviation':std_cost_7,
    'Number of cores': '7'
}

results_df_7=pd.DataFrame(data_7)

results_df_7.to_csv('results_df_7.csv', index=False)

results_df_7

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.719261,1.478217,2.827222,0.05387,7
1,2,1.659982,0.044408,2.767444,0.077723,7
2,4,1.687787,0.013318,2.69,0.079909,7
3,6,1.752113,0.053332,2.900444,0.05274,7
4,8,2.149248,0.013224,2.624778,0.138042,7
5,10,2.18108,0.014249,2.715778,0.118103,7
6,12,2.257012,0.045033,2.85,0.182801,7
7,14,2.452055,0.092182,2.868556,0.161578,7
8,16,2.6904,0.045827,2.625778,0.147698,7
9,18,2.806958,0.094347,2.655222,0.090026,7


### 8 cores

In [37]:
sc.stop()

In [38]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","8")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [39]:
ex_time_8 = []
err_8 = []
std_time_8 = []
std_cost_8 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_re, C_k))
    
    ex_time_8.append(np.mean(repeat_times))
    err_8.append(np.mean(repeat_cost))
    std_time_8.append(np.std(repeat_times))
    std_cost_8.append(np.std(repeat_cost))

                                                                                

In [40]:
data_8 = {
    'Partition': part,
    'Execution time (s)': ex_time_8,
    'Time standard deviation (s)':std_time_8,
    'Mean Cost Function':err_8,
    'Mean Cost Function standard deviation':std_cost_8,
    'Number of cores': '8'
}

results_df_8=pd.DataFrame(data_8)

results_df_8.to_csv('results_df_8.csv', index=False)

results_df_8

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,2.746094,1.479638,2.817111,0.132072,8
1,2,1.619236,0.016646,2.768111,0.073284,8
2,4,1.682505,0.011969,2.555333,0.12397,8
3,6,1.706858,0.027577,2.636889,0.052824,8
4,8,1.875876,0.044762,3.013222,0.028228,8
5,10,2.130014,0.022629,2.610333,0.194424,8
6,12,3.386189,1.584316,2.797444,0.183817,8
7,14,2.310322,0.033291,2.710444,0.219361,8
8,16,2.544,0.037696,2.723778,0.165149,8
9,18,2.758607,0.005588,2.744,0.222632,8


### Last Experiment
Choosing 8 cores and 4 partitions as the quantity that minimizes time execution, results od mean squared error has been registered in order to compare them with other mothods.

In [5]:
sc.stop()

In [32]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","8")\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [36]:
mse_sc=[]
num_repeats=3
for _ in range(num_repeats):
    C_k, timeS, mse = mini_kmeans(rcv1_re, sc, n_partitions=4, num_clusters=5)
    mse_sc.append(mse)




In [37]:
mse_sc_mean = [np.mean([mse_sc[i][j] for i in range(3)]) for j in range(10)]
mse_sc_std=[np.std([mse_sc[i][j] for i in range(3)]) for j in range(10)]
print(mse_sc_mean)
print(mse_sc_std)

[2.5000000000000004, 2.6166666666666667, 2.5000000000000004, 2.65, 2.8333333333333335, 2.6999999999999997, 2.733333333333333, 2.8166666666666664, 2.85, 2.8000000000000003]
[0.14719601443879748, 0.2392116682401222, 0.2677063067368168, 0.14719601443879732, 0.37932688922470137, 0.32403703492039304, 0.28674417556808746, 0.08498365855987966, 0.14719601443879735, 0.2677063067368168]


In [38]:
sc.stop()