In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import StructType, StructField, DoubleType,IntegerType
from pyspark.sql import Row
from pyspark.sql.functions import array, lit,udf
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
import time
from sklearn.metrics import silhouette_samples, silhouette_score

import warnings
from Minibatchalg import cost




## MiniBatch K-means algorithm
### Mean Cost Function

In [8]:
import numpy as np
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

def cost(data, centroids):
    """
    
    Args:
        data (DataFrame): Spark Dataframe with Features Column
        centroids (list): Centroids list as numpy vectors.

    Returns:
        float: mean cost of the algorithm.
    """
    # Data Extraction
    feature_array = data.collect()
    n_samples = len(feature_array)
    n_clusters = len(centroids)

    # Distance between points and centroids
    distances = np.zeros(n_samples)
    for i in range(n_samples):
        point = feature_array[i]
        squared_distances = np.array([Vectors.squared_distance(Vectors.dense(point), Vectors.dense(centroid)) for centroid in centroids])
        min_distance = np.min(squared_distances)
        distances[i] = min_distance

    # Cost as sum of minimal distances
    total_cost = np.sum(distances)

    return total_cost / n_samples



### Compute Distance, Find Nearest Cluster, Mini K-means Spark

In [9]:

def compute_distance(point, centroids):
    return np.array([np.linalg.norm(np.array(point) - np.array(c)) for c in centroids])


def find_nearest_cluster(x):
    idx, distances = x
    return idx, np.argmin(distances)

def mini_kmeans_spark(X, sc, num_clusters=5, iteration=10, batch_size=200, n_partitions=10):
    from pyspark.sql.functions import col
    from pyspark.sql import Row
    import numpy as np
    import time

    k = num_clusters
    b = batch_size
    t = iteration
    N = X.count() 
    C = X.sample(False, k/N).collect()  
    C = [list(row) for row in C]  
    v = np.zeros(len(C))
    times = []
    mse = []

    for i in range(t):
        start = time.time()
        
        mini_batch = X.sample(False, b/N).collect()
        mini_batch = [list(row) for row in mini_batch]

        if n_partitions is not None:
            dist_mini_batch = sc.parallelize(mini_batch, n_partitions)
        else:
            dist_mini_batch = sc.parallelize(mini_batch)

        distances_rdd = dist_mini_batch.map(lambda x: compute_distance(x, C))

        # Mean squared error
        mse_value = distances_rdd.map(lambda x: min(x)).reduce(lambda x, y: x + y) / (len(mini_batch) * len(C))

        mse.append(mse_value)

        nearest_cluster_indices = distances_rdd.map(lambda x: np.argmin(x)).collect()

        
        for idx, cluster_idx in enumerate(nearest_cluster_indices):
            v[cluster_idx] += 1
            learning_rate = 1 / v[cluster_idx]
            C[cluster_idx] = (1 - learning_rate) * np.array(C[cluster_idx]) + learning_rate * np.array(mini_batch[idx])

        end = time.time()
        times.append(end - start)

    return C, times, mse


# Data
* Data has been provided from scikit learn. The choosen subset is target. Each sample has a value of 1 in its categories, and 0 in others. The array has 3.15% of non zero values.
* In this specific case Pandas Data Frame has been converted in Spark Data Frame in order to distribute not only the processes but also the dataset.
* Dataset dimensions are 3000 rows and 103 columns, this was the maximum allowed size from spark in order to run the algorithm with only one core.

In [10]:
from sklearn.datasets import fetch_rcv1
from pyspark.sql import SparkSession
import pandas as pd


rcv1 = fetch_rcv1()


In [11]:

target_df = pd.DataFrame(rcv1.target[0:3000].toarray(), columns=rcv1.target_names)
target_df


Unnamed: 0,C11,C12,C13,C14,C15,C151,C1511,C152,C16,C17,...,M11,M12,M13,M131,M132,M14,M141,M142,M143,MCAT
0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1


## Spark Configuration
The cluster was composed by three virtual machines provided by CloudVeneto. The machines used for this project had the following charachteristics:

    | VM | IP Address | Memory  | RAM | Number of Cores |
    |----|------------|---------|-----|-----------------|-------|
    | VM2|10.67.22.233| 25 GB   | 8GB | 4               |Master |
    | VM1|10.67.22.219| 25 GB   | 8GB | 4               |Slave01|
    | VM3|10.67.22.157| 25 GB   | 8GB | 4               |Slave02|
    
* Spark configuration in order to select how many cores are effectively used to do the calculations is `.config("spark.cores.max","1")` because using `config("spark.executors.cores","1")` the application setted 1 core for the slave01 and 1 core for slave02, which prevents testing on the desired number of core.
* Memory is set for 6GB because was the maximum memory that application allowed to run the cluster.




## Experiments

* The following cells are varying  the number of partition in order to test the time execution and mean cost function of the algorithm.
* For each number of cores a spark configuration has been initialized and the other parameter remained fixed.
* In the experiments each procedure is repeated 3 times in order to estimate the mean value and stadard deviation for each quantity to measure.
* A the end of the notebook it has been run a measure of mean squared error between clusterized points and centroids in order to compare them with other methods.



In [25]:
part=np.array([1,2,4,6,8,10,12,14,16,18,20,22,34,36,38,64])

### 1 core

In [26]:
sc.stop()

In [27]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max", "1")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext
# print its status
sc



In [28]:
# Crea un DataFrame Spark dalla colonna "target"
rcv1_df = spark.createDataFrame(target_df)

In [29]:
ex_time_1 = []
err_1 = []
std_time_1 = []
std_cost_1 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_1.append(np.mean(repeat_times))
    err_1.append(np.mean(repeat_cost))
    std_time_1.append(np.std(repeat_times))
    std_cost_1.append(np.std(repeat_cost))



                                                                                

In [30]:
data_1 = {
    'Partition': part,
    'Execution time (s)': ex_time_1,
    'Time standard deviation (s)':std_time_1,
    'Mean Cost Function':err_1,
    'Mean Cost Function standard deviation':std_cost_1,
    'Number of cores': '1'
}

results_df_1=pd.DataFrame(data_1)

results_df_1.to_csv('results_df_spark_1.csv', index=False)

results_df_1

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,4.751723,0.846321,1.964422,0.165298,1
1,2,5.163402,0.038258,2.022995,0.046407,1
2,4,7.264055,0.028834,1.876834,0.100431,1
3,6,9.417521,0.102875,2.183222,0.406273,1
4,8,11.5054,0.118574,1.864446,0.103196,1
5,10,13.682367,0.111566,1.726431,0.07027,1
6,12,15.564023,0.042187,1.926731,0.141541,1
7,14,17.789635,0.095985,1.988435,0.346239,1
8,16,19.630832,0.241079,2.243891,0.286268,1
9,18,21.914969,0.14808,2.474279,0.182371,1


### 2 cores

In [31]:
sc.stop()

In [32]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","2")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [33]:
rcv1_df = spark.createDataFrame(target_df)

In [35]:
ex_time_2 = []
err_2 = []
std_time_2 = []
std_cost_2 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_2.append(np.mean(repeat_times))
    err_2.append(np.mean(repeat_cost))
    std_time_2.append(np.std(repeat_times))
    std_cost_2.append(np.std(repeat_cost))

                                                                                

In [36]:
data_2 = {
    'Partition': part,
    'Execution time (s)': ex_time_2,
    'Time standard deviation (s)':std_time_2,
    'Mean Cost Function':err_2,
    'Mean Cost Function standard deviation':std_cost_2,
    'Number of cores': '2'
}

results_df_2=pd.DataFrame(data_2)

results_df_2.to_csv('results_df_spark_2.csv', index=False)

results_df_2

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,4.437636,0.84553,1.53979,0.099078,2
1,2,3.372294,0.045393,2.125064,0.114345,2
2,4,4.689989,0.212106,1.904199,0.137685,2
3,6,5.843053,0.114571,1.899728,0.16783,2
4,8,6.846858,0.120793,2.316854,0.37173,2
5,10,7.855664,0.099024,2.019886,0.45908,2
6,12,8.843838,0.035451,2.030228,0.533819,2
7,14,9.958914,0.097729,2.1669,0.1839,2
8,16,10.815359,0.138646,1.980229,0.29473,2
9,18,12.654579,0.920163,2.089864,0.265645,2


### 3 cores

In [37]:
sc.stop()

In [38]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","3")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [39]:
rcv1_df = spark.createDataFrame(target_df)

In [40]:
ex_time_3 = []
err_3 = []
std_time_3 = []
std_cost_3 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_3.append(np.mean(repeat_times))
    err_3.append(np.mean(repeat_cost))
    std_time_3.append(np.std(repeat_times))
    std_cost_3.append(np.std(repeat_cost))

                                                                                

In [41]:
data_3 = {
    'Partition': part,
    'Execution time (s)': ex_time_3,
    'Time standard deviation (s)':std_time_3,
    'Mean Cost Function':err_3,
    'Mean Cost Function standard deviation':std_cost_3,
    'Number of cores': '3'
}

results_df_3=pd.DataFrame(data_3)

results_df_3.to_csv('results_df_spark_3.csv', index=False)

results_df_3

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,5.033745,1.469048,2.410356,0.275823,3
1,2,3.265156,0.073432,1.886841,0.252112,3
2,4,4.337987,0.116351,1.744492,0.291808,3
3,6,4.380465,0.0591,1.95118,0.282791,3
4,8,5.345005,0.041866,1.933987,0.13612,3
5,10,6.319669,0.051374,1.830738,0.105833,3
6,12,6.571327,0.060674,1.741281,0.156671,3
7,14,7.352148,0.036694,1.802111,0.277893,3
8,16,8.342983,0.021912,1.914767,0.042714,3
9,18,8.579041,0.05268,2.288168,0.290701,3


### 4 cores

In [42]:
sc.stop()

In [43]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","4")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [44]:
rcv1_df = spark.createDataFrame(target_df)

In [45]:
ex_time_4 = []
err_4 = []
std_time_4 = []
std_cost_4 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_4.append(np.mean(repeat_times))
    err_4.append(np.mean(repeat_cost))
    std_time_4.append(np.std(repeat_times))
    std_cost_4.append(np.std(repeat_cost))

                                                                                

In [46]:
data_4 = {
    'Partition': part,
    'Execution time (s)': ex_time_4,
    'Time standard deviation (s)':std_time_4,
    'Mean Cost Function':err_4,
    'Mean Cost Function standard deviation':std_cost_4,
    'Number of cores': '4'
}

results_df_4=pd.DataFrame(data_4)

results_df_4.to_csv('results_df_spark_4.csv', index=False)

results_df_4

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,4.632213,1.626116,2.062096,0.171144,4
1,2,3.255252,0.046183,1.88051,0.325991,4
2,4,3.44008,0.065585,1.703185,0.159244,4
3,6,4.382428,0.023962,1.926074,0.358747,4
4,8,4.514703,0.038412,1.876598,0.175716,4
5,10,5.35601,0.050685,1.82044,0.554975,4
6,12,5.739607,0.1651,2.002627,0.403126,4
7,14,6.546725,0.127492,1.589052,0.041046,4
8,16,7.042795,0.093907,1.804007,0.40969,4
9,18,7.52621,0.028523,1.967083,0.539663,4


### 5 cores

In [47]:
sc.stop()

In [48]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","5")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [49]:
rcv1_df = spark.createDataFrame(target_df)

In [51]:
ex_time_5 = []
err_5 = []
std_time_5 = []
std_cost_5 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_5.append(np.mean(repeat_times))
    err_5.append(np.mean(repeat_cost))
    std_time_5.append(np.std(repeat_times))
    std_cost_5.append(np.std(repeat_cost))

                                                                                

In [52]:
data_5 = {
    'Partition': part,
    'Execution time (s)': ex_time_5,
    'Time standard deviation (s)':std_time_5,
    'Mean Cost Function':err_5,
    'Mean Cost Function standard deviation':std_cost_5,
    'Number of cores': '5'
}

results_df_5=pd.DataFrame(data_5)

results_df_5.to_csv('results_df_spark_5.csv', index=False)

results_df_5

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,3.37506,0.107259,1.898807,0.218406,5
1,2,3.163337,0.019564,1.979332,0.170117,5
2,4,3.138177,0.10702,1.599478,0.167043,5
3,6,4.217835,0.025792,1.784534,0.197906,5
4,8,4.315783,0.071153,1.54008,0.139294,5
5,10,6.191673,2.654863,2.07806,0.234166,5
6,12,5.301752,0.015409,1.640205,0.057415,5
7,14,5.429315,0.050396,2.097277,0.381232,5
8,16,6.31419,0.05893,1.859044,0.234809,5
9,18,6.527562,0.036194,1.934008,0.158829,5


### 6 cores

In [53]:
sc.stop()

In [54]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","6")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [55]:
rcv1_df = spark.createDataFrame(target_df)

In [56]:
ex_time_6 = []
err_6 = []
std_time_6 = []
std_cost_6 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_6.append(np.mean(repeat_times))
    err_6.append(np.mean(repeat_cost))
    std_time_6.append(np.std(repeat_times))
    std_cost_6.append(np.std(repeat_cost))

                                                                                

In [57]:
data_6 = {
    'Partition': part,
    'Execution time (s)': ex_time_6,
    'Time standard deviation (s)':std_time_6,
    'Mean Cost Function':err_6,
    'Mean Cost Function standard deviation':std_cost_6,
    'Number of cores': '6'
}

results_df_6=pd.DataFrame(data_6)

results_df_6.to_csv('results_df_spark_6.csv', index=False)

results_df_6

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,4.708957,1.598873,1.891372,0.060769,6
1,2,3.318835,0.091417,2.009998,0.279895,6
2,4,3.332201,0.090566,2.368872,0.31805,6
3,6,3.566916,0.170022,2.003748,0.347094,6
4,8,5.927299,2.152151,1.993317,0.134744,6
5,10,4.591384,0.031561,1.978175,0.313305,6
6,12,4.702471,0.14631,2.147827,0.495191,6
7,14,5.440367,0.038838,2.111177,0.289768,6
8,16,5.586365,0.03873,1.80323,0.160468,6
9,18,5.859098,0.008977,2.002408,0.529238,6


### 7 cores

In [58]:
sc.stop()

In [59]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","7")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [60]:
rcv1_df = spark.createDataFrame(target_df)

In [61]:
ex_time_7 = []
err_7 = []
std_time_7 = []
std_cost_7 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_7.append(np.mean(repeat_times))
    err_7.append(np.mean(repeat_cost))
    std_time_7.append(np.std(repeat_times))
    std_cost_7.append(np.std(repeat_cost))

                                                                                

In [62]:
data_7 = {
    'Partition': part,
    'Execution time (s)': ex_time_7,
    'Time standard deviation (s)':std_time_7,
    'Mean Cost Function':err_7,
    'Mean Cost Function standard deviation':std_cost_7,
    'Number of cores': '7'
}

results_df_7=pd.DataFrame(data_7)

results_df_7.to_csv('results_df_spark_7.csv', index=False)

results_df_7

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,4.938357,1.61587,1.738313,0.277117,7
1,2,4.928564,2.075228,1.653179,0.037833,7
2,4,5.148966,2.247287,1.790121,0.304452,7
3,6,3.687986,0.075411,1.85913,0.262563,7
4,8,4.45255,0.128248,1.980469,0.304441,7
5,10,4.536411,0.059473,2.111089,0.372724,7
6,12,4.648523,0.077495,2.079652,0.171844,7
7,14,4.967949,0.038356,1.545574,0.040996,7
8,16,5.613493,0.067984,2.007017,0.310337,7
9,18,5.855473,0.065449,1.903833,0.179933,7


### 8 cores

In [63]:
sc.stop()

In [64]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","8")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [65]:
rcv1_df = spark.createDataFrame(target_df)

In [66]:
ex_time_8 = []
err_8 = []
std_time_8 = []
std_cost_8 = []
num_repeats = 3 

for i in part:
    repeat_times = []
    repeat_cost = []
    
    for _ in range(num_repeats):
        start = time.time()
        C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=i, num_clusters=5)
        end = time.time()
        repeat_times.append(end - start)
        repeat_cost.append(cost(rcv1_df, C_k))
    
    ex_time_8.append(np.mean(repeat_times))
    err_8.append(np.mean(repeat_cost))
    std_time_8.append(np.std(repeat_times))
    std_cost_8.append(np.std(repeat_cost))

                                                                                

In [67]:
data_8 = {
    'Partition': part,
    'Execution time (s)': ex_time_8,
    'Time standard deviation (s)':std_time_8,
    'Mean Cost Function':err_8,
    'Mean Cost Function standard deviation':std_cost_8,
    'Number of cores': '8'
}

results_df_8=pd.DataFrame(data_8)

results_df_8.to_csv('results_df_spark_8.csv', index=False)

results_df_8

Unnamed: 0,Partition,Execution time (s),Time standard deviation (s),Mean Cost Function,Mean Cost Function standard deviation,Number of cores
0,1,5.099987,1.856624,1.952128,0.144949,8
1,2,3.497821,0.103702,1.911607,0.093638,8
2,4,3.609479,0.024379,1.776902,0.457992,8
3,6,3.721736,0.025763,2.058322,0.282081,8
4,8,4.109144,0.081832,1.766689,0.0836,8
5,10,4.71333,0.010549,1.81922,0.08547,8
6,12,4.863396,0.03193,2.127473,0.284002,8
7,14,5.123907,0.015663,1.666156,0.100939,8
8,16,5.453733,0.14367,1.903144,0.132617,8
9,18,5.883843,0.120097,2.096692,0.420246,8


In [4]:
sc.stop()

### Last Experiment
Choosing 8 cores and 4 partitions as the quantity that minimizes time execution, results od mean squared error has been registered in order to compare them with other mothods.

In [5]:
spark = SparkSession.builder \
    .appName("Mini Batch K-means")\
    .config("spark.cores.max","8")\
    .config("spark.executor.memory", "6g")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .getOrCreate()

# create a spark context
sc = spark.sparkContext

sc

In [12]:
rcv1_df = spark.createDataFrame(target_df)

In [13]:
C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=4, num_clusters=5)

23/10/16 21:09:31 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [14]:
print(mse)

[0.7466294110963937, 0.5112228907538667, 0.5152719509558034, 0.5167080007003777, 0.5011649491184559, 0.5130507382764737, 0.497737576744222, 0.5004927742598564, 0.5122515521530647, 0.5029721605028452]


In [15]:
itera=np.arange(1,11,1)

In [17]:
mse_df=[]
num_repeats=3
for _ in range(num_repeats):
    C_k, timeS, mse = mini_kmeans_spark(rcv1_df, sc, n_partitions=4, num_clusters=5)
    mse_df.append(mse)    

In [26]:
mse_df_mean = [np.mean([mse_df[i][j] for i in range(3)]) for j in range(10)]
mse_df_std=[np.std([mse_df[i][j] for i in range(3)]) for j in range(10)]
print(mse_df_mean)
print(mse_df_std)

[0.3257165010526702, 0.2434910215208379, 0.23902546599988336, 0.23570888085877842, 0.23580463967853996, 0.2323665199152538, 0.23259276058606235, 0.23209235853582358, 0.23267835281982327, 0.23441080468205877]
[0.07605478065827204, 0.044905086846708596, 0.05052359619134658, 0.043650346075360134, 0.042292964183035936, 0.0441313458665705, 0.04875552362737942, 0.04984271773485523, 0.04258637691940688, 0.047179569497458745]
