In [5]:
import sys
import random
import numpy as np

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.random import RandomRDDs


spark = SparkSession.builder \
        .master('local[*]')  \
        .enableHiveSupport() \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
# inputs
file_name = 'output.csv'  # file name to be generated
points = 1000 # number of points to be generated
count_cluster = 3 # number of clusters
dimension = 4 # dimension of the data
std = 1 # standard deviation
noise_points = points * 2 # number of noise points to be generated / double the number of points

 

In [144]:
# methods 
def calculate_rdn_with_std(centroids, points, std, cluster, dimension):
    values = []
    for d in range(dimension):
        value = centroids[d] + points[d] * std
        value = round(value, 2)
        values.append( value )
    return (values, cluster )
 
    
def format_as_csv(x):
    line = ""
    for i in x[0]:
        line += '{},'.format(i)
    line += '{}'.format(x[1])
    return line

In [145]:
 
clusters_rdd = sc.parallelize(range(0, count_cluster))
 
centroids_rdd = clusters_rdd   \
    .map(lambda x : (x, random.sample(list(np.arange(0, 100, 0.1)), k=dimension)) )

 
result = RandomRDDs   \
    .normalVectorRDD(sc, numRows=points, numCols=dimension, numPartitions=count_cluster)   \
    .map(lambda x : (random.randint(0, count_cluster - 1), list(x)))
    .join(centroids_rdd)     \
    .map(lambda x: calculate_rdn_with_std(x[1][1], x[1][0], std, x[0], dimension)) \
    .map(format_as_csv) 
 


['14.56,63.35,56.96,56.48,0',
 '15.65,60.99,56.68,54.42,0',
 '15.48,60.79,57.85,54.04,0',
 '15.36,64.07,57.25,55.03,0',
 '15.91,63.57,57.44,55.85,0',
 '15.76,63.24,56.18,54.96,0',
 '14.06,62.46,56.02,55.22,0',
 '14.1,62.9,55.59,54.5,0',
 '17.2,60.4,56.87,55.45,0',
 '15.92,61.7,58.84,55.22,0',
 '13.54,62.08,56.0,55.56,0',
 '15.43,62.57,55.98,53.47,0',
 '15.77,62.04,57.07,54.64,0',
 '15.53,62.06,56.33,54.76,0',
 '16.14,62.71,55.13,54.94,0',
 '14.76,63.95,54.84,54.55,0',
 '15.31,62.33,56.96,54.97,0',
 '15.55,62.74,57.34,55.33,0',
 '16.03,61.8,56.22,56.67,0',
 '14.72,61.31,57.93,54.55,0',
 '15.64,62.32,55.85,57.07,0',
 '14.85,62.79,56.7,53.78,0',
 '14.73,61.59,56.02,54.03,0',
 '13.01,60.41,54.97,55.4,0',
 '15.64,62.72,55.51,55.83,0',
 '15.29,63.38,55.48,54.24,0',
 '15.02,62.08,56.52,56.66,0',
 '14.1,61.07,57.34,56.36,0',
 '16.67,61.95,55.72,54.96,0',
 '15.26,62.22,58.23,54.48,0',
 '13.76,62.75,56.06,56.71,0',
 '15.84,60.66,58.2,54.37,0',
 '15.53,60.98,57.55,52.64,0',
 '16.8,61.66,56.59,55.

In [140]:
result.coalesce(1).saveAsTextFile(file_name)