In [1]:
import numpy as np
from pyspark.mllib.clustering import KMeans

In [2]:
from pyspark import  SparkContext
sc = SparkContext( 'local', 'pyspark')

In [3]:
inputData = sc.textFile("./irisdata.csv")

In [4]:
inputData.count()

150

In [5]:
inputData.take(10)

['5.1,3.5,1.4,0.2,Iris-setosa',
 '4.9,3,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.3,0.2,Iris-setosa',
 '4.6,3.1,1.5,0.2,Iris-setosa',
 '5,3.6,1.4,0.2,Iris-setosa',
 '5.4,3.9,1.7,0.4,Iris-setosa',
 '4.6,3.4,1.4,0.3,Iris-setosa',
 '5,3.4,1.5,0.2,Iris-setosa',
 '4.4,2.9,1.4,0.2,Iris-setosa',
 '4.9,3.1,1.5,0.1,Iris-setosa']

In [2]:
def parseData(line):
    ld = line.split(",") 
    return np.array([float(ld[0]),float(ld[1]), float(ld[2]),float(ld[3])])

In [7]:
modelInput = inputData.map(parseData)

In [8]:
modelInput.count()

150

In [9]:
modelInput.take(10)

[array([ 5.1,  3.5,  1.4,  0.2]),
 array([ 4.9,  3. ,  1.4,  0.2]),
 array([ 4.7,  3.2,  1.3,  0.2]),
 array([ 4.6,  3.1,  1.5,  0.2]),
 array([ 5. ,  3.6,  1.4,  0.2]),
 array([ 5.4,  3.9,  1.7,  0.4]),
 array([ 4.6,  3.4,  1.4,  0.3]),
 array([ 5. ,  3.4,  1.5,  0.2]),
 array([ 4.4,  2.9,  1.4,  0.2]),
 array([ 4.9,  3.1,  1.5,  0.1])]

In [10]:
numClusters = 3

In [11]:
KMmodel = KMeans.train(modelInput, numClusters)

In [12]:
print("Final centers: " + str(KMmodel.clusterCenters))
print("Total Cost: " + str(KMmodel.computeCost(modelInput)))

Final centers: [array([ 5.006,  3.418,  1.464,  0.244]), array([ 6.85384615,  3.07692308,  5.71538462,  2.05384615]), array([ 5.88360656,  2.74098361,  4.38852459,  1.43442623])]
Total Cost: 78.94506582597637


In [13]:
def parseData2(line):
    ld = line.split(",") 
    return (ld[4],np.array([float(ld[0]),float(ld[1]), float(ld[2]),float(ld[3])]))

In [14]:
checkData = inputData.map(parseData2)

In [15]:
checkData.count()

150

In [16]:
checkData.take(10)

[('Iris-setosa', array([ 5.1,  3.5,  1.4,  0.2])),
 ('Iris-setosa', array([ 4.9,  3. ,  1.4,  0.2])),
 ('Iris-setosa', array([ 4.7,  3.2,  1.3,  0.2])),
 ('Iris-setosa', array([ 4.6,  3.1,  1.5,  0.2])),
 ('Iris-setosa', array([ 5. ,  3.6,  1.4,  0.2])),
 ('Iris-setosa', array([ 5.4,  3.9,  1.7,  0.4])),
 ('Iris-setosa', array([ 4.6,  3.4,  1.4,  0.3])),
 ('Iris-setosa', array([ 5. ,  3.4,  1.5,  0.2])),
 ('Iris-setosa', array([ 4.4,  2.9,  1.4,  0.2])),
 ('Iris-setosa', array([ 4.9,  3.1,  1.5,  0.1]))]

In [17]:
labelsAndPreds = checkData.map(lambda p: (p[0], KMmodel.predict(p[1])) )

In [18]:
labelsAndPreds.saveAsTextFile("KM-lap1")