+ Импорт данных из hdfs
+ PCA
+ K-means
+ LDA
# Импорт данных из hdfs в dataframe

In [48]:
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector

val data = sc.textFile("/user/supp.bda08/iris.csv")
    .filter(_ != "0,1,2,3,0")
    .map(_.split(","))
    .map(row => {
        val vector = row.slice(0,4).map(_.toDouble)
        val label  = row(4).toInt
        LabeledPoint(label, Vectors.dense(vector))
    }).toDF("label","features")

data.show(5)

data: org.apache.spark.sql.DataFrame = [label: double, features: vector]


+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[5.1,3.5,1.4,0.2]|
|  0.0|[4.9,3.0,1.4,0.2]|
|  0.0|[4.7,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[5.0,3.6,1.4,0.2]|
+-----+-----------------+
only showing top 5 rows



# PCA

In [3]:
import org.apache.spark.ml.feature.PCA

val numFeatures = 3

val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pca_features")
      .setK(numFeatures)
      .fit(data)

val pcaDF = pca.transform(data)

pcaDF.show(5, false)

numFeatures: Int = 3
pca: org.apache.spark.ml.feature.PCAModel = pca_5c0c808881e9
pcaDF: org.apache.spark.sql.DataFrame = [label: double, features: vector, pca_features: vector]


+-----+-----------------+-----------------------------------------------------------+
|label|features         |pca_features                                               |
+-----+-----------------+-----------------------------------------------------------+
|0.0  |[5.1,3.5,1.4,0.2]|[-2.8271359726790233,-5.641331045573369,0.6642769315103052]|
|0.0  |[4.9,3.0,1.4,0.2]|[-2.7959524821488415,-5.145166883252961,0.8462865195138336]|
|0.0  |[4.7,3.2,1.3,0.2]|[-2.621523558165056,-5.177378121203952,0.6180558535093933] |
|0.0  |[4.6,3.1,1.5,0.2]|[-2.764905900474237,-5.003599415056986,0.6050931192230615] |
|0.0  |[5.0,3.6,1.4,0.2]|[-2.7827501159516568,-5.648648294377431,0.5465353947337392]|
+-----+-----------------+-----------------------------------------------------------+
only showing top 5 rows



# K-means

In [4]:
import org.apache.spark.ml.clustering.KMeans

val kmeans = new KMeans()
  .setK(3)
  .setFeaturesCol("features")
  .setPredictionCol("prediction")

val model = kmeans.fit(data)

val results = model.transform(data)

results.show(150)

kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_d618455b4b83
model: org.apache.spark.ml.clustering.KMeansModel = kmeans_d618455b4b83
results: org.apache.spark.sql.DataFrame = [label: double, features: vector, prediction: int]


+-----+-----------------+----------+
|label|         features|prediction|
+-----+-----------------+----------+
|  0.0|[5.1,3.5,1.4,0.2]|         2|
|  0.0|[4.9,3.0,1.4,0.2]|         2|
|  0.0|[4.7,3.2,1.3,0.2]|         2|
|  0.0|[4.6,3.1,1.5,0.2]|         2|
|  0.0|[5.0,3.6,1.4,0.2]|         2|
|  0.0|[5.4,3.9,1.7,0.4]|         2|
|  0.0|[4.6,3.4,1.4,0.3]|         2|
|  0.0|[5.0,3.4,1.5,0.2]|         2|
|  0.0|[4.4,2.9,1.4,0.2]|         2|
|  0.0|[4.9,3.1,1.5,0.1]|         2|
|  0.0|[5.4,3.7,1.5,0.2]|         2|
|  0.0|[4.8,3.4,1.6,0.2]|         2|
|  0.0|[4.8,3.0,1.4,0.1]|         2|
|  0.0|[4.3,3.0,1.1,0.1]|         2|
|  0.0|[5.8,4.0,1.2,0.2]|         2|
|  0.0|[5.7,4.4,1.5,0.4]|         2|
|  0.0|[5.4,3.9,1.3,0.4]|         2|
|  0.0|[5.1,3.5,1.4,0.3]|         2|
|  0.0|[5.7,3.8,1.7,0.3]|         2|
|  0.0|[5.1,3.8,1.5,0.3]|         2|
|  0.0|[5.4,3.4,1.7,0.2]|         2|
|  0.0|[5.1,3.7,1.5,0.4]|         2|
|  0.0|[4.6,3.6,1.0,0.2]|         2|
|  0.0|[5.1,3.3,1.7,0.5]|         2|
|

# LDA

In [46]:
import org.apache.spark.ml.clustering.LDA

val lda = new LDA()
  .setK(3)
  .setMaxIter(10)

val model = lda.fit(data)

val transformed = model.transform(data)

transformed.printSchema

lastException: Throwable = null
lda: org.apache.spark.ml.clustering.LDA = lda_a8ba1bba1ae2
model: org.apache.spark.ml.clustering.LDAModel = lda_a8ba1bba1ae2
transformed: org.apache.spark.sql.DataFrame = [label: double, features: vector, topicDistribution: vector]


root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)



In [6]:
val ll = model.logLikelihood(data)
val lp = model.logPerplexity(data)

val topics = model.describeTopics(3)

topics.show(false)

ll: Double = -2936.162301925541
lp: Double = 1.4128391198190156
topics: org.apache.spark.sql.DataFrame = [topic: int, termIndices: array<int>, termWeights: array<double>]


+-----+-----------+--------------------------------------------------------------+
|topic|termIndices|termWeights                                                   |
+-----+-----------+--------------------------------------------------------------+
|0    |[0, 2, 1]  |[0.3981220591570822, 0.31100639718910134, 0.18658450895271134]|
|1    |[0, 1, 2]  |[0.4832126027590587, 0.3377224514368744, 0.14439829995250877] |
|2    |[1, 2, 3]  |[0.27068789773607044, 0.25584426390176196, 0.238085549087821] |
+-----+-----------+--------------------------------------------------------------+



In [61]:
import org.apache.spark.sql.Row
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

case class myRow(label: Double, features: Vector, numDistribution: Int)

transformed.map(l => l match {
    case Row(label: Double, features: Vector, topicDistribution: Vector) => 
    val num = topicDistribution.toArray.zip(1 to 3).maxBy(_._1)._2
    myRow(label, features, num)
}).toDF().show(150)

defined class myRow


sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@3403cce2


+-----+-----------------+---------------+
|label|         features|numDistribution|
+-----+-----------------+---------------+
|  0.0|[5.1,3.5,1.4,0.2]|              2|
|  0.0|[4.9,3.0,1.4,0.2]|              2|
|  0.0|[4.7,3.2,1.3,0.2]|              2|
|  0.0|[4.6,3.1,1.5,0.2]|              2|
|  0.0|[5.0,3.6,1.4,0.2]|              2|
|  0.0|[5.4,3.9,1.7,0.4]|              2|
|  0.0|[4.6,3.4,1.4,0.3]|              2|
|  0.0|[5.0,3.4,1.5,0.2]|              2|
|  0.0|[4.4,2.9,1.4,0.2]|              2|
|  0.0|[4.9,3.1,1.5,0.1]|              2|
|  0.0|[5.4,3.7,1.5,0.2]|              2|
|  0.0|[4.8,3.4,1.6,0.2]|              2|
|  0.0|[4.8,3.0,1.4,0.1]|              2|
|  0.0|[4.3,3.0,1.1,0.1]|              2|
|  0.0|[5.8,4.0,1.2,0.2]|              2|
|  0.0|[5.7,4.4,1.5,0.4]|              2|
|  0.0|[5.4,3.9,1.3,0.4]|              2|
|  0.0|[5.1,3.5,1.4,0.3]|              2|
|  0.0|[5.7,3.8,1.7,0.3]|              2|
|  0.0|[5.1,3.8,1.5,0.3]|              2|
|  0.0|[5.4,3.4,1.7,0.2]|         