In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.{RandomForestClassifier, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import Image.Processing._

## Load images and transform them into feature vectors

In [2]:
var data = Vector.empty[(Double, org.apache.spark.ml.linalg.Vector)]

var path = ""

for(j <- 1 to 10)
{
  for(k <- 1 to 10)
    {

    val J = j.toString
    val K = k.toString

    if( j <= 9 & k <= 9 ){
    path = "../Datasets/Hand_drawn_characters/Sample00" + J + "/img00" + J + "-00" + K + ".png"
    } else if( j <= 9 & k > 9 ){
    path = "../Datasets/Hand_drawn_characters/Sample00" + J + "/img00" + J + "-0" + K + ".png"
    } else if( j > 9 & k <= 9 ){
    path = "../Datasets/Hand_drawn_characters/Sample0" + J + "/img0" + J + "-00" + K + ".png"
    } else{
    path = "../Datasets/Hand_drawn_characters/Sample0" + J + "/img0" + J + "-0" + K + ".png"
    }
        
    val image = readImage(path)

    val acimage = autoCropImage(image)

    val rimage = resizeImage(acimage, 10, 15)

    val m = covertToArray(rimage).max

    val V = Vectors.dense(covertToArray(rimage).map(x => x - m)) //5400 d densevector

    data = data :+ (j.toDouble,V)

    }
}

data = Vector((1.0,[0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,-1.6777215E7,-1.6777215E7,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0...


Vector((1.0,[0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,-1.6777215E7,-1.6777215E7,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,0.0,0.0,-1.6777215E7,-1.6777215E7,0.0,0.0,0.0,-1.6777215E7,-1.6777215E7,-1.6777215E

## Create Dataframe

In [3]:
val df = data.toDF("label","features")
df.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|[0.0,0.0,0.0,0.0,...|
|  1.0|[0.0,0.0,0.0,-1.6...|
|  1.0|[0.0,0.0,-1.67772...|
|  1.0|[0.0,0.0,0.0,-1.6...|
|  1.0|[0.0,0.0,-1.67772...|
|  1.0|[0.0,0.0,0.0,-1.6...|
|  1.0|[0.0,0.0,-1.67772...|
|  1.0|[0.0,0.0,0.0,-1.6...|
|  1.0|[0.0,0.0,0.0,-1.6...|
|  1.0|[0.0,0.0,0.0,0.0,...|
+-----+--------------------+
only showing top 10 rows



df = [label: double, features: vector]


[label: double, features: vector]

## Split into train and test sets

In [4]:
val Array(trainingData, testData) = df.randomSplit(Array(0.7, 0.3))

trainingData = [label: double, features: vector]
testData = [label: double, features: vector]


[label: double, features: vector]

## Scale the features

In [5]:
val scaler = new StandardScaler().
  setInputCol("features").
  setOutputCol("scaledFeatures")

scaler = stdScal_38f8347dbaa9


stdScal_38f8347dbaa9

In [6]:
val s = scaler.fit(trainingData)

s = stdScal_38f8347dbaa9


stdScal_38f8347dbaa9

In [7]:
val trainingData_s = s.transform(trainingData)
val testData_s = s.transform(testData)

trainingData_s = [label: double, features: vector ... 1 more field]
testData_s = [label: double, features: vector ... 1 more field]


[label: double, features: vector ... 1 more field]

## Dimensionality Reduction with PCA

In [8]:
val pca = new PCA().
    setInputCol("scaledFeatures").
    setOutputCol("pcaFeatures").
    setK(30)

pca = pca_94bccc29bb2a


pca_94bccc29bb2a

In [9]:
val p = pca.fit(trainingData_s)

p = pca_94bccc29bb2a


pca_94bccc29bb2a

In [10]:
val trainingData_p = p.transform(trainingData_s)
val testData_p = p.transform(testData_s)

trainingData_p = [label: double, features: vector ... 2 more fields]
testData_p = [label: double, features: vector ... 2 more fields]


[label: double, features: vector ... 2 more fields]

## Train the model

In [11]:
val rf = new RandomForestClassifier().
    setFeaturesCol("pcaFeatures").
    setNumTrees(100).
    setMaxDepth(10)

rf = rfc_84d6f13a73ec


rfc_84d6f13a73ec

In [12]:
val ovr = new OneVsRest().
    setClassifier(rf)

ovr = oneVsRest_e360b3753ffc


oneVsRest_e360b3753ffc

In [13]:
val model = ovr.fit(trainingData_p)

model = oneVsRest_e360b3753ffc


oneVsRest_e360b3753ffc

## Make predictions

In [14]:
val predictions = model.transform(testData_p)
predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|      scaledFeatures|         pcaFeatures|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  1.0|[0.0,0.0,-1.67772...|[0.0,0.0,-2.17849...|[-6.8419257978742...|       1.0|
|  1.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|[-8.6160989504394...|       1.0|
|  1.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|[-10.454917865731...|       1.0|
|  2.0|[0.0,-1.6777215E7...|[0.0,-3.266464331...|[-7.6063524797896...|       2.0|
|  2.0|[0.0,0.0,-1.67772...|[0.0,0.0,-2.17849...|[0.91455562552746...|       3.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



predictions = [label: double, features: vector ... 3 more fields]


[label: double, features: vector ... 3 more fields]

In [15]:
predictions.stat.crosstab("label", "prediction").sort("label_prediction").show()

|label_prediction|1.0|10.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|
+----------------+---+----+---+---+---+---+---+---+---+---+
|             1.0|  3|   0|  0|  0|  0|  0|  0|  0|  0|  0|
|            10.0|  0|   2|  0|  0|  0|  0|  0|  0|  0|  0|
|             2.0|  0|   0|  2|  1|  0|  0|  0|  0|  0|  0|
|             3.0|  0|   0|  0|  3|  0|  0|  0|  0|  0|  0|
|             4.0|  0|   0|  0|  0|  2|  0|  0|  0|  0|  0|
|             5.0|  0|   0|  0|  0|  0|  3|  0|  0|  0|  0|
|             6.0|  0|   0|  0|  0|  0|  0|  5|  0|  0|  0|
|             7.0|  0|   0|  0|  0|  0|  0|  0|  4|  0|  0|
|             8.0|  0|   0|  0|  0|  0|  0|  0|  0|  3|  0|
|             9.0|  0|   1|  0|  0|  0|  0|  0|  0|  0|  3|
+----------------+---+----+---+---+---+---+---+---+---+---+



## Evaluate the model

In [16]:
val evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("accuracy")

evaluator = mcEval_631887364bca


mcEval_631887364bca

In [17]:
val accuracy = evaluator.evaluate(predictions)

accuracy = 0.9375


0.9375