In [3]:
%use krangl, lets-plot

In [4]:
class DecisionTree(private val maxDepth: Int = 5, private val featuresLimit: Boolean = false) {

    lateinit var x: List<DoubleArray>
    lateinit var y: List<Double>
    var root: Node? = null

    fun fit(x: List<DoubleArray>, y: List<Double>) {
        this.x = x
        this.y = y
        root = trainNode(0, x.indices.toList())
    }

    fun predict(x: DoubleArray): Double {
        if (root == null) {
            throw RuntimeException("call fit first")
        }
        return root!!.predict(x)
    }

    private fun impurity(indices: List<Int>): Double {
        val curY = indices.map { y[it] }
        return curY.groupingBy { it }.eachCount().values.map { it.toDouble() / curY.size }.sumByDouble { it * (1 - it) }
    }

    private fun qualityFunctional(left: List<Int>, right: List<Int>): Double {
        val fullSet = left + right
        return impurity(fullSet) - (left.size.toDouble() / fullSet.size) * impurity(left) - (right.size.toDouble() / fullSet.size) * impurity(right)
    }

    private fun trainNode(depth: Int, indices: List<Int>): Node {
        val curY = indices.map { y[it] }
        
        if (curY.distinct().size <= 1 || depth == maxDepth) {
            return LeafNode(curY.groupingBy { it }.eachCount().maxByOrNull { it.value }!!.key)
        }

        var splitFeature = 0
        var splitThreshold = 0.0
        var maxQuality = -Double.MAX_VALUE
    
        var features = x.first().indices.toList()
        if (featuresLimit) {
            features = features.shuffled().take(sqrt(features.size.toDouble()).toInt())
        }
        for (featureInd in features) {
            val sortedIndices = indices.sortedBy { x[it][featureInd] }
        
            var prevTreshold = x[sortedIndices[0]][featureInd]
            for (ind in 1..sortedIndices.lastIndex) {
                val treshold = x[sortedIndices[ind]][featureInd]
                if (treshold == prevTreshold) {
                    continue
                } else {
                    prevTreshold = treshold
                }
                
                val left = sortedIndices.take(ind)
                val right = sortedIndices.drop(ind)
                val quality = qualityFunctional(left, right)
                if (quality > maxQuality) {
                    maxQuality = quality
                    splitFeature = featureInd
                    splitThreshold = treshold
                }
            }
        }

        val (left, right) = indices.partition { x[it][splitFeature] < splitThreshold }
        val leftNode = trainNode(depth + 1, left)
        val rightNode = trainNode(depth + 1, right)
        return SplitNode(splitFeature, splitThreshold, leftNode, rightNode)
    }

    interface Node {
        fun predict(x: DoubleArray): Double
    }

    class SplitNode(private val feature: Int, private val threshold: Double, private val left: Node, private val right: Node) : Node {
        override fun predict(x: DoubleArray): Double {
            return if (x[feature] < threshold) {
                left.predict(x)
            } else {
                right.predict(x)
            }
        }

    }

    class LeafNode(private val result: Double) : Node {
        override fun predict(x: DoubleArray): Double {
            return result
        }
    }
}

In [5]:
fun DataFrame.toXY(): Pair<List<DoubleArray>, List<Double>> {
    val objs = this.rows.toList().map { it.values.map { it.toString().toDouble() } }.shuffled().take(500)
    return Pair(objs.map { it.dropLast(1).toDoubleArray() }, objs.map { it.last() })
}

In [6]:
fun getTwoDigitNumber(x: Int): String {
    return "${x / 10}${x % 10}"
}

var maxDepth = 0
var minDepth = Int.MAX_VALUE

var maxDepthDataset = 0
var minDepthDataset = 0

val mDatasets = 21
for (datasetInd in 1..mDatasets) {
    val twoDigitNumber = getTwoDigitNumber(datasetInd)
    val trainFileName = "data/${twoDigitNumber}_train.csv"
    val testFileName = "data/${twoDigitNumber}_test.csv"
    
    val trainDf = DataFrame.readCSV(trainFileName)
    val (trainX, trainY) = trainDf.toXY()
    
    val testDf = DataFrame.readCSV(testFileName)
    val (testX, testY) = testDf.toXY()
    
    var maxAccuracy = -Double.MAX_VALUE
    var optDepth = 0
    for (depth in 1..12) {
        val dt = DecisionTree(depth)
        dt.fit(trainX, trainY)
        var correctPredictions = 0
        for (i in testX.indices) {
            val pred = dt.predict(testX[i])
            if (pred == testY[i]) {
                correctPredictions++
            }
        }
        val accuracy = correctPredictions.toDouble() / testX.size 
        if (accuracy > maxAccuracy) {
            maxAccuracy = accuracy
            optDepth = depth
        }
    }
    
    if (optDepth > maxDepth) {
        maxDepth = optDepth
        maxDepthDataset = datasetInd
    }
    
    if (optDepth < minDepth) {
        minDepth = optDepth
        minDepthDataset = datasetInd
    }
    println("Dataset $datasetInd")
    println(" Accuracy: $maxAccuracy")
    println(" Depth: $optDepth")
    println()
}

Dataset 1
 Accuracy: 0.992
 Depth: 3

Dataset 2
 Accuracy: 0.44
 Depth: 7

Dataset 3
 Accuracy: 1.0
 Depth: 1

Dataset 4
 Accuracy: 0.936
 Depth: 4

Dataset 5
 Accuracy: 0.9848484848484849
 Depth: 1

Dataset 6
 Accuracy: 0.998
 Depth: 2

Dataset 7
 Accuracy: 0.952
 Depth: 4

Dataset 8
 Accuracy: 0.997920997920998
 Depth: 2

Dataset 9
 Accuracy: 0.812
 Depth: 4

Dataset 10
 Accuracy: 0.984
 Depth: 3

Dataset 11
 Accuracy: 0.996
 Depth: 1

Dataset 12
 Accuracy: 0.75
 Depth: 7

Dataset 13
 Accuracy: 0.546
 Depth: 5

Dataset 14
 Accuracy: 0.92
 Depth: 5

Dataset 15
 Accuracy: 1.0
 Depth: 1

Dataset 16
 Accuracy: 1.0
 Depth: 1

Dataset 17
 Accuracy: 0.654
 Depth: 5

Dataset 18
 Accuracy: 0.884
 Depth: 5

Dataset 19
 Accuracy: 0.706
 Depth: 6

Dataset 20
 Accuracy: 0.88
 Depth: 5

Dataset 21
 Accuracy: 0.64
 Depth: 7



In [7]:
val x = (1..10).toList()
val train = mutableListOf<Double>()
val test = mutableListOf<Double>()

x.forEach {
    val twoDigitNumber = getTwoDigitNumber(minDepthDataset)
    val trainFileName = "data/${twoDigitNumber}_train.csv"
    val testFileName = "data/${twoDigitNumber}_test.csv"

    val trainDf = DataFrame.readCSV(trainFileName)
    val (trainX, trainY) = trainDf.toXY()

    val testDf = DataFrame.readCSV(testFileName)
    val (testX, testY) = testDf.toXY()

    val dt = DecisionTree(it)
    dt.fit(trainX, trainY)
    var correctPredictions = 0
    for (i in testX.indices) {
        val pred = dt.predict(testX[i])
        if (pred == testY[i]) {
            correctPredictions++
        }
    }
    val accuracyTest = correctPredictions.toDouble() / testX.size 
    
    correctPredictions = 0
    for (i in trainX.indices) {
        val pred = dt.predict(trainX[i])
        if (pred == trainY[i]) {
            correctPredictions++
        }
    }
    val accuracyTrain = correctPredictions.toDouble() / trainX.size
    
    train.add(accuracyTrain)
    test.add(accuracyTest)
}

In [8]:
val depths = x
val p = lets_plot() + geom_line() { x=depths; y=train } + geom_line(color="red") { x = depths; y=test }
p.show()

In [9]:
val x = (1..10).toList()
val train = mutableListOf<Double>()
val test = mutableListOf<Double>()

x.forEach {
    val twoDigitNumber = getTwoDigitNumber(maxDepthDataset)
    val trainFileName = "data/${twoDigitNumber}_train.csv"
    val testFileName = "data/${twoDigitNumber}_test.csv"

    val trainDf = DataFrame.readCSV(trainFileName)
    val (trainX, trainY) = trainDf.toXY()

    val testDf = DataFrame.readCSV(testFileName)
    val (testX, testY) = testDf.toXY()

    val dt = DecisionTree(it)
    dt.fit(trainX, trainY)
    var correctPredictions = 0
    for (i in testX.indices) {
        val pred = dt.predict(testX[i])
        if (pred == testY[i]) {
            correctPredictions++
        }
    }
    val accuracyTest = correctPredictions.toDouble() / testX.size 
    
    correctPredictions = 0
    for (i in trainX.indices) {
        val pred = dt.predict(trainX[i])
        if (pred == trainY[i]) {
            correctPredictions++
        }
    }
    val accuracyTrain = correctPredictions.toDouble() / trainX.size
    
    train.add(accuracyTrain)
    test.add(accuracyTest)
}

In [10]:
val depths = x
val p = lets_plot() + geom_line() { x=depths; y=train } + geom_line(color="red") { x = depths; y=test }
p.show()

In [11]:
import kotlin.random.Random

class RandomForest(val nTrees: Int = 1, val nObjects: Int = 100) {
    lateinit var trees: List<DecisionTree>
        
    fun fit(x: List<DoubleArray>, y: List<Double>) {
        trees = List(nTrees) { trainTree(x, y) }
    }
    
    fun predict(x: DoubleArray): Double {
        val preds = trees.map { it.predict(x) }
        return preds.groupingBy { it }.eachCount().maxByOrNull { it.value }!!.key
    }
    
    fun trainTree(x: List<DoubleArray>, y: List<Double>): DecisionTree {
        val inds = List(nObjects) { Random.nextInt(x.size) }
        val xSample = inds.map { x[it] }
        val ySample = inds.map { y[it] }
        val dt = DecisionTree(-1, true)
        dt.fit(xSample, ySample)
        return dt
    }
}

In [14]:
val mDatasets = 21
for (datasetInd in 1..mDatasets) {
    val twoDigitNumber = getTwoDigitNumber(datasetInd)
    val trainFileName = "data/${twoDigitNumber}_train.csv"
    val testFileName = "data/${twoDigitNumber}_test.csv"
    
    val trainDf = DataFrame.readCSV(trainFileName)
    val (trainX, trainY) = trainDf.toXY()
    
    val testDf = DataFrame.readCSV(testFileName)
    val (testX, testY) = testDf.toXY()
    
    val forest = RandomForest(1000)
    forest.fit(trainX, trainY)
    
    var correctPredictions = 0
    for (i in trainX.indices) {
        val pred = forest.predict(trainX[i])
        if (pred == trainY[i]) {
            correctPredictions++
        }
    }
    val trainAccuracy = correctPredictions.toDouble() / trainX.size 
    
    correctPredictions = 0
    for (i in testX.indices) {
        val pred = forest.predict(testX[i])
        if (pred == testY[i]) {
            correctPredictions++
        }
    }
    val testAccuracy = correctPredictions.toDouble() / testX.size 
    
    println("Dataset $datasetInd")
    println(" Train accuracy: $trainAccuracy")
    println(" Test accuracy: $testAccuracy")
    println()
}

Dataset 1
 Train accuracy: 0.936
 Test accuracy: 0.922

Dataset 2
 Train accuracy: 1.0
 Test accuracy: 0.288

Dataset 3
 Train accuracy: 0.964
 Test accuracy: 0.956

Dataset 4
 Train accuracy: 0.834
 Test accuracy: 0.756

Dataset 5
 Train accuracy: 0.9675324675324676
 Test accuracy: 0.9848484848484849

Dataset 6
 Train accuracy: 0.986
 Test accuracy: 0.818

Dataset 7
 Train accuracy: 0.868
 Test accuracy: 0.784

Dataset 8
 Train accuracy: 0.9438669438669439
 Test accuracy: 0.9708939708939709

Dataset 9
 Train accuracy: 0.918
 Test accuracy: 0.592

Dataset 10
 Train accuracy: 0.924
 Test accuracy: 0.932

Dataset 11
 Train accuracy: 0.946
 Test accuracy: 0.994

Dataset 12
 Train accuracy: 0.882
 Test accuracy: 0.662

Dataset 13
 Train accuracy: 0.88
 Test accuracy: 0.314

Dataset 14
 Train accuracy: 0.952
 Test accuracy: 0.726

Dataset 15
 Train accuracy: 0.946
 Test accuracy: 1.0

Dataset 16
 Train accuracy: 0.954
 Test accuracy: 0.982

Dataset 17
 Train accuracy: 1.0
 Test accuracy: 0.