In [1]:
%use lets-plot

import java.nio.file.Files
import java.nio.file.Paths
import java.io.File
import java.io.InputStream

In [2]:
class Letter(val subject: List<String>, val text: List<String>, val isLegit: Boolean) {
    fun getFullLetter(): List<String> {
        return subject + text
    }
    
    fun getFullLetterAsString(): String {
        return getFullLetter().joinToString(separator=" ")
    }
    
    fun getNGrams(n: Int): List<String> {
        val fullLetter = getFullLetter()
        val res = fullLetter.toMutableList()
        for (windowSize in 2..n) {
            for (wordInd in 0 until fullLetter.size - windowSize) {
                val str = fullLetter.subList(wordInd, wordInd + windowSize).joinToString(separator=" ")
                res.add(str)
            }
        }
        return res
    }
}

In [3]:
fun String.toStringList(): List<String> {
    return this.trim().split(" ").filter { !it.isEmpty() }
}

In [4]:
fun parseLetter(letterFile: File): Letter {
    val reader = letterFile.inputStream().bufferedReader()
    
    var str = reader.readLine()
    val subject = mutableListOf<String>()
    val text = mutableListOf<String>()
    
    while (!str.isEmpty()) {
        subject.addAll(str.removePrefix("Subject: ").toStringList())
        str = reader.readLine()
    }
    
    reader.readLines().forEach { text.addAll(it.toStringList()) }
    return Letter(subject, text, letterFile.nameWithoutExtension.contains("legit"))
}

In [5]:
val dataPath = Paths.get("./data")

val partsMap = mutableMapOf<String, MutableList<Letter>>()

Files.walk(dataPath).forEach { 
    val asFile = it.toFile()
    if (asFile.isFile) {
        val letter = parseLetter(asFile)
        val dir = asFile.getParent()
        if (partsMap[dir] != null) {
            partsMap[dir]!!.add(letter)
        } else {
            partsMap[dir] = mutableListOf(letter)
        }
    }
}

val parts = partsMap.values.toList().map { it.toList() }

In [6]:
fun getDictWithNGrams(parts: List<List<Letter>>, n: Int): Map<String, Int> {
    val dict = mutableMapOf<String, Int>()
    var freeInd = 0
    for (part in parts) {
        for (letter in part) {
            val nGrams = letter.getNGrams(n)
            for (key in nGrams) {
                if (!dict.contains(key)) {
                    dict[key] = freeInd++
                }
            }
        }
    }
    return dict
} 

In [7]:
fun getVectorWithNGram(letter: Letter, dict: Map<String, Int>, n: Int): List<Int> {
    val vector = MutableList(dict.size) { 0 }
    val nGrams = letter.getNGrams(n)
    for (key in nGrams) {
        vector[dict[key]!!]++
    }
    return vector
}

In [8]:
class NaiveBayes(val alpha: Double, val lambdas: List<Double> = listOf(1.0, 1.0)) {
    var prProb = listOf<MutableList<Double>>()
    var clProb = mutableListOf<Double>()
    var classes = listOf<Int>()
    
    fun fit(x: List<List<Int>>, y: List<Int>) {
        classes = y.distinct()
        val classesCount = classes.size
        val keyCount = x[0].size
        
        prProb = List(classesCount) { MutableList(keyCount) { 0.0 } }
        clProb = MutableList(classesCount) { 0.0 }
        
        for (cl in classes) {
            val classSize = y.count { it == cl }
            val xClass = x.withIndex().filter { y[it.index] == cl }.map { it.value }
            
            val wordStat = MutableList(keyCount) { 0 }
            for (letter in xClass) {
                for ((key, count) in letter.withIndex()) {
                    wordStat[key] += count
                }
            }
            val totalCount = wordStat.sum()
            
            for (letter in xClass) {
                for ((key, count) in letter.withIndex()) {
                    prProb[cl][key] = (wordStat[key] + alpha) / (totalCount + alpha * keyCount)
                }
            }
            clProb[cl] = classSize.toDouble() / y.size
        }
    }
    
    fun predict(x: List<List<Int>>): List<Int> {
        val lettersPredictions = MutableList<Int>(x.size) { -1 }
        for ((ind, letter) in x.withIndex()) {
            var maxPYX = -Double.MAX_VALUE
            for (cl in classes) {
                val pYX = lambdas[cl] + ln(clProb[cl]) + prProb[cl].withIndex().filter { letter[it.index] > 0 }.map { ln(it.value) }.sum()
                if (pYX > maxPYX) {
                    maxPYX = pYX
                    lettersPredictions[ind] = cl
                }
            }
        }
        return lettersPredictions
    }
    
    fun predictWithTrustLevel(x: List<List<Int>>, trustLevel: Double): List<Int> {
        val lettersPredictions = MutableList<Int>(x.size) { -1 }
        for ((ind, letter) in x.withIndex()) {
            val pYX = MutableList(classes.size) { 0.0 }
            for (cl in classes) {
                pYX[cl] = lambdas[cl] + ln(clProb[cl]) + prProb[cl].withIndex().filter { letter[it.index] > 0 }.map { ln(it.value) }.sum()
            }
            lettersPredictions[ind] = if (ln(pYX[0] / pYX[1]) > trustLevel) 1 else 0
        }
        return lettersPredictions
    }
}

In [9]:
val ALPHA = 0.001
val N_GRAM = 1

val nb = NaiveBayes(ALPHA)
val dict = getDictWithNGrams(parts, N_GRAM)
val vectorizeLetters = parts.map { it.map { letter -> Pair(getVectorWithNGram(letter, dict, N_GRAM), if (letter.isLegit) 1 else 0) } }

In [10]:
val trustLevels = List(10) { i -> -1.0 + (2.0 / 10) * i}
val x = mutableListOf<Double>()
val y = mutableListOf<Double>()

for (trustLevel in trustLevels) {
    val confusionMatrix = List(2) { MutableList(2) { 0 } }

    for (i in parts.indices) {
        val testX = vectorizeLetters[i].map { it.first }
        val testY = vectorizeLetters[i].map { it.second }

        val train = vectorizeLetters.withIndex().filter { it.index != i }.map { it.value }.flatten()
        val trainX = train.map { it.first }
        val trainY = train.map { it.second }

        nb.fit(trainX, trainY)
        val predict = nb.predictWithTrustLevel(testX, trustLevel)
        for (j in predict.indices) {
            confusionMatrix[predict[j]][testY[j]]++
        }
    }
    
    x.add(confusionMatrix[0][1].toDouble() / (confusionMatrix[0][1] + confusionMatrix[1][1]))
    y.add(confusionMatrix[0][0].toDouble() / (confusionMatrix[0][0] + confusionMatrix[1][0]))
    
    val accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]).toDouble() / confusionMatrix.sumBy { it.sum() }
    println("trust level: $trustLevel, accuracy: $accuracy")
}

trust level: -1.0, accuracy: 0.5596330275229358
trust level: -0.8, accuracy: 0.5596330275229358
trust level: -0.6, accuracy: 0.5596330275229358
trust level: -0.3999999999999999, accuracy: 0.5596330275229358
trust level: -0.19999999999999996, accuracy: 0.5889908256880734
trust level: 0.0, accuracy: 0.9798165137614679
trust level: 0.20000000000000018, accuracy: 0.6743119266055045
trust level: 0.40000000000000013, accuracy: 0.44036697247706424
trust level: 0.6000000000000001, accuracy: 0.44036697247706424
trust level: 0.8, accuracy: 0.44036697247706424


In [11]:
val plotInfo = mapOf(
    "x" to x,
    "y" to y
)

val p = lets_plot(plotInfo) { x = "x"; y = "y"} + geom_point() + geom_line()
p

In [12]:
val x = mutableListOf<Double>()
val y = mutableListOf<Double>()

for (lambdaLegit in 1..91 step 10) {
    val confusionMatrix = List(2) { MutableList(2) { 0 } }

    for (i in parts.indices) {
        val testX = vectorizeLetters[i].map { it.first }
        val testY = vectorizeLetters[i].map { it.second }

        val train = vectorizeLetters.withIndex().filter { it.index != i }.map { it.value }.flatten()
        val trainX = train.map { it.first }
        val trainY = train.map { it.second }

        val bayes = NaiveBayes(ALPHA, listOf(1.0, lambdaLegit.toDouble()))
        bayes.fit(trainX, trainY)
        val predict = bayes.predict(testX)
        for (j in predict.indices) {
            confusionMatrix[predict[j]][testY[j]]++
        }
    }
    val accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]).toDouble() / confusionMatrix.sumBy { it.sum() }
    println("lambdaLegit: $lambdaLegit, accuracy: $accuracy")
    x.add(lambdaLegit.toDouble())
    y.add(accuracy)
}

lambdaLegit: 1, accuracy: 0.9798165137614679
lambdaLegit: 11, accuracy: 0.9761467889908257
lambdaLegit: 21, accuracy: 0.9688073394495413
lambdaLegit: 31, accuracy: 0.955045871559633
lambdaLegit: 41, accuracy: 0.944954128440367
lambdaLegit: 51, accuracy: 0.9293577981651376
lambdaLegit: 61, accuracy: 0.908256880733945
lambdaLegit: 71, accuracy: 0.8926605504587156
lambdaLegit: 81, accuracy: 0.8733944954128441
lambdaLegit: 91, accuracy: 0.8568807339449541


In [13]:
val plotInfo2 = mapOf(
    "x" to x,
    "y" to y
)

val p2 = lets_plot(plotInfo2) { x = "x"; y = "y"} + geom_point() + geom_line()
p2