In [None]:
import io.github.cdimascio.dotenv.Dotenv
import java.nio.file.Paths

val dotenv = Dotenv.load()
val dataDir = dotenv.get("DATA_DIR").let { Paths.get(it).toFile() }.also { it.mkdirs() }
dataDir

In [2]:
import com.google.gson.Gson
import com.google.gson.reflect.TypeToken

data class ChangesFromGitDataItem(
    val cveId: String,
    val ghsaId: String,
    val packageName: String,
    val vulnIntroduced: String,
    val latestVulnVersion: String,
    val vulnFixed: String,
    val fixCommitRepo: String,
    val fixCommitHash: String,
    val commitHashFromGit: String,
    val modifiedFilePathBefore: String,
    val modifiedFilePathAfter: String,
    val modifiedFileSrcBefore: String,
    val modifiedFileSrcAfter: String,
    val diffParsedJson: String,
    val nloc: Int,
    val changedMethods: List<String>,
)

In [3]:
import com.google.gson.Gson
import kotlinx.serialization.json.Json

val changesFromGit = Gson().fromJson<List<ChangesFromGitDataItem>>(dataDir.resolve("interim").resolve("changes_from_git.json").readText(), object: TypeToken<List<ChangesFromGitDataItem>>() {}.type)
changesFromGit.count()

639

In [4]:
import com.google.common.hash.Hashing
import io.ktor.client.*
import io.ktor.client.call.*
import io.ktor.client.engine.cio.*
import io.ktor.client.plugins.*
import io.ktor.client.request.*
import io.ktor.client.statement.*
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking

val jarsDir = dataDir.resolve("interim/jars").also { it.mkdirs() }

fun packageNameToUrl(packageName: String): String {
    val parts = packageName.split(":")
    return parts[0].replace('.', '/') + "/" + parts[1]
}

fun gavToJarUrl(packageName: String, version: String): String {
    val parts = packageName.split(":")
    return parts[0].replace('.', '/') + "/" + parts[1] + "/" + version + "/" + parts[1] + "-" + version + ".jar"
}

val httpClient = HttpClient() {
    install(HttpTimeout) {
        requestTimeoutMillis = 120_000
    }
}

fun downloadJar(packageName: String, version: String, repoUrl: String): Boolean = runBlocking {
    val jarUrl = gavToJarUrl(packageName, version)
    val jarPath = jarsDir.resolve(jarUrl).also { it.parentFile.mkdirs() }
    val jarMd5 = httpClient
        .get("$repoUrl/$jarUrl.md5")
        .let {
            if (it.status.value == 200) {
                return@let it
            }
            println("http error ${it.status.value} for md5 of $packageName ($repoUrl/$jarUrl)")
            return@let null
        }?.bodyAsText()
        ?.split("\\s+".toRegex())
        ?.first()
    
    if (jarPath.isFile && jarMd5 != null) {
        if (com.google.common.io.Files.asByteSource(jarPath).hash(Hashing.md5()).toString() == jarMd5)
            return@runBlocking true
        println("corrupt file: ${jarUrl}")
    }
    
    val result = httpClient
        .get("$repoUrl/$jarUrl")
        .let {
            if (it.status.value == 200) {
                return@let it
            }
            println("http error ${it.status.value} for $packageName ($repoUrl/$jarUrl)")
            return@runBlocking false
        }

    val jarBytes = result.readBytes()
    jarPath.writeBytes(jarBytes)
    true
}

In [None]:
import java.util.*

val dispatcher = Dispatchers.IO.limitedParallelism(8)

val jarsFromMaven =
    runBlocking {
        changesFromGit
            .flatMap {
                sequence {
                    yield(Pair(it.packageName, it.latestVulnVersion))
                    yield(Pair(it.packageName, it.vulnFixed))
                }
            }
            .toSet()
            .map {
                async(dispatcher) {
                    try {
                        val succ = downloadJar(it.first, it.second, "https://repo1.maven.org/maven2")
                        if (succ)
                            return@async it
                        else
                            return@async null
                    } catch (e: Exception) {
                        println("error downloading jar ${gavToJarUrl(it.first, it.second)}")
                        println(e.message)
                        throw e
                    }
                }
            }.awaitAll()
            .filterNotNull()
    }

DISPLAY(jarsFromMaven.count())

In [None]:
val jarsFromJenkins =
    runBlocking {
        changesFromGit
            .flatMap {
                sequence {
                    yield(Pair(it.packageName, it.latestVulnVersion))
                    yield(Pair(it.packageName, it.vulnFixed))
                }
            }
            .filterNot {
                jarsFromMaven.contains(it)
            }
            .toSet()
            .map {
                async(dispatcher) {
                    try {
                        val succ = downloadJar(it.first, it.second, "https://repo.jenkins-ci.org/releases")
                        if (succ)
                            return@async it
                        else
                            return@async null
                    } catch (e: Exception) {
                        println("error downloading jar ${gavToJarUrl(it.first, it.second)}")
                        throw e
                    }
                }
            }.awaitAll()
            .filterNotNull()
    }

DISPLAY(jarsFromJenkins.count())

264 jars from Maven Central and 17 jars from Jenkins

In [7]:
val packageToJar = (jarsFromMaven + jarsFromJenkins)
    .map {
        it to jarsDir.resolve(gavToJarUrl(it.first, it.second))
    }.toMap()
packageToJar.count()

281

In [8]:
import com.github.javaparser.JavaParser

val parser = JavaParser()

fun getPackageId(src: String): String {
    try {
        return parser.parse(src).result.get().packageDeclaration.get().nameAsString
    } catch (e: Exception) {
        return src.lines().firstNotNullOf { l ->
            "^package\\s+([a-zA_Z_][\\.\\w]*);".toRegex().find(l)?.groupValues?.get(1)    // java parser has problem with malformed java files
        }
    }
}

val randomItem = changesFromGit.random()
DISPLAY(randomItem.cveId)
DISPLAY(randomItem.changedMethods)
DISPLAY(randomItem.modifiedFilePathBefore)

val randomSrc =
    randomItem.modifiedFileSrcBefore

DISPLAY(getPackageId(randomSrc))

CVE-2013-1965

[OgnlUtil::isEvalExpression, OgnlUtil::getValue, OgnlUtil::copy, OgnlUtil::setEnableEvalExpression, OgnlUtil::compile, OgnlUtil::getValue, OgnlUtil::setValue, OgnlUtil::compile]

xwork-core/src/main/java/com/opensymphony/xwork2/ognl/OgnlUtil.java

com.opensymphony.xwork2.ognl

In [9]:
DISPLAY(changesFromGit.count())
changesFromGit.filter {
    it.changedMethods.count() > 0
}.count()

639

584

55 file changes will be discarded because they don't change any methods

In [10]:
changesFromGit
    .filter {
        it.changedMethods.count() > 0
    }
    .map {
        val changedClasses = it.changedMethods
            .map {
                it.split("::").asReversed().drop(1).asReversed().joinToString("$") to it
            }.groupBy { it.first }
            .map { it.key to it.value.map { it.second } }
            .toMap()
        
        changedClasses.keys.count()
    }.sum()  

629

In [11]:
changesFromGit
    .filter {
        it.changedMethods.count() > 0
    }
    .filter {
        packageToJar[Pair(it.packageName, it.latestVulnVersion)] != null &&
        packageToJar[Pair(it.packageName, it.vulnFixed)] != null
    }
    .map {
        val changedClasses = it.changedMethods
            .map {
                it.split("::").asReversed().drop(1).asReversed().joinToString("$") to it
            }.groupBy { it.first }
            .map { it.key to it.value.map { it.second } }
            .toMap()
        
        changedClasses.keys.count()
    }.sum()  

569

In [12]:
import java.util.concurrent.atomic.AtomicInteger
import java.util.zip.ZipFile

val noJar = AtomicInteger(0)

fun loadClassFile(jar: File, classFilePath: String): ByteArray? {
    if (!jar.isFile) {
        noJar.incrementAndGet()
        return null
    }
    
    val jarAsZip = ZipFile(jar)
    val entry = jarAsZip.getEntry(classFilePath)
    if (entry == null)
        return null
    
    val content = jarAsZip.getInputStream(entry).readAllBytes()
    return content
}

In [13]:
data class InterimResult2(
    val cveId: String,
    val ghsaId: String,
    val packageName: String,
    val vulnIntroduced: String,
    val latestVulnVersion: String,
    val fixedVersion: String,
    val fixCommitRepo: String,
    val fixCommitHash: String,
    val jarBefore: String,
    val jarAfter: String,
    val srcBefore: String,
    val srcAfter: String,
    val classFileBefore: String,
    val classFileAfter: String,
    val changedClass: String,
    val changedMethods: String,
    val packageIdBefore: String,
    val packageIdAfter: String,
    val diffParsedJson: String,
    val nloc: Int,
)

In [None]:
import com.github.javaparser.JavaParser
import org.apache.commons.csv.CSVFormat
import kotlin.io.path.Path
import kotlin.io.path.name
import kotlin.io.path.nameWithoutExtension

val changesDir = dataDir.resolve("proc/changes").also { it.mkdirs() }

fun getFullClassName(compilationUnit: String, className: String) =
    compilationUnit + "." + className

fun fullClassNameToPath(fullClassName: String) =
    fullClassName.replace('.', '/')

fun res() =
    changesFromGit
        .filter {
            it.changedMethods.count() > 0
        }
        .filter {
            packageToJar[Pair(it.packageName, it.latestVulnVersion)] != null &&
                    packageToJar[Pair(it.packageName, it.vulnFixed)] != null
        }
        .flatMap {
            val changedClasses = it.changedMethods
                .map {
                    it.split("::").asReversed().drop(1).asReversed().joinToString("$") to it
                }.groupBy { it.first }
                .map { it.key to it.value.map { it.second } }
                .toMap()

            val packageIdBefore = getPackageId(it.modifiedFileSrcBefore)
            val packageIdAfter = getPackageId(it.modifiedFileSrcAfter)

            changedClasses
                .map inner@{ cc ->
                    val changedClass = cc.key
                    val changedMethods = cc.value

                    val beforePath = packageIdBefore.replace('.', '/') + "/" + changedClass + ".class"
                    val afterPath = packageIdAfter.replace('.', '/') + "/" + changedClass + ".class"
                    val beforeContent =
                        loadClassFile(packageToJar[Pair(it.packageName, it.latestVulnVersion)]!!, beforePath)
                    val afterContent =
                        loadClassFile(packageToJar[Pair(it.packageName, it.vulnFixed)]!!, afterPath)

                    if (beforeContent == null || afterContent == null || beforeContent contentEquals afterContent)
                        return@inner null

                    val artifactId = it.packageName.split(":")[1]

                    val chCommitDir = changesDir.resolve(it.cveId).resolve(it.commitHashFromGit).also { it.mkdirs() }
                    val beforeDir = chCommitDir.resolve("before").also { it.mkdirs() }
                    val afterDir = chCommitDir.resolve("after").also { it.mkdirs() }

                    val jarBefore = packageToJar[Pair(it.packageName, it.latestVulnVersion)]!!.readBytes()
                    val jarAfter = packageToJar[Pair(it.packageName, it.vulnFixed)]!!.readBytes()

                    val jarPathBefore = beforeDir
                        .resolve("jar")
                        .also { it.mkdirs() }
                        .resolve("$artifactId-${it.latestVulnVersion}.jar")
                        .also { it.writeBytes(jarBefore) }

                    val jarPathAfter = afterDir
                        .resolve("jar")
                        .also { it.mkdirs() }
                        .resolve("$artifactId-${it.vulnFixed}.jar")
                        .also { it.writeBytes(jarAfter) }

                    val javaPathBefore = beforeDir
                        .resolve("java")
                        .also { it.mkdirs() }
                        .resolve("""${it.modifiedFilePathBefore}""")
                        .also { it.parentFile.mkdirs() }
                        .also { f -> f.writeText(it.modifiedFileSrcBefore) }

                    val javaPathAfter = afterDir
                        .resolve("java")
                        .also { it.mkdirs() }
                        .resolve("""${it.modifiedFilePathAfter}""")
                        .also { it.parentFile.mkdirs() }
                        .also { f -> f.writeText(it.modifiedFileSrcAfter) }

                    val classPathBefore = beforeDir
                        .resolve("class")
                        .also { it.mkdirs() }
                        .resolve("""${Path(it.modifiedFilePathBefore).parent}${File.separatorChar}${changedClass}.class""")
                        .also { it.parentFile.mkdirs() }
                        .also { it.writeBytes(beforeContent) }

                    val classPathAfter = afterDir
                        .resolve("class")
                        .also { it.mkdirs() }
                        .resolve("""${Path(it.modifiedFilePathAfter).parent}${File.separatorChar}${changedClass}.class""")
                        .also { it.parentFile.mkdirs() }
                        .also { it.writeBytes(afterContent) }

                    InterimResult2(
                        cveId = it.cveId,
                        ghsaId = it.ghsaId,
                        packageName = it.packageName,
                        vulnIntroduced = it.vulnIntroduced,
                        latestVulnVersion = it.latestVulnVersion,
                        fixedVersion = it.vulnFixed,
                        fixCommitRepo = it.fixCommitRepo,
                        fixCommitHash = it.commitHashFromGit,
                        jarBefore = jarPathBefore.relativeTo(dataDir).toString(),
                        jarAfter = jarPathAfter.relativeTo(dataDir).toString(),
                        srcBefore = javaPathBefore.relativeTo(dataDir).toString(),
                        srcAfter = javaPathAfter.relativeTo(dataDir).toString(),
                        classFileBefore = classPathBefore.relativeTo(dataDir).toString(),
                        classFileAfter = classPathAfter.relativeTo(dataDir).toString(),
                        changedClass = changedClass,
                        changedMethods = "[" + changedMethods.joinToString(", ") + "]",
                        packageIdBefore = packageIdBefore,
                        packageIdAfter = packageIdAfter,
                        diffParsedJson = it.diffParsedJson,
                        nloc = it.nloc,
                    )
                }.filterNotNull()
        }.toList()

val res = res()

In [19]:
res.toDataFrame().writeCSV(dataDir.resolve("proc/changes.tsv"), format = CSVFormat.DEFAULT.withDelimiter('\t'))
res.toDataFrame().writeJson(dataDir.resolve("proc/changes.json"), prettyPrint = true)

There are 326 many cases that a .class file cannot be found in the jar.

manual investigation:
- many cases of wrong artifactId in GHSA (eg CVE-2019-10093)
- 186 cases of test classes

There are 28 cases of identical .class files.

manual investigation:
- GHSA even has errors in version numbers: CVE-2013-7285

In [15]:
res.count()

209