# My Implementation of models in *Modeling & Ranking Flaky Tests at Apple (Memon et al., ICSE 2020)*

Written by Adam Peace, 2020.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats

Import the paper's example, where there is assumed to be one version update per day

In [2]:
R = pd.read_csv("flaky_tests_example.csv")
R

Unnamed: 0,day,v,tc_1,tc_2,tc_3,tc_4
0,1,1,1,0,1,0
1,1,1,1,0,1,1
2,1,1,1,1,1,0
3,1,1,1,1,1,1
4,2,2,0,0,0,0
5,2,2,0,0,0,0
6,2,2,0,0,0,0
7,2,2,0,0,0,0
8,3,3,1,1,1,0
9,3,3,1,0,1,0


In [3]:
test_cases = ["tc_1", "tc_2", "tc_3", "tc_4"]

In [4]:
def get_last_n_days(R, n):
    return R[R.day > R.day.max() - n]
def get_first_n_days(R, n):
    return R[R.day <= n]

## 3.2 Quantifying Flakiness Inside a Version

In [5]:
def get_R_v_star(R, test_case, version=0):
    if version == 0:
        return list(R[test_case])
    return list(R[R["v"] == version][test_case])

In [6]:
# Computes entropy of the given R_v_star (All reruns of SUT at version v)
def entropy(R_v_star):
    def log(n):
        if n == 0:
            return 0
        return np.log2(n)

    p_F = np.average(R_v_star) # Get probability of failure

    summation = 0
    summation += - p_F * log(p_F) # Adds entropy associated with p(F)
    summation += - (1 - p_F) * log(1 - p_F) # Adds entropy of associated with p(P)
    return summation

### Entropy of each test at each version
Note: in the paper, it is suggested that the entropy of $tc_1$ & $tc_3$ are 1. Since they have no variation inside versions, they are non-flaky thus entropy for them is 0

In [7]:
# versions = [1, 2, 3, 4] # Uncomment as needed
versions = [1]
for test_case in test_cases:
    for version in versions:
        testSet = get_R_v_star(R, test_case, version)
        print("Test {} at version {} has entropy {:.3f}".format(test_case, version, entropy(testSet)))
    print()

Test tc_1 at version 1 has entropy 0.000

Test tc_2 at version 1 has entropy 1.000

Test tc_3 at version 1 has entropy 0.000

Test tc_4 at version 1 has entropy 1.000



In [8]:
def flipRate(R_v_star):
    numFlips = 0
    numPossibleFlips = len(R_v_star) - 1
    if numPossibleFlips == 0:
        numPossibleFlips = 1

    lastTest = R_v_star[0]
    for test in R_v_star:
        if test != lastTest:
            numFlips += 1
            lastTest = test

    return numFlips / numPossibleFlips

### FlipRate of each test at each version

In [9]:
# versions = [1, 2, 3, 4] # Uncomment as needed
versions = [1]
for test_case in test_cases:
    for version in versions:
        testSet = get_R_v_star(R, test_case, version)
        print("Test {} at version {} has flipRate {:.3f}".format(test_case, version, flipRate(testSet)))
    print()

Test tc_1 at version 1 has flipRate 0.000

Test tc_2 at version 1 has flipRate 0.333

Test tc_3 at version 1 has flipRate 0.000

Test tc_4 at version 1 has flipRate 1.000



## 3.3 Aggregating Flakiness Across Versions

### Unweighted Flakiness

In [10]:
def flakiness_unweighted(R, H, test_case, f=flipRate):
    summation = 0
    testSet = get_first_n_days(R, H)
    
    presentVersions = testSet["v"].unique()

    for version in presentVersions:
        R_v_star = get_R_v_star(testSet, test_case, version)
        if len(R_v_star) > 0:
            summation += f(R_v_star)
    return summation / H

print("tc_2 U_FR is {:.2f} and U_E is {:.2f}".format(
    flakiness_unweighted(R, 3, "tc_2"), 
    flakiness_unweighted(R, 3, "tc_2", f=entropy)
))
print("tc_4 U_FR is {:.2f} and U_E is {:.2f}".format(
    flakiness_unweighted(R, 3, "tc_4"), 
    flakiness_unweighted(R, 3, "tc_4", f=entropy)
))

tc_2 U_FR is 0.44 and U_E is 0.67
tc_4 U_FR is 0.33 and U_E is 0.33


### Non-recursive Solution for weighted flakiness. Unknown if accurate

In [11]:
def flakiness_weighted_naive(R, test_case, l=0.1, f=flipRate):
    numerator = 0
    denominator = 0
    for index, version in enumerate(reversed(R.v.unique())):
        numerator += np.power(0.9, index) * f(get_R_v_star(R, test_case, version))
        denominator += np.power(0.9, index)
    return numerator / denominator

# print("tc_2 W_FR is {:.2f} and W_E is {:.2f}".format(
#     flakiness_weighted_naive(R, "tc_2"), 
#     flakiness_weighted_naive(R, "tc_2", f=entropy)
# ))
# print("tc_4 W_FR is {:.2f} and W_E is {:.2f}".format(
#     flakiness_weighted_naive(R, "tc_4"), 
#     flakiness_weighted_naive(R, "tc_4", f=entropy)
# ))

### Weighted Flakiness
Once again, the paper suggests the $W_E$ for $tc_2$ is 0.76. My result for $R_{tc_1, v_1, *}$ is 0.918, leading my final score to be 0.741

In [12]:
def flakiness_weighted(R, n, test_case, f=flipRate, l=0.1, P=1):
    def Z(R, n, test_case, l=0.1, f=flipRate):
        def get_t_from_v(v):
            return v

        if n == 1:
            return f(get_R_v_star(R, test_case, get_t_from_v(n)))
        return f(get_R_v_star(R, test_case, get_t_from_v(n))) + (1 - l) * Z(R, n-1, test_case, f=f)

    def normalizer(n, l=0.1):
        summation = 0
        for i in range(n):
            summation += np.power((1 - l), i)
        return summation

    return Z(R, n, test_case, l=l, f=f) / normalizer(n, l)

print("tc_2 W_FR is {:.2f} and W_E is {:.3f}".format(
    flakiness_weighted(R, 4, "tc_2"), 
    flakiness_weighted(R, 4, "tc_2", f=entropy)
))
print("tc_4 W_FR is {:.2f} and W_E is {:.3f}".format(
    flakiness_weighted(R, 4, "tc_4"), 
    flakiness_weighted(R, 4, "tc_4", f=entropy)
))

tc_2 W_FR is 0.62 and W_E is 0.741
tc_4 W_FR is 0.21 and W_E is 0.212
