In [2]:
import gzip
import json
import os
import sys
import numpy as np
import pandas as pd
import datetime

sys.path.append("../../..")

module_path_1 = os.path.abspath(os.path.join("../../../data-sources/TestGrid"))

if module_path_1 not in sys.path:
    sys.path.append(module_path_1)

from ipynb.fs.defs.testgrid_EDA import decode_run_length  # noqa: E402

In [3]:
# Load test file
with gzip.open("../../../../data/raw/testgrid_810.json.gz", "rb") as read_file:
    data = json.load(read_file)

In [4]:
percent_failure_by_grid_csv = []

for tab in data.keys():
    print(tab)
    for grid in data[tab].keys():
        current_grid = data[tab][grid]
        if len(current_grid["grid"]) == 0:
            pass
        else:
            # get all timestamps for this grid (x-axis of grid)
            timestamps = [
                datetime.datetime.fromtimestamp(x // 1000)
                for x in current_grid["timestamps"]
            ]
            # get all test names for this grid (y-axis of grid)
            tests = [
                current_grid["grid"][i]["name"]
                for i in range(len(current_grid["grid"]))
            ]
            # unroll the run-length encoding and set bool for failure or not (x==12)
            decoded = [
                (
                    np.array(
                        decode_run_length(current_grid["grid"][i]["statuses"])
                    )
                    == 12
                ).tolist()
                for i in range(len(current_grid["grid"]))
            ]
            # add the timestamp to bool value
            decoded = [list(zip(timestamps, g)) for g in decoded]
            # add the test, tab and grid name to each entry
            # TODO: any ideas for avoiding this quad-loop
            for i, d in enumerate(decoded):
                for j, k in enumerate(d):
                    decoded[i][j] = (k[0], tab, grid, tests[i], k[1])
            # accumulate the results
            percent_failure_by_grid_csv.append(decoded)


len(percent_failure_by_grid_csv)

"redhat-openshift-informing"
"redhat-openshift-ocp-release-3.11-informing"
"redhat-openshift-ocp-release-4.1-blocking"
"redhat-openshift-ocp-release-4.1-informing"
"redhat-openshift-ocp-release-4.2-blocking"
"redhat-openshift-ocp-release-4.2-informing"
"redhat-openshift-ocp-release-4.3-blocking"
"redhat-openshift-ocp-release-4.3-broken"
"redhat-openshift-ocp-release-4.3-informing"
"redhat-openshift-ocp-release-4.4-blocking"
"redhat-openshift-ocp-release-4.4-broken"
"redhat-openshift-ocp-release-4.4-informing"
"redhat-openshift-ocp-release-4.5-blocking"
"redhat-openshift-ocp-release-4.5-broken"
"redhat-openshift-ocp-release-4.5-informing"
"redhat-openshift-ocp-release-4.6-blocking"
"redhat-openshift-ocp-release-4.6-broken"
"redhat-openshift-ocp-release-4.6-informing"
"redhat-openshift-ocp-release-4.7-blocking"
"redhat-openshift-ocp-release-4.7-broken"
"redhat-openshift-ocp-release-4.7-informing"
"redhat-openshift-okd-release-4.3-informing"
"redhat-openshift-okd-release-4.4-informing"
"r

369

In [5]:
# output above leaves us with a doubly nested list. Flatten
flat_list = [item for sublist in percent_failure_by_grid_csv for item in sublist]
flatter_list = [item for sublist in flat_list for item in sublist]

In [6]:
flatter_list[0]

(datetime.datetime(2020, 10, 8, 20, 48, 5),
 '"redhat-openshift-informing"',
 'release-openshift-okd-installer-e2e-aws-upgrade',
 'Application behind service load balancer with PDB is not disrupted',
 False)

In [7]:
len(flatter_list)

19483548

In [8]:
# Convert to dataframe
df_csv = pd.DataFrame(
    flatter_list, columns=["timestamp", "tab", "job", "test", "failure"]
)
df_csv.head()

Unnamed: 0,timestamp,tab,job,test,failure
0,2020-10-08 20:48:05,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
1,2020-10-08 19:12:01,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,True
2,2020-10-08 14:18:13,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
3,2020-10-08 11:15:28,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
4,2020-10-08 08:27:53,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False


In [9]:
# saving only the first 1000000 out of 19 million rows due to pvc limits.
# 250mb = 1 million --> 4750 mb = 19 million
df_csv.head(1000000).to_csv(
    "../../../../data/processed/failures.csv",
    header=False,
)

In [10]:
percent_pass_by_grid_csv = []

for tab in data.keys():
    print(tab)
    for grid in data[tab].keys():
        current_grid = data[tab][grid]
        if len(current_grid["grid"]) == 0:
            pass
        else:
            # get all timestamps for this grid (x-axis of grid)
            timestamps = [
                datetime.datetime.fromtimestamp(x // 1000)
                for x in current_grid["timestamps"]
            ]
            # get all test names for this grid (y-axis of grid)
            tests = [
                current_grid["grid"][i]["name"]
                for i in range(len(current_grid["grid"]))
            ]
            # unroll the run-length encoding and set bool for passing or not (x==1)
            decoded = [
                (
                    np.array(
                        decode_run_length(current_grid["grid"][i]["statuses"])
                    )
                    == 1
                ).tolist()
                for i in range(len(current_grid["grid"]))
            ]
            # add the timestamp to bool value
            decoded = [list(zip(timestamps, g)) for g in decoded]
            # add the test, tab and grid name to each entry
            # TODO: any ideas for avoiding this quad-loop
            for i, d in enumerate(decoded):
                for j, k in enumerate(d):
                    decoded[i][j] = (k[0], tab, grid, tests[i], k[1])
            # accumulate the results
            percent_pass_by_grid_csv.append(decoded)


len(percent_pass_by_grid_csv)

"redhat-openshift-informing"
"redhat-openshift-ocp-release-3.11-informing"
"redhat-openshift-ocp-release-4.1-blocking"
"redhat-openshift-ocp-release-4.1-informing"
"redhat-openshift-ocp-release-4.2-blocking"
"redhat-openshift-ocp-release-4.2-informing"
"redhat-openshift-ocp-release-4.3-blocking"
"redhat-openshift-ocp-release-4.3-broken"
"redhat-openshift-ocp-release-4.3-informing"
"redhat-openshift-ocp-release-4.4-blocking"
"redhat-openshift-ocp-release-4.4-broken"
"redhat-openshift-ocp-release-4.4-informing"
"redhat-openshift-ocp-release-4.5-blocking"
"redhat-openshift-ocp-release-4.5-broken"
"redhat-openshift-ocp-release-4.5-informing"
"redhat-openshift-ocp-release-4.6-blocking"
"redhat-openshift-ocp-release-4.6-broken"
"redhat-openshift-ocp-release-4.6-informing"
"redhat-openshift-ocp-release-4.7-blocking"
"redhat-openshift-ocp-release-4.7-broken"
"redhat-openshift-ocp-release-4.7-informing"
"redhat-openshift-okd-release-4.3-informing"
"redhat-openshift-okd-release-4.4-informing"
"r

369

In [11]:
# output above leaves us with a doubly nested list. Flatten
flat_list = [item for sublist in percent_pass_by_grid_csv for item in sublist]
flatter_list = [item for sublist in flat_list for item in sublist]
len(flatter_list)

19483548

In [12]:
# Convert to dataframe
df1_csv = pd.DataFrame(
    flatter_list, columns=["timestamp", "tab", "job", "test", "passing"]
)
df1_csv.head()

Unnamed: 0,timestamp,tab,job,test,passing
0,2020-10-08 20:48:05,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
1,2020-10-08 19:12:01,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
2,2020-10-08 14:18:13,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
3,2020-10-08 11:15:28,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False
4,2020-10-08 08:27:53,"""redhat-openshift-informing""",release-openshift-okd-installer-e2e-aws-upgrade,Application behind service load balancer with ...,False


In [13]:
# saving only the first 1000000 out of 19 million rows due to pvc limits.
# 250mb = 1 million --> 4750 mb = 19 million
df1_csv.head(1000000).to_csv(
    "../../../../data/processed/pass.csv",
    header=False,
)

In [14]:
# Metrics
no_tests = df1_csv.head(1000000).test.count()
print("Total number of tests: %i" %(no_tests))
no_failures = df_csv.head(1000000).failure.sum()
print("Total number of failing tests: %i" %(no_failures))
test_failures_percentage = ((df_csv.head(1000000).failure.sum() / df_csv.head(1000000).test.count()))*100
print("Test failure percentage: %f" %(test_failures_percentage))
no_pass = df1_csv.head(1000000).passing.sum()
print("Total number of passing tests: %i" %(no_pass))
test_pass_percentage = ((df1_csv.head(1000000).passing.sum() / df1_csv.head(1000000).passing.count()))*100
print("Test pass percentage: %f" %(test_pass_percentage))

Total number of tests: 1000000
Total number of failing tests: 3989
Test failure percentage: 0.398900
Total number of passing tests: 704558
Test pass percentage: 70.455800
