In [1]:
import sys
import numpy as np
import pandas as pd

from src.preprocessing import log
from src.utils import constants

Checking risk of set contaminations during train/test split of samples

In [2]:
for project in constants.PROJECTS:
    df = pd.read_csv(f"../data/labeled/{project}.csv")
    max_unique_commits = int(df["commit"].value_counts().iloc[0])
    max_reruns  = int(df.groupby(["commit", "name"]).agg("count")[["id"]].max().iloc[0])

    print(project, " max unique commit:", max_unique_commits, " max reruns:", max_reruns)

A  max unique commit: 4  max reruns: 4
B  max unique commit: 11  max reruns: 11
C  max unique commit: 4  max reruns: 4
D  max unique commit: 6  max reruns: 2
E  max unique commit: 2  max reruns: 2
veloren  max unique commit: 4  max reruns: 2


Evaluating size reduction of log abstraction

In [3]:
memory_reductions = []

for project in constants.PROJECTS:
    df = pd.read_csv(f"../data/labeled/{project}.csv")
    df["abstract_log"] = df["log"].apply(log.clean)

    df["log_size"] = df["log"].str.len()
    df["abstract_log_size"] = df["abstract_log"].str.len()
    df["reduction"] = ((df["log_size"] - df["abstract_log_size"]) * 100) / df[
        "log_size"
    ]

    memory_reductions.append(df["reduction"].mean())
    del df

print(memory_reductions)

[np.float64(61.68925428643435), np.float64(64.20182838839996), np.float64(63.146457603666114), np.float64(70.27214008054487), np.float64(73.00070075751027), np.float64(71.56565150235059)]


In [4]:
np.mean(memory_reductions)

np.float64(67.31267210315103)

In [5]:
df = pd.read_pickle(f"../data/prepared/A.pickle")
df

Unnamed: 0,id,name,commit,project,created_at,n_past_reruns,n_past_successes,n_past_fails,n_commit_since_brown,time_since_brown,recent_brownness_ratio,log,brown
114455,37224,unit test front-end,5d6dab0ef95c455161bace581ed70c1af19fcc0d,173,2019-08-29 13:05:05.108000+00:00,0,0,0,0,,,[0KRunning with gitlab-runner 11.8.0 (4745a6f...,0
114452,37227,unit test front-end,768108a7c1ec3e9abaffa0c790360eacb95db218,173,2019-08-29 13:52:00.389000+00:00,0,0,0,0,,0.0,[0KRunning with gitlab-runner 11.8.0 (4745a6f...,0
114449,37230,unit test front-end,8c3b858eff9829fc8390b2a31dbebe9770e21b15,173,2019-08-29 15:24:10.739000+00:00,0,0,0,0,,0.0,[0KRunning with gitlab-runner 11.8.0 (4745a6f...,0
114446,37274,unit test front-end,d49f2885b262a350deaeec591ee119bea9126de7,173,2019-08-30 08:43:23.971000+00:00,0,0,0,0,,0.0,[0KRunning with gitlab-runner 11.8.0 (4745a6f...,0
114443,37288,unit test front-end,adc1351dbf5ea4aa83fe03b2632fc69736cf8f1e,173,2019-08-30 10:25:19.898000+00:00,0,0,0,0,,0.0,[0KRunning with gitlab-runner 11.8.0 (4745a6f...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107030,466934,scan_image,b4624ce599e31d4230a3be1d77a9c32595a955e6,173,2024-07-04 22:27:38.580000+00:00,0,0,0,4,610544.780,0.4,[0KRunning with gitlab-runner 15.8.3 (080abea...,0
107015,468339,scan_image,18b4aef6011fe1e42a65e122deee017f2e165f48,173,2024-07-08 18:42:13.184000+00:00,0,0,0,5,942619.384,0.2,[0KRunning with gitlab-runner 15.8.3 (080abea...,0
107012,469912,scan_image,6af9fc9dd26fce8f57c5ca592d375b9e94666ee7,173,2024-07-10 17:02:03.789000+00:00,0,0,0,6,1109409.989,0.0,[0KRunning with gitlab-runner 15.8.3 (080abea...,0
107009,469940,scan_image,649a68e6085bfa954adf5e2c0e6c0574380f95a8,173,2024-07-10 17:17:45.489000+00:00,0,0,0,7,1110351.689,0.0,[0KRunning with gitlab-runner 15.8.3 (080abea...,0


In [6]:
df.columns

Index(['id', 'name', 'commit', 'project', 'created_at', 'n_past_reruns',
       'n_past_successes', 'n_past_fails', 'n_commit_since_brown',
       'time_since_brown', 'recent_brownness_ratio', 'log', 'brown'],
      dtype='object')