/
testfailure.py
112 lines (93 loc) · 3.95 KB
/
testfailure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import numpy as np
import xgboost
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from dataset import commit_features, db
from .base import Model
from . import register
import utils
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@register('testfailure')
class TestFailureModel(Model):
def __init__(self, lemmatization=False,path='data/commits.json',):
Model.__init__(self, lemmatization,path)
self.sampler = RandomUnderSampler(random_state=0)
feature_extractors = [
commit_features.source_code_file_size(),
commit_features.other_file_size(),
commit_features.test_file_size(),
commit_features.source_code_added(),
commit_features.other_added(),
commit_features.test_added(),
commit_features.source_code_deleted(),
commit_features.other_deleted(),
commit_features.test_deleted(),
# commit_features.author_experience(),
# commit_features.reviewer_experience(),
commit_features.reviewers_num(),
# commit_features.component_touched_prev(),
# commit_features.directory_touched_prev(),
# commit_features.file_touched_prev(),
commit_features.types(),
commit_features.files(),
commit_features.components(),
commit_features.components_modified_num(),
commit_features.directories(),
commit_features.directories_modified_num(),
commit_features.source_code_files_modified_num(),
commit_features.other_files_modified_num(),
commit_features.test_files_modified_num(),
]
self.extraction_pipeline = Pipeline(
[
(
"commit_extractor",
commit_features.CommitExtractor(feature_extractors, []),
),
("union", ColumnTransformer([("data", DictVectorizer(dtype=np.float32), "data")])),
]
)
self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
self.clf.set_params(predictor="cpu_predictor")
def items_gen(self, limit=None):
commit_map = utils.get_commit_map(path=self.commits_path)
assert len(commit_map) > 0
i=0
for item in tqdm(db.read('data/test_scheduling.pickle.zstd'), total=min(limit,len(commit_map)) if limit else len(commit_map), desc='generating data'):
i += 1
if limit and i > limit:
break
revs, test_datas = item['revs'], item['data']
commits = tuple(
commit_map[revision] for revision in revs if revision in commit_map
)
if len(commits) == 0:
continue
commit_data = commit_features.merge_commits(commits)
label=1 if any(commit['failures'] for commit in commits) else 0
yield commit_data, label
def get_labels(self):
classes = {}
for commit in db.read(self.commits_path):
if self.limit and len(classes) >= self.limit:
break
classes[commit['node']]=1 if commit['failures'] else 0
logger.info(
"%d commits failed", sum(1 for label in classes.values() if label == 1)
)
logger.info(
"%d commits did not fail",
sum(1 for label in classes.values() if label == 0),
)
return [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names_out()