In [87]:
import pandas as pd
import numpy as np
import math
import pickle
import random

from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
from scipy.io import loadmat
from scipy.spatial import distance

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score,recall_score,precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

from src import SMOTE
from src import CFS
from src import metrices_V2 as metrices

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path

import matplotlib.pyplot as plt

In [131]:
def load_data(project,commits):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis = 1,how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name','commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0,item)
    understand_df = understand_df[cols_list]
    commit_guru_df = pd.read_csv(commit_guru_path)
    cols = understand_df.columns.tolist()
    commit_guru_df = commit_guru_df.drop(labels = ['parent_hashes','author_name','author_name',
                                                   'author_email','fileschanged','author_date',
                                                   'author_date_unix_timestamp', 'commit_message',
                                                  'classification', 'fix', 'contains_bug','fixes',],axis=1)
    
    commit_guru_df = commit_guru_df[commit_guru_df['commit_hash'].isin(commits)]
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    df = understand_df.merge(commit_guru_df,on='commit_hash')
    cols = df.columns.tolist()
    cols = cols[1:] + [cols[0]]
    df = df[cols]
    for item in ['Kind', 'Name','commit_hash']:
        if item in cols:
            df = df.drop(labels = [item],axis=1)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    y = df.Bugs
    X = df.drop('Bugs',axis = 1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X,columns = cols)
    return X,y

def load_understand_data(project):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis = 1,how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name','commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0,item)
    understand_df = understand_df[cols_list]
    cols = understand_df.columns.tolist()
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    df = understand_df
    cols = df.columns.tolist()
    cols = cols[1:] + [cols[0]]
    df = df[cols]
    for item in ['Kind', 'Name']:
        if item in cols:
            df = df.drop(labels = [item],axis=1)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def load_commit_data(project):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    commit_guru_df = pd.read_csv(commit_guru_path)
    
    commit_guru_df = commit_guru_df.drop(labels = ['parent_hashes','author_name','author_name',
                                                   'author_email','author_date',
                                                   'author_date_unix_timestamp', 'commit_message',
                                                  'classification', 'fix','fixes',],axis=1)

    df = commit_guru_df
    df.rename(columns={"contains_bug": "Bugs"},inplace=True)
    y = df.Bugs
    X = df.drop('Bugs',axis = 1)
    y.fillna(False,inplace=True)
    df['Bugs'] = y
    df.dropna(inplace=True)
    cols = df.columns.tolist()
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def normalize(X):
    commit_hash = X.commit_hash
    X = X.drop('commit_hash',axis = 1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X,columns = cols)
    X['commit_hash'] = commit_hash
    return X

def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def apply_cfs(df):
    y = df.Bugs.values
    X = df.drop(labels = ['Bugs'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Bugs')
    return df[cols],cols
                     
# Assumtion: For every defective files changes randomly pick one
def get_commit_subset_s1(project):
    commit_df = load_commit_data(project)
    if commit_df[commit_df['Bugs'] == True].shape[0] > 100:
        updated_commits = []
        buggy_files = []
        cols = commit_df.columns.values.tolist()
        cols.append('file')
        for i in range(commit_df.shape[0]):
            files = commit_df.loc[i,'fileschanged'].split(',')
            for file in files:
                if file.split('.')[-1] == 'java':
                    commit_info = commit_df.loc[i].values.tolist()
                    commit_info.append(file)
                    updated_commits.append(commit_info)
        updated_commits_df = pd.DataFrame(updated_commits,columns = cols)
        updated_commits_df.drop('fileschanged',axis = 1,inplace = True)
        unique_files = updated_commits_df.file.unique()
        commits = []
        _sum = 0
        for u_file in unique_files:
            sub_df = updated_commits_df[updated_commits_df['file'] == u_file]
            sub_df = sub_df[sub_df['Bugs'] == True]
            if sub_df.shape[0] == 0:
                continue
            rand_commit_num = random.randint(0,sub_df.shape[0]-1)
            commits.append(sub_df.commit_hash.values.tolist()[rand_commit_num])
    else:
        commits = commit_df[commit_df['Bugs'] == True].commit_hash.values.tolist()
    return commits
    
# Assumtion: For every defective file changes pick the most distanct ones
def get_commit_subset_s2(project):
    commit_df = load_commit_data(project)
    commits = []
    if commit_df[commit_df['Bugs'] == True].shape[0] > 100:
        updated_commits = []
        cols = commit_df.columns.values.tolist()
        cols.append('file')
        for i in range(commit_df.shape[0]):
            files = commit_df.loc[i,'fileschanged'].split(',')
            for file in files:
                if file.split('.')[-1] == 'java':
                    commit_info = commit_df.loc[i].values.tolist()
                    commit_info.append(file)
                    updated_commits.append(commit_info)
        updated_commits_df = pd.DataFrame(updated_commits,columns = cols)
        updated_commits_df.drop('fileschanged',axis = 1,inplace = True)
        unique_files = updated_commits_df.file.unique()
        
        _sum = 0
        for u_file in unique_files:
            sub_df = updated_commits_df[updated_commits_df['file'] == u_file]
            sub_df = sub_df[sub_df['Bugs'] == True]
            if sub_df.shape[0] > 2:
                _sub_sample = sub_df.drop(['commit_hash','Bugs','file'],axis = 1)
                _sub_sample = _sub_sample.values.tolist()
                scaler = MinMaxScaler()
                _sub_sample = scaler.fit_transform(_sub_sample)
                dist = distance.cdist(_sub_sample,_sub_sample)
                dist_df = pd.DataFrame(dist,columns = sub_df.commit_hash.values.tolist(),
                                       index = sub_df.commit_hash.values.tolist())
                commit_1 = dist_df.max(axis=1).idxmax(axis = 0)
                commit_2 = dist_df.loc[commit_1].idxmax(axis = 0)
                commits.append([commit_1,commit_2])

            else:
                commits.append(sub_df.commit_hash.values.tolist())
    else:
        commits.append(commit_df[commit_df['Bugs'] == True].commit_hash.values.tolist())
    commits = [val for sublist in commits for val in sublist]
#     _files = updated_commits_df[updated_commits_df['Bugs'] == True].file.values.tolist()
    return commits

# Assumtion: For every defective files changes where number of changes is more then the median pick one at random
def get_commit_subset_s3(project):
    commit_df = load_commit_data(project)
    if commit_df[commit_df['Bugs'] == True].shape[0] > 100:
        updated_commits = []
        buggy_files = []
        cols = commit_df.columns.values.tolist()
        cols.append('file')
        for i in range(commit_df.shape[0]):
            files = commit_df.loc[i,'fileschanged'].split(',')
            for file in files:
                if file.split('.')[-1] == 'java':
                    commit_info = commit_df.loc[i].values.tolist()
                    commit_info.append(file)
                    updated_commits.append(commit_info)
        updated_commits_df = pd.DataFrame(updated_commits,columns = cols)
        updated_commits_df.drop('fileschanged',axis = 1,inplace = True)
        unique_files = updated_commits_df.file.unique()
        median = np.median(updated_commits_df.groupby('file').count().commit_hash.values)
        commits = []
        _sum = 0
        for u_file in unique_files:
            sub_df = updated_commits_df[updated_commits_df['file'] == u_file]
            sub_df = sub_df[sub_df['Bugs'] == True]
            if sub_df.shape[0] > median:
                rand_commit_num = random.randint(0,sub_df.shape[0]-1)
                commits.append(sub_df.commit_hash.values.tolist()[rand_commit_num])
    else:
        commits = commit_df[commit_df['Bugs'] == True].commit_hash.values.tolist()
#     commits = [val for sublist in commits for val in sublist]
    return commits

# Assumtion: For every defective files changes where number of changes is more then the median pick two most 
#distance ones
def get_commit_subset_s4(project):
    commit_df = load_commit_data(project)
    commits = []
    if commit_df[commit_df['Bugs'] == True].shape[0] > 100:
        updated_commits = []
        buggy_files = []
        cols = commit_df.columns.values.tolist()
        cols.append('file')
        for i in range(commit_df.shape[0]):
            files = commit_df.loc[i,'fileschanged'].split(',')
            for file in files:
                if file.split('.')[-1] == 'java':
                    commit_info = commit_df.loc[i].values.tolist()
                    commit_info.append(file)
                    updated_commits.append(commit_info)
        updated_commits_df = pd.DataFrame(updated_commits,columns = cols)
        updated_commits_df.drop('fileschanged',axis = 1,inplace = True)
        unique_files = updated_commits_df.file.unique()
        median = np.median(updated_commits_df.groupby('file').count().commit_hash.values)
        _sum = 0
        for u_file in unique_files:
            sub_df = updated_commits_df[updated_commits_df['file'] == u_file]
            sub_df = sub_df[sub_df['Bugs'] == True]
            if sub_df.shape[0] > median:
                _sub_sample = sub_df.drop(['commit_hash','Bugs','file'],axis = 1)
                _sub_sample = _sub_sample.values.tolist()
                scaler = MinMaxScaler()
                _sub_sample = scaler.fit_transform(_sub_sample)
                dist = distance.cdist(_sub_sample,_sub_sample)
                dist_df = pd.DataFrame(dist,columns = sub_df.commit_hash.values.tolist(),
                                       index = sub_df.commit_hash.values.tolist())
                commit_1 = dist_df.max(axis=1).idxmax(axis = 0)
                commit_2 = dist_df.loc[commit_1].idxmax(axis = 0)
                commits.append([commit_1,commit_2])
    else:
        commits.append(commit_df[commit_df['Bugs'] == True].commit_hash.values.tolist())
    commits = [val for sublist in commits for val in sublist]
    return commits

In [132]:
proj_df = pd.read_csv('projects.csv')
projects = proj_df.repo_name.tolist()

In [133]:
def run(project):
    commits = get_commit_subset_s4(project)
#     commits = get_commit_subset_s2(project)
    X,y = load_data(project,commits)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=18)
    loc = X_test.CountLineCode
    df_smote = pd.concat([X_train,y_train],axis = 1)
    df_smote = apply_smote(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs',axis = 1)
    clf = RandomForestClassifier()
    clf.fit(X_train,y_train)
    predicted = clf.predict(X_test)
    abcd = metrices.measures(y_test,predicted,loc)
    pf = abcd.get_pf()
    recall = abcd.calculate_recall()
    precision = abcd.calculate_precision()
    f1 = abcd.calculate_f1_score()
    g_score = abcd.get_g_score()
    pci_20 = abcd.get_pci_20()
    ifa = abcd.get_ifa()
    try:
        auc = roc_auc_score(y_test, predicted)
    except:
        auc = 0
    print(classification_report(y_test, predicted))
    return recall,precision,pf,f1,g_score,auc,pci_20,ifa

In [135]:
precision_list = {}
recall_list = {}
pf_list = {}
f1_list = {}
g_list = {}
auc_list = {}
pci_20_list = {}
ifa_list = {}
count = 0
for project in projects:
#     if project != 'druid':
#         continue
    try:
        recall,precision,pf,f1,g_score,auc,pci_20,ifa = run(project)
        recall_list[project] = recall
        precision_list[project] = precision
        pf_list[project] = pf
        f1_list[project] = f1
        g_list[project] = g_score
        auc_list[project] = auc
        pci_20_list[project] = pci_20
        ifa_list[project] = ifa
    except Exception as e:
        print(e)
        continue
final_result = {}
final_result['precision'] = precision_list
final_result['recall'] = recall_list
final_result['pf'] = pf_list
final_result['f1'] = f1_list
final_result['g'] = g_list
final_result['auc'] = auc_list
final_result['pci_20'] = pci_20_list
final_result['ifa'] = ifa_list
with open('data/strategy_4_1000.pkl', 'wb') as handle:
    pickle.dump(final_result, handle, protocol=pickle.HIGHEST_PROTOCOL)

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       138
           1       0.95      0.91      0.93       168

    accuracy                           0.92       306
   macro avg       0.92      0.93      0.92       306
weighted avg       0.93      0.92      0.92       306

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164

    accuracy                           1.00       164
   macro avg       1.00      1.00      1.00       164
weighted avg       1.00      1.00      1.00       164

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         7

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

              precision    recall  f1-score   support

           0      

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        19
           1       0.50      1.00      0.67         1

    accuracy                           0.95        20
   macro avg       0.75      0.97      0.82        20
weighted avg       0.97      0.95      0.96        20

index 1 is out of bounds for axis 0 with size 1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       511

    accuracy                           1.00       511
   macro avg       1.00      1.00      1.00       511
weighted avg       1.00      1.00      1.00       511

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       509

    accuracy                           1.00       509
   macro avg       1.00      1.00      1.00       509
weighted avg       1.00      1.00      1.00       509

              precision    recall  f1-score   support

           0       0.92 

              precision    recall  f1-score   support

           0       0.83      0.91      0.87        11
           1       0.67      0.50      0.57         4

    accuracy                           0.80        15
   macro avg       0.75      0.70      0.72        15
weighted avg       0.79      0.80      0.79        15

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        51
           1       0.00      0.00      0.00         3

    accuracy                           0.94        54
   macro avg       0.47      0.50      0.49        54
weighted avg       0.89      0.94      0.92        54

index 1 is out of b

              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.86      0.86      0.86         7

    accuracy                           0.86        14
   macro avg       0.86      0.86      0.86        14
weighted avg       0.86      0.86      0.86        14

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         3

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

              precision    recall  f1-score   support

           0       0.60      0.86      0.71         7
           1       0.86      0.60      0.71        10

    accuracy                           0.71        17
   macro avg       0.73      0.73      0.71        17
weighted avg       0.75      0.71      0.71        17

              preci

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1060

    accuracy                           1.00      1060
   macro avg       1.00      1.00      1.00      1060
weighted avg       1.00      1.00      1.00      1060

              precision    recall  f1-score   support

           0       0.70      0.88      0.78         8
           1       0.50      0.25      0.33         4

    accuracy                           0.67        12
   macro avg       0.60      0.56      0.56        12
weighted avg       0.63      0.67      0.63        12

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       228

    accuracy                           1.00       228
   macro avg       1.00      1.00      1.00       228
weighted avg       1.00      1.00      1.00       228

              precision    recall  f1-score   support

           0       0.80      0.40      0.53        20
           1      

              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       1.00      0.67      0.80         3

    accuracy                           0.90        10
   macro avg       0.94      0.83      0.87        10
weighted avg       0.91      0.90      0.89        10

              precision    recall  f1-score   support

           0       0.74      0.70      0.72        44
           1       0.85      0.87      0.86        84

    accuracy                           0.81       128
   macro avg       0.79      0.79      0.79       128
weighted avg       0.81      0.81      0.81       128

              precision    recall  f1-score   support

           0       0.93      0.92      0.93        62
           1       0.72      0.76      0.74        17

    accuracy                           0.89        79
   macro avg       0.83      0.84      0.83        79
weighted avg       0.89      0.89      0.89        79

              preci

              precision    recall  f1-score   support

           0       0.71      0.73      0.72        33
           1       0.91      0.90      0.90        97

    accuracy                           0.85       130
   macro avg       0.81      0.81      0.81       130
weighted avg       0.86      0.85      0.85       130

              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.50      0.67         2

    accuracy                           0.90        10
   macro avg       0.94      0.75      0.80        10
weighted avg       0.91      0.90      0.89        10

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00         5

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16

              preci

              precision    recall  f1-score   support

           0       0.80      0.89      0.84        27
           1       0.88      0.79      0.84        29

    accuracy                           0.84        56
   macro avg       0.84      0.84      0.84        56
weighted avg       0.84      0.84      0.84        56

              precision    recall  f1-score   support

           0       0.67      0.75      0.71        16
           1       0.75      0.67      0.71        18

    accuracy                           0.71        34
   macro avg       0.71      0.71      0.71        34
weighted avg       0.71      0.71      0.71        34

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       339
           1       0.76      0.94      0.84        17

    accuracy                           0.98       356
   macro avg       0.88      0.96      0.92       356
weighted avg       0.99      0.98      0.98       356

              preci

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       417
           1       0.80      0.36      0.50        11

    accuracy                           0.98       428
   macro avg       0.89      0.68      0.75       428
weighted avg       0.98      0.98      0.98       428

              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.93      0.93      0.93        14

    accuracy                           0.89        19
   macro avg       0.86      0.86      0.86        19
weighted avg       0.89      0.89      0.89        19

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       304
           1       0.69      0.71      0.70        48

    accuracy                           0.92       352
   macro avg       0.82      0.83      0.83       352
weighted avg       0.92      0.92      0.92       352

              preci

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        15
           1       0.89      1.00      0.94        17

    accuracy                           0.94        32
   macro avg       0.95      0.93      0.94        32
weighted avg       0.94      0.94      0.94        32

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        23
           1       0.85      0.85      0.85        27

    accuracy                           0.84        50
   macro avg       0.84      0.84      0.84        50
weighted avg       0.84      0.84      0.84        50

              precision    recall  f1-score   support

           0       0.79      0.72      0.75        76
           1       0.81      0.86      0.83       104

    accuracy                           0.80       180
   macro avg       0.80      0.79      0.79       180
weighted avg       0.80      0.80      0.80       180

              preci

              precision    recall  f1-score   support

           0       0.80      0.53      0.64        15
           1       0.71      0.89      0.79        19

    accuracy                           0.74        34
   macro avg       0.75      0.71      0.72        34
weighted avg       0.75      0.74      0.72        34

              precision    recall  f1-score   support

           0       0.25      0.25      0.25         4
           1       0.83      0.83      0.83        18

    accuracy                           0.73        22
   macro avg       0.54      0.54      0.54        22
weighted avg       0.73      0.73      0.73        22

              precision    recall  f1-score   support

           0       0.82      0.78      0.80        63
           1       0.62      0.68      0.65        34

    accuracy                           0.74        97
   macro avg       0.72      0.73      0.72        97
weighted avg       0.75      0.74      0.74        97

              preci

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       411
           1       0.75      0.67      0.71         9

    accuracy                           0.99       420
   macro avg       0.87      0.83      0.85       420
weighted avg       0.99      0.99      0.99       420

              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.88      0.88      0.88         8

    accuracy                           0.87        15
   macro avg       0.87      0.87      0.87        15
weighted avg       0.87      0.87      0.87        15

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        57
           1       0.71      0.71      0.71        31

    accuracy                           0.80        88
   macro avg       0.78      0.78      0.78        88
weighted avg       0.80      0.80      0.80        88

              preci

              precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       1.00      0.88      0.94        25

    accuracy                           0.90        30
   macro avg       0.81      0.94      0.85        30
weighted avg       0.94      0.90      0.91        30

              precision    recall  f1-score   support

           0       0.94      0.80      0.86        20
           1       0.93      0.98      0.96        56

    accuracy                           0.93        76
   macro avg       0.94      0.89      0.91        76
weighted avg       0.93      0.93      0.93        76

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       185
           1       0.73      0.54      0.62        35

    accuracy                           0.90       220
   macro avg       0.82      0.75      0.78       220
weighted avg       0.89      0.90      0.89       220

              preci

              precision    recall  f1-score   support

           0       0.88      0.93      0.90       114
           1       0.87      0.78      0.82        67

    accuracy                           0.87       181
   macro avg       0.87      0.85      0.86       181
weighted avg       0.87      0.87      0.87       181

              precision    recall  f1-score   support

           0       0.67      0.71      0.69        14
           1       0.80      0.76      0.78        21

    accuracy                           0.74        35
   macro avg       0.73      0.74      0.74        35
weighted avg       0.75      0.74      0.74        35

              precision    recall  f1-score   support

           0       0.75      0.78      0.77        79
           1       0.65      0.60      0.62        52

    accuracy                           0.71       131
   macro avg       0.70      0.69      0.69       131
weighted avg       0.71      0.71      0.71       131

              preci

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       105
           1       0.84      0.84      0.84        31

    accuracy                           0.93       136
   macro avg       0.90      0.90      0.90       136
weighted avg       0.93      0.93      0.93       136

              precision    recall  f1-score   support

           0       0.90      0.95      0.92       150
           1       0.88      0.78      0.83        74

    accuracy                           0.89       224
   macro avg       0.89      0.87      0.88       224
weighted avg       0.89      0.89      0.89       224

              precision    recall  f1-score   support

           0       0.92      0.70      0.80        47
           1       0.63      0.89      0.74        27

    accuracy                           0.77        74
   macro avg       0.77      0.80      0.77        74
weighted avg       0.81      0.77      0.77        74

              preci

              precision    recall  f1-score   support

           0       0.80      0.92      0.86        71
           1       0.71      0.48      0.58        31

    accuracy                           0.78       102
   macro avg       0.76      0.70      0.72       102
weighted avg       0.78      0.78      0.77       102

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       176
           1       0.71      0.72      0.71       103

    accuracy                           0.79       279
   macro avg       0.77      0.77      0.77       279
weighted avg       0.79      0.79      0.79       279

              precision    recall  f1-score   support

           0       0.67      0.88      0.76        16
           1       0.89      0.71      0.79        24

    accuracy                           0.78        40
   macro avg       0.78      0.79      0.77        40
weighted avg       0.80      0.78      0.78        40

              preci

              precision    recall  f1-score   support

           0       0.81      0.78      0.79        37
           1       0.83      0.84      0.84        45

    accuracy                           0.82        82
   macro avg       0.82      0.81      0.81        82
weighted avg       0.82      0.82      0.82        82

              precision    recall  f1-score   support

           0       0.86      0.85      0.86        73
           1       0.87      0.88      0.88        86

    accuracy                           0.87       159
   macro avg       0.87      0.87      0.87       159
weighted avg       0.87      0.87      0.87       159

              precision    recall  f1-score   support

           0       0.81      0.90      0.85        62
           1       0.90      0.81      0.85        68

    accuracy                           0.85       130
   macro avg       0.86      0.86      0.85       130
weighted avg       0.86      0.85      0.85       130

              preci

              precision    recall  f1-score   support

           0       0.85      0.92      0.88        74
           1       0.75      0.60      0.67        30

    accuracy                           0.83       104
   macro avg       0.80      0.76      0.77       104
weighted avg       0.82      0.83      0.82       104

              precision    recall  f1-score   support

           0       0.50      0.33      0.40         9
           1       0.82      0.90      0.86        31

    accuracy                           0.78        40
   macro avg       0.66      0.62      0.63        40
weighted avg       0.75      0.78      0.76        40

              precision    recall  f1-score   support

           0       0.92      0.88      0.90       115
           1       0.77      0.84      0.80        55

    accuracy                           0.86       170
   macro avg       0.84      0.86      0.85       170
weighted avg       0.87      0.86      0.87       170

              preci

              precision    recall  f1-score   support

           0       0.82      0.76      0.79        59
           1       0.75      0.80      0.77        51

    accuracy                           0.78       110
   macro avg       0.78      0.78      0.78       110
weighted avg       0.78      0.78      0.78       110

              precision    recall  f1-score   support

           0       0.63      0.63      0.63        51
           1       0.85      0.85      0.85       130

    accuracy                           0.79       181
   macro avg       0.74      0.74      0.74       181
weighted avg       0.79      0.79      0.79       181

              precision    recall  f1-score   support

           0       0.76      0.89      0.82        54
           1       0.71      0.50      0.59        30

    accuracy                           0.75        84
   macro avg       0.74      0.69      0.70        84
weighted avg       0.74      0.75      0.74        84

              preci

              precision    recall  f1-score   support

           0       0.86      0.87      0.86        68
           1       0.73      0.71      0.72        34

    accuracy                           0.81       102
   macro avg       0.79      0.79      0.79       102
weighted avg       0.81      0.81      0.81       102

              precision    recall  f1-score   support

           0       0.69      0.56      0.62        36
           1       0.87      0.92      0.90       117

    accuracy                           0.84       153
   macro avg       0.78      0.74      0.76       153
weighted avg       0.83      0.84      0.83       153

              precision    recall  f1-score   support

           0       0.84      0.83      0.83        81
           1       0.82      0.83      0.83        77

    accuracy                           0.83       158
   macro avg       0.83      0.83      0.83       158
weighted avg       0.83      0.83      0.83       158

              preci

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       477
           1       0.73      0.52      0.61       140

    accuracy                           0.85       617
   macro avg       0.80      0.73      0.76       617
weighted avg       0.84      0.85      0.84       617

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       677
           1       0.84      0.73      0.78       171

    accuracy                           0.92       848
   macro avg       0.89      0.84      0.86       848
weighted avg       0.91      0.92      0.91       848

              precision    recall  f1-score   support

           0       0.92      0.96      0.94        24
           1       0.96      0.93      0.94        27

    accuracy                           0.94        51
   macro avg       0.94      0.94      0.94        51
weighted avg       0.94      0.94      0.94        51

              preci

              precision    recall  f1-score   support

           0       0.71      0.77      0.74        94
           1       0.83      0.79      0.81       138

    accuracy                           0.78       232
   macro avg       0.77      0.78      0.77       232
weighted avg       0.78      0.78      0.78       232

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       124
           1       0.84      0.85      0.84       104

    accuracy                           0.86       228
   macro avg       0.85      0.85      0.85       228
weighted avg       0.86      0.86      0.86       228

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3382
           1       0.88      0.85      0.87       386

    accuracy                           0.97      3768
   macro avg       0.93      0.92      0.93      3768
weighted avg       0.97      0.97      0.97      3768

              preci

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       187
           1       0.84      0.87      0.86       253

    accuracy                           0.83       440
   macro avg       0.83      0.82      0.83       440
weighted avg       0.83      0.83      0.83       440

              precision    recall  f1-score   support

           0       0.79      0.72      0.75       113
           1       0.76      0.82      0.79       125

    accuracy                           0.77       238
   macro avg       0.77      0.77      0.77       238
weighted avg       0.77      0.77      0.77       238

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       112
           1       0.81      0.86      0.83        97

    accuracy                           0.84       209
   macro avg       0.84      0.84      0.84       209
weighted avg       0.84      0.84      0.84       209

              preci

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       149
           1       0.76      0.72      0.74        79

    accuracy                           0.82       228
   macro avg       0.81      0.80      0.80       228
weighted avg       0.82      0.82      0.82       228

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       301
           1       0.82      0.91      0.86       103

    accuracy                           0.93       404
   macro avg       0.89      0.92      0.91       404
weighted avg       0.93      0.93      0.93       404

              precision    recall  f1-score   support

           0       0.85      0.80      0.82        49
           1       0.84      0.88      0.86        60

    accuracy                           0.84       109
   macro avg       0.84      0.84      0.84       109
weighted avg       0.84      0.84      0.84       109

              preci

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       126
           1       0.88      0.82      0.85        84

    accuracy                           0.89       210
   macro avg       0.89      0.88      0.88       210
weighted avg       0.89      0.89      0.88       210

              precision    recall  f1-score   support

           0       0.86      0.93      0.89        27
           1       0.33      0.20      0.25         5

    accuracy                           0.81        32
   macro avg       0.60      0.56      0.57        32
weighted avg       0.78      0.81      0.79        32

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       247
           1       0.82      0.75      0.79       122

    accuracy                           0.86       369
   macro avg       0.85      0.84      0.84       369
weighted avg       0.86      0.86      0.86       369

              preci

              precision    recall  f1-score   support

           0       0.66      0.69      0.68        88
           1       0.81      0.79      0.80       146

    accuracy                           0.75       234
   macro avg       0.74      0.74      0.74       234
weighted avg       0.75      0.75      0.75       234

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       362
           1       0.68      0.67      0.67       132

    accuracy                           0.83       494
   macro avg       0.78      0.78      0.78       494
weighted avg       0.83      0.83      0.83       494

              precision    recall  f1-score   support

           0       0.81      0.80      0.80        93
           1       0.87      0.88      0.87       143

    accuracy                           0.85       236
   macro avg       0.84      0.84      0.84       236
weighted avg       0.85      0.85      0.85       236

              preci

              precision    recall  f1-score   support

           0       0.92      0.85      0.88        13
           1       0.87      0.93      0.90        14

    accuracy                           0.89        27
   macro avg       0.89      0.89      0.89        27
weighted avg       0.89      0.89      0.89        27

index 1 is out of bounds for axis 0 with size 1
              precision    recall  f1-score   support

           0       0.87      0.79      0.83       150
           1       0.88      0.93      0.90       248

    accuracy                           0.88       398
   macro avg       0.87      0.86      0.87       398
weighted avg       0.88      0.88      0.88       398

              precision    recall  f1-score   support

           0       0.92      0.91      0.91        64
           1       0.83      0.86      0.85        35

    accuracy                           0.89        99
   macro avg       0.88      0.88      0.88        99
weighted avg       0.89   

              precision    recall  f1-score   support

           0       0.92      0.77      0.84        70
           1       0.73      0.90      0.80        48

    accuracy                           0.82       118
   macro avg       0.82      0.83      0.82       118
weighted avg       0.84      0.82      0.82       118

              precision    recall  f1-score   support

           0       0.84      0.74      0.79       181
           1       0.65      0.77      0.70       111

    accuracy                           0.75       292
   macro avg       0.74      0.76      0.75       292
weighted avg       0.77      0.75      0.76       292

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       581
           1       0.56      0.35      0.43       127

    accuracy                           0.83       708
   macro avg       0.72      0.64      0.67       708
weighted avg       0.81      0.83      0.82       708

              preci

              precision    recall  f1-score   support

           0       0.87      0.92      0.89        49
           1       0.79      0.68      0.73        22

    accuracy                           0.85        71
   macro avg       0.83      0.80      0.81        71
weighted avg       0.84      0.85      0.84        71

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       230
           1       0.88      0.84      0.86       283

    accuracy                           0.85       513
   macro avg       0.85      0.85      0.85       513
weighted avg       0.85      0.85      0.85       513

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       114
           1       0.89      0.92      0.90       106

    accuracy                           0.90       220
   macro avg       0.90      0.90      0.90       220
weighted avg       0.90      0.90      0.90       220

              preci

              precision    recall  f1-score   support

           0       0.92      0.83      0.87       663
           1       0.78      0.89      0.83       450

    accuracy                           0.86      1113
   macro avg       0.85      0.86      0.85      1113
weighted avg       0.86      0.86      0.86      1113

              precision    recall  f1-score   support

           0       0.77      0.95      0.85        21
           1       0.97      0.83      0.89        35

    accuracy                           0.88        56
   macro avg       0.87      0.89      0.87        56
weighted avg       0.89      0.88      0.88        56

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       140
           1       0.88      0.84      0.86       138

    accuracy                           0.86       278
   macro avg       0.86      0.86      0.86       278
weighted avg       0.86      0.86      0.86       278

              preci

              precision    recall  f1-score   support

           0       0.72      0.75      0.74       190
           1       0.80      0.78      0.79       250

    accuracy                           0.77       440
   macro avg       0.76      0.76      0.76       440
weighted avg       0.77      0.77      0.77       440

              precision    recall  f1-score   support

           0       0.87      0.94      0.91        71
           1       0.89      0.76      0.82        41

    accuracy                           0.88       112
   macro avg       0.88      0.85      0.86       112
weighted avg       0.88      0.88      0.87       112

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       233
           1       0.85      0.83      0.84       253

    accuracy                           0.84       486
   macro avg       0.84      0.84      0.84       486
weighted avg       0.84      0.84      0.84       486

              preci

              precision    recall  f1-score   support

           0       0.78      0.76      0.77        55
           1       0.82      0.83      0.82        70

    accuracy                           0.80       125
   macro avg       0.80      0.80      0.80       125
weighted avg       0.80      0.80      0.80       125

Found array with 0 sample(s) (shape=(0, 58)) while a minimum of 1 is required by MinMaxScaler.
              precision    recall  f1-score   support

           0       0.90      0.86      0.88       505
           1       0.92      0.94      0.93       839

    accuracy                           0.91      1344
   macro avg       0.91      0.90      0.90      1344
weighted avg       0.91      0.91      0.91      1344

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       123
           1       0.92      0.85      0.89       199

    accuracy                           0.86       322
   macro avg       0.85      0.87

              precision    recall  f1-score   support

           0       0.74      0.81      0.78        43
           1       0.91      0.88      0.89        97

    accuracy                           0.86       140
   macro avg       0.83      0.85      0.84       140
weighted avg       0.86      0.86      0.86       140

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       136
           1       0.69      0.61      0.65        51

    accuracy                           0.82       187
   macro avg       0.77      0.75      0.76       187
weighted avg       0.81      0.82      0.81       187

              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.97      0.95      0.96       110

    accuracy                           0.94       139
   macro avg       0.89      0.92      0.91       139
weighted avg       0.94      0.94      0.94       139

              preci

              precision    recall  f1-score   support

           0       0.64      0.76      0.70        72
           1       0.64      0.49      0.56        61

    accuracy                           0.64       133
   macro avg       0.64      0.63      0.63       133
weighted avg       0.64      0.64      0.63       133

              precision    recall  f1-score   support

           0       0.92      0.73      0.81        15
           1       0.75      0.92      0.83        13

    accuracy                           0.82        28
   macro avg       0.83      0.83      0.82        28
weighted avg       0.84      0.82      0.82        28

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       823
           1       0.76      0.78      0.77       205

    accuracy                           0.91      1028
   macro avg       0.85      0.86      0.86      1028
weighted avg       0.91      0.91      0.91      1028

              preci

              precision    recall  f1-score   support

           0       0.78      0.75      0.76        56
           1       0.74      0.76      0.75        51

    accuracy                           0.76       107
   macro avg       0.76      0.76      0.76       107
weighted avg       0.76      0.76      0.76       107

              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       1.00      0.62      0.77         8

    accuracy                           0.80        15
   macro avg       0.85      0.81      0.80        15
weighted avg       0.86      0.80      0.79        15

              precision    recall  f1-score   support

           0       0.67      0.40      0.50        15
           1       0.78      0.91      0.84        34

    accuracy                           0.76        49
   macro avg       0.72      0.66      0.67        49
weighted avg       0.74      0.76      0.73        49

              preci

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       783
           1       0.79      0.84      0.82       609

    accuracy                           0.84      1392
   macro avg       0.83      0.84      0.83      1392
weighted avg       0.84      0.84      0.84      1392

              precision    recall  f1-score   support

           0       0.67      0.81      0.74        74
           1       0.64      0.46      0.54        54

    accuracy                           0.66       128
   macro avg       0.66      0.64      0.64       128
weighted avg       0.66      0.66      0.65       128

              precision    recall  f1-score   support

           0       0.75      0.80      0.77       171
           1       0.80      0.75      0.78       184

    accuracy                           0.77       355
   macro avg       0.78      0.78      0.77       355
weighted avg       0.78      0.77      0.77       355

              preci

              precision    recall  f1-score   support

           0       0.79      0.78      0.79       114
           1       0.76      0.78      0.77       104

    accuracy                           0.78       218
   macro avg       0.78      0.78      0.78       218
weighted avg       0.78      0.78      0.78       218

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1126
           1       0.84      0.88      0.86      1043

    accuracy                           0.87      2169
   macro avg       0.87      0.87      0.87      2169
weighted avg       0.87      0.87      0.87      2169

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       246
           1       0.72      0.63      0.67       107

    accuracy                           0.81       353
   macro avg       0.78      0.76      0.77       353
weighted avg       0.81      0.81      0.81       353

              preci

              precision    recall  f1-score   support

           0       0.90      0.76      0.82       282
           1       0.75      0.90      0.82       230

    accuracy                           0.82       512
   macro avg       0.83      0.83      0.82       512
weighted avg       0.83      0.82      0.82       512

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       144
           1       0.85      0.72      0.78       144

    accuracy                           0.80       288
   macro avg       0.80      0.80      0.79       288
weighted avg       0.80      0.80      0.79       288

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       162
           1       0.79      0.67      0.72       137

    accuracy                           0.77       299
   macro avg       0.77      0.76      0.76       299
weighted avg       0.77      0.77      0.76       299

              preci

              precision    recall  f1-score   support

           0       0.78      0.77      0.77       368
           1       0.84      0.85      0.84       521

    accuracy                           0.82       889
   macro avg       0.81      0.81      0.81       889
weighted avg       0.82      0.82      0.82       889

              precision    recall  f1-score   support

           0       0.79      0.73      0.76       203
           1       0.75      0.81      0.78       209

    accuracy                           0.77       412
   macro avg       0.77      0.77      0.77       412
weighted avg       0.77      0.77      0.77       412

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       585
           1       0.87      0.87      0.87       797

    accuracy                           0.85      1382
   macro avg       0.84      0.84      0.84      1382
weighted avg       0.85      0.85      0.85      1382

              preci

In [136]:
commits = []
for project in projects:
    commits_all = load_commit_data(project)
    commits_all = commits_all[commits_all['Bugs'] == True]
    commits_1 = get_commit_subset_s1(project)
    commits_2 = get_commit_subset_s2(project)
    commits_3 = get_commit_subset_s3(project)
    commits_4 = get_commit_subset_s4(project)
    commits.append([project,commits_all.shape[0],len(list(set(commits_1))),len(list(set(commits_2))),len(list(set(commits_3))),len(list(set(commits_4)))])
commits_df = pd.DataFrame(commits,columns = ['project','all','Strategy_1','Strategy_2','Strategy_3','Strategy_4'])
commits_df['s1_prec'] = round(commits_df['Strategy_1']/commits_df['all'],2)
commits_df['s2_prec'] = round(commits_df['Strategy_2']/commits_df['all'],2)
commits_df['s3_prec'] = round(commits_df['Strategy_3']/commits_df['all'],2)
commits_df['s4_prec'] = round(commits_df['Strategy_4']/commits_df['all'],2)
commits_df.to_csv('results/commit_strategy.csv')    
    

In [121]:
commits_df

Unnamed: 0,project,all,Strategy_1,Strategy_2,Strategy_3,s1_prec,s2_prec,s3_prec
0,org.alloytools.alloy,97,97,97,97,1.000000,1.000000,1.000000
1,qpython,1,119,119,119,119.000000,119.000000,119.000000
2,friendlychat-android,29,29,29,29,1.000000,1.000000,1.000000
3,paho.mqtt.android,1,167,167,167,167.000000,167.000000,167.000000
4,paho.mqtt.java,1,611,611,611,611.000000,611.000000,611.000000
5,android-docs-samples,81,81,81,81,1.000000,1.000000,1.000000
6,Toasty,1,126,126,126,126.000000,126.000000,126.000000
7,android-mvvm-architecture,92,92,92,92,1.000000,1.000000,1.000000
8,recipes-rss,70,70,70,70,1.000000,1.000000,1.000000
9,HoloGraphLibrary,49,49,49,49,1.000000,1.000000,1.000000
