In [4]:
import pandas as pd
import numpy as np
import math
import pickle
from datetime import datetime

from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
from scipy.io import loadmat

import matlab.engine as engi
import matlab as mat

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score,recall_score,precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from pyearth import Earth

from src import SMOTE
from src import CFS
from src import metrices_V2 as metrices

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path

import matplotlib.pyplot as plt

In [154]:
def load_data(project):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    
    release_df = pd.read_pickle('data/release/' + project + '_release.pkl')
    release_df = release_df.sort_values('created_at',ascending=False)
    release_df = release_df.reset_index(drop=True)
#     df_test_releases = release_df[0:5]
#     df_test_releases.reset_index(inplace = True, drop=True)
#     df_train_releases = release_df[5:]
#     df_train_releases.reset_index(inplace = True, drop=True)
#     last_train_release = datetime.strptime(df_train_releases.loc[0,'created_at'], '%Y-%m-%d').date()
    
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis = 1,how='all')
    cols_list = understand_df.columns.values.tolist()
    
    for item in ['Kind', 'Name','commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0,item)
            
    understand_df = understand_df[cols_list]
    commit_guru_df = pd.read_csv(commit_guru_path)
    cols = understand_df.columns.tolist()
    commit_guru_df['created_at'] = pd.to_datetime(commit_guru_df.author_date_unix_timestamp,unit='s')
    commit_guru_df = commit_guru_df.drop(labels = ['parent_hashes','author_name','author_name',
                                                   'author_email','fileschanged','author_date',
                                                   'author_date_unix_timestamp', 'commit_message',
                                                  'classification', 'fix', 'contains_bug','fixes',],axis=1)

    
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    df = understand_df.merge(commit_guru_df,on='commit_hash')
    cols = df.columns.tolist()
    cols = cols[1:] + [cols[0]]
    df = df[cols]
    for item in ['Kind', 'Name','commit_hash']:
        if item in cols:
            df = df.drop(labels = [item],axis=1)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv('data/converted/'+ project + '_understand.csv',index=False)
    y = df.Bugs
    df = df.drop('Bugs',axis = 1)
    cols = df.columns
    created_at = df.created_at
    df = df.drop('created_at',axis = 1)
    scaler = MinMaxScaler()
    df = scaler.fit_transform(df)
    df = pd.DataFrame(X,columns = cols[:-1])
    df['created_at'] = created_at
    df['Bugs'] = y
    
#     df_last_commit_date = df.loc[df.shape[0],'created_at']

    df_last_commit_date = df.loc[df.shape[0]-1,'created_at']
    last_train_date = None
    count = 0
    for i in range(release_df.shape[0]):
        release_date = datetime.strptime(release_df.loc[i,'created_at'], '%Y-%m-%d')
        if release_date <= df_last_commit_date:
            count += 1
        if count == 5:
            last_train_date = release_date
            break
    
#     print(last_train_date)
#     print(df)
    
    if last_train_date == None:
        return df,df,0
    
    
    
    df['created_at'] = pd.to_datetime(df.created_at,unit='s')
    train_df =  df[df.created_at < last_train_date]
    test_df =  df[df.created_at >= last_train_date]
    
    train_df = train_df.drop('created_at',axis = 1)
    test_df = test_df.drop('created_at',axis = 1)
    
    if train_df.shape[0] == 0:
        return df,df,0
    
    if test_df.shape[0] == 0:
        return df,df,0
    
    return train_df,test_df,1


def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def apply_cfs(df):
        y = df.Bugs.values
        X = df.drop(labels = ['Bugs'],axis = 1)
        X = X.values
        selected_cols = CFS.cfs(X,y)
        cols = df.columns[[selected_cols]].tolist()
        cols.append('Bugs')
        return df[cols],cols

In [157]:
def run_self(project):
    train_df, test_df,complete = load_data(project)
    if complete == 0:
        return None,None,None,None,None,None,None,None,None
    loc = test_df.CountLineCode
    df_smote = train_df
    df_smote = apply_smote(df_smote)
#     df_smote,cols = apply_cfs(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs',axis = 1)
    clf =  RandomForestClassifier()
    clf.fit(X_train,y_train)
    importance = 0
    
#     test_df = test_df[cols]
    y_test = test_df.Bugs
    X_test = test_df.drop('Bugs',axis = 1)
    
    predicted = clf.predict(X_test)
    abcd = metrices.measures(y_test,predicted,loc)
    pf = abcd.get_pf()
    recall = abcd.calculate_recall()
    precision = abcd.calculate_precision()
    f1 = abcd.calculate_f1_score()
    g_score = abcd.get_g_score()
    pci_20 = abcd.get_pci_20()
    ifa = abcd.get_ifa()
    try:
        auc = roc_auc_score(y_test, predicted)
    except:
        auc = 0
    print(classification_report(y_test, predicted))
    return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance

In [158]:
proj_df = pd.read_csv('projects.csv')
projects = proj_df.repo_name.tolist()

In [159]:
count = 0
precision_list = {}
recall_list = {}
pf_list = {}
f1_list = {}
g_list = {}
auc_list = {}
pci_20_list = {}
ifa_list = {}
featue_importance = {}
for project in projects:
    try:
        if project == '.DS_Store':
            continue
#         if project != 'redis-manager':
#             continue
        if df.shape[0] > 10:
            recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance = run_self(project)
            if recall == None:
                continue
            recall_list[project] = recall
            precision_list[project] = precision
            pf_list[project] = pf
            f1_list[project] = f1
            g_list[project] = g_score
            auc_list[project] = auc
            pci_20_list[project] = pci_20
            ifa_list[project] = ifa
            featue_importance[project] = importance
    except Exception as e:
        print(project,e)
        continue

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       163

    accuracy                           1.00       163
   macro avg       1.00      1.00      1.00       163
weighted avg       1.00      1.00      1.00       163

Canvas2ImagePlugin 'commit_hash'
jgit No columns to parse from file
android-app No columns to parse from file
colorpicker Found array with 0 sample(s) (shape=(0, 58)) while a minimum of 1 is required by MinMaxScaler.
material-calendar-view Found array with 0 sample(s) (shape=(0, 58)) while a minimum of 1 is required by MinMaxScaler.
              precision    recall  f1-score   support

           0       0.70      0.98      0.82       413
           1       0.00      0.00      0.00       172

    accuracy                           0.69       585
   macro avg       0.35      0.49      0.41       585
weighted avg       0.50      0.69      0.58       585

              precision    recall  f1-score   support

          

              precision    recall  f1-score   support

           0       0.85      0.71      0.77        41
           1       0.25      0.44      0.32         9

    accuracy                           0.66        50
   macro avg       0.55      0.58      0.55        50
weighted avg       0.74      0.66      0.69        50

              precision    recall  f1-score   support

           0       0.47      0.84      0.61       378
           1       0.45      0.12      0.20       400

    accuracy                           0.47       778
   macro avg       0.46      0.48      0.40       778
weighted avg       0.46      0.47      0.39       778

              precision    recall  f1-score   support

           0       0.84      0.60      0.70       110
           1       0.17      0.41      0.24        22

    accuracy                           0.57       132
   macro avg       0.50      0.50      0.47       132
weighted avg       0.72      0.57      0.62       132

              preci

              precision    recall  f1-score   support

           0       0.92      0.87      0.89        63
           1       0.11      0.17      0.13         6

    accuracy                           0.81        69
   macro avg       0.51      0.52      0.51        69
weighted avg       0.85      0.81      0.83        69

              precision    recall  f1-score   support

           0       0.62      0.51      0.56       283
           1       0.46      0.57      0.51       208

    accuracy                           0.54       491
   macro avg       0.54      0.54      0.54       491
weighted avg       0.55      0.54      0.54       491

              precision    recall  f1-score   support

           0       0.43      0.59      0.50       414
           1       0.53      0.37      0.43       508

    accuracy                           0.47       922
   macro avg       0.48      0.48      0.47       922
weighted avg       0.48      0.47      0.46       922

              preci

              precision    recall  f1-score   support

           0       0.56      0.77      0.65       141
           1       0.46      0.25      0.32       113

    accuracy                           0.54       254
   macro avg       0.51      0.51      0.48       254
weighted avg       0.51      0.54      0.50       254

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

              precision    recall  f1-score   support

           0       0.68      0.89      0.77       504
           1       0.29      0.10      0.15       236

    accuracy                           0.64       740
   macro avg       0.48      0.49      0.46       740
weighted avg       0.55      0.64      0.57       740

zotfile 'commit_hash'
              precision    recall  f1-score   suppo

              precision    recall  f1-score   support

           0       0.67      0.94      0.78       358
           1       0.08      0.01      0.02       168

    accuracy                           0.64       526
   macro avg       0.37      0.47      0.40       526
weighted avg       0.48      0.64      0.54       526

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       316
           1       0.24      0.27      0.25        74

    accuracy                           0.70       390
   macro avg       0.53      0.54      0.53       390
weighted avg       0.71      0.70      0.71       390

              precision    recall  f1-score   support

           0       0.40      0.88      0.55       321
           1       0.62      0.13      0.22       483

    accuracy                           0.43       804
   macro avg       0.51      0.51      0.38       804
weighted avg       0.54      0.43      0.35       804

              preci

              precision    recall  f1-score   support

           0       0.52      0.49      0.50       123
           1       0.43      0.46      0.44       102

    accuracy                           0.48       225
   macro avg       0.47      0.47      0.47       225
weighted avg       0.48      0.48      0.48       225

              precision    recall  f1-score   support

           0       0.52      0.75      0.61       264
           1       0.50      0.27      0.35       247

    accuracy                           0.51       511
   macro avg       0.51      0.51      0.48       511
weighted avg       0.51      0.51      0.48       511

              precision    recall  f1-score   support

           0       0.32      0.73      0.44        56
           1       0.81      0.41      0.55       150

    accuracy                           0.50       206
   macro avg       0.56      0.57      0.49       206
weighted avg       0.67      0.50      0.52       206

              preci

In [155]:
for project in projects:
    if project == '.DS_Store':
        continue
    if project != 'maven-plugins':
        continue
    run_self(project)


In [164]:
np.nanmedian(list(precision_list.values())),np.nanmedian(list(recall_list.values())),np.nanmedian(list(pf_list.values()))

(0.42, 0.36, 0.37)