In [1]:
# import pickle
from my_util import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, matthews_corrcoef, precision_recall_fscore_support, classification_report
from sklearn.feature_selection import SelectFromModel, mutual_info_classif, SelectKBest, f_classif, SelectPercentile, chi2

from imblearn.over_sampling import SMOTE

import numpy as np
from scipy.optimize import differential_evolution
import pandas as pd
import time, pickle, math, warnings, os
warnings.filterwarnings('ignore')

data_path = './text_metric_data/'
model_path = './text_metric_model/'

projects = ['openstack','qt']

remove_python_common_tokens = True

if not os.path.exists(data_path):
    os.makedirs(data_path)
   
if not os.path.exists(model_path):
    os.makedirs(model_path)

In [2]:
def get_combined_df(code_commit, commit_id, label, metrics_df, count_vect):
    code_df = pd.DataFrame()
    code_df['commit_id'] = commit_id
    code_df['code'] = code_commit
    code_df['label'] = label
    
    code_df = code_df.sort_values(by='commit_id')
    
    metrics_df = metrics_df.sort_values(by='commit_id')
    metrics_df = metrics_df.drop('commit_id',axis=1)
    
    code_change_arr = count_vect.transform(code_df['code']).astype(np.int16).toarray()
    metrics_df_arr = metrics_df.to_numpy(dtype=np.float32)
    
    final_features = np.concatenate((code_change_arr,metrics_df_arr),axis=1)

    return final_features, list(code_df['commit_id']), list(code_df['label'])


In [4]:
def objective_func(k, clf, train_feature, train_label, test_feature, test_label):
#     print(k)
    smote = SMOTE(random_state=42, k_neighbors= int(np.round(k)), n_jobs=32)
    train_feature_res, train_label_res = smote.fit_resample(train_feature, train_label)
    
#     clf = RandomForestClassifier(random_state=42, n_jobs=-1)
    clf.fit(train_feature_res, train_label_res)
    
    prob = clf.predict_proba(test_feature)[:,1]
    auc = roc_auc_score(test_label, prob)
    
    return -auc

In [1]:
clf_result = {}

sampling_methods = 'DE_SMOTE_min_df_3'
result_str = ''


def run_experiment(cur_proj)
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf_names = 'RF'
        
    train_code, train_commit, train_label = prepare_data(cur_proj, mode='train',
                                                                  remove_python_common_tokens=remove_python_common_tokens)
    test_code, test_commit, test_label = prepare_data(cur_proj, mode='test',
                                                              remove_python_common_tokens=remove_python_common_tokens)

    commit_metrics = load_change_metrics_df(cur_proj)
    train_commit_metrics = commit_metrics[commit_metrics['commit_id'].isin(train_commit)]
    test_commit_metrics = commit_metrics[commit_metrics['commit_id'].isin(test_commit)]
    
    count_vect = CountVectorizer(min_df=3, ngram_range=(1,1))
    count_vect.fit(train_code)
    
    train_feature, train_commit_id, new_train_label = get_combined_df(train_code, train_commit, train_label, train_commit_metrics,count_vect)
    test_feature, test_commit_id, new_test_label = get_combined_df(test_code, test_commit, test_label, test_commit_metrics,count_vect)

    print('load data of',cur_proj, 'finish')
    
    bounds = [(1,20)]
    result = differential_evolution(objective_func, bounds, args=(clf, train_feature, new_train_label, test_feature, new_test_label),
                                   popsize=10, mutation=0.7, recombination=0.3,seed=0)
    
    print(result.x, result.fun)
    
    smote = SMOTE(random_state=42, n_jobs=32, k_neighbors=int(np.round(result.x)))
    
    train_feature_res, train_label_res = smote.fit_resample(train_feature, new_train_label)

    trained_clf, result_str, pred_df = train_eval_model(clf, train_feature_res, train_label_res, 
                                       test_feature, new_test_label)
    pred_df['test_commit'] = test_commit_id
    pred_df.to_csv(data_path+cur_proj+'_'+clf_name+'_'+sampling_methods+'_prediction_result.csv')

    model_path = model_path+cur_proj+'_'+clf_name+'_'+sampling_methods+'.pkl'
    pickle.dump(trained_clf, open(model_path, 'wb'))

    print('finished',cur_proj)
    print('-'*100)

In [None]:
run_experiment(projects[0])

In [None]:
run_experiment(projects[1])