In [214]:
from sklearn.datasets import load_wine
from datetime import timedelta
import pandas as pd
import numpy as np

In [215]:
read_data_file = pd.read_pickle('../flink.pkl')
df = pd.DataFrame(read_data_file)

In [216]:
df.columns

Index(['commit_befor ', 'commit_after', 'cal_smell', 'dispen_bf', 'object_bf',
       'bloater_bf', 'dispen_at', 'object_at', 'bloater_af', 'open_time',
       'closed_time', 'cal_time', 'changed_file', 'additions', 'deletions',
       'dev', 'cal_time_binaly'],
      dtype='object')

In [217]:
df['cal_time'][0:20].median()

Timedelta('7 days 05:57:00')

In [218]:
df['cal_time_binaly'] = df['cal_time'].apply(lambda x:  0 if x < timedelta (days = 7) else 1)

In [219]:
X = df[['commit_befor ', 'dispen_bf', 'object_bf', 'bloater_bf', 'open_time',]]

y = df['cal_time_binaly']

In [220]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test = X[:20], X[20:]
y_train, y_test = y[:20], y[20:]
clf = GradientBoostingClassifier(n_estimators=1, learning_rate=1.0 , max_depth=1 , random_state=2).fit(X_train , y_train)

In [221]:
clf.predict(X_test)

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [222]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf , X , y , cv=5 , scoring= "accuracy")

array([0.33333333, 0.66666667, 0.5       , 0.5       , 0.5       ])

In [223]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(clf,X,y,cv=5)
y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [224]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y , y_pred)

array([[ 8,  4],
       [11,  7]], dtype=int64)

In [225]:
clf.predict(X)

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [226]:
clf.predict(X_train)

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)

In [227]:
from sklearn.metrics import precision_score, recall_score, f1_score
print('precision = ', precision_score(y,y_pred ,average='macro'))
print('recall_score = ', recall_score(y,y_pred ,average='macro'))
print('f1_score',f1_score(y,y_pred ,average='macro'))


precision =  0.5287081339712918
recall_score =  0.5277777777777778
f1_score 0.4994438264738599


In [228]:
frame={"y":y,"y_pred": y_pred}
pd.DataFrame(frame).sum()

y         18
y_pred    11
dtype: int64

In [229]:
from sklearn.model_selection import GridSearchCV

parameters = {
  
    'learning_rate': [0.1 , 0.2 , 1.0],
    'n_estimators': [1,2,4],
    'subsample' : [0.7],
    'max_depth' : [1,2] ,
    'random_state':[1,2]
  #(n_estimators=1, learning_rate=1.0 , max_depth=1 , random_state=2)
}
GradientBoosting = GradientBoostingClassifier()
clf = GridSearchCV(GradientBoosting, parameters , cv=3 , scoring='f1_macro')
clf.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.1, 0.2, 1.0], 'max_depth': [1, 2],
                         'n_estimators': [1, 2, 4], 'random_state': [1, 2],
                         'subsample': [0.7]},
             scoring='f1_macro')

In [230]:
clf.best_estimator_

GradientBoostingClassifier(learning_rate=1.0, max_depth=2, n_estimators=2,
                           random_state=2, subsample=0.7)

In [231]:
y_pred = cross_val_predict(clf.best_estimator_,X_test,y_test)
y_pred



array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [232]:
y_pred_clf = clf.best_estimator_.predict(X_test)

In [233]:
from sklearn.metrics import precision_score, recall_score, f1_score
print('precision = ', precision_score(y_test,y_pred_clf ,average='macro'))
print('recall_score = ', recall_score(y_test,y_pred_clf ,average='macro'))
print('f1_score',f1_score(y_test,y_pred_clf ,average='macro'))

precision =  0.6111111111111112
recall_score =  0.5625
f1_score 0.29292929292929293


In [234]:
from sklearn.model_selection import GridSearchCV

parameters = {
  
    'learning_rate': [0.01 , 0.1 ,1],
    'n_estimators': [12,14,16],
    'subsample' : [0.7,0.9,0.1],
    'max_depth' : [2] ,
    'random_state':[2]
  
}
GradientBoosting = GradientBoostingClassifier()
clf = GridSearchCV(GradientBoosting, parameters , cv=3 , scoring='f1_macro')
clf.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1], 'max_depth': [2],
                         'n_estimators': [12, 14, 16], 'random_state': [2],
                         'subsample': [0.7, 0.9, 0.1]},
             scoring='f1_macro')

In [235]:
clf.best_estimator_

GradientBoostingClassifier(learning_rate=0.01, max_depth=2, n_estimators=12,
                           random_state=2, subsample=0.7)

In [236]:
import itertools
from itertools import product 

In [237]:
result = itertools.combinations(X ,2)

for item in result:
    df  = pd.DataFrame(item)
    X[list (item)]
    parameters = {
    'learning_rate': [0.01 , 0.1 ,1],
    'n_estimators': [12,14,16],
    'subsample' : [0.7],
    'max_depth' : [2] ,
    'random_state':[2]
    }
    GradientBoosting = GradientBoostingClassifier()
    clf = GridSearchCV(GradientBoosting , parameters , cv=3 , scoring='f1_macro')
    clf.fit(X_train[list (item)],y_train)
    clf.predict(X_test[list (item)])
    #print('predict=',clf.predict(X_test[list (item)]))
    #print(X_test)
    #print("y_train = ",y_train)
    print(item)
    print("precision_score =",precision_score(clf.predict(X[list(item)]),y))
    #print("recall_score =", recall_score(clf.predict(X[list(item)]),y))
    #print("f_1=",f1_score(clf.predict(X[list(item)]),y,average='macro'))
    #print("f_1_X-test=",f1_score(clf.predict(X_test[list(item)]),y_test,average='macro'))

('commit_befor ', 'dispen_bf')
precision_score = 0.6111111111111112
('commit_befor ', 'object_bf')
precision_score = 0.6111111111111112
('commit_befor ', 'bloater_bf')
precision_score = 0.7777777777777778
('commit_befor ', 'open_time')
precision_score = 0.6111111111111112
('dispen_bf', 'object_bf')
precision_score = 0.4444444444444444
('dispen_bf', 'bloater_bf')
precision_score = 0.2222222222222222
('dispen_bf', 'open_time')
precision_score = 0.7222222222222222
('object_bf', 'bloater_bf')
precision_score = 0.4444444444444444
('object_bf', 'open_time')
precision_score = 0.3888888888888889
('bloater_bf', 'open_time')
precision_score = 0.3333333333333333
