In [44]:
from sklearn.datasets import load_wine
from datetime import timedelta
import pandas as pd
import numpy as np

In [45]:
read_data_file = pd.read_pickle('../flink.pkl')
df = pd.DataFrame(read_data_file)

In [46]:
df.columns

Index(['commit_befor ', 'commit_after', 'cal_smell', 'dispen_bf', 'object_bf',
       'bloater_bf', 'dispen_at', 'object_at', 'bloater_af', 'open_time',
       'closed_time', 'cal_time', 'changed_file', 'additions', 'deletions',
       'dev', 'cal_time_binaly'],
      dtype='object')

In [47]:
df['cal_time'][0:20].median()

Timedelta('7 days 05:57:00')

In [48]:
df['cal_time_binaly'] = df['cal_time'].apply(lambda x:  0 if x < timedelta (days = 7) else 1)

In [49]:
X = df[['commit_befor ', 'dispen_bf', 'object_bf', 'bloater_bf', 'open_time',]]
y = df['cal_time_binaly']

In [50]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test = X[:20], X[20:]
y_train, y_test = y[:20], y[20:]
clf = GradientBoostingClassifier(n_estimators=1, learning_rate=1.0 , max_depth=1 , random_state=2).fit(X_train , y_train)

In [51]:
clf.predict(X_test)

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [52]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf , X , y , cv=5 , scoring= "accuracy")

array([0.33333333, 0.66666667, 0.5       , 0.5       , 0.5       ])

In [53]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(clf,X,y,cv=5)
y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [54]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y , y_pred)

array([[ 8,  4],
       [11,  7]], dtype=int64)

In [55]:
clf.predict(X)

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [56]:
clf.predict(X_train)

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)

In [57]:
from sklearn.metrics import precision_score, recall_score, f1_score
print('precision = ', precision_score(y,y_pred ,average='macro'))
print('recall_score = ', recall_score(y,y_pred ,average='macro'))
print('f1_score',f1_score(y,y_pred ,average='macro'))


precision =  0.5287081339712918
recall_score =  0.5277777777777778
f1_score 0.4994438264738599


In [58]:
frame={"y":y,"y_pred": y_pred}
pd.DataFrame(frame).sum()

y         18
y_pred    11
dtype: int64

In [59]:
from sklearn.model_selection import GridSearchCV

parameters = {
  
    'learning_rate': [0.01 , 0.1 ,1],
    'n_estimators': [12,14,16],
    'subsample' : [0.7],
    'max_depth' : [2] ,
    'random_state':[2]
  
}
GradientBoosting = GradientBoostingClassifier()
clf = GridSearchCV(GradientBoosting, parameters , cv=3 , scoring='f1_macro')
clf.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1], 'max_depth': [2],
                         'n_estimators': [12, 14, 16], 'random_state': [2],
                         'subsample': [0.7]},
             scoring='f1_macro')

In [60]:
y_pred = cross_val_predict(clf.best_estimator_,X_test,y_test)
y_pred



array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [61]:
y_pred_clf = clf.best_estimator_.predict(X_test)

In [62]:
from sklearn.metrics import precision_score, recall_score, f1_score
print('precision = ', precision_score(y_test,y_pred_clf ,average='macro'))
print('recall_score = ', recall_score(y_test,y_pred_clf ,average='macro'))
print('f1_score',f1_score(y_test,y_pred_clf ,average='macro'))

precision =  0.4583333333333333
recall_score =  0.4375
f1_score 0.375
