### Model Değerlendirmesi

In [26]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

dataset = load_digits()
X, y = dataset.data, dataset.target

for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [27]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print('Original:\t', y[1:30])
print('New Label:\t', y_binary_imbalanced[1:30])

Original:	 [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
New Label:	 [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [28]:
np.bincount(y_binary_imbalanced)

array([1615,  182])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

# Accuracy (Doğruluk) - - -> Support Vector Machine (SVM)
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.9955555555555555

### Dummy Sınıflandırıcı

In [30]:
from sklearn.dummy import DummyClassifier

# Negative class 0 is the majority class
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
# dummy tends to predict only the majority class
y_dummy_predictions = dummy_majority.predict(X_test)

y_dummy_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
dummy_majority.score(X_test, y_test)

0.9044444444444445

In [32]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.9777777777777777

### Confusion Matrisi

##### ikili sınıf için

In [33]:
from sklearn.metrics import confusion_matrix
# Majority class
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_majority_predicted = dummy_majority.predict(X_test)
confusion = confusion_matrix(y_test, y_majority_predicted)

print('Majority\n', confusion)

Majority
 [[407   0]
 [ 43   0]]


In [34]:
# Stratified class-proportional
dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train)
y_classprop_predicted = dummy_classprop.predict(X_test)
confusion = confusion_matrix(y_test, y_classprop_predicted)

print('Random class-proportional prediction \n', confusion)

Random class-proportional prediction 
 [[358  49]
 [ 40   3]]


In [35]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_predicted = svm.predict(X_test)
confusion = confusion_matrix(y_test, svm_predicted)

print('Support Vector Machine\n', confusion)

Support Vector Machine
 [[402   5]
 [  5  38]]


In [36]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(X_train, y_train)
lr_predicted = lr.predict(X_test)
confusion = confusion_matrix(y_test, lr_predicted)

print('Logistic Regression\n', confusion)

Logistic Regression
 [[401   6]
 [  8  35]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
tree_predicted = dt.predict(X_test)
confusion = confusion_matrix(y_test, tree_predicted)

print('Decision Tree\n', confusion)

Decision Tree
 [[400   7]
 [ 17  26]]


### Değerlendirme Ölçekleri-Evaluation Metrics

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
# F1 = 2 * Precision * Recall / (Precision + Recall)
print('Accuracy : {:.2f}'.format(accuracy_score(y_test, tree_predicted)))
print('Precision : {:.2f}'.format(precision_score(y_test, tree_predicted)))
print('Recall  : {:.2f}'.format(recall_score(y_test, tree_predicted)))
print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted)))

Accuracy : 0.95
Precision : 0.79
Recall  : 0.60
F1: 0.68


In [39]:
# Full classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, tree_predicted, target_names=['not 1', '1']))

              precision    recall  f1-score   support

       not 1       0.96      0.98      0.97       407
           1       0.79      0.60      0.68        43

    accuracy                           0.95       450
   macro avg       0.87      0.79      0.83       450
weighted avg       0.94      0.95      0.94       450



In [40]:
print('Random propotional Class (dummy)\n',
      classification_report(y_test, y_classprop_predicted, target_names=['1 değil', '1']))
print('SVM\n',
      classification_report(y_test, svm_predicted, target_names = ['1 değil', '1']))
print('Logistic Regression\n',
      classification_report(y_test, lr_predicted, target_names = ['1 değil', '1']))
print('Decision Tree\n',
      classification_report(y_test, tree_predicted, target_names = ['1 değil', '1']))

Random propotional Class (dummy)
               precision    recall  f1-score   support

     1 değil       0.90      0.88      0.89       407
           1       0.06      0.07      0.06        43

    accuracy                           0.80       450
   macro avg       0.48      0.47      0.48       450
weighted avg       0.82      0.80      0.81       450

SVM
               precision    recall  f1-score   support

     1 değil       0.99      0.99      0.99       407
           1       0.88      0.88      0.88        43

    accuracy                           0.98       450
   macro avg       0.94      0.94      0.94       450
weighted avg       0.98      0.98      0.98       450

Logistic Regression
               precision    recall  f1-score   support

     1 değil       0.98      0.99      0.98       407
           1       0.85      0.81      0.83        43

    accuracy                           0.97       450
   macro avg       0.92      0.90      0.91       450
weighted avg  

### Decision Functions

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)
y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20]))
# for the first 20 items show the decision scores
y_score_list

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[(np.int64(0), np.float64(-29.82878099370642)),
 (np.int64(0), np.float64(-19.38286859856305)),
 (np.int64(0), np.float64(-29.198475980964915)),
 (np.int64(0), np.float64(-21.746274461985045)),
 (np.int64(0), np.float64(-22.642379044373467)),
 (np.int64(0), np.float64(-11.805939123799444)),
 (np.int64(1), np.float64(6.496008481092072)),
 (np.int64(0), np.float64(-23.354612279811512)),
 (np.int64(0), np.float64(-27.543779300797045)),
 (np.int64(0), np.float64(-26.88821278179377)),
 (np.int64(0), np.float64(-31.862940933111904)),
 (np.int64(0), np.float64(-22.486086239877622)),
 (np.int64(0), np.float64(-25.318025664525024)),
 (np.int64(0), np.float64(-13.384523140951877)),
 (np.int64(0), np.float64(-13.565639950360538)),
 (np.int64(0), np.float64(-13.308357307487423)),
 (np.int64(1), np.float64(12.180922641917292)),
 (np.int64(0), np.float64(-34.36243708268069)),
 (np.int64(0), np.float64(-13.231539904930415)),
 (np.int64(0), np.float64(-29.59397814680245))]

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)
y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1]))

# for the first 20 items show the probabilities of positive class
y_proba_list

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[(np.int64(0), np.float64(1.110516492006772e-13)),
 (np.int64(0), np.float64(3.820560862797354e-09)),
 (np.int64(0), np.float64(2.0857535982501779e-13)),
 (np.int64(0), np.float64(3.59511675463642e-10)),
 (np.int64(0), np.float64(1.4673702973697477e-10)),
 (np.int64(0), np.float64(7.4600638568107995e-06)),
 (np.int64(1), np.float64(0.9984928228204379)),
 (np.int64(0), np.float64(7.198147800889322e-11)),
 (np.int64(0), np.float64(1.0911605069385053e-12)),
 (np.int64(0), np.float64(2.101829874532039e-12)),
 (np.int64(0), np.float64(1.452447941968591e-14)),
 (np.int64(0), np.float64(1.7156031159282192e-10)),
 (np.int64(0), np.float64(1.010464734708899e-11)),
 (np.int64(0), np.float64(1.5387738196047946e-06)),
 (np.int64(0), np.float64(1.2838576360179774e-06)),
 (np.int64(0), np.float64(1.6605545119320453e-06)),
 (np.int64(1), np.float64(0.9999948726816245)),
 (np.int64(0), np.float64(1.1928427361198052e-15)),
 (np.int64(0), np.float64(1.7931410523739048e-06)),
 (np.int64(0), np.float64(1.

### Precision - Recall Curves

In [43]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# make sure y_test is 0/1 and y_scores_lr continuous
precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)

plt.figure(figsize=(6,6))
plt.plot(recall, precision, lw=2, label="Precision-Recall curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="lower left")
plt.grid(True)
plt.gca().set_aspect("equal", adjustable="box")
plt.show()

<IPython.core.display.Javascript object>