In [3]:
import sklearn
import numpy as np
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression

from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal


In [19]:
y_true = [0,0,1,1,0,0,1,1,0,0]
y_true = np.array(y_true)
y_true

array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0])

In [20]:
y_pred = [0]*5 + [1]*5
y_pred = np.array(y_pred)
y_pred

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [21]:
precision_recall_fscore_support(y_true, y_pred)

(array([0.6, 0.4]),
 array([0.5, 0.5]),
 array([0.54545455, 0.44444444]),
 array([6, 4]))

In [24]:
MCM = multilabel_confusion_matrix(y_true, y_pred)
MCM

array([[[2, 2],
        [3, 3]],

       [[3, 3],
        [2, 2]]])

# Class_Weight
## test_compute_class_weight()

In [14]:
def test_compute_class_weight():
    # Test (and demo) compute_class_weight.
    y = np.asarray([2, 2, 2, 3, 3, 4])
    y = np.asarray([2, 2, 2, 2, 3, 3, 3, 4, 4, 5])

    classes = np.unique(y)
    print (y)
    print (classes)

    cw = compute_class_weight("balanced", classes=classes, y=y)
    print (cw)
    # total effect of samples is preserved
    class_counts = np.bincount(y)[2:]
    print (class_counts)
    print (np.bincount(y))
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    print (np.dot(cw, class_counts))
#    assert cw[0] < cw[1] < cw[2]
test_compute_class_weight()

[2 2 2 2 3 3 3 4 4 5]
[2 3 4 5]
[0.625      0.83333333 1.25       2.5       ]
[4 3 2 1]
[0 0 4 3 2 1]
10.0


Class Weight of each class is (the total number in y) divided by (the number of classes) divided by (the number of elements in the class)

## test_compute_weight_dict

In [22]:
def test_compute_class_weight_dict():
    classes = np.arange(3)
    print (classes)
    class_weights = {0: 1.0, 1: 2.0, 2: 1.0}
    print (class_weights)
    y = np.asarray([0, 0, 1, 2, 2, 2])
    print (y)
    cw = compute_class_weight(class_weights, classes=classes, y=y)
    print (cw)

test_compute_class_weight_dict()

[0 1 2]
{0: 1.0, 1: 2.0, 2: 1.0}
[0 0 1 2 2 2]
[1. 2. 1.]


## test_compute_test_weight_invariance()

In [27]:
def test_compute_class_weight_invariance():
    # Test that results with class_weight="balanced" is invariant wrt
    # class imbalance if the number of samples is identical.
    # The test uses a balanced two class dataset with 100 datapoints.
    # It creates three versions, one where class 1 is duplicated
    # resulting in 150 points of class 1 and 50 of class 0,
    # one where there are 50 points in class 1 and 150 in class 0,
    # and one where there are 100 points of each class (this one is balanced
    # again).
    # With balancing class weights, all three should give the same model.
    X, y = make_blobs(centers=2, random_state=0)
    print (X.shape, y.shape)
    print (X)
    print (y)
    # create dataset where class 1 is duplicated twice
    X_1 = np.vstack([X] + [X[y == 1]] * 2)
    y_1 = np.hstack([y] + [y[y == 1]] * 2)
    # create dataset where class 0 is duplicated twice
    X_0 = np.vstack([X] + [X[y == 0]] * 2)
    y_0 = np.hstack([y] + [y[y == 0]] * 2)
    # duplicate everything
    X_ = np.vstack([X] * 2)
    y_ = np.hstack([y] * 2)
    # results should be identical
    logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
    logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
    logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
    assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
    assert_array_almost_equal(logreg.coef_, logreg0.coef_)
    print (logreg.coef_)

test_compute_class_weight_invariance()

(100, 2) (100,)
[[ 4.21850347  2.23419161]
 [ 0.90779887  0.45984362]
 [-0.27652528  5.08127768]
 [ 0.08848433  2.32299086]
 [ 3.24329731  1.21460627]
 [ 1.44193252  2.76754364]
 [ 1.0220286   4.11660348]
 [ 3.97820955  2.37817845]
 [ 0.58894326  4.00148458]
 [ 1.25185786  0.20811388]
 [ 0.62835793  4.4601363 ]
 [ 1.68608568  0.65828448]
 [ 1.18454506  5.28042636]
 [ 0.06897171  4.35573272]
 [ 1.78726415  1.70012006]
 [ 4.4384123   1.84214315]
 [ 3.18190344 -0.18226785]
 [ 0.30380963  3.94423417]
 [ 0.73936011  0.43607906]
 [ 1.28535145  1.43691285]
 [ 1.1312175   4.68194985]
 [ 0.66471755  4.35995267]
 [ 1.31570453  2.44067826]
 [-0.18887976  5.20461381]
 [ 2.57854418  0.72611733]
 [ 0.87305123  4.71438583]
 [ 1.3105127   0.07122512]
 [ 0.9867701   6.08965782]
 [ 1.42013331  4.63746165]
 [ 2.3535057   2.22404956]
 [ 2.43169305 -0.20173713]
 [ 1.0427873   4.60625923]
 [ 0.95088418  0.94982874]
 [ 2.45127423 -0.19539785]
 [ 1.62011397  2.74692739]
 [ 2.15504965  4.12386249]
 [ 1.3809348