<a href="https://colab.research.google.com/github/cagBRT/Data/blob/main/Imbalanced_Dataset_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# create an imbalanced dataset
from numpy import unique
from sklearn.datasets import make_classification
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99], flip_y=0,
  random_state=1)
# summarize dataset
classes = unique(y)
total = len(y)
for c in classes:
  n_examples = len(y[y==c])
  percent = n_examples / total * 100
  print('> Class=%d : %d/%d (%.1f%%)' % (c, n_examples, total, percent))

In [None]:
# log loss for naive probability predictions.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99], flip_y=0,
random_state=1)
# split into train/test sets with same class ratio
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# no skill prediction 0
probabilities = [[1, 0] for _ in range(len(testy))] 
avg_logloss = log_loss(testy, probabilities) 
print('P(class0=1): Log Loss=%.3f' % (avg_logloss)) # no skill prediction 1
probabilities = [[0, 1] for _ in range(len(testy))] 
avg_logloss = log_loss(testy, probabilities) 
print('P(class1=1): Log Loss=%.3f' % (avg_logloss)) # baseline probabilities
probabilities = [[0.99, 0.01] for _ in range(len(testy))] 
avg_logloss = log_loss(testy, probabilities) 
print('Baseline: Log Loss=%.3f' % (avg_logloss))
# perfect probabilities
avg_logloss = log_loss(testy, testy) 
print('Perfect: Log Loss=%.3f' % (avg_logloss))

In [None]:
# brier score for naive probability predictions.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99], flip_y=0,
random_state=1)
# split into train/test sets with same class ratio
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# no skill prediction 0
probabilities = [0.0 for _ in range(len(testy))] 
avg_brier = brier_score_loss(testy, probabilities) 
print('P(class1=0): Brier Score=%.4f' % (avg_brier)) # no skill prediction 1
probabilities = [1.0 for _ in range(len(testy))] 
avg_brier = brier_score_loss(testy, probabilities) 
print('P(class1=1): Brier Score=%.4f' % (avg_brier)) # baseline probabilities
probabilities = [0.01 for _ in range(len(testy))] 
avg_brier = brier_score_loss(testy, probabilities) 
print('Baseline: Brier Score=%.4f' % (avg_brier)) # perfect probabilities
avg_brier = brier_score_loss(testy, testy)
print('Perfect: Brier Score=%.4f' % (avg_brier))

In [None]:
# brier skill score for naive probability predictions.
from sklearn.datasets import make_classification 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import brier_score_loss
# calculate the brier skill score
def brier_skill_score(y, yhat, brier_ref): # calculate the brier score
  bs = brier_score_loss(y, yhat)
  # calculate skill score
  return 1.0 - (bs / brier_ref)
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.99], flip_y=0, random_state=1)
# split into train/test sets with same class ratio
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# calculate reference
probabilities = [0.01 
for _ in range(len(testy))] 
brier_ref = brier_score_loss(testy, probabilities) 
print('Reference: Brier Score=%.4f' % (brier_ref)) # no skill prediction 0
probabilities = [0.0 for _ in range(len(testy))]
bss = brier_skill_score(testy, probabilities, brier_ref) 
print('P(class1=0): BSS=%.4f' % (bss))
# no skill prediction 1
probabilities = [1.0 for _ in range(len(testy))]
bss = brier_skill_score(testy, probabilities, brier_ref) 
print('P(class1=1): BSS=%.4f' % (bss))
# baseline probabilities
probabilities = [0.01 for _ in range(len(testy))]
bss = brier_skill_score(testy, probabilities, brier_ref) 
print('Baseline: BSS=%.4f' % (bss))
# perfect probabilities
bss = brier_skill_score(testy, testy, brier_ref) 
print('Perfect: BSS=%.4f' % (bss))
