In [1]:
import numpy as np
from keras import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, AveragePooling2D
from keras import Sequential
from keras import backend as K
from imblearn.over_sampling import RandomOverSampler
import random
import copy

2024-01-11 01:50:47.126661: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-11 01:50:47.308403: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-11 01:50:48.550875: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-11 01:50:48.550937: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-11 01:50:48.556020: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
data = np.load("chestmnist.npz")
train_data = data["train_images"]
val_data = data["val_images"]
test_data = data["test_images"]
train_labels = data["train_labels"]
val_labels = data["val_labels"]
test_labels = data["test_labels"]

In [3]:
print(np.shape(data))
print(np.shape(data["train_images"]))

(6,)
(78468, 28, 28)


In [4]:
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

In [5]:
def get_counts(labels, class_num):
    counts = np.zeros(class_num)
    for label in labels:
        for i in np.where(label != 0)[0]:
            counts[i]+=1
    # counts = [i/len(labels) for i in counts]

    return counts

def get_equal_indicies(labels, class_equal):
    indicies = [i for i in range(np.shape(labels)[0]) if labels[i][class_equal] == 1]
    return indicies

def remove_class(data, labels, class_to_remove):
    chosen_indicies = set(get_equal_indicies(labels, class_to_remove))
    remaining_indicies = [i for i in range(np.shape(data)[0]) if i not in chosen_indicies]
    data, labels = data[remaining_indicies], labels[remaining_indicies]
    labels = np.delete(labels, (class_to_remove), axis=1)
    return data, labels

def add_no_class(labels):
    to_append = np.array([np.array([1]) if (label == np.zeros(np.shape(label)[0])).all() else np.array([0]) for label in labels])
    labels = np.append(labels, to_append, axis=1)
    return labels

class_num = np.shape(train_labels)[1]
# print(np.shape(train_labels))
train_counts = get_counts(train_labels, class_num+1)
train_data, train_labels = remove_class(train_data, train_labels, class_num-1)
train_counts = get_counts(train_labels, class_num)
class_num-=1

In [6]:
def split_data(dataset, labels, class_num):
    split_labels = []
    split_data = []

    for i in range(class_num):
        split_data.append(copy.deepcopy(dataset))
        to_add = [1 if label[i] == 1 else 0 for label in labels]
        split_labels.append(to_add)
        
    return split_data, split_labels 

split_train_images, split_labels = split_data(train_data, train_labels, class_num)

In [7]:
def shuffle_together(data, labels):
    p = np.random.permutation(len(data))
    return data[p], labels[p]

# def subsample_class(data, labels, majority_class):
#     all_indices = np.array(get_equal_indicies(labels, majority_class))

#     excluded_indices = [all_indices[(random.randint(0,len(all_indices)-1))] for i in range(len(all_indices))]
#     excluded_indicies = set(excluded_indices)
#     chosen_indicies = np.array([i for i in range(np.shape(data)[0]) if i not in excluded_indicies])

#     return data[chosen_indicies], labels[chosen_indicies]

# def oversample_class(data, labels, minority_class, copy_num, class_num):
#     all_indices = [i for i in get_equal_indicies(labels, minority_class) if list(labels[i]).count(1) == 1]
#     data_to_add = np.zeros((copy_num, 28, 28))
#     labels_to_add = np.zeros((copy_num, class_num), dtype=int)

#     for i in range(copy_num):
#         rnd_index = random.randint(0,len(all_indices)-1)
#         data_to_add[i] = data[all_indices[rnd_index]].copy()
#         labels_to_add[i] = labels[all_indices[rnd_index]].copy()

#     # labels_to_add = np.reshape(labels_to_add, (np.shape(labels_to_add)[0]))
#     data = np.append(data, data_to_add, axis=0)
#     labels = np.append(labels, labels_to_add, axis=0)

#     return data, labels

# def oversample_binary(data, labels, copy_num):
#     chosen_indicies = [i for i in range(np.shape(data)[0]) if labels[i] == 1]
#     data_to_add = np.zeros((copy_num, 28, 28))

#     for i in range(copy_num):
#         rnd_index = random.randint(0,len(chosen_indicies)-1)
#         data_to_add[i] = data[chosen_indicies[rnd_index]].copy()
    
#     data = np.append(data, data_to_add, axis=0)
#     labels = np.append(labels, np.ones(copy_num))

#     data, labels = shuffle_together(data, labels)
#     return data, labels

def oversample_dataset(split_dataset, split_labels):
    
    ros = RandomOverSampler(random_state=0)
    for class_index in range(len(split_dataset)):
        split_labels[class_index] = np.array(split_labels[class_index])
        split_dataset[class_index] = split_dataset[class_index].reshape((78324,784))
        split_dataset[class_index], split_labels[class_index] = ros.fit_resample(split_dataset[class_index], split_labels[class_index])
        print(np.shape(split_dataset[class_index]))
        length = np.shape(split_dataset[class_index])[0]
        split_dataset[class_index] = split_dataset[class_index].reshape((np.shape(split_dataset[class_index])[0],28,28))


    return split_dataset, split_labels


split_train_images, split_labels = oversample_dataset(split_train_images, split_labels)

(140702, 784)
(152752, 784)
(138144, 784)
(128864, 784)
(148704, 784)
(147910, 784)
(154696, 784)
(149252, 784)
(150130, 784)
(153274, 784)
(153050, 784)
(154340, 784)
(152102, 784)


In [8]:

# def loss(target, output):
#     weights = np.array([0.4999014531185609, 1.023027280727486, 0.43079488604780436, 0.2869071613672823, 1.0034527406128615, 0.9122715233953503, 1.0209309133489461, 1.077802673259677, 1.222986893437952, 1.181302396477263, 1.1077582784086397, 0.8634563010646199, 0.8767519326252279, 0.6890930646117368])
#     target = tf.convert_to_tensor(target)
#     output = tf.convert_to_tensor(output)
#     target.shape.assert_is_compatible_with(output.shape)
#     weights = tf.reshape(tf.convert_to_tensor(weights, dtype=target.dtype), (1,-1))

#     # Adjust the predictions so that the probability of
#     # each class for every sample adds up to 1
#     # This is needed to ensure that the cross entropy is
#     # computed correctly.
#     output = output / tf.reduce_sum(output, -1, True)

#     # Compute cross entropy from probabilities.
#     epsilon_ = tf.constant(tf.keras.backend.epsilon(), output.dtype.base_dtype)
#     output = tf.clip_by_value(output, epsilon_, 1.0 - epsilon_)
#     return -tf.reduce_sum(weights * target * tf.math.log(output), axis=-1)


In [9]:
cnn_models = [0]*class_num

for i in range(class_num):
    cnn_models[i] = Sequential()
    cnn_models[i].add(Conv2D(filters=10, kernel_size=(3, 3), activation='relu', input_shape= (28, 28, 1)))
    cnn_models[i].add(MaxPooling2D(2, 2))
    cnn_models[i].add(Conv2D(filters=10, kernel_size=(3, 3), activation='relu', input_shape= (28, 28, 1)))
    cnn_models[i].add(MaxPooling2D(2, 2))
    cnn_models[i].add(Flatten())
    cnn_models[i].add(Dense(10, activation='relu'))
    cnn_models[i].add(Dense(1, activation='sigmoid'))
    cnn_models[i].compile(loss="binary_crossentropy", optimizer='Adam', metrics=['AUC', 'accuracy'])

In [10]:
# train_data = train_data.astype('float32')
for i in range(class_num):
    print("Feature:", i)
    cnn_models[i].fit(split_train_images[i], split_labels[i], batch_size=64, epochs=5)

Feature:  0


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Feature:  1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Feature:  2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Feature:  3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Feature:  4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Feature:  5


: 

[1.4162689e-12 1.5216710e-08 4.5782864e-10 1.4946076e-10 3.0666295e-05
 9.9996936e-01 7.4342017e-09]
[0.01505083 0.0504419  0.03308822 0.00276871 0.6952178  0.20125857
 0.00217384]
[4.7845774e-06 2.3109933e-04 2.5010526e-05 1.0316935e-06 3.2762840e-02
 9.6694505e-01 3.0211440e-05]
[0.09250011 0.22203088 0.07604286 0.05883425 0.23826015 0.23454644
 0.07778532]
[2.7494119e-03 1.6411096e-02 1.2630174e-01 5.5240397e-04 6.2206441e-01
 2.3104410e-01 8.7681267e-04]
