In [1]:
import tensorflow as tf
import tensorflow.keras as tfk
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
import cv2.xfeatures2d
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
def process_file(file):
    classes = []
    names = []
    with open(file, 'r') as f:
        header = True
        for line in f:
            if header:
                header = False
                continue
            n, c = line.split(',')
            names.append(n)
            classes.append(c)
    return np.array(names), np.array(classes)

In [3]:
names, labels = process_file('train_labels.txt')
labels = np.array([int(x) for x in labels])[:3000]
names = np.array(names)[:3000]
eval_names, _ = process_file('sample_submission.txt')
print(f"{len(names)} training examples.")
print(f"{len(eval_names)} testing examples.")

3000 training examples.
5149 testing examples.


In [4]:
def read_image(name):
    img = cv.imread(f"data/{name}.png")
    return img

In [5]:
images = np.array([read_image(name) for name in names])
print(images.shape)

(3000, 224, 224, 3)


In [34]:
model = tfk.applications.densenet.DenseNet121(
    include_top=False,
    input_shape=(224, 224, 3))

output = None
for layer in model.layers:
    #if "_7" in layer.name:
    #    break
    output = layer.output
output = tfk.layers.GlobalAveragePooling2D()(output)
model = tfk.Model(inputs=model.inputs, outputs=output)

preprocess_input = tfk.applications.densenet.preprocess_input
print(model.summary())

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
zero_padding2d_2 (ZeroPadding2D (None, 230, 230, 3)  0           input_5[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d_2[0][0]           
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
____________________________________________________________________________________________

In [None]:
emb = model.predict(preprocess_input(images))
print(emb.shape)

In [None]:
images_train, images_val, \
emb_train, emb_val, \
labels_train, labels_val \
    = train_test_split(images, emb, labels, test_size=0.1)

In [None]:
#k_means_idx = labels_train == 0.
#k_means = KMeans(n_clusters=128)
#k_means.fit(emb_train[k_means_idx])

In [None]:
# dists = k_means.transform(emb_train[k_means_idx])
# center_idx = np.argmin(dists, axis=0)
# center_idx = np.where(k_means_idx)[0][center_idx]
# centers = images_train[center_idx]

# hh = (len(centers) + 3) // 4
# plt.figure(figsize=(16, hh * 4))
# for idx in range(len(centers)):
#     center = centers[idx]
#     plt.subplot(hh, 4, idx + 1)
#     plt.imshow(center)
# plt.show()

In [None]:
# q_idx = 66
# plt.subplot(1, 2, 1)
# plt.imshow(images_val[q_idx])
# [c] = k_means.predict([emb_val[q_idx]])
# plt.subplot(1, 2, 2)
# plt.imshow(centers[c])
# print(emb_val[q_idx][:10])
# print(emb_train[center_idx[c]][:10])
# print(k_means.cluster_centers_[c][:10])

In [None]:
pca = PCA(n_components=16).fit(emb_train)
emb_train_2 = pca.transform(emb_train)
emb_val_2 = pca.transform(emb_val)

In [None]:
def preprocess(emb):
    return emb

def get_class_weight(p=0.23112):
    return {0: p, 1: 1 - p}

# for p in np.linspace(0.23112, 0.23113, 20):
#     clf = SVC(class_weight=get_class_weight(p))
#     clf.fit(preprocess(emb_train)[:1000], labels_train[:1000])
#     print(p, f1_score(labels_val, clf.predict(preprocess(emb_val))))
clf = SVC(class_weight=get_class_weight())
clf.fit(preprocess(emb_train), labels_train)
print(f1_score(labels_val, clf.predict(preprocess(emb_val))))
print(preprocess(emb_train).shape)