# Supervised Learning

In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from my_class import my_class as my
%matplotlib inline

In [15]:
def performance(pred, Y):
    """
    asd
    """
    from sklearn.metrics import confusion_matrix
    import numpy as np

    def array_to_latex(tbl):
        for ii in range(tbl.shape[0]):
            tmp_str = ''
            for jj in range(tbl.shape[1]):
                if jj != 0:
                    tmp_str += ' & ' + "{:.0f}".format(tbl[ii,jj])  
                else:
                    tmp_str += "{:.0f}".format(tbl[ii,jj]) 

            tmp_str += ' \\\\ '
            print(tmp_str)

    def performance_measure(pred_test, Y_test):
        #
        cm = confusion_matrix(y_pred = pred_test,
            y_true = Y_test, 
            labels = list(range(len(set(Y_test)))))
        TP = np.diag(cm)
        FP = np.sum(cm, axis=0) - np.diag(cm)
        FN = np.sum(cm,axis=1) - np.diag(cm)
        TN = np.sum(cm) - (FP+FN+TP)
        #
        precision = TP/ (TP + FP)
        recall = TP / (TP + FN)
        F1 = np.multiply(2, np.multiply(precision, recall) / np.add(precision, recall))
        acc = (TP+TN)/(TP+FP+FN+TN)
        #
        return TP, FP, precision, recall, F1, acc, cm


    TP, FP, precision, recall, F1, Acc, cm = performance_measure(pred_test=pred, Y_test=np.argmax(Y, axis=1))
    print('--------------------------------------------')
    print('Average for all classes')
    print('Accurcy:   %f' %(np.mean(Acc)))
    print('Precision: %f' %(np.mean(precision)))
    print('Recall:    %f' %(np.mean(recall)))
    print('F1:        %f' %(np.mean(F1)))

    #
    print("std.\n")
    array_to_latex(cm)
    # 
    print("\npct.\n")
    cm_norm = cm / cm.astype(np.float).sum(axis=1, keepdims=True) * 100
    array_to_latex(cm_norm)

    print("\n\nPaste into latex..\n\n")
    tmp = np.ndarray((2,6))
    tmp[0:2,0:2] = cm_norm
    
    tmp[0:2,2] = precision * 100
    tmp[0:2,3] = recall * 100
    tmp[0:2,4] = F1 * 100
    tmp[0:2,5] = Acc * 100
    #
    array_to_latex(tmp)

In [2]:
# Set directories
data_path_clear = "./../data/data/clear/skive/2016/"
data_path_foggy = "./../data/data/foggy/skive/2016/"

In [3]:
pic_path_clear = my.list_pics(data_path_clear)
pic_path_foggy = my.list_pics(data_path_foggy)
pic_path = pic_path_clear + pic_path_foggy
pic_path = [pic_path[ii] for ii in range(len(pic_path)) if ".jpg" in pic_path[ii]]

n_clear = len(pic_path_clear)
n_foggy = len(pic_path_foggy)
n = len(pic_path)

## Create target variable and feature matrix

In [4]:
Y_clear = np.zeros(n_clear, dtype=int)
Y_foggy = np.ones(n_foggy, dtype=int)
Y = np.concatenate((Y_clear, Y_foggy), axis=0)
# balance(Y)
# one hot
b = np.zeros((len(Y), len(set(Y))))
b[np.arange(len(Y)), Y] = 1
Y = b
n_classes = Y.shape[1]
classes = ["clear", "foggy"]

In [5]:
ratio = 1
channels = 3
pics = my.img_to_nparr(pic_path=pic_path, 
                       img_height = 576, 
                       img_width = 704, 
                       rat = ratio,
                       ch = channels,
                       verbose = False)
# only consider the 3 /5 top of the picture...
# pics = pics[:, 0:int(pics.shape[1] / 5 * 4),:,:]
# dimensions picture
image_height, image_width, _ = pics[1].shape

n_pixels = image_height * image_width

All images to array!


## Feature extraction

In [6]:
features = ["Dark channel", "sobel_VARsob", "sobel_TEN", 
            "laplace_sum", "laplace_var", "pct_overexposed"]
n_features = len(features)

In [7]:
update = False
#
if update:
    X = np.zeros((n, n_features))
    for ii in range(n):
        print(str(ii + 1) + " of " + str(n), end="\r")
        feature_list = []
        # dark channel
        dc = my.get_dark_channel(pics[ii], win=20)
        # close to 1 -> presents of fog
        feature_list.append(np.mean(dc / 255.0)) 

        # sobel edge filtering
        S = my.sobel_filter(pics[ii]),
        feature_list.append(my.VARsob(S))
        feature_list.append(my.TEN(S) / n_pixels)
        
        # laplace
        L = my.lapalce_filter(pics[ii])
        feature_list.append(np.sum(abs(L)) / n_pixels)
        feature_list.append(np.var(abs(L)) / n_pixels)
        
        # pct. overexposed pixels
        feature_list.append(my.overexposed_pixels(pics[ii]) / n_pixels)
        
        # add to design matrix
        X[ii,:] = feature_list
    # if updated save new... 
    print("Updated...")
    np.save("./../data/tmp/X.npy", X)
else:
    X = np.load("./../data/tmp/X.npy")

## Modelling

In [8]:
# Randomize the order of the pictures
idx = np.arange(n)
np.random.shuffle(idx)
#
pic_path = np.array(pic_path)[idx]
Y = Y[idx]
X = X[idx]

In [9]:
#
test_size = 0.3
rand_state = 22
K = 2
#Splitting 
#X_model, X_test, Y_model, Y_test = train_test_split(X, Y,
#                                                    test_size = test_size,
#                                                    random_state = rand_state)
idx_X_model, idx_X_test, idx_Y_model, idx_Y_test = train_test_split(np.arange(n),np.arange(n),
                                                                    test_size = test_size,
                                                                    random_state = rand_state)
# devide data
X_model, X_test, Y_model, Y_test = X[idx_X_model], X[idx_X_test], Y[idx_Y_model], Y[idx_Y_test]
pic_path_model, pic_path_test = pic_path[idx_X_model], pic_path[idx_X_test]

np.save("./../data/tmp/X_model.npy", X_model)


print("Train and val. size:\t{0}\nTest set size:\t\t{1}".format(len(X_model), len(X_test)))

Train and val. size:	190
Test set size:		82


In [10]:
# cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.externals import joblib

# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

clf = RandomForestClassifier(random_state = rand_state,
                             n_jobs = -1,
                             class_weight = {0: Y_model.shape[0] / (n_classes * np.bincount(np.argmin(Y_model, 1)))[0], 
                                             1: Y_model.shape[0] / (n_classes * np.bincount(np.argmin(Y_model, 1)))[1]})
# clf.get_params()
# RF serach grid

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [11]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 5, verbose=1, 
                               random_state=rand_state, 
                               n_jobs = -1)
update = False
#
if update:
    # Fit the random search model
    rf_random.fit(X=X_model, y=np.argmax(Y_model, 1))
    print(rf_random.best_params_)
else:
    print({'bootstrap': True,
     'max_depth': 20,
     'max_features': 'auto',
     'min_samples_leaf': 1,
     'min_samples_split': 5,
     'n_estimators': 400})

{'bootstrap': True, 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}


In [12]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [2,3,4],
    'min_samples_leaf': [1,2],
    'min_samples_split': [2,3],
    'n_estimators': [150,200,250]}
# Instantiate the grid search model

clf = RandomForestClassifier(random_state = rand_state,
                             n_jobs = -1,
                             bootstrap = True,
                             max_features = 'auto',
                             class_weight = {0: Y_model.shape[0] / (n_classes * np.bincount(np.argmin(Y_model, 1)))[0], 
                                             1: Y_model.shape[0] / (n_classes * np.bincount(np.argmin(Y_model, 1)))[1]})

grid_search = GridSearchCV(estimator = clf, 
                           param_grid = param_grid, 
                           cv = 5, n_jobs = -1, verbose = 1)
#
if update:
    # Fit the grid search to the data
    grid_search.fit(X_model, np.argmax(Y_model, 1))
    print(grid_search.best_params_)
else:
    print({'max_depth': 3, 
           'min_samples_leaf': 1, 
           'min_samples_split': 2, 
           'n_estimators': 200})
    
#

clf = RandomForestClassifier(max_depth = 5,
                             min_samples_leaf = 1,
                             min_samples_split = 3,
                             n_estimators = 300,
                             random_state = rand_state,
                             n_jobs = -1,
                             bootstrap = True,
                             max_features = 'auto',
                             class_weight = {0: Y_model.shape[0] / (n_classes * np.bincount(np.argmin(Y_model, 1)))[0], 
                                             1: Y_model.shape[0] / (n_classes * np.bincount(np.argmin(Y_model, 1)))[1]})

# fit tree
clf.fit(X_model, np.argmax(Y_model, 1))
joblib.dump(clf, "./../data/tmp/clf.pkl") 

{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


['./../data/tmp/clf.pkl']

In [13]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [int(x) for x in np.linspace(start = 2, stop = 40, num = 1)],
    'n_neighbors': [int(x) for x in np.linspace(start = 2, stop = 50, num = 1)]}
# Instantiate the grid search model

neigh = KNeighborsClassifier()

grid_search = GridSearchCV(estimator = neigh, 
                           param_grid = param_grid, 
                           cv = 5, n_jobs = -1, verbose = 1)
#

#
if update:
    # Fit the random search model
    grid_search.fit(X_model, np.argmax(Y_model, 1))
    print(rf_random.best_params_)
else:
    print({'algorithm': 'brute', 
           'leaf_size': 2, 
           'n_neighbors': 2})

    
# fit knn
neigh = KNeighborsClassifier(algorithm = 'brute', 
                             leaf_size = 2,
                             n_neighbors = 2,
                             n_jobs=-1)
neigh.fit(X_model, np.argmax(Y_model, 1)) 
joblib.dump(neigh, "./../data/tmp/neigh.pkl") 

{'algorithm': 'brute', 'leaf_size': 2, 'n_neighbors': 2}


['./../data/tmp/neigh.pkl']

In [16]:
# model assesment
clf = joblib.load("./../data/tmp/clf.pkl")
print("RF")
pred_test = clf.predict(X_test)
performance(pred_test, Y_test)

neigh = joblib.load("./../data/tmp/neigh.pkl") 
print("KNN")
pred_test = neigh.predict(X_test)
performance(pred_test, Y_test)

RF
--------------------------------------------
Average for all classes
Accurcy:   0.987805
Precision: 0.987805
Recall:    0.988095
F1:        0.987803
std.

41 & 1 \\ 
0 & 40 \\ 

pct.

98 & 2 \\ 
0 & 100 \\ 


Paste into latex..


98 & 2 & 100 & 98 & 99 & 99 \\ 
0 & 100 & 98 & 100 & 99 & 99 \\ 
KNN
--------------------------------------------
Average for all classes
Accurcy:   0.963415
Precision: 0.963415
Recall:    0.963690
F1:        0.963409
std.

40 & 2 \\ 
1 & 39 \\ 

pct.

95 & 5 \\ 
2 & 98 \\ 


Paste into latex..


95 & 5 & 98 & 95 & 96 & 96 \\ 
2 & 98 & 95 & 98 & 96 & 96 \\ 
