# How to use my code  
1. run the first block to define the functions.
2. run the second block to check cross validated performance. Change the last arugment for function "set_classifier" "cv_performance_assessment" to choose the classifier and feature extracting method you want to use.
3. run the third block to generate the csv file to upload to kaggle. Change the code in line 15~21 to choose the classifier and feature extracting method you want to use.  
*remember to change the directories if you put the data in the different place.

In [None]:
# -*- coding: utf-8 -*-
'''Sample script for solar array image classification

Author:       Kyle Bradbury, modified by Tzu-Chun
Date:         January 30, 2018, February 16, 2020
Organization: Duke University Energy Initiative, Duke MIDS student
'''

'''
Import the packages needed for classification
'''
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
plt.close()

#Install TensorFlow GPU: https://blog.quantinsti.com/install-tensorflow-gpu/

import warnings
warnings.filterwarnings("ignore")

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm

# keras (CNN)
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

# HOG
from skimage.feature import hog
from skimage import data, exposure

'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
dir_train_images  = './training/'
dir_test_images   = './testing/'
dir_train_labels  = './labels_training.csv'
dir_test_ids      = './sample_submission.csv'

'''
Include the functions used for loading, preprocessing, features extraction, 
classification, and performance evaluation
'''

def load_data(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    for identifier in ids:
        fname     = dir_data + identifier.astype(str) + '.tif'
        image     = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids

def preprocess_and_extract_features(data):
    '''Preprocess data and extract features
    
    Preprocess: normalize, scale, repair
    Extract features: transformations and dimensionality reduction
    '''
    # Here, we do something trivially simple: we take the average of the RGB
    # values to produce a grey image, transform that into a vector, then
    # extract the mean and standard deviation as features.
    
    # Make the image grayscale
    data = np.mean(data, axis=3)
    
    # Vectorize the grayscale matrices
    vectorized_data = data.reshape(data.shape[0],-1)
    
    # extract the mean and standard deviation of each sample as features
    feature_mean = np.mean(vectorized_data,axis=1)
    feature_std  = np.std(vectorized_data,axis=1)
    
    # Combine the extracted features into a single feature vector
    features = np.stack((feature_mean,feature_std),axis=-1)
    
    return features

def preprocess_and_extract_features_HOG(data):
    '''Preprocess data and extract features
    
    1. (optional) global image normalisation
    2. computing the gradient image in x and y
    3. computing gradient histograms
    4. normalising across blocks
    5. flattening into a feature vector
    '''
    # run first image for computaional purpose
    features, hog_image = hog(data[0], orientations=8, pixels_per_cell=(16, 16),
                                        cells_per_block=(1, 1), multichannel=True, feature_vector = True, visualize=True)
    features =  features.reshape(-1,1)
    hog_image = [hog_image]
    
    # load the image one by one
    for img in data:
        # Extract Histogram of Oriented Gradients (HOG) for a given image.
        f, img = hog(img, orientations=8, pixels_per_cell=(16, 16),
                                        cells_per_block=(1, 1), multichannel=True, feature_vector = True, visualize=True)
        # concatenate the outcomes 
        features = np.concatenate((features, f.reshape(-1,1)), axis = 1)
        hog_image = np.concatenate((hog_image, [img]), axis = 0 )
        

    # delete the duplicated first column and transpose the array
    features =  np.delete(features, 0, 1)
    features = np.transpose(features)
    # delete the duplicated first column and reshape
    hog_image = np.delete(hog_image, 0, 0)
    hog_image = hog_image.reshape(len(hog_image), 101, 101, 1)
    
    return features, hog_image

def set_classifier(c = 0):
    '''Shared function to select the classifier for both performance evaluation
    and testing
    '''
    if c == 0:
        return KNeighborsClassifier(n_neighbors=7)
    if c == 1:
        model = Sequential()
        #add model layers
        model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(101,101,1)))
        model.add(Conv2D(32, kernel_size=3, activation='relu'))
        model.add(Flatten())
        model.add(Dense(2, activation='softmax'))
        #compile model using accuracy to measure model performance
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])    
        return model
    if c == 2:
        return svm.SVC(probability = True)

def cv_performance_assessment(X,y,k,clf, f=0):
    '''Cross validated performance assessment
    
    X   = training data
    y   = training labels
    k   = number of folds for cross validation
    clf = classifier to use
    f = feature extract method to use
    
    Divide the training data into k folds of training and validation data. 
    For each fold the classifier will be trained on the training data and
    tested on the validation data. The classifier prediction scores are 
    aggregated and output
    '''
    # Establish the k folds
    prediction_scores = np.empty(y.shape[0],dtype='object')
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for train_index, val_index in kf.split(X, y):
        # Extract the training and validation data for this fold
        X_train, X_val   = X[train_index], X[val_index]
        y_train          = y[train_index]
        
        X_train_features = []
        X_val_features = []
        
        # if clf is not CNN
        if isinstance(clf, Sequential) != True:
            # extract features depends on the method chose
            if f == 0:
                X_train_features = preprocess_and_extract_features(X_train)
                X_val_features   = preprocess_and_extract_features(X_val)
            if f == 1:
                X_train_features, _ = preprocess_and_extract_features_HOG(X_train)
                X_val_features, _   = preprocess_and_extract_features_HOG(X_val)
            # Train and test the classifier on the validation data for this fold
            clf              = clf.fit(X_train_features,y_train)
            cpred            = clf.predict_proba(X_val_features)
        else:
            # extract features depends on the method chose
            if f == 0:
                X_train_features = preprocess_and_extract_features(X_train)
                X_val_features   = preprocess_and_extract_features(X_val)
            if f == 1:
                _, X_train_features = preprocess_and_extract_features_HOG(X_train)
                _, X_val_features   = preprocess_and_extract_features_HOG(X_val)
            # Train and test the classifier on the validation data for this fold
            clf              = clf.fit(X_train_features,y_train)
            cpred            = clf.predict(X_val_features)
        
        # Save the predictions for this fold
        prediction_scores[val_index] = cpred[:,1]
    return prediction_scores

def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.axis('square')
    plt.legend()
    plt.tight_layout()

In [None]:
'''
Sample script for cross validated performance
'''
# Set parameters for the analysis
num_training_folds = 3

# Load the data
data, labels = load_data(dir_train_images, dir_train_labels, training=True)

# Choose which classifier to use
# 0: KNN, 1: CNN, 2: SVC
clf = set_classifier(2)

# Perform cross validated performance assessment
# 0: mean&std, 1: HOG
prediction_scores = cv_performance_assessment(data,labels,num_training_folds,clf,1)

# Compute and plot the ROC curves
plot_roc(labels, prediction_scores)


In [None]:
'''
Sample script for producing a Kaggle submission
'''

produce_submission = True # Switch this to True when you're ready to create a submission for Kaggle

if produce_submission:
    # Load data
    training_data, training_labels = load_data(dir_train_images, dir_train_labels, training=True)
    test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
    
    # let "test_scores" be outside of if function
    test_scores = 0
    
    # choose model
    # 0: KNN, 1: CNN, 2: SVC
    c = 0
    
    # choose the feature extraction
    #0: mean&std, 1: HOG
    feature = 1
    
    # base on the model type to decide the method to extract features
    if (c == 0) | (c == 2):
        if feature == 0:
            training_features = preprocess_and_extract_features(training_data)
            test_features  = preprocess_and_extract_features(test_data)
        if feature == 1:
            training_features, _ = preprocess_and_extract_features_HOG(training_data)
            test_features, _  = preprocess_and_extract_features_HOG(test_data)
            
        clf                            = set_classifier(c)
        clf.fit(training_features, training_labels)
        test_scores    = clf.predict_proba(test_features)[:,1]
        # Save the predictions to a CSV file for upload to Kaggle
        submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores})
        submission_file.to_csv('submission.csv',
                                columns=['id','score'],
                                index=False)

    if c ==1: # inputs when using this model are transformed images
        if feature == 0:
            training_features = np.mean(training_data, axis =3).reshape(len(training_data),101,101,1)
            test_features = np.mean(test_data, axis =3).reshape(len(test_data),101,101,1)
        if feature == 1:
            _, training_features = preprocess_and_extract_features_HOG(training_data)
            _, test_features  = preprocess_and_extract_features_HOG(test_data)
            
        clf                            = set_classifier(c)
        training_labels = to_categorical(training_labels)
        clf.fit(training_features, training_labels, epochs=3)
        test_scores = clf.predict(test_features)
        # Save the predictions to a CSV file for upload to Kaggle
        submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores[:,1]})
        submission_file.to_csv('submission.csv',
                               columns=['id','score'],
                               index=False)