In [1]:
import os

from functools import partial

import numpy as np
import pandas as pd

from joblib import dump
from graphviz import Source
from scipy.spatial.distance import pdist, cdist, squareform
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
def PUK_kernel(X1,X2, sigma, omega):
    """Compute the kernel matrix between two arrays using the Pearson VII function-based universal kernel.
    From: @rlphilli - https://github.com/rlphilli/sklearn-PUK-kernel/blob/master/PUK_kernel.py
    """
    # Compute squared euclidean distance between each row element pair of the two matrices
    if X1 is X2 :
        kernel = squareform(pdist(X1, 'sqeuclidean'))
    else:
        kernel = cdist(X1, X2, 'sqeuclidean')

    kernel = (1 + (kernel * 4 * np.sqrt(2**(1.0/omega)-1)) / sigma**2) ** omega
    kernel = 1/kernel

    return kernel

def classify(x_train, y_train, x_test, y_test, classifier):
    if isinstance(classifier, tuple):
        clf = GridSearchCV(classifier[0], classifier[1], cv=10, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
    else:
        clf = classifier.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

    scores = {
        'accuracy_train': clf.score(x_train, y_train),
        'accuracy_test': clf.score(x_test, y_test),
        'f1_weighted': f1_score(y_test.values, y_pred, average='weighted'),
        'cohen_kappa': cohen_kappa_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'report': classification_report(y_test, y_pred),
        'best_params': str(clf.best_params_) if isinstance(classifier, tuple) else None
    }

    return scores, clf

In [3]:
gt_path = '../data/ground_truth/outside_samples'
models_path = '../data/results/models'
scalers_path = '../data/results/scalers'
graphs_path = '../data/results/graphs'
reports_path = '../data/results/reports'
for path in [models_path, scalers_path, graphs_path, reports_path]:
    os.makedirs(path, exist_ok=True)

strata = ['residential', 'urbanreg', 'urbanirreg', 'rural', 'shanty']
cols_to_drop = ['type_full', 'type_short', 'quartier', 'pattern', 'latitude', 'longitude']
param_ranges = [10**x for x in range(-3, 3)]
puk_kernels = [partial(PUK_kernel, sigma=s, omega=o) for s in param_ranges for o in param_ranges]
classifiers = {
    'tree_2': DecisionTreeClassifier(max_depth=2),
    'tree_3': DecisionTreeClassifier(max_depth=3),
    'tree_5': DecisionTreeClassifier(max_depth=5),    
    'svm_linear': (LinearSVC(), {'C': param_ranges}),
    'svm_puk': (SVC(), {'C': param_ranges, 'kernel': puk_kernels}),
    'svm_rbf': (SVC(kernel='rbf'), {'C': param_ranges, 'gamma': param_ranges}),
    'logistic': LogisticRegressionCV(cv=10),
    'gaussian_naive_bayes': GaussianNB()
}

# Prepare results dictionary
general_fields = ['stratum'] + [f'{mode}_samples_total' for mode in ['train', 'test']]
scores_per_classifier = ['accuracy_train', 'accuracy_test', 'f1_weighted', 'cohen_kappa', 'best_params']
all_fields = general_fields + [f'{classifier}__{score}' for classifier in classifiers.keys()
                               for score in scores_per_classifier]
results = {field:[] for field in all_fields}

for stratum in strata:
    print(f'Working on {stratum}')
    df_train = {'all': pd.read_csv(f'{gt_path}/gt_{stratum}_train.csv')}
    df_test = {'all': pd.read_csv(f'{gt_path}/gt_{stratum}_test.csv')}
    
    # Drop columns and convert roof_type to binary
    for df in [df_train, df_test]:
        df['x'] = df['all'].drop(axis=1, columns=cols_to_drop)
        df['x'] = pd.get_dummies(df['x'])
        for col in df['x'].columns:
            df['x'][col] = df['x'][col].astype(np.float64)
        
        df['y'] = df['all']['type_short']
    
    # Fit scaler to **train** data, then fit both and save
    scaler = StandardScaler().fit(df_train['x'])
    for df in [df_train, df_test]:
        df['x_scaled'] = scaler.transform(df['x'])
    dump(scaler, os.path.join(scalers_path, f'{stratum}_scaler.joblib'))
    
    results['stratum'].append(stratum)
    results['train_samples_total'].append(len(df_train['all']))
    results['test_samples_total'].append(len(df_test['all']))
    
    for key, classifier in classifiers.items():
        scores, clf = classify(
            df_train['x_scaled'],
            df_train['y'],
            df_test['x_scaled'],
            df_test['y'],
            classifier
        )
        
        # Save results
        for score in scores_per_classifier:
            results[f'{key}__{score}'].append(scores[score])
            
        # Write graph for tree classifiers
        if key.startswith('tree'):
            dot_data = export_graphviz(
                clf,
                out_file=None,
                feature_names=list(df_train['x'].columns),
                class_names=list(df_train['y'].unique()),
                filled=True, rounded=True,
                special_characters=True
            )

            graph = Source(dot_data) 
            graph.render(os.path.join(graphs_path, f'{stratum}_{key}'))
        
        # Also save model and report
        dump(clf, os.path.join(models_path, f'{stratum}_{key}_model.joblib'))
        with open(os.path.join(reports_path, f'{stratum}_{key}.txt'), 'w') as report_cm:
            report_cm.write('=====================\nCLASSIFICATION REPORT\n=====================\n')
            report_cm.write(scores['report'])
            report_cm.write('\n\n================\nCONFUSION MATRIX\n================\n')
            report_cm.write(str(scores['confusion_matrix']))
            report_cm.write('\n\n===========\nCOHEN-KAPPA\n===========\n')
            report_cm.write(str(scores['cohen_kappa']))
            if scores['best_params']:
                report_cm.write('\n\n===========\nBEST PARAMS\n===========\n')
                report_cm.write(scores['best_params'])
            
                  
# Convert results dict to Dataframe and write out
results_df = pd.DataFrame.from_dict(results)
results_df.to_csv('../data/results/ground_truth_classification_summary.csv', index=False)

Working on residential
Working on urbanreg
Working on urbanirreg
Working on rural
Working on shanty


## Summaries

In [5]:
results_df.set_index('stratum', inplace=True)

In [6]:
cohen_kappa_cols = [c for c in results_df.columns if 'cohen' in c]
cohen_kappa_df = results_df[cohen_kappa_cols].copy()
cohen_kappa_df.rename(index=str, columns={c:c.replace('__cohen_kappa', '') for c in cohen_kappa_cols}, inplace=True)
cohen_kappa_df

Unnamed: 0_level_0,tree_2,tree_3,tree_5,svm_linear,svm_puk,svm_rbf,logistic,gaussian_naive_bayes
stratum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
residential,0.733333,0.733333,0.733333,0.733333,0.75,0.75,0.733333,0.733333
urbanreg,0.6225,0.635,0.63,0.6225,0.6375,0.6275,0.63,0.6225
urbanirreg,0.5975,0.5925,0.59625,0.6,0.61875,0.60375,0.6,0.59875
rural,0.433333,0.5,0.466667,0.366667,0.566667,0.466667,0.4,0.4
shanty,0.3975,0.4,0.4175,0.39125,0.485,0.46625,0.39,0.3925


In [7]:
f1_cols = [c for c in results_df.columns if 'f1' in c]
f1_df = results_df[f1_cols].copy()
f1_df.rename(index=str, columns={c:c.replace('__f1_weighted', '') for c in f1_cols}, inplace=True)
f1_df

Unnamed: 0_level_0,tree_2,tree_3,tree_5,svm_linear,svm_puk,svm_rbf,logistic,gaussian_naive_bayes
stratum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
residential,0.864253,0.864253,0.86532,0.864827,0.873941,0.873941,0.864827,0.864253
urbanreg,0.804277,0.813155,0.810294,0.804277,0.813274,0.807603,0.808444,0.804654
urbanirreg,0.790255,0.789427,0.792342,0.791775,0.805529,0.799249,0.792407,0.790961
rural,0.691936,0.733333,0.717979,0.662222,0.783273,0.732143,0.677033,0.677033
shanty,0.668682,0.680843,0.702384,0.669857,0.740217,0.729894,0.668802,0.669906


In [8]:
cohen_kappa_df.to_csv('../data/results/cohen_kappa_summary.csv')
f1_df.to_csv('../data/results/f1_summary.csv')