# Classify single image patches

Not expected to be very precise - more of a model structure search

In [1]:
import os
import sys
import numpy as np
import pandas as pd

import skelm

from matplotlib import pyplot as plt
import seaborn as sn
%matplotlib inline

In [2]:
from multiprocessing import Pool

In [3]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [4]:
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV, GroupShuffleSplit, RandomizedSearchCV
import scipy
from sklearn.utils.fixes import loguniform
from time import time

In [5]:
overlap = '50p'
MAX_SAMPLES_PER_FILE = 30
data_path = "/scratch/project_2001220/leonardo/00_Features/GPDSS10000/folder2/004/overlap" + overlap

# 

## Load data

In [6]:
def load_user(fname):
    for root,_,files in os.walk(os.path.dirname(fname) + "/overlap" + overlap):
        pq_files = [os.path.join(root, f) for f in files if f.endswith("_features.parquet.gz")]
    
    def _load(path):
        df = pd.read_parquet(path).loc[:, '0':]
        df = df.sample(MAX_SAMPLES_PER_FILE)
        df["cf"] = int("/cf-" in path)
        return df
    
    data = pd.concat([_load(a) for a in pq_files])
    return np.array(data, dtype=np.float32)

In [7]:
df_files = pd.read_pickle("df_data.pkl.gz")

In [8]:
with Pool(8) as p:
    data = np.vstack(p.map(load_user, df_files.file[:8]))

In [9]:
data.shape

(12960, 1025)

In [10]:
sys.getsizeof(np.array(data, dtype=np.float32)) / 2**20

50.67455291748047

In [14]:
X = data[:, :-1]
y = data[:, -1]

In [15]:
X = X - X.mean()

In [17]:
X = X / X.std().clip(min=0.5)

In [19]:
X = X.clip(min=-5, max=5)

In [22]:
x = X
y = y
# groups = fid

#  

## Run ELM

In [23]:
params = {
    'alpha': loguniform(1e-3, 1e+1),
#     'ufunc': ['tanh', 'sigm', 'relu', 'lin'],
#     'pairwise_metric': [None, 'euclidean', 'cityblock', 'cosine'],
#     'density': loguniform(0.001, 1),
    'n_neurons': loguniform(1000, 10000),
}

In [24]:
for i in range(2, 5):  #100000
    rcv = RandomizedSearchCV(skelm.ELMClassifier(), params, n_iter=100, scoring='accuracy', cv=3, refit=False)
    res = rcv.fit(x, y)
    pd.DataFrame(res.cv_results_).to_pickle("res_v2/res_v2_{}.pkl".format(i))

ValueError: The key pairwise is not defined in _get_tags() for the class ELMClassifier.

## 

## Check results

In [None]:
Z = pd.concat((pd.read_pickle("res_v2/res_v2_{}.pkl".format(i)) for i in range(9999) if os.path.isfile("res_v2/res_v2_{}.pkl".format(i))))
Z = Z.reset_index(drop=True)
Z = Z.drop(["params", "param_pairwise_metric", "param_ufunc"] , axis=1)

Z['param_n_neurons'] = Z['param_n_neurons'].apply(lambda a : a // 50 * 50)
Z['param_density'] = Z['param_density'].apply(lambda a : (a + 0.05)//0.1 * 0.1)
Z['param_alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.5 * 0.5)

In [None]:
plt.figure(figsize=(8, 10))
P_10k = pd.pivot_table(Z, index='param_n_neurons', columns='param_alpha', values="mean_test_score")
# sn.heatmap(P, vmin=0.61)

In [None]:
fname0 = "res_v2/res_v2_{}_n100000.pkl"
Z = pd.concat((pd.read_pickle(fname0.format(i)) for i in range(9999) if os.path.isfile(fname0.format(i))))
Z = Z.reset_index(drop=True)
Z = Z.drop(["params", "param_pairwise_metric", "param_ufunc"] , axis=1)

Z['param_n_neurons'] = Z['param_n_neurons'].apply(lambda a : a // 50 * 50)
Z['param_density'] = Z['param_density'].apply(lambda a : (a + 0.05)//0.1 * 0.1)
Z['param_alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.5 * 0.5)

In [None]:
plt.figure(figsize=(8, 10))
P = pd.pivot_table(Z, index='param_n_neurons', columns='param_alpha', values="mean_test_score")
sn.heatmap(P, vmin=0.64)

## performance vs number of training samples, best model

In [None]:
train_idx, test_idx = next(GroupShuffleSplit().split(x, y, groups))
xt, xs, yt, ys = x[train_idx], x[test_idx], y[train_idx], y[test_idx]

In [None]:
data = []
model = skelm.ELMClassifier(n_neurons=2000, alpha=0.3, batch_size=5000)

for j in np.logspace(3, np.log10(xt.shape[0]), num=30):
    jj = int(j)
    print(jj, end=' ... ')
    t = time()
    score = model.fit(x[:jj], y[:jj]).score(xs, ys)
    data.append({'N': j, 'score': score})
    t = time() - t
    print("{:.3f} : {:.1f}s".format(score, t))

In [None]:
Z = pd.DataFrame(data)
Z.plot(x='N', y='score', logx=True, figsize=(15, 7), grid=True)

In [None]:
data = []
model = skelm.ELMClassifier(n_neurons=2000, alpha=0.3, batch_size=5000)

for j in np.logspace(3, np.log10(xt.shape[0]), num=30):
    jj = int(j)
    print(jj, end=' ... ')
    t = time()
    score = model.fit(x[:jj], y[:jj]).score(xs, ys)
    data.append({'N': j, 'score': score})
    t = time() - t
    print("{:.3f} : {:.1f}s".format(score, t))

In [None]:
Z = pd.DataFrame(data)
Z.plot(x='N', y='score', logx=True, figsize=(15, 7), grid=True)

#  

## Tune best parameters for limited-sample models

In [None]:
Ns = [4110, 5700, 10890]
Ns = [int(n * (30/27) * (3/2)) for n in Ns]

In [None]:
params = {
    'alpha': loguniform(1e+1, 1e+4),
    'n_neurons': loguniform(2000, 10000),
}

In [None]:
for i in range(1, 500):
    for n1, res in zip(Ns, (res1, res2, res3)):
        rcv = RandomizedSearchCV(skelm.ELMClassifier(batch_size=10000), params, n_iter=20, scoring='accuracy', cv=3, refit=False)
        res.append(pd.DataFrame(rcv.fit(x[:n1], y[:n1]).cv_results_))
        print("{}:{}".format(i, n1//1000), end=' ')

In [None]:
import pickle

In [None]:
with open("res123_current.pkl", "wb") as fr:
    pickle.dump({"res1": res1, "res2": res2, "res3":res3}, fr)

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(18, 8))#, gridspec_kw={'hspace': 0, 'wspace': 0.3})
fig.suptitle('Horizontally stacked subplots')

k = 0
for res, ax in zip([res1, res2, res3], axes):
    k += 1
    Z = pd.concat(res)
    Z['neurons'] = Z['param_n_neurons'].apply(lambda a : 2 ** (np.log2(a) // 0.15 * 0.15)).astype(int)
    Z['alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.2 * 0.2).apply(lambda a: np.round(a, 1))
    P = pd.pivot_table(Z, index='neurons', columns='alpha', values="mean_test_score")
    sn.heatmap(P, ax=ax, square=True, cbar=False, vmin=0.6)
    ax.set_title([4110, 5700, 10890][k-1])

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(18, 8))#, gridspec_kw={'hspace': 0, 'wspace': 0.3})
fig.suptitle('Horizontally stacked subplots')

k = 0
for res, ax in zip([res1, res2, res3], axes):
    k += 1
    Z = pd.concat(res)
    Z['neurons'] = Z['param_n_neurons'].apply(lambda a : 2 ** (np.log2(a) // 0.15 * 0.15)).astype(int)
    Z['alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.2 * 0.2).apply(lambda a: np.round(a, 1))
    P = pd.pivot_table(Z[(Z.alpha > 0.5) & (Z.alpha < 4.1) & (Z.neurons > 500)], 
                       index='neurons', columns='alpha', values="mean_test_score")
    sn.heatmap(P, ax=ax, square=True, cbar=False, vmin=0.6)
    ax.set_title([4110, 5700, 10890][k-1])

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(12, 8))#, gridspec_kw={'hspace': 0, 'wspace': 0.3})
fig.suptitle('Horizontally stacked subplots')

k = 0
for res, ax in zip([res1, res2, res3], axes):
    k += 1
    Z = pd.concat(res)
    Z['neurons'] = Z['param_n_neurons'].apply(lambda a : a // 100 * 100).astype(np.int)
    Z['alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.5 * 0.5)
    P = pd.pivot_table(Z, index='neurons', columns='alpha', values="mean_test_score")
    sn.heatmap(P, ax=ax, square=False, cbar=k==3)

#  

## Final experiments

In [None]:
X = pd.read_pickle("/home/akusok/HDD2TB/MCYTD_10p_n100.pkl")

In [None]:
Z = pd.concat((
    pd.get_dummies(X.uid, prefix='user'),
    X.drop(['wsize', 'overlap', 'xmin', 'ymin', 'xmax', 'ymax', 'uid'], axis=1)
), axis=1)

In [None]:
Z.loc[:, '0':'1023'] = Z.loc[:, '0':'1023'] - Z.loc[:, '0':'1023'].mean()

In [None]:
# clip low standard deviation to avoid excessively large numbers
Z.loc[:, '0':'1023'] = Z.loc[:, '0':'1023'] / Z.loc[:, '0':'1023'].std().clip(lower=0.5)

In [None]:
# limit the value range
Z.loc[:, '0':'1023'] = Z.loc[:, '0':'1023'].clip(lower=-5.0, upper=5.0)

In [None]:
Z = Z.reset_index(drop=True)

In [None]:
y = np.array(Z.sig_true)
x = np.array(Z.drop(['sig_true', 'fid'], axis=1))
groups = Z.fid

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
model = skelm.ELMClassifier(alpha=10, n_neurons=1000, ufunc='sigm', density=0.02)

In [None]:
cvp = cross_val_predict(model, x, y, groups=groups, cv=GroupKFold(n_splits=10), n_jobs=3, pre_dispatch=3)

In [None]:
np.save("/home/akusok/HDD2TB/MCYTD_10p_n100-predict.npy", cvp)