# Classify single image patches

Not expected to be very precise - more of a model structure search

In [1]:
import os
os.system("export MKL_NUM_THREADS=4")
os.system("export NUMEXPR_NUM_THREADS=4")
os.system("export OMP_NUM_THREADS=4")

0

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from pyarrow import parquet

import skelm

from matplotlib import pyplot as plt, rcParams
%matplotlib inline
rcParams['figure.figsize'] = (15, 8)

# import seaborn as sns
# sns.set(rc={'figure.figsize':(15, 8)})

In [3]:
from multiprocessing import Pool

In [4]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [5]:
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV, GroupShuffleSplit, RandomizedSearchCV
import scipy
from sklearn.utils.fixes import loguniform
from time import time

In [6]:
overlap = '50p'
MAX_SAMPLES_PER_FILE = 30
# data_path = "/scratch/project_2001220/leonardo/00_Features/GPDSS10000/folder2/004/overlap" + overlap

# 

## Load data

In [7]:
def load_user(fname):
    for root,_,files in os.walk(os.path.dirname(fname) + "/overlap" + overlap):
        pq_files = [os.path.join(root, f) for f in files if f.endswith("_features.parquet.gz")]
    
    def _load(path):
        X = np.array(parquet.read_table(path, use_threads=False))[6:]
        X = X[:, np.random.choice(X.shape[1], MAX_SAMPLES_PER_FILE, replace=False)].T
        y = np.ones((MAX_SAMPLES_PER_FILE, 1), np.float32) * int("/cf-" in path)
        return X, y
  
    print(".", end="")
    data = [_load(a) for a in pq_files]
    X = np.vstack([d[0] for d in data]).astype(np.float32)
    y = np.vstack([d[1] for d in data])
    return np.hstack((X, y))

In [8]:
df_files = pd.read_pickle("df_data.pkl.gz")

In [9]:
data = np.load("data_200k.npy")

In [10]:
data.shape

(207360, 1025)

In [11]:
data.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [12]:
sys.getsizeof(np.array(data, dtype=np.float32)) / 2**20

810.791130065918

In [13]:
%%time
x = data[:, :-1]
y = data[:, -1]

CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 12.9 µs


In [14]:
%%time
# normalize data in a low-memory way

m = x.mean(axis=0)
x -= m
l = x.shape[0]
s = np.array([(np.sum(col**2)/l)**0.5 for col in x.T])
x /= s

CPU times: user 718 ms, sys: 19.7 ms, total: 737 ms
Wall time: 745 ms


In [15]:
%%time
x.mean(0), x.std(0)

CPU times: user 602 ms, sys: 262 ms, total: 864 ms
Wall time: 869 ms


(array([-3.06963926e-07, -1.04492095e-08,  4.19440083e-09, ...,
        -1.83964954e-09, -3.94052933e-08,  7.86634118e-08], dtype=float32),
 array([1.        , 1.        , 1.        , ..., 0.9999999 , 0.99999994,
        1.        ], dtype=float32))

In [16]:
%%time
x = np.ascontiguousarray(x)
y = np.ascontiguousarray(y)

CPU times: user 701 ms, sys: 235 ms, total: 936 ms
Wall time: 945 ms


In [18]:
np.show_config()

blas_mkl_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Users/akusok/miniconda3/envs/signatures/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Users/akusok/miniconda3/envs/signatures/include']
blas_opt_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Users/akusok/miniconda3/envs/signatures/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Users/akusok/miniconda3/envs/signatures/include']
lapack_mkl_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Users/akusok/miniconda3/envs/signatures/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Users/akusok/miniconda3/envs/signatures/include']
lapack_opt_info:
    libraries = ['mkl_rt', 'pthread']
    library_dirs = ['/Users/akusok/miniconda3/envs/signatures/lib']
    define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
    include_dirs = ['/Users/akus

#  

## Run ELM

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
xt, xv, yt, yv = train_test_split(x, y, test_size=0.1)

In [52]:
%%time
xt, xv, yt, yv = train_test_split(x, y, test_size=0.1)

KeyboardInterrupt: 

In [24]:
model.score(xv, yv)

0.5670331790123457

## Run ELM Optimization

ELM itself can run on my laptop, and evaluate all results on the laptop.

ELM *Optimization* is what runs in the cloud - it gathers statistics of many many ELM trials with different random combinations of parameters. Tighter parameter ranges and good combinations are dsicovered through this random search process, iteration by iteration.

In [None]:
params = {
    'alpha': loguniform(1e-2, 1e+2),
    'ufunc': ['tanh', 'sigm', 'relu', 'lin'],
    'pairwise_metric': [None, 'euclidean', 'cityblock', 'cosine'],
    'density': loguniform(0.001, 1),
    'n_neurons': loguniform(10, 10000),
}

In [None]:
rcv = RandomizedSearchCV(
    skelm.ELMClassifier(), params, n_iter=4, scoring='accuracy', cv=3, refit=False,
    verbose=3, n_jobs=None, return_train_score=True
)
rcv.fit(x, y)

In [None]:
pd.DataFrame(rcv.cv_results_)

## actually run the ELM...

In [None]:
rcv = RandomizedSearchCV(skelm.ELMClassifier(), params, n_iter=100, scoring='accuracy', cv=3, refit=False)
rcv.fit(x, y)

In [None]:
for i in range(2, 5):  #100000
    rcv = RandomizedSearchCV(skelm.ELMClassifier(), params, n_iter=100, scoring='accuracy', cv=3, refit=False)
    res = rcv.fit(x, y)
    pd.DataFrame(res.cv_results_).to_pickle("res_v2/res_v2_{}.pkl".format(i))

## 

## Check results

In [None]:
Z = pd.concat((pd.read_pickle("res_v2/res_v2_{}.pkl".format(i)) for i in range(9999) if os.path.isfile("res_v2/res_v2_{}.pkl".format(i))))
Z = Z.reset_index(drop=True)
Z = Z.drop(["params", "param_pairwise_metric", "param_ufunc"] , axis=1)

Z['param_n_neurons'] = Z['param_n_neurons'].apply(lambda a : a // 50 * 50)
Z['param_density'] = Z['param_density'].apply(lambda a : (a + 0.05)//0.1 * 0.1)
Z['param_alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.5 * 0.5)

In [None]:
plt.figure(figsize=(8, 10))
P_10k = pd.pivot_table(Z, index='param_n_neurons', columns='param_alpha', values="mean_test_score")
# sn.heatmap(P, vmin=0.61)

In [None]:
fname0 = "res_v2/res_v2_{}_n100000.pkl"
Z = pd.concat((pd.read_pickle(fname0.format(i)) for i in range(9999) if os.path.isfile(fname0.format(i))))
Z = Z.reset_index(drop=True)
Z = Z.drop(["params", "param_pairwise_metric", "param_ufunc"] , axis=1)

Z['param_n_neurons'] = Z['param_n_neurons'].apply(lambda a : a // 50 * 50)
Z['param_density'] = Z['param_density'].apply(lambda a : (a + 0.05)//0.1 * 0.1)
Z['param_alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.5 * 0.5)

In [None]:
plt.figure(figsize=(8, 10))
P = pd.pivot_table(Z, index='param_n_neurons', columns='param_alpha', values="mean_test_score")
sn.heatmap(P, vmin=0.64)

## performance vs number of training samples, best model

In [None]:
train_idx, test_idx = next(GroupShuffleSplit().split(x, y, groups))
xt, xs, yt, ys = x[train_idx], x[test_idx], y[train_idx], y[test_idx]

In [None]:
data = []
model = skelm.ELMClassifier(n_neurons=2000, alpha=0.3, batch_size=5000)

for j in np.logspace(3, np.log10(xt.shape[0]), num=30):
    jj = int(j)
    print(jj, end=' ... ')
    t = time()
    score = model.fit(x[:jj], y[:jj]).score(xs, ys)
    data.append({'N': j, 'score': score})
    t = time() - t
    print("{:.3f} : {:.1f}s".format(score, t))

In [None]:
Z = pd.DataFrame(data)
Z.plot(x='N', y='score', logx=True, figsize=(15, 7), grid=True)

In [None]:
data = []
model = skelm.ELMClassifier(n_neurons=2000, alpha=0.3, batch_size=5000)

for j in np.logspace(3, np.log10(xt.shape[0]), num=30):
    jj = int(j)
    print(jj, end=' ... ')
    t = time()
    score = model.fit(x[:jj], y[:jj]).score(xs, ys)
    data.append({'N': j, 'score': score})
    t = time() - t
    print("{:.3f} : {:.1f}s".format(score, t))

In [None]:
Z = pd.DataFrame(data)
Z.plot(x='N', y='score', logx=True, figsize=(15, 7), grid=True)

#  

## Tune best parameters for limited-sample models

In [None]:
Ns = [4110, 5700, 10890]
Ns = [int(n * (30/27) * (3/2)) for n in Ns]

In [None]:
params = {
    'alpha': loguniform(1e+1, 1e+4),
    'n_neurons': loguniform(2000, 10000),
}

In [None]:
for i in range(1, 500):
    for n1, res in zip(Ns, (res1, res2, res3)):
        rcv = RandomizedSearchCV(skelm.ELMClassifier(batch_size=10000), params, n_iter=20, scoring='accuracy', cv=3, refit=False)
        res.append(pd.DataFrame(rcv.fit(x[:n1], y[:n1]).cv_results_))
        print("{}:{}".format(i, n1//1000), end=' ')

In [None]:
import pickle

In [None]:
with open("res123_current.pkl", "wb") as fr:
    pickle.dump({"res1": res1, "res2": res2, "res3":res3}, fr)

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(18, 8))#, gridspec_kw={'hspace': 0, 'wspace': 0.3})
fig.suptitle('Horizontally stacked subplots')

k = 0
for res, ax in zip([res1, res2, res3], axes):
    k += 1
    Z = pd.concat(res)
    Z['neurons'] = Z['param_n_neurons'].apply(lambda a : 2 ** (np.log2(a) // 0.15 * 0.15)).astype(int)
    Z['alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.2 * 0.2).apply(lambda a: np.round(a, 1))
    P = pd.pivot_table(Z, index='neurons', columns='alpha', values="mean_test_score")
    sn.heatmap(P, ax=ax, square=True, cbar=False, vmin=0.6)
    ax.set_title([4110, 5700, 10890][k-1])

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(18, 8))#, gridspec_kw={'hspace': 0, 'wspace': 0.3})
fig.suptitle('Horizontally stacked subplots')

k = 0
for res, ax in zip([res1, res2, res3], axes):
    k += 1
    Z = pd.concat(res)
    Z['neurons'] = Z['param_n_neurons'].apply(lambda a : 2 ** (np.log2(a) // 0.15 * 0.15)).astype(int)
    Z['alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.2 * 0.2).apply(lambda a: np.round(a, 1))
    P = pd.pivot_table(Z[(Z.alpha > 0.5) & (Z.alpha < 4.1) & (Z.neurons > 500)], 
                       index='neurons', columns='alpha', values="mean_test_score")
    sn.heatmap(P, ax=ax, square=True, cbar=False, vmin=0.6)
    ax.set_title([4110, 5700, 10890][k-1])

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(12, 8))#, gridspec_kw={'hspace': 0, 'wspace': 0.3})
fig.suptitle('Horizontally stacked subplots')

k = 0
for res, ax in zip([res1, res2, res3], axes):
    k += 1
    Z = pd.concat(res)
    Z['neurons'] = Z['param_n_neurons'].apply(lambda a : a // 100 * 100).astype(np.int)
    Z['alpha'] = Z['param_alpha'].apply(lambda a : np.log10(a)//0.5 * 0.5)
    P = pd.pivot_table(Z, index='neurons', columns='alpha', values="mean_test_score")
    sn.heatmap(P, ax=ax, square=False, cbar=k==3)

#  

## Final experiments

In [None]:
X = pd.read_pickle("/home/akusok/HDD2TB/MCYTD_10p_n100.pkl")

In [None]:
Z = pd.concat((
    pd.get_dummies(X.uid, prefix='user'),
    X.drop(['wsize', 'overlap', 'xmin', 'ymin', 'xmax', 'ymax', 'uid'], axis=1)
), axis=1)

In [None]:
Z.loc[:, '0':'1023'] = Z.loc[:, '0':'1023'] - Z.loc[:, '0':'1023'].mean()

In [None]:
# clip low standard deviation to avoid excessively large numbers
Z.loc[:, '0':'1023'] = Z.loc[:, '0':'1023'] / Z.loc[:, '0':'1023'].std().clip(lower=0.5)

In [None]:
# limit the value range
Z.loc[:, '0':'1023'] = Z.loc[:, '0':'1023'].clip(lower=-5.0, upper=5.0)

In [None]:
Z = Z.reset_index(drop=True)

In [None]:
y = np.array(Z.sig_true)
x = np.array(Z.drop(['sig_true', 'fid'], axis=1))
groups = Z.fid

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
model = skelm.ELMClassifier(alpha=10, n_neurons=1000, ufunc='sigm', density=0.02)

In [None]:
cvp = cross_val_predict(model, x, y, groups=groups, cv=GroupKFold(n_splits=10), n_jobs=3, pre_dispatch=3)

In [None]:
np.save("/home/akusok/HDD2TB/MCYTD_10p_n100-predict.npy", cvp)