# Find mapping for reservoir using binarizer

This notebook take an input dataset for regression problem and binarize the input to be fed to an AMN reservoir. The code make use of RandomForestRegressor to rank the features (feature_importance) and find the mapping to the reservoir input.

## Utilities (run this cell first)

In [1]:
# Run this cell first 

def input_binarize(parameter, thr, verbose=False):
    # Transforms the feature matrix parameter.X into a matrix Xt 
    # using RandomForestRegressor
    # If the number of desired features k is greater than 
    # the number of original features m,  additional features 
    # are generated duplicating features.
    # Parameters:
    # thr: The number threshold foe binarization
    # Returns: A binary version of the transformed feature matrix paramater.X.
    from Library.Build_Model import rank_feature_by_permutation_importance
    from sklearn.preprocessing import Binarizer
    
    def rank_feature(X, y, regression):
        # Rank features based on permutation_importance
        # using random forest regressor
        if regression:
            model = RandomForestRegressor(n_estimators=100, random_state=42)
        else:
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            
        model.fit(X, y)
        
        # Compute permutation importance
        perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42)
        
        # Create a dataframe to hold feature names and their permutation importance scores
        feature_names = list(range(X.shape[1]))
        perm_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': perm_importance.importances_mean
            })
        
        # Sort by importance
        perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)
        sorted_features = perm_importance_df['Feature'].tolist()
        
        return sorted_features

    X, Xt, k = parameter.X, np.copy(parameter.X), parameter.res.X.shape[1]

    # Add columns to increase the number of features to k
    while Xt.shape[1] < k:
        i = np.random.randint(0, Xt.shape[1])
        Xt = np.concatenate((Xt, Xt[:,i].reshape(-1,1)), axis=1)
        
    # Binarize data
    scaler = MinMaxScaler()
    Xt = scaler.fit_transform(Xt)  
    binarizer = Binarizer(threshold=thr)
    Xb = binarizer.fit_transform(Xt)

    # Rank the features of both binarized array and reservoir
    lb = rank_feature(Xt, parameter.Y.ravel())
    lr = rank_feature(parameter.res.X, parameter.res.Y.ravel())
    if verbose:
        print('reservoir:', lr)
        print('problem  :', lb)
        
    # Ranked features one-to-one mapping 
    Xbr = np.zeros((Xb.shape[0], parameter.res.X.shape[1]))
    for i in range(len(lr)):
        Xbr[:,lr[i]] = Xb[:,lb[i]]
            
    return Xbr
        
def reservoir_binarize(reservoirfile, X, Y, mode='AMN-phenotype',
                       n_hidden_post=0,
                       train_rate=1.0e-3, batch_size=10, xfold=5, epochs=100, 
                       niter=1, t0=0.1, t1=1, dt=0.1, verbose=False):
    # Search best threshold for binarizer on binarized input
    
    from Library.Build_Reservoir import RC_Model
    from Library.Build_Model import train_evaluate_model, evaluate_model, model_input
    from sklearn.metrics import r2_score, accuracy_score

    batch_size = 100 if X.shape[0] > 1000 else 10    
    best_t, best_r2, failure = -1, -1000, 10
     
    for t in np.arange(t0, t1, dt):
        # Create model
        model = RC_Model(mode=mode,
                reservoirfile=reservoirfile, X=X, Y=Y,
                n_hidden_prior=-1, n_hidden_post=n_hidden_post,
                activation_post='relu',
                batch_size=batch_size,
                epochs=epochs, train_rate=train_rate, 
                xfold=xfold, niter=niter, verbose=verbose)
        # binarize the input
        model.X = input_binarize(model, t, verbose=verbose)
        # Train and evaluate
        _, pred, _, _ = train_evaluate_model(model, failure=failure, verbose=verbose)
        r2 = r2_score(Y, pred[:, 0], multioutput='variance_weighted')
        if verbose:
            print(f'threshold: {t:.2f} q2: {r2:.3f}')
        if r2 > best_r2:
            best_r2 = r2
            best_t = t
            best_model = model
    if verbose:
        print(f'Best threshold: {best_t:.2f} q2: {best_r2:.3f}')
    
    return best_model, best_t, best_r2

def write_model_binarize(trainingfile, model, verbose=True):  
    # Write file with X in binary format
    from Library.Build_Dataset import get_minmed_varmed_ko
    
    medium = model.res.medium
    minmed, varmed, ko = get_minmed_varmed_ko(medium)    
    medium = list(varmed.keys())
    medium.append('Y_true')
    H = np.asarray(medium)
    D = np.concatenate((model.X, model.Y), axis=1)
    write_csv(trainingfile, H, D)
    

## Main code

In [3]:
# Search the best mapping between an input file and reservoir input

from Library.Import import *

DIRECTORY = './'
problems = [
    'raw_geographical_origin_of_music', # 0
    'raw_pumadyn32nh',
    'raw_white_wine', # 2
    'raw_kin8nm',
    'raw_cars', # 4
    'raw_airfoil_self_noise', #5
    'raw_QSAR_fish_toxicity', # 6
    'raw_space_ga',
    'raw_concrete_compressive_strength', # 8
    'raw_grid_stability',
    'raw_miami_housing', # 10
    'raw_cpu_activity', # 11
    'raw_naval_propulsion_plant',
    'raw_energy_efficiency' # 13
]

species = [ 
    'e_coli_core', #0
    'iEK1008', 
    'iIT341', #2
    'iJN1463', 
    'iML1515', #4
    'iMM904',
    'iPC815', #6
    'iYO844',
    'iYS1720', 
    'iYS854', # 9
    'iML1515EXP', #10
]

seed = 1
np.random.seed(seed=seed)
mode = 'AMN_phenotype'
precision = 0
n_hidden_post = 0 
xfold = 5
epochs = 100 
niter = 1 # 0 for full xfold cv
train_rate = 1.0e-4 # 1.0e-4 for white wine may be 1.0e-3 otherwise

for i in [10]: # list(range(11))
    s = species[i]
    reservoirname = f'{s}_train_AMN_QP'
    reservoirfile = f'{DIRECTORY}Reservoir/{reservoirname}' 
    for j in [2, 3, 4, 5, 6, 7, 8, 9, 11, 13]: # list(range(14)):
        problem = problems[j].split('.')[0]
        trainingfile = f'{DIRECTORY}Dataset_input/Dataset_open_ML/Regression/{problem}'
        resultfile = f'{DIRECTORY}Result/{problem}_{s}_{mode}_{str(precision)}'
        H, X, Y = read_XY(trainingfile, nY=1, scaling='XY')
        start_time = time.time()
        # search treshold between 0.1 and 0.9
        t0, t1, dt = 0.1, 1.0, 0.1
        model, threshold, q2 = reservoir_binarize(reservoirfile, X, Y, mode=mode,
                            n_hidden_post=n_hidden_post,
                            train_rate=train_rate, 
                            xfold=xfold, epochs=epochs, niter=niter, 
                            t0=t0, t1=t1, dt=dt, verbose=True)
        # search around the threshold
        t0, t1, dt = threshold-0.01, threshold+0.02, 0.01
        model, threshold, q2 = reservoir_binarize(reservoirfile, X, Y, mode=mode,
                            n_hidden_post=n_hidden_post,
                            train_rate=train_rate, 
                            xfold=xfold, epochs=epochs, niter=niter, 
                            t0=t0, t1=t1, dt=dt, verbose=True)
        delta_time = time.time() - start_time
        print(f'{problem} {s} Threshold {threshold:.2f} Q2: {q2:.3f} cpu time {delta_time:.2f}')
        trainingfile = f'{trainingfile}_{s}_binary' 
        write_model_binarize(trainingfile, model, verbose=True)
        

number of reactions: 2086=2086
number of metabolites: 1335
filtered measurements size: 1
Reservoir matrices shapes for AMN-phenotype W (2,) bias (2,)
reservoir: [0, 27, 1, 19, 24, 18, 10, 14, 8, 5, 4, 6, 16, 2, 9, 11, 21, 13, 12, 3, 17, 25, 22, 20, 15, 26, 7, 23]
problem  : [10, 21, 1, 3, 4, 2, 14, 8, 7, 23, 0, 19, 26, 25, 6, 15, 27, 24, 9, 11, 12, 22, 18, 16, 17, 5, 13, 20]
RC scaler: MinMaxScaler()
RC input shape:((4898, 28), (4898, 1))
-------train (3918, 28) (3918, 2)
-------test  (980, 28) (980, 2)
----------------------------------- RC
Res IO (None, 28) (None, 2086)
Dense layer n_hidden: 0 hidden_dim: -1 input_dim: 2086 output_dim: 1 activation: relu trainable: True
Post IO (None, 2086) (None, 1)
train = 0.06 test = 0.06 loss-train = -1.000000 loss-test = -1.000000
threshold: 0.10 q2: 0.059
number of reactions: 2086=2086
number of metabolites: 1335
filtered measurements size: 1
Reservoir matrices shapes for AMN-phenotype W (2,) bias (2,)
reservoir: [0, 27, 1, 19, 24, 18, 10, 14, 