In [None]:
import pandas as pd
import numpy as np
from glob import glob
from benatools.tools import BaseOptimizeBlend
from sklearn.metrics import mean_squared_error

# Functions and Optimizer
We need to define an Optimizer class, implementing the metric method. The Optimizer class extends the BaseOptimizeBlend, which has already implemented fit and predict methods.

In [None]:
class Optimizer(BaseOptimizeBlend):
    def metric(self, coef, X, y):
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=1)
        score = mean_squared_error(y, predictions)
        return score

Sometimes it is useful to define some other helper functions when dealing with complex scoring functions.  
Another good practice is to name the oof and the submission files with the same suffix

In [None]:
def read_folder(folder):
    oof = glob(folder+'/oof*.csv')
    subs = glob(folder+'/sub*.csv')
    oof = sorted(oof, key=lambda x: int(x[:-4].split('_')[-1]))
    subs = sorted(subs, key=lambda x: int(x[:-4].split('_')[-1]))
    return oof, subs

def score(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

# Read Actual Training Labels
The values the optimizer will work with are based on OOF calculations. Thus, we need to read the actual training labels in order to calculate metrics on them.

In [None]:
# train labels
LABELS = ['label_1', 'label_2']
train = pd.read_json('train.csv')
y_true = train[LABELS].values
y_true.shape  # Shape should be (n_train_samples, n_labels)

# Read OOF and Submission Files
It is important to get the list of files to evaluate (both OOF and Submission files for each model), and calculate the metric on them

In [None]:
oof = []
subs = []

# List all the folders to read files from
folders = ['ensemble/rnn', 'ensemble/gnnnew']

# Read all the folders
for f in folders:
    a,b = read_folder(f)
    oof += a
    subs += b

# Create a Dataframe with both OOF and Submission paths for the same model
df_data = pd.DataFrame({'oof':oof, 'subs':subs})

# Calculate OOF score for each model. Important to sort each file by its row id, to make sure we compare apples with apples
df_data['oof_score'] = df_data.apply(lambda x: score( pd.read_csv(x['oof'], encoding='utf-8', engine='c').sort_values('id')[LABELS].values, y_oof ), axis=1 )
df_data.head()

# Fit the Optimizer

In [None]:
# Read OOF files and stack them into a numpy array
oof_arr = np.stack([pd.read_csv(f, encoding='utf-8', engine='c').sort_values('id')[LABELS] for f in df_data['oof'].values])
# Shape will be (n_estimators, n_samples, n_labels)
oof_arr.shape

In [None]:
# Generate a list of Optimizers, one for each label to optimize
opts = [Optimizer() for i in range(y_true.shape[1])]

# Run optimization process for each label
for i, o in enumerate(opts):
    x = oof_arr[:,:,i].T  # OOF calculated from all estimators for label i
    o.fit(x, y_true[:,i])
    print('Original Metric', mean_squared_error(y_true[:,i], np.mean(oof_arr[:,:,i].T, axis=1), squared=False))
    print('Coefficients', o.get_coef())
    print('Coefficients shape', o.get_coef().shape)
    print('')


# Blend
Once the coefficients have been calculated, it is time to apply them to the submissions files

In [None]:
# Read Submission files and stack them innto a numpy array
subs_arr = np.stack([pd.read_csv(f, encoding='utf-8', engine='c').sort_values('id')[LABELS] for f in df_data['subs'].values])
# Shape will be (n_estimators, n_samples, n_labels)
subs_arr.shape

In [None]:
df = pd.read_csv(df_data['subs'][0], encoding='utf-8', engine='c').sort_values('id')[LABELS]

# Apply coefficients to every submission file
for i,c in enumerate(LABELS):
    df[c] = opts[i].predict(subs_arr[:,:,i].T)
df

In [None]:
# Save new submission file
df.to_csv('submission.csv', index=False)
print('Submission saved')