In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import gc
from fastai.core import parallel

In [None]:
subs = [
    'loss-5.7552val-2.9950', 'loss-4.9516val-3.0042', 'loss-5.9943val-2.7766',
#    'loss-4.0414val-2.7287', 'loss-4.9044val-2.5880', 'loss-4.9516val-2.8229',
    'loss-4.0414val-2.7287', 'loss-4.9516val-2.8229',
    'loss-5.6540val-3.0131', 'loss-5.1336val-3.0759', 'loss-5.5435val-3.0776',
    'loss-3.5933val-3.0637', 'loss-5.6477val-3.0793',
]

In [None]:
train_x = None
for sub in subs:
    df = pd.read_csv('%s_val' % sub)
    df = df.drop(['id'], axis=1)
    if train_x is None:
        train_x = df
        train_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
    else:
        train_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
train_x = train_x.drop(['scalar_coupling_constant'], axis=1).astype('float32')
train_x.head()

In [None]:
test_x = None
for sub in subs:
    df = pd.read_csv('%s' % sub)
    df = df.drop(['id'], axis=1)
    if test_x is None:
        test_x = df
        test_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
    else:
        test_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
test_x = test_x.drop(['scalar_coupling_constant'], axis=1).astype('float32')
test_x.head()

In [None]:
test_y = pd.read_csv('temp_-3.225.csv')
test_y = test_y.drop(['id'], axis=1)['scalar_coupling_constant'].astype('float32')
test_y.head()


In [None]:
type_index = {
    '1JHC': 0,
    '2JHH': 1,
    '1JHN': 2,
    '2JHN': 3,
    '2JHC': 4,
    '3JHH': 5,
    '3JHC': 6,
    '3JHN': 7
}
test_types = pd.read_csv('test.csv')['type']
test_types_idx = []
for value in test_types.values:
    test_types_idx.append(type_index[value])
test_types_idx = np.array(test_types_idx)
test_types_idx

In [None]:
train_y = pd.read_csv('validation_targets')
train_y = train_y.drop(['id'], axis=1)['scalar_coupling_constant'].astype('float32')


In [None]:
types = pd.read_csv('validation_types')
types = types.drop(['id'], axis=1)['type']

In [None]:
from sklearn.linear_model import *

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa

reg1 = KNeighborsRegressor(n_neighbors=20) # -3.19
reg2 = LinearRegression() # -3.16
reg3 = LinearRegression(fit_intercept=False) 
reg4 = RANSACRegressor() #-3.177
reg5 = TheilSenRegressor() #-3.177
#reg6 = RandomForestRegressor(n_estimators=10) #-4.04
#reg6 = BayesianRidge() # -3.16867
#reg6 = HuberRegressor() # -3.1839

#model = VotingRegressor(estimators=[('knr', reg1), ('lr', reg2), ('lr2', reg3), ('ransac', reg4), ('th', reg5)])
model = VotingRegressor(estimators=[
    ('knr', reg1), ('lr2', reg2), ('ransac', reg4), ('th', reg5)],n_jobs=-1)
#model = VotingRegressor(estimators=[('hb', reg6)])

In [None]:
y = train_y
X = train_x

In [None]:
def eval_lmae(gt, pred, types):
    types = types.values.astype(np.int)
    gt = np.asarray(gt)
    pred = np.asarray(pred)    
    loss = 0.
    for type in np.unique(types):
        mask = types == type
        loss += np.log(np.mean(np.abs(gt[mask]-pred[mask])))
    loss = loss / len(np.unique(types))
    return loss

In [None]:
# Run CV
# https://www.kaggle.com/aharless/simple-catboost-cv-lb-281
OPTIMIZE_ROUNDS = False
y_test_pred = None
y_test_pred = [None, None, None, None, None, None, None, None]

K = 1

def calc_type(type, iii):
    loss = 0
    y_test_pred = None
    for i, (train_index, test_index) in enumerate([(range(len(X)),
                                                    range(len(y)))]):
        gc.collect()

        # Create data for this fold
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index, :], X.iloc[test_index, :]
        type_train, type_valid = types.iloc[train_index], types.iloc[
            test_index]

        y_train, y_valid, X_train, X_valid, type_train, type_valid = y_train[
            type_train == type], y_valid[type_valid == type], X_train[
                type_train == type], X_valid[type_valid == type], type_train[
                    type_train == type], type_valid[type_valid == type]

        fit_model = model.fit(
            np.concatenate([X_train, test_x[test_types_idx == type][:X_train.shape[0]]],
                           axis=0),
            np.concatenate([y_train, test_y[test_types_idx == type][:y_train.shape[0]]],axis=0))

        # Generate validation predictions for this fold
        pred = fit_model.predict(X_valid)
        loss += eval_lmae(y_valid, pred, type_valid)

        # Accumulate test set predictions
        if y_test_pred is None:
            y_test_pred  = fit_model.predict(test_x)
        else:
            y_test_pred += fit_model.predict(test_x)
        return (type, y_test_pred, loss )

In [None]:
res = parallel(calc_type, range(8))

In [None]:
y_test_pred, loss = [None] * 8, [None] * 8
for (type,y_test_preds,type_loss) in res:
    y_test_pred[type] = y_test_preds
    loss[type] = type_loss
for i in range(8):
    print(f"Type {i} LMAE: {loss[i]}")
print(f"\nFinal  LMAE: {np.mean(loss)}")    

In [None]:
y_test_pred_list = y_test_pred
y_test_pred = []

for idx, type_idx in enumerate(test_types_idx):
    y_test_pred.append(y_test_pred_list[type_idx][idx])
y_test_pred = np.array(y_test_pred)

In [None]:
y_test_pred

In [None]:
df = pd.read_csv('%s' % subs[0])
df.head()

In [None]:
df['scalar_coupling_constant'] = y_test_pred

In [None]:
df.head()

In [None]:
sub_fname = 'LMAE_' + str(loss) + '-LinearRegressionByType-1fold'
sub_fname

In [None]:
df.to_csv(sub_fname, index=False)

In [None]:
comp = 'champs-scalar-coupling'
m = ' '.join(subs) + ' estimators: ' + ' '.join([name for name,_ in model.estimators])
m

In [None]:
!kaggle competitions submit -c {comp} -f {sub_fname} -m '{m}'

In [None]:
import time
time.sleep(60)
!kaggle competitions submissions -c {comp} -v > submissions-{comp}.csv

In [None]:
submissions = pd.read_csv(f'submissions-{comp}.csv')
submissions.iloc[0].publicScore