In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

In [3]:
!head loss-5.7552val-2.9950_val

id,scalar_coupling_constant
0,83.50312805175781
1,-2.245182991027832
2,-11.567841529846191
3,-11.566396713256836
4,3.245485305786133
5,12.714962005615234
6,3.298550844192505
7,83.466064453125
8,-2.2379026412963867


In [4]:
subs = ['loss-5.7552val-2.9950', 'loss-4.9516val-3.0042', 'loss-5.9943val-2.7766']

In [5]:
train_x = None
for sub in subs:
    df = pd.read_csv('%s_val' % sub)
    df = df.drop(['id'], axis=1)
    if train_x is None:
        train_x = df
        train_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
    else:
        train_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
train_x = train_x.drop(['scalar_coupling_constant'], axis=1)
#df = pd.read_csv('validation_types')
#df = df.drop(['id'], axis=1)
#train_x['type'] = df['type']
train_x.head()

Unnamed: 0,scalar_coupling_constant_loss-5.7552val-2.9950,scalar_coupling_constant_loss-4.9516val-3.0042,scalar_coupling_constant_loss-5.9943val-2.7766
0,83.503128,83.583549,83.352898
1,-2.245183,-2.236212,-2.195633
2,-11.567842,-11.562995,-11.576006
3,-11.566397,-11.564421,-11.565042
4,3.245485,3.295776,3.181636


In [16]:
test_x = None
for sub in subs:
    df = pd.read_csv('%s' % sub)
    df = df.drop(['id'], axis=1)
    if test_x is None:
        test_x = df
        test_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
    else:
        test_x['scalar_coupling_constant_%s' % sub] = df['scalar_coupling_constant']
test_x = test_x.drop(['scalar_coupling_constant'], axis=1)
#df = pd.read_csv('validation_types')
#df = df.drop(['id'], axis=1)
#train_x['type'] = df['type']
test_x.head()

Unnamed: 0,scalar_coupling_constant_loss-5.7552val-2.9950,scalar_coupling_constant_loss-4.9516val-3.0042,scalar_coupling_constant_loss-5.9943val-2.7766
0,15.68477,15.999567,8.700197
1,188.352219,193.554047,187.950882
2,6.846854,7.092568,5.50555
3,188.5215,193.675003,187.018097
4,15.910284,16.058594,8.810644


In [6]:
train_y = pd.read_csv('validation_targets')
train_y = train_y.drop(['id'], axis=1)['scalar_coupling_constant']
train_y

0         83.542984
1         -2.378307
2        -11.700430
3        -11.697890
4          3.252806
5         13.691265
6          3.252054
7         83.541679
8         -2.378622
9        -11.699591
10        13.692376
11         3.252531
12         3.252724
13        83.548409
14        -2.377157
15         3.252426
16         3.252421
17        13.692103
18        -2.378762
19        83.541763
20       -11.700430
21       -11.699314
22        -2.378518
23        83.542969
24       -11.697617
25        -2.377235
26        83.548599
27        55.525219
28        -4.061727
29         5.328185
            ...    
463694     4.651087
463695     2.200768
463696    89.291336
463697     2.200768
463698     4.651087
463699    10.657756
463700     0.081280
463701     3.564758
463702     4.914779
463703     4.021182
463704     5.211150
463705    83.776955
463706    -1.201683
463707     6.123803
463708   -10.538275
463709     0.778395
463710     5.399363
463711     5.566216
463712    -1.475149


In [7]:
!head validation_types

id,type
0,0.0
1,4.0
2,1.0
3,1.0
4,5.0
5,5.0
6,5.0
7,0.0
8,4.0


In [8]:
types = pd.read_csv('validation_types')
types = types.drop(['id'], axis=1)['type']
types

0         0.0
1         4.0
2         1.0
3         1.0
4         5.0
5         5.0
6         5.0
7         0.0
8         4.0
9         1.0
10        5.0
11        5.0
12        5.0
13        0.0
14        4.0
15        5.0
16        5.0
17        5.0
18        4.0
19        0.0
20        1.0
21        1.0
22        4.0
23        0.0
24        1.0
25        4.0
26        0.0
27        2.0
28        4.0
29        1.0
         ... 
463694    6.0
463695    4.0
463696    0.0
463697    4.0
463698    6.0
463699    6.0
463700    5.0
463701    5.0
463702    6.0
463703    6.0
463704    4.0
463705    0.0
463706    4.0
463707    6.0
463708    1.0
463709    5.0
463710    6.0
463711    6.0
463712    4.0
463713    0.0
463714    4.0
463715    6.0
463716    5.0
463717    6.0
463718    4.0
463719    6.0
463720    6.0
463721    4.0
463722    0.0
463723    4.0
Name: type, Length: 463724, dtype: float64

In [9]:
MAX_ROUNDS = 650
model = CatBoostRegressor(
    n_estimators=20000,
    loss_function='MAE',
    eval_metric='RMSE',
    #iterations = MAX_ROUNDS,
    cat_features=None,
    devices='0-3', task_type='GPU')

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [11]:
y = train_y
X = train_x

In [12]:
np.unique(types.values)

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [13]:
def eval_lmae(gt, pred, types):
    types = types.values
    gt = np.asarray(gt)
    pred = np.asarray(pred)    
    loss = 0.
    for type in range(8):
        mask = types == type
        loss += np.log(np.mean(np.abs(gt[mask]-pred[mask])))
    loss = loss / 8.
    return loss


In [21]:
# Run CV
# https://www.kaggle.com/aharless/simple-catboost-cv-lb-281
OPTIMIZE_ROUNDS = False
y_test_pred = None

for i, (train_index, test_index) in enumerate(kf.split(train_x)):

    # Create data for this fold
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index, :], X.iloc[test_index, :]
    type_train, type_valid = types.iloc[train_index], types.iloc[test_index]
    print("\nFold ", i)

    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        fit_model = model.fit(X_train,
                              y_train,
                              eval_set=(X_valid, y_valid),
                              use_best_model=True,
                              plot=True,
                              silent=True)
        print("  N trees = ", model.tree_count_)
    else:
        fit_model = model.fit(X_train, y_train)

    # Generate validation predictions for this fold
    pred = fit_model.predict(X_valid)
    #print(pred, y_valid)
    print( "  LMAE = ", eval_lmae(y_valid, pred, type_valid))
    #break
    
    #print( "  Gini = ", eval_gini(y_valid, pred) )
    #y_valid_pred.iloc[test_index] = pred

    # Accumulate test set predictions
    if y_test_pred is None:
        y_test_pred = fit_model.predict(test_x)
    else:
        y_test_pred += fit_model.predict(test_x)

y_test_pred /= K  # Average test set predictions


Fold  0
  LMAE =  -3.051789948581929

Fold  1
  LMAE =  -3.0124033537600874

Fold  2
  LMAE =  -3.0607194521648577

Fold  3
  LMAE =  -3.0408775962963777

Fold  4
  LMAE =  -3.0292550052711658


In [22]:
y_test_pred

array([ 13.467886  , 190.58247305,   6.50547081, ...,   1.98569069,
         4.18130641, 119.7659851 ])

In [23]:
df = pd.read_csv('%s' % subs[0])
df.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,15.68477
1,4658148,188.352219
2,4658149,6.846854
3,4658150,188.5215
4,4658151,15.910284


In [24]:
df['scalar_coupling_constant'] = y_test_pred

In [25]:
df.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,13.467886
1,4658148,190.582473
2,4658149,6.505471
3,4658150,190.357381
4,4658151,13.57884


In [27]:
sub_fname = 'LinearRegression%' + '%'.join(subs)
sub_fname

'LinearRegression%loss-5.7552val-2.9950%loss-4.9516val-3.0042%loss-5.9943val-2.7766'

In [28]:
df.to_csv(sub_fname, index=False)

In [29]:
comp = 'champs-scalar-coupling'

In [30]:
!kaggle competitions submit -c {comp} -f {sub_fname} -m ''

100%|██████████████████████████████████████| 64.1M/64.1M [00:02<00:00, 22.9MB/s]
Successfully submitted to Predicting Molecular Properties

In [31]:
import time
time.sleep(60)
!kaggle competitions submissions -c {comp} -v > submissions-{comp}.csv

In [32]:
submissions = pd.read_csv(f'submissions-{comp}.csv')
submissions.iloc[0].publicScore

-3.048