In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from kuma_utils.training import Trainer
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

In [7]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
# exp_paths = [
#     Path('results/exp_1'),
#     Path('results/exp_7')
# ]


In [4]:
# for i, p in enumerate(exp_paths):
#     if i == 0:
#         stack_submission = pd.read_csv(p/'submission.csv')
#         stack_submission.columns = ['id', p.stem]
#     else:
#         submission = pd.read_csv(p/'submission.csv')
#         submission.columns = ['id', p.stem]
#         stack_submission = stack_submission.merge(submission, on='id', how='left')

In [5]:
# stack_submission['predictions'] = stack_submission.drop('id', axis=1).mean(1)

In [6]:
# stack_submission[['id', 'predictions']].to_csv(f'results/ens_{"_".join(stack_submission.columns[1:])}.csv', index=False)

In [9]:
# submission = pd.read_csv('results/exp_16_a/submission_raw.csv')[['id', 'fold_4']]
# submission2 =  pd.read_csv('results/exp_12/submission_raw.csv')[['id', 'fold_4']]
# submission['predictions'] = (submission['fold_4'] + submission2['fold_4']) / 2
# submission[['id', 'predictions']].to_csv('results/ens_exp_12_exp_16_a_fold4.csv', index=False)

# Generate meta features

In [None]:
def make_meta_feature(graph):
    record = {
        'ratio16': 0, 
        'ratio42': 0, 
        'ratio34': 0, 
        'ratio74': 0,
    }
    atoms = pd.Series(graph['atom'])
    atom_count = len(atoms)
    atom_dict = (atoms.value_counts() / atom_count).to_dict()
    for k, v in atom_dict.items():
        record[f'ratio{k}'] = v
    record['atom_count'] = atom_count
    record['link_count'] = len(graph['bond'])
    return pd.Series(record)

In [37]:
train = pd.read_csv('data/train_meta.csv')
test = pd.read_csv('data/test_meta.csv')
train.head()

Unnamed: 0,id,band_gap,ratio16,ratio42,ratio34,ratio74,atom_count,link_count
0,6141cf0f51c1cbd9654b8870,1.0843,0.663158,0.336842,0.0,0.0,190.0,2008.0
1,6141cf1051c1cbd9654b8872,1.1102,0.663158,0.336842,0.0,0.0,190.0,2008.0
2,6141cf11cc0e69a0cf28ab35,1.1484,0.659686,0.335079,0.005236,0.0,191.0,2028.0
3,6141cf11b842c2e72e2f2d48,1.8068,0.65625,0.333333,0.010417,0.0,192.0,2048.0
4,6141cf11ae4fb853db2e3f14,0.36,0.670157,0.329843,0.0,0.0,191.0,2024.0


In [38]:
oof = pd.read_csv('results/exp_12/outoffolds.csv')
preds = pd.read_csv('results/exp_12/submission_raw.csv')
oof.head()

Unnamed: 0,_id,outoffolds
0,6141cf0f51c1cbd9654b8870,1.083449
1,6141cf1051c1cbd9654b8872,1.107333
2,6141cf11cc0e69a0cf28ab35,1.14562
3,6141cf11b842c2e72e2f2d48,1.810341
4,6141cf11ae4fb853db2e3f14,0.359406


In [39]:
def mae(approx, target):
    return np.mean(np.abs(target - approx))

In [41]:
valid_fold

Unnamed: 0,id,band_gap,ratio16,ratio42,ratio34,ratio74,atom_count,link_count
2,6141cf11cc0e69a0cf28ab35,1.1484,0.659686,0.335079,0.005236,0.000000,191.0,2028.0
7,6141cf12cc0e69a0cf28ab39,1.1477,0.659686,0.335079,0.005236,0.000000,191.0,2028.0
10,6141cf1302d926221cabc547,0.3532,0.668421,0.331579,0.000000,0.000000,190.0,2004.0
20,6141cf16a05be5973bd77a67,1.1384,0.664921,0.329843,0.000000,0.005236,191.0,2028.0
25,6141cf173ac25c70a5c6c83d,0.2694,0.666667,0.333333,0.000000,0.000000,189.0,1984.0
...,...,...,...,...,...,...,...,...
2950,6145edc43ac25c70a5c6cdcf,0.3474,0.666667,0.333333,0.000000,0.000000,189.0,1984.0
2953,61463ed331cf3ef3d4a9f836,0.3591,0.663158,0.331579,0.005263,0.000000,190.0,2004.0
2957,614688ce31cf3ef3d4a9f83a,0.4016,0.663158,0.331579,0.005263,0.000000,190.0,2004.0
2964,6146ecdb3ac25c70a5c6cdef,0.3506,0.666667,0.333333,0.000000,0.000000,189.0,1984.0


In [42]:
splitter = KFold(n_splits=5, shuffle=True, random_state=2022)
fold_iter = list(splitter.split(X=train, y=train['band_gap']))

stack_predictions = preds.copy()
for fold, (train_idx, valid_idx) in enumerate(fold_iter):
    train_fold, valid_fold = train.iloc[train_idx], train.iloc[valid_idx]
    oof_fold = oof.iloc[valid_idx]
    valid_fold['oof'] = oof_fold['outoffolds']
    test_fold = test.merge(preds[['id', f'fold_{fold}']], on='id', how='left').drop(['id', 'band_gap'], axis=1)
    print(f'===== FOLD {fold} =====')
    print(f'Baseline MAE: {mae(valid_fold["oof"], valid_fold["band_gap"])}')

    params = {
        'objective': 'regression_l1',
        'metric': 'l1',
        'verbose': -1
    }
    fit_params = {
        'num_boost_round': 100,
        'early_stopping_rounds': 10,
        'verbose_eval': 100,
    }
    stack_model = Trainer(LGBMRegressor)
    stack_model.train(
        train_data=[valid_fold.drop(['band_gap', 'id'], axis=1), valid_fold['band_gap']],
        valid_data=[valid_fold.drop(['band_gap', 'id'], axis=1), valid_fold['band_gap']],
        params=params, fit_params=fit_params
    )
    stack_predictions[f'fold_{fold}'] = stack_model.predict(test_fold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


===== FOLD 0 =====
Baseline MAE: 0.015191011949902031
Logger created at 22/02/09:14:32:31
Training until validation scores don't improve for 10 rounds
[100]	training's l1: 0.0120277	valid_1's l1: 0.0120277
Did not meet early stopping. Best iteration is:
[100]	training's l1: 0.0120277	valid_1's l1: 0.0120277
===== FOLD 1 =====
Baseline MAE: 0.0127920620991448
Logger created at 22/02/09:14:32:31
Training until validation scores don't improve for 10 rounds
[100]	training's l1: 0.00988667	valid_1's l1: 0.00988667
Did not meet early stopping. Best iteration is:
[100]	training's l1: 0.00988667	valid_1's l1: 0.00988667
===== FOLD 2 =====
Baseline MAE: 0.011641855395624253
Logger created at 22/02/09:14:32:31
Training until validation scores don't improve for 10 rounds
[100]	training's l1: 0.00928264	valid_1's l1: 0.00928264
Did not meet early stopping. Best iteration is:
[100]	training's l1: 0.00928264	valid_1's l1: 0.00928264
===== FOLD 3 =====
Baseline MAE: 0.013226091423726204
Logger create

In [45]:
stack_predictions['predictions'] = stack_predictions.drop(['id', 'predictions'], axis=1).mean(1)
stack_predictions

Unnamed: 0,id,fold_0,fold_1,fold_2,fold_3,fold_4,predictions
0,6141cf9631cf3ef3d4a9edb4,0.284799,0.277344,0.278645,0.276255,0.284876,0.280384
1,6141d2fd9cbada84a8676921,1.807479,1.808629,1.807318,1.809040,1.808409,1.808175
2,6142341931cf3ef3d4a9f3c0,0.384958,0.361651,0.398185,0.372458,0.401697,0.383790
3,6142199f4e27a1844a5f05fa,1.138767,1.146873,1.148112,1.140968,1.150239,1.144992
4,6141d441ee0a3fd43fb47b65,0.357428,0.409063,0.366523,0.361035,0.355343,0.369878
...,...,...,...,...,...,...,...
2962,6142615131cf3ef3d4a9f4b8,0.380492,0.413478,0.369151,0.384473,0.400098,0.389538
2963,6141f4354e27a1844a5f046c,1.146824,1.146489,1.141585,1.140132,1.142058,1.143418
2964,6141e31a4e27a1844a5f0346,1.145217,1.143735,1.146044,1.143696,1.142101,1.144158
2965,61429f0531cf3ef3d4a9f5b4,0.361714,0.382903,0.414262,0.382455,0.400801,0.388427


In [46]:
stack_predictions[['id', 'predictions']].to_csv('results/exp_12/stack_submission.csv', index=False)