In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

In [3]:
PROJECT_DIR = os.path.abspath('.')
if PROJECT_DIR.endswith('final-nbs'):
    PROJECT_DIR = os.path.abspath('../')
    os.chdir(PROJECT_DIR)

In [4]:
import cfg
from src.data import get_features_path_from_metadata, join_dataframe_columns
from src import util
from src.data import setup_directories
util.setup_logging()

dirs = setup_directories(cfg.DATA_DIR, create_dirs=True)

In [5]:
raw_dir = Path(dirs['raw'])
train_dir = Path(dirs['train'])
cv_dir = Path(dirs['cv']['final-validation'])

In [6]:
# read metadata
pd_metadata = pd.read_csv(raw_dir / "metadata.csv", index_col="sample_id")
pd_metadata.head()

Unnamed: 0_level_0,split,instrument_type,features_path,features_md5_hash
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S0000,train,commercial,train_features/S0000.csv,017b9a71a702e81a828e6242aa15f049
S0001,train,commercial,train_features/S0001.csv,0d09840214054d254bd49436c6a6f315
S0002,train,commercial,train_features/S0002.csv,3f58b3c9b001bfed6ed4e4f757083e09
S0003,train,commercial,train_features/S0003.csv,e9a12f96114a2fda60b36f4c0f513fb1
S0004,train,commercial,train_features/S0004.csv,b67603d3931897bfa796ac42cc16de78


In [7]:
sub_dir = Path(dirs['submission'])

In [8]:
train_target = pd.read_csv(raw_dir / 'train_labels.csv', index_col='sample_id')

In [9]:
train_lgbm = pd.read_csv(sub_dir.joinpath('lgbm', 'validation', 'train.csv'), index_col='sample_id')


In [10]:
train_opt_lgbm = pd.read_csv(sub_dir.joinpath('optimized-iteration-lgbm', 'validation', 'train.csv'), index_col='sample_id')

In [11]:
train_svm = pd.read_csv(sub_dir.joinpath('svm', 'validation', 'train.csv'), index_col='sample_id')

In [12]:
train_preds = {'lgbm': train_lgbm, 'svm': train_svm, 'lgbm_opt': train_opt_lgbm}

In [13]:
from src.metrics import compute_scores

In [14]:
valid_target = pd.read_csv(raw_dir / 'val_labels.csv', index_col='sample_id')

In [15]:
valid_lgbm = pd.read_csv(sub_dir.joinpath('lgbm', 'validation', 'submission.csv'), index_col='sample_id')
valid_lgbm = valid_lgbm.loc[valid_target.index]

In [16]:
valid_svm = pd.read_csv(sub_dir.joinpath('svm', 'validation', 'submission.csv'), index_col='sample_id')
valid_svm = valid_svm.loc[valid_target.index]

In [17]:
valid_opt_lgbm = pd.read_csv(sub_dir.joinpath('optimized-iteration-lgbm', 'validation', 'submission.csv'), index_col='sample_id')
valid_opt_lgbm = valid_opt_lgbm.loc[valid_target.index]

In [18]:
valid_lgbm_raw_target = pd.read_csv(sub_dir.joinpath('lgbm-raw-target', 'validation', 'submission.csv'), index_col='sample_id')
valid_lgbm_raw_target = valid_lgbm_raw_target.loc[valid_target.index]

In [19]:
valid_preds = {'lgbm': valid_lgbm, 'svm': valid_svm, 'lgbm_opt': valid_opt_lgbm}

In [20]:
train_scores = {model: compute_scores(train_target.loc[pred.index], pred) for model, pred in train_preds.items()}
train_scores = pd.DataFrame(train_scores).T.sort_values(by=['avg_loss'])

In [21]:
valid_scores = {model: compute_scores(valid_target, pred) for model, pred in valid_preds.items()}
valid_scores = pd.DataFrame(valid_scores).T.sort_values(by=['avg_loss'])

In [22]:
train_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm,0.148647,0.14816,0.10547,0.165768,0.194717,0.028971,0.155453,0.253595,0.18205,0.189172,0.063119
lgbm_opt,0.149679,0.152719,0.107892,0.168116,0.201856,0.027791,0.157189,0.256627,0.179267,0.186433,0.058901
svm,0.154184,0.147894,0.101944,0.170994,0.202152,0.028129,0.169677,0.273264,0.191609,0.190642,0.065532


In [23]:
valid_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm_opt,0.139441,0.12208,0.114622,0.160589,0.220167,0.018175,0.155199,0.212064,0.177936,0.169067,0.044511
lgbm,0.139668,0.119528,0.119755,0.159714,0.223741,0.021868,0.154865,0.208771,0.17892,0.165185,0.044332
svm,0.158643,0.130487,0.123351,0.170053,0.244879,0.021972,0.198306,0.258428,0.1833,0.207186,0.048464


In [24]:
joined_scores = train_scores.assign(set_type='train').append(valid_scores.assign(set_type='valid'))

joined_scores = (
    joined_scores
    .reset_index()
    .rename(columns={'index': 'model'})
    .pivot('model', 'set_type')
)

In [25]:
joined_scores.T

Unnamed: 0_level_0,model,lgbm,lgbm_opt,svm
Unnamed: 0_level_1,set_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
avg_loss,train,0.148647,0.149679,0.154184
avg_loss,valid,0.139668,0.139441,0.158643
basalt,train,0.14816,0.152719,0.147894
basalt,valid,0.119528,0.12208,0.130487
carbonate,train,0.10547,0.107892,0.101944
carbonate,valid,0.119755,0.114622,0.123351
chloride,train,0.165768,0.168116,0.170994
chloride,valid,0.159714,0.160589,0.170053
iron_oxide,train,0.194717,0.201856,0.202152
iron_oxide,valid,0.223741,0.220167,0.244879


opt model makes to much overfit, lets drop it

In [26]:
train_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm,0.148647,0.14816,0.10547,0.165768,0.194717,0.028971,0.155453,0.253595,0.18205,0.189172,0.063119
lgbm_opt,0.149679,0.152719,0.107892,0.168116,0.201856,0.027791,0.157189,0.256627,0.179267,0.186433,0.058901
svm,0.154184,0.147894,0.101944,0.170994,0.202152,0.028129,0.169677,0.273264,0.191609,0.190642,0.065532


In [27]:
def ensemble_prediction(preds, weights):
    yhat = np.zeros(preds[0].shape)
    
    for i in range(len(preds)):
        yhat += preds[i] * weights[i]
    
    return yhat

In [28]:
ensembles = {
    'lgbm_45_lgbm_opt_45_svm_10': [0.45, 0.45, 0.1],
    'lgbm_40_lgbm_opt_40_svm_20': [0.4, 0.4, 0.2],
    'lgbm_70_svm_30': [0.7, 0., 0.3],
    
    
}

In [29]:
for ensemble_name, weights in ensembles.items():
    train_preds[ensemble_name] = ensemble_prediction([train_lgbm, train_opt_lgbm, train_svm], weights)
    valid_preds[ensemble_name] = ensemble_prediction([valid_lgbm, valid_opt_lgbm, valid_svm], weights)

In [30]:
train_scores = {model: compute_scores(train_target.loc[pred.index], pred) for model, pred in train_preds.items()}
train_scores = pd.DataFrame(train_scores).T.sort_values(by=['avg_loss'])

In [31]:
valid_scores = {model: compute_scores(valid_target, pred) for model, pred in valid_preds.items()}
valid_scores = pd.DataFrame(valid_scores).T.sort_values(by=['avg_loss'])

In [32]:
train_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm_70_svm_30,0.141015,0.140186,0.09732,0.156298,0.182813,0.027061,0.150257,0.246832,0.169664,0.180741,0.058979
lgbm_40_lgbm_opt_40_svm_20,0.141288,0.142722,0.098602,0.157287,0.185224,0.026788,0.150388,0.245847,0.168312,0.179997,0.057717
lgbm_45_lgbm_opt_45_svm_10,0.142797,0.144455,0.100743,0.159298,0.187978,0.027151,0.151137,0.247089,0.170506,0.181651,0.057966
lgbm,0.148647,0.14816,0.10547,0.165768,0.194717,0.028971,0.155453,0.253595,0.18205,0.189172,0.063119
lgbm_opt,0.149679,0.152719,0.107892,0.168116,0.201856,0.027791,0.157189,0.256627,0.179267,0.186433,0.058901
svm,0.154184,0.147894,0.101944,0.170994,0.202152,0.028129,0.169677,0.273264,0.191609,0.190642,0.065532


In [33]:
valid_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm_45_lgbm_opt_45_svm_10,0.138635,0.119516,0.116523,0.15785,0.217738,0.019959,0.157223,0.21285,0.171576,0.169129,0.043986
lgbm_40_lgbm_opt_40_svm_20,0.139018,0.119362,0.116325,0.157078,0.216469,0.020025,0.160121,0.216147,0.168748,0.17184,0.044066
lgbm_opt,0.139441,0.12208,0.114622,0.160589,0.220167,0.018175,0.155199,0.212064,0.177936,0.169067,0.044511
lgbm,0.139668,0.119528,0.119755,0.159714,0.223741,0.021868,0.154865,0.208771,0.17892,0.165185,0.044332
lgbm_70_svm_30,0.140098,0.118466,0.118278,0.156976,0.217388,0.021443,0.163817,0.219003,0.167865,0.173325,0.04442
svm,0.158643,0.130487,0.123351,0.170053,0.244879,0.021972,0.198306,0.258428,0.1833,0.207186,0.048464


In [34]:
test_lgbm = pd.read_csv(sub_dir.joinpath('lgbm', 'validation', 'submission.csv'), index_col='sample_id')

test_svm = pd.read_csv(sub_dir.joinpath('svm', 'validation', 'submission.csv'), index_col='sample_id')

test_opt_lgbm = pd.read_csv(sub_dir.joinpath('optimized-iteration-lgbm', 'validation', 'submission.csv'), index_col='sample_id')

In [35]:
assert len(np.unique(map(len, [test_lgbm, test_svm, test_opt_lgbm]))) == 1

assert all(test_lgbm.index == test_svm.index)
assert all(test_lgbm.index == test_opt_lgbm.index)

In [36]:
test_preds = {'lgbm': test_lgbm, 'svm': test_svm, 'lgbm_opt': test_opt_lgbm}

In [37]:
for ensemble_name, weights in ensembles.items():
    test_preds[ensemble_name] = ensemble_prediction([test_lgbm, test_opt_lgbm, test_svm], weights)

In [38]:
ensembles

{'lgbm_45_lgbm_opt_45_svm_10': [0.45, 0.45, 0.1],
 'lgbm_40_lgbm_opt_40_svm_20': [0.4, 0.4, 0.2],
 'lgbm_70_svm_30': [0.7, 0.0, 0.3]}

In [39]:
ensemble_dir = sub_dir.joinpath('ensembles-no-validation')
ensemble_dir.mkdir(exist_ok=True, parents=True)

In [40]:
for ensemble_name in ensembles.keys():
    ensemble_pred = test_preds[ensemble_name]
    ensemble_pred.to_csv(ensemble_dir / f'{ensemble_name}.csv', index=True)

In [41]:
!ls {ensemble_dir}

lgbm_40_lgbm_opt_40_svm_20.csv	lgbm_70_svm_30.csv
lgbm_45_lgbm_opt_45_svm_10.csv
