In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

In [3]:
PROJECT_DIR = os.path.abspath('.')
if PROJECT_DIR.endswith('final-nbs'):
    PROJECT_DIR = os.path.abspath('../')
    os.chdir(PROJECT_DIR)

In [4]:
import cfg
from src.data import get_features_path_from_metadata, join_dataframe_columns
from src import util
from src.data import setup_directories
util.setup_logging()

dirs = setup_directories(cfg.DATA_DIR, create_dirs=True)

In [5]:
raw_dir = Path(dirs['raw'])
train_dir = Path(dirs['train'])
# cv_dir = Path(dirs['cv']['test'])

In [6]:
sub_dir = Path(dirs['submission'])

In [7]:
train_target = pd.read_csv(raw_dir / 'train_labels.csv', index_col='sample_id')

In [8]:
train_lgbm = pd.read_csv(sub_dir.joinpath('lgbm', 'test', 'train.csv'), index_col='sample_id')


In [9]:
train_opt_lgbm = pd.read_csv(sub_dir.joinpath('optimized-iteration-lgbm', 'test', 'train.csv'), index_col='sample_id')

In [10]:
train_svm = pd.read_csv(sub_dir.joinpath('svm', 'test', 'train.csv'), index_col='sample_id')

In [11]:
train_preds = {'lgbm': train_lgbm, 'svm': train_svm, 'lgbm_opt': train_opt_lgbm}

In [12]:
from src.metrics import compute_scores

In [13]:
valid_target = pd.read_csv(raw_dir / 'val_labels.csv', index_col='sample_id')

In [14]:
train_target = train_target.append(valid_target)

In [15]:
valid_lgbm = pd.read_csv(sub_dir.joinpath('lgbm', 'test', 'submission.csv'), index_col='sample_id')
valid_lgbm = valid_lgbm.loc[valid_target.index]

In [16]:
valid_svm = pd.read_csv(sub_dir.joinpath('svm', 'test', 'submission.csv'), index_col='sample_id')
valid_svm = valid_svm.loc[valid_target.index]

In [17]:
valid_opt_lgbm = pd.read_csv(sub_dir.joinpath('optimized-iteration-lgbm', 'test', 'submission.csv'), index_col='sample_id')
valid_opt_lgbm = valid_opt_lgbm.loc[valid_target.index]

In [18]:
valid_preds = {'lgbm': valid_lgbm, 'svm': valid_svm, 'lgbm_opt': valid_opt_lgbm}

In [19]:
train_scores = {model: compute_scores(train_target.loc[pred.index], pred) for model, pred in train_preds.items()}
train_scores = pd.DataFrame(train_scores).T.sort_values(by=['avg_loss'])

In [20]:
valid_scores = {model: compute_scores(valid_target, pred) for model, pred in valid_preds.items()}
valid_scores = pd.DataFrame(valid_scores).T.sort_values(by=['avg_loss'])

In [21]:
train_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm,0.123033,0.112934,0.075054,0.142676,0.171334,0.019852,0.122836,0.203351,0.165224,0.164438,0.052632
lgbm_opt,0.124146,0.116908,0.073997,0.140968,0.175924,0.017923,0.129259,0.208687,0.169079,0.161119,0.047599
svm,0.129573,0.112703,0.070896,0.142835,0.184871,0.020418,0.149817,0.220952,0.168192,0.169584,0.055462


In [22]:
valid_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm,0.011356,0.009279,0.008047,0.012732,0.018075,0.003405,0.012662,0.019244,0.013331,0.012231,0.004557
lgbm_opt,0.012527,0.010552,0.008443,0.013987,0.018131,0.002112,0.014116,0.020631,0.017538,0.014867,0.004896
svm,0.020796,0.016009,0.011276,0.019781,0.029791,0.004125,0.021299,0.033165,0.037267,0.026093,0.009156


In [23]:
joined_scores = train_scores.assign(set_type='train').append(valid_scores.assign(set_type='valid'))

joined_scores = (
    joined_scores
    .reset_index()
    .rename(columns={'index': 'model'})
    .pivot('model', 'set_type')
)

In [24]:
joined_scores.T

Unnamed: 0_level_0,model,lgbm,lgbm_opt,svm
Unnamed: 0_level_1,set_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
avg_loss,train,0.123033,0.124146,0.129573
avg_loss,valid,0.011356,0.012527,0.020796
basalt,train,0.112934,0.116908,0.112703
basalt,valid,0.009279,0.010552,0.016009
carbonate,train,0.075054,0.073997,0.070896
carbonate,valid,0.008047,0.008443,0.011276
chloride,train,0.142676,0.140968,0.142835
chloride,valid,0.012732,0.013987,0.019781
iron_oxide,train,0.171334,0.175924,0.184871
iron_oxide,valid,0.018075,0.018131,0.029791


opt model makes to much overfit, lets drop it

In [25]:
train_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm,0.123033,0.112934,0.075054,0.142676,0.171334,0.019852,0.122836,0.203351,0.165224,0.164438,0.052632
lgbm_opt,0.124146,0.116908,0.073997,0.140968,0.175924,0.017923,0.129259,0.208687,0.169079,0.161119,0.047599
svm,0.129573,0.112703,0.070896,0.142835,0.184871,0.020418,0.149817,0.220952,0.168192,0.169584,0.055462


In [26]:
def ensemble_prediction(preds, weights):
    yhat = np.zeros(preds[0].shape)
    
    for i in range(len(preds)):
        yhat += preds[i] * weights[i]
    
    return yhat

In [27]:
ensembles = {
    'lgbm_45_lgbm_opt_45_svm_10': [0.45, 0.45, 0.1],
    'lgbm_40_lgbm_opt_40_svm_20': [0.4, 0.4, 0.2],
    'lgbm_70_svm_30': [0.7, 0., 0.3],
    
    
}

In [28]:
for ensemble_name, weights in ensembles.items():
    train_preds[ensemble_name] = ensemble_prediction([train_lgbm, train_opt_lgbm, train_svm], weights)
    valid_preds[ensemble_name] = ensemble_prediction([valid_lgbm, valid_opt_lgbm, valid_svm], weights)

In [29]:
train_scores = {model: compute_scores(train_target.loc[pred.index], pred) for model, pred in train_preds.items()}
train_scores = pd.DataFrame(train_scores).T.sort_values(by=['avg_loss'])

In [30]:
valid_scores = {model: compute_scores(valid_target, pred) for model, pred in valid_preds.items()}
valid_scores = pd.DataFrame(valid_scores).T.sort_values(by=['avg_loss'])

In [31]:
train_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm_40_lgbm_opt_40_svm_20,0.116749,0.107533,0.068912,0.132408,0.162114,0.018329,0.122803,0.196917,0.15467,0.155729,0.048076
lgbm_70_svm_30,0.116884,0.105761,0.068958,0.132906,0.162199,0.01911,0.122942,0.1976,0.152223,0.156898,0.050247
lgbm_45_lgbm_opt_45_svm_10,0.11796,0.109354,0.070381,0.134362,0.164071,0.018414,0.122395,0.198214,0.157495,0.156963,0.047951
lgbm,0.123033,0.112934,0.075054,0.142676,0.171334,0.019852,0.122836,0.203351,0.165224,0.164438,0.052632
lgbm_opt,0.124146,0.116908,0.073997,0.140968,0.175924,0.017923,0.129259,0.208687,0.169079,0.161119,0.047599
svm,0.129573,0.112703,0.070896,0.142835,0.184871,0.020418,0.149817,0.220952,0.168192,0.169584,0.055462


In [32]:
valid_scores

Unnamed: 0,avg_loss,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
lgbm,0.011356,0.009279,0.008047,0.012732,0.018075,0.003405,0.012662,0.019244,0.013331,0.012231,0.004557
lgbm_opt,0.012527,0.010552,0.008443,0.013987,0.018131,0.002112,0.014116,0.020631,0.017538,0.014867,0.004896
lgbm_45_lgbm_opt_45_svm_10,0.012769,0.010516,0.00854,0.013983,0.019171,0.002893,0.014164,0.021234,0.017241,0.014786,0.005163
lgbm_40_lgbm_opt_40_svm_20,0.013609,0.01112,0.008838,0.014615,0.020261,0.003028,0.014947,0.02254,0.019107,0.016032,0.005603
lgbm_70_svm_30,0.014046,0.01128,0.009003,0.014812,0.02135,0.003617,0.015228,0.023366,0.019521,0.016359,0.005928
svm,0.020796,0.016009,0.011276,0.019781,0.029791,0.004125,0.021299,0.033165,0.037267,0.026093,0.009156


In [33]:
# reading test set

In [34]:
test_lgbm = pd.read_csv(sub_dir.joinpath('lgbm', 'test', 'submission.csv'), index_col='sample_id')

test_svm = pd.read_csv(sub_dir.joinpath('svm', 'test', 'submission.csv'), index_col='sample_id')

test_opt_lgbm = pd.read_csv(sub_dir.joinpath('optimized-iteration-lgbm', 'test', 'submission.csv'), index_col='sample_id')

In [35]:
assert len(np.unique(map(len, [test_lgbm, test_svm, test_opt_lgbm]))) == 1

In [36]:
assert all(test_lgbm.index == test_svm.index)
assert all(test_lgbm.index == test_opt_lgbm.index)

In [37]:
test_preds = {'lgbm': test_lgbm, 'svm': test_svm, 'lgbm_opt': test_opt_lgbm}

In [38]:
for ensemble_name, weights in ensembles.items():
    test_preds[ensemble_name] = ensemble_prediction([test_lgbm, test_opt_lgbm, test_svm], weights)

In [39]:
ensembles

{'lgbm_45_lgbm_opt_45_svm_10': [0.45, 0.45, 0.1],
 'lgbm_40_lgbm_opt_40_svm_20': [0.4, 0.4, 0.2],
 'lgbm_70_svm_30': [0.7, 0.0, 0.3]}

In [40]:
ensemble_dir = sub_dir.joinpath('ensembles')
ensemble_dir.mkdir(exist_ok=True, parents=True)

In [41]:
for ensemble_name in ensembles.keys():
    ensemble_pred = test_preds[ensemble_name]
    ensemble_pred.to_csv(ensemble_dir / f'{ensemble_name}.csv', index=True)

In [42]:
!ls {ensemble_dir}

lgbm_40_lgbm_opt_40_svm_20.csv	lgbm_70_svm_30.csv
lgbm_45_lgbm_opt_45_svm_10.csv
