In [1]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix, f1_score, accuracy_score

from utils import print_confusion_matrix

In [2]:
def get_wt(cls, df):
    tot_neg_instances = df.shape[0] - p[cls]
    tot_pos_instances = p[cls]
    return float(tot_neg_instances/tot_pos_instances)

In [3]:
def get_wt2(cls, tot):
    tot_neg_instances = tot - p[cls]
    tot_pos_instances = p[cls]
    return float(tot_neg_instances/tot_pos_instances)

In [4]:
df_train = np.load('../cache/train_stage2_fe2.npy')

In [5]:
df_train.shape

(3689, 4689)

In [6]:
df_test = np.load('../cache/test_stage2_fe2.npy')

In [7]:
df_test.shape

(986, 4689)

In [8]:
df = pd.read_csv('../cache/stage2_test_id.csv')

In [9]:
pid = df.ID

In [10]:
df1 = pd.read_csv('../cache/stage2_labels.csv')

In [11]:
df1.head()

Unnamed: 0,y
0,1
1,2
2,2
3,3
4,4


In [12]:
y = df1['y'].values

In [13]:
y = y - 1 #fix for zero bound array

In [14]:
y

array([0, 1, 1, ..., 5, 3, 0])

In [15]:
wts = np.load('../cache/stage2_train_weights.npy')

In [16]:
wts.shape

(3689,)

In [17]:
wts_per_class = np.load('../cache/stage2_train_weights_per_class.npy')
wts_per_class = wts_per_class.tolist()

In [18]:
print(wts_per_class)

{1: 4.572507552870091, 2: 6.407630522088353, 3: 37.427083333333336, 4: 3.912117177097204, 5: 12.816479400749063, 6: 11.42087542087542, 7: 2.5, 8: 174.66666666666666, 9: 84.79069767441861}


In [19]:
wts_per_class[2]

6.407630522088353

In [20]:
 

denom = 0
fold = 10 
for i in range(fold):
    params = {
        'eta': 0.03333,
        'max_depth': 6,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'tree_method': 'gpu_hist',
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(df_train, y, test_size=0.2, random_state=i)
    
    w1 = [wts_per_class[j+1] for j in y1]
    w2 = [wts_per_class[j+1] for j in y2]
    watchlist = [(xgb.DMatrix(x1, y1, weight=w1), 'train'), (xgb.DMatrix(x2, y2, weight=w2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1, weight=w1), 1000,  watchlist, 
                      verbose_eval=50, early_stopping_rounds=100)
    pred_val =  model.predict(xgb.DMatrix(x2), ntree_limit=model.best_ntree_limit)
    score1 = log_loss(y2, pred_val, labels = list(range(9)))
    
    print('fold = {:d}'.format(i))
    print('val multi_log_loss: {}'.format(score1))
    
    fscore = f1_score(y2, pred_val.argmax(axis=1), labels = list(range(9)), average='macro')
    print('val f1_score: {}'.format(fscore))
    
    acc = accuracy_score(y2, pred_val.argmax(axis=1))
    print('val accuracy: {}'.format(acc))
    
    print(confusion_matrix(y2, pred_val.argmax(axis=1), labels = list(range(9))))
    
    print('-------------------')
    print('\n\n')
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/sub5_0_stage2_xgb_fold_'  + str(i) + '.csv', index=False)


[0]	train-mlogloss:2.11353	valid-mlogloss:2.14249
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.64771	valid-mlogloss:1.2431
[100]	train-mlogloss:0.304906	valid-mlogloss:1.13823
[150]	train-mlogloss:0.170686	valid-mlogloss:1.17548
Stopping. Best iteration:
[96]	train-mlogloss:0.32087	valid-mlogloss:1.13618

fold = 0
val multi_log_loss: 0.9857176823950395
val f1_score: 0.6555427505688386
val accuracy: 0.6842818428184282
[[ 88   3   0  15  10   9   5   1   0]
 [  0  62   2   1   2   0  28   0   0]
 [  0   0  15   3   2   0   5   0   0]
 [ 30   2   7 118   9   2   4   0   0]
 [  7   3   1   4  35   3   2   0   0]
 [  5   3   1   1   4  33   5   0   0]
 [  2  24  12   4   5   2 144   0   0]
 [  0   0   0   0   0   0   0   3   2]
 [  0   1   0   1   1   0   0   0   7]]
-------------------



[0]	train-mlogloss:2.11211	valid-mlogloss:2.14689
Multiple eval metrics h

  'precision', 'predicted', average, warn_for)


[0]	train-mlogloss:2.11262	valid-mlogloss:2.14416
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.639682	valid-mlogloss:1.34389
[100]	train-mlogloss:0.303972	valid-mlogloss:1.26214
[150]	train-mlogloss:0.171396	valid-mlogloss:1.30762
Stopping. Best iteration:
[91]	train-mlogloss:0.341781	valid-mlogloss:1.25775

fold = 8
val multi_log_loss: 0.9932491372372239
val f1_score: 0.5926026132921262
val accuracy: 0.6693766937669376
[[ 89   2   0  19  19   5   4   0   0]
 [  3  60   1   3   4   1  32   0   0]
 [  0   0  16   3   2   0   2   0   0]
 [ 25   3   4  92   4   0   6   0   1]
 [  7   2   3   4  31   0   4   0   0]
 [  4   3   0   1   7  47   5   0   0]
 [  1  31  11   4   8   0 152   0   0]
 [  2   0   0   1   0   0   1   0   1]
 [  0   0   0   1   0   0   0   0   7]]
-------------------



[0]	train-mlogloss:2.11252	valid-mlogloss:2.16167
Multiple eval metric

In [21]:
submission = pd.DataFrame(preds/denom, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub5_0_stage2_all_2_2_xgb.csv', index=False)
# scored 2.82570 on stage2 private LB, 1.70018 on stage2 public LB
# scored 2.81200 on stage2 private LB, 1.71324 on stage2 public LB (with new feature engg)

