In [1]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix, f1_score, accuracy_score

from utils import print_confusion_matrix

In [2]:
def get_wt(cls, df):
    tot_neg_instances = df.shape[0] - p[cls]
    tot_pos_instances = p[cls]
    return float(tot_neg_instances/tot_pos_instances)

In [3]:
def get_wt2(cls, tot):
    tot_neg_instances = tot - p[cls]
    tot_pos_instances = p[cls]
    return float(tot_neg_instances/tot_pos_instances)

In [4]:
df_train = np.load('../cache/train_stage2_fe2.npy')

In [5]:
df_train.shape

(22044, 3318)

In [6]:
df_test = np.load('../cache/test_stage2_fe2.npy')

In [7]:
df_test.shape

(986, 3318)

In [8]:
df = pd.read_csv('../cache/stage2_test_id.csv')

In [9]:
pid = df.ID

In [10]:
df1 = pd.read_csv('../cache/stage2_labels.csv')

In [11]:
df1.head()

Unnamed: 0,y
0,1
1,2
2,2
3,2
4,2


In [12]:
y = df1['y'].values

In [13]:
y = y - 1 #fix for zero bound array

In [14]:
y

array([0, 1, 1, ..., 5, 3, 0])

In [15]:
wts = np.load('../cache/stage2_train_weights.npy')

In [16]:
wts.shape

(22044,)

In [17]:
wts_per_class = np.load('../cache/stage2_train_weights_per_class.npy')
wts_per_class = wts_per_class.tolist()

In [18]:
print(wts_per_class)

{1: 5.1438127090301, 2: 7.957334416903698, 3: 25.948655256723715, 4: 2.7054967221381743, 5: 9.276923076923078, 6: 10.144590495449949, 7: 3.5839051777916406, 8: 206.96226415094338, 9: 115.02105263157895}


In [19]:
wts_per_class[2]

7.957334416903698

In [22]:
 

denom = 0
fold = 10 
for i in range(fold):
    params = {
        'eta': 0.03333,
        'max_depth': 6,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': i,
        'tree_method': 'gpu_hist',
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(df_train, y, test_size=0.2, random_state=i, stratify=y, shuffle=True)
    
    w1 = [wts_per_class[j+1] for j in y1]
    w2 = [wts_per_class[j+1] for j in y2]
    watchlist = [(xgb.DMatrix(x1, y1, weight=w1), 'train'), (xgb.DMatrix(x2, y2, weight=w2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1, weight=w1), 1000,  watchlist, 
                      verbose_eval=50, early_stopping_rounds=100)
    pred_val =  model.predict(xgb.DMatrix(x2), ntree_limit=model.best_ntree_limit)
    score1 = log_loss(y2, pred_val, labels = list(range(9)))
    
    print('fold = {:d}'.format(i))
    print('val multi_log_loss: {}'.format(score1))
    
    fscore = f1_score(y2, pred_val.argmax(axis=1), labels = list(range(9)), average='macro')
    print('val f1_score: {}'.format(fscore))
    
    acc = accuracy_score(y2, pred_val.argmax(axis=1))
    print('val accuracy: {}'.format(acc))
    
    print(confusion_matrix(y2, pred_val.argmax(axis=1), labels = list(range(9))))
    
    print('-------------------')
    print('\n\n')
    #if score < 0.9:
    if denom != 0:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit+80)
        preds = pred.copy()
    denom += 1
    submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    submission['ID'] = pid
    submission.to_csv('../submissions/sub5_0_stage2_xgb_fold_'  + str(i) + '.csv', index=False)


[0]	train-mlogloss:2.0982	valid-mlogloss:2.10236
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.5892	valid-mlogloss:0.666544
[100]	train-mlogloss:0.274805	valid-mlogloss:0.370445
[150]	train-mlogloss:0.16157	valid-mlogloss:0.27276
[200]	train-mlogloss:0.106156	valid-mlogloss:0.230782
[250]	train-mlogloss:0.069974	valid-mlogloss:0.208361
[300]	train-mlogloss:0.046394	valid-mlogloss:0.197071
[350]	train-mlogloss:0.031589	valid-mlogloss:0.193724
[400]	train-mlogloss:0.021918	valid-mlogloss:0.193179
[450]	train-mlogloss:0.01518	valid-mlogloss:0.193155
[500]	train-mlogloss:0.010525	valid-mlogloss:0.193671
Stopping. Best iteration:
[408]	train-mlogloss:0.020486	valid-mlogloss:0.192414

fold = 0
val multi_log_loss: 0.1085512381422941
val f1_score: 0.9709853189781563
val accuracy: 0.972556135178045
[[ 698    0    1    8    4    1    6    0    0]
 [   0  458    0    0

[0]	train-mlogloss:2.10013	valid-mlogloss:2.10145
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[50]	train-mlogloss:0.589546	valid-mlogloss:0.638028
[100]	train-mlogloss:0.274019	valid-mlogloss:0.334943
[150]	train-mlogloss:0.162013	valid-mlogloss:0.230471
[200]	train-mlogloss:0.104649	valid-mlogloss:0.180342
[250]	train-mlogloss:0.06786	valid-mlogloss:0.150139
[300]	train-mlogloss:0.045822	valid-mlogloss:0.133799
[350]	train-mlogloss:0.030743	valid-mlogloss:0.124317
[400]	train-mlogloss:0.021116	valid-mlogloss:0.120772
[450]	train-mlogloss:0.014673	valid-mlogloss:0.118902
[500]	train-mlogloss:0.010062	valid-mlogloss:0.117629
[550]	train-mlogloss:0.007056	valid-mlogloss:0.117941
[600]	train-mlogloss:0.00508	valid-mlogloss:0.11858
Stopping. Best iteration:
[518]	train-mlogloss:0.008841	valid-mlogloss:0.117071

fold = 6
val multi_log_loss: 0.11051360416093563
val f1_score: 0.968196

In [24]:
preds1 = preds/denom

In [25]:
preds1 = np.clip(preds1, a_min=0.05, a_max=0.95)

In [26]:
submission = pd.DataFrame(preds1, columns=['class'+str(c+1) for c in range(9)])
submission['ID'] = pid
submission.to_csv('../submissions/sub5_0_stage2_all_2_2_xgb.csv', index=False)
# scored 2.82570 on stage2 private LB, 1.70018 on stage2 public LB
# scored 2.81200 on stage2 private LB, 1.71324 on stage2 public LB (with new feature engg)
# scored 2.46932 on stage2 private LB, 1.33261 on stage2 public LB (without stratify and shuffle in train_test_split)

