#### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import cv2

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb

import operator
import pickle
np.random.seed(42)

from utils import *



#### Load and Prepare Data

In [27]:
data = pd.read_json('./Data/train.json')
test = pd.read_json('./Data/test.json')

In [28]:
data['band_1'] = data['band_1'].apply(lambda x : np.array(x).reshape(75, 75))
data['band_2'] = data['band_2'].apply(lambda x : np.array(x).reshape(75, 75))

test['band_1'] = test['band_1'].apply(lambda x : np.array(x).reshape(75, 75))
test['band_2'] = test['band_2'].apply(lambda x : np.array(x).reshape(75, 75))

data['inc_angle'] = pd.to_numeric(data['inc_angle'], errors='coerce').fillna(-1)
test['inc_angle'] = pd.to_numeric(test['inc_angle'], errors='coerce')

In [51]:
band_1_tr = np.concatenate([im for im in data['band_1']]).reshape(-1, 75*75)
band_2_tr = np.concatenate([im for im in data['band_2']]).reshape(-1, 75*75)
# b_av = np.divide(band_1_tr, band_2_tr, out=np.zeros_like(band_1_tr), where=band_2_tr!=0)
# b_av = band_1_tr*np.cos(train['inc_angle'])[:, None, None]
# b_av = np.ones((75,75))*(np.cos(train['inc_angle'])[:, None, None]**2)
# b_av2 = band_2_tr*np.cos(train['inc_angle'])[:, None, None]
# b_av = (band_1_tr+band_2_tr)/2
rgb = np.concatenate((band_1_tr, band_2_tr, np.array(data['inc_angle']).reshape((len(data), 1))), axis=1)
X_train = rgb

band_1_test = np.concatenate([im for im in test['band_1']]).reshape(-1, 75*75)
band_2_test = np.concatenate([im for im in test['band_2']]).reshape(-1, 75*75)
# b = np.divide(band_1_test, band_2_test, out=np.zeros_like(band_1_test), where=band_2_test!=0)
# b_av = (band_1_test+band_2_test)/2
rgb = np.concatenate((band_1_test, band_2_test, np.array(test['inc_angle']).reshape((len(test), 1))), axis=1)
X_test = rgb

y_train = data['is_iceberg'].values.astype(np.float32)

In [None]:
# idx = 2
# f, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(10,5))
# plt.suptitle('Is Iceberg : '+str(y_train[idx])+'\nIncidence Angle : '+str(train['inc_angle'][idx]),
#              fontweight="bold")
# ax1.imshow(X_train[0][idx][0,:,:], cmap='gray')
# ax1.set_title('Band 1')
# ax2.imshow(band_2_tr[idx], cmap='gray')
# ax2.set_title('Band 2')
# ax3.imshow(np.moveaxis(X_train[0][idx], 0, 2))
# ax3.set_title('Color Composite')
# plt.show()

#### XGBoost 

In [52]:
X = X_train
y = y_train
x_test = X_test

In [85]:
xgbscores = []

# Set xgb parameters
params = {}
params['objective'] = 'binary:logistic'
# params['booster'] = 'dart'
params['eta'] = 0.02
params['silent'] = True
# params['eval_metric'] = 'ndcg@20000'
params['max_depth'] = 7
params['subsample'] = 0.9
params['colsample_bytree'] = 0.5
params['colsample_bylevel'] = 0.5
params['seed'] = 99
params['gamma'] = 14
params['eval_metric'] = 'logloss'
# params['num_class'] = 2
# params['lambda'] = 10
# params['alpha'] = 1
# params['scale_pos_weight'] = 26.43671061122891

In [86]:
# Create a submission file
sub = pd.DataFrame()
sub['id'] = test['id']
sub['target'] = np.zeros_like(test['id'])

kfold = 5
weights = []
feature_imps = []
sss = KFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into LGBoost format
    d_train = xgb.DMatrix(X_train, y_train, missing=-1)
    d_valid = xgb.DMatrix(X_valid, y_valid, missing=-1)
#     d_valid_gini = xgb.DMatrix(X_valid)
#     d_valid_2 = xgb.DMatrix(X_val, y_val)
    d_test = xgb.DMatrix(x_test, missing=-1)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=170, verbose_eval=100)

    print('[Fold %d/%d Prediction:]' % (i + 1, kfold))
    feature_imps.append(mdl.get_fscore())
    # Predict on our test data
    p_test = mdl.predict(d_test)
    print(mdl.best_score)
#     val_gini = mdl.predict(d_valid_gini)
#     print(gini_normalized(y_valid, val_gini))
#     print(gini_normalized(y_val, mdl.predict(d_valid_2)))
    weight = mdl.best_score
    weights.append(weight)
    sub['target'] += p_test*weight

sub['target'] = sub['target']/np.sum(weights)

[Fold 1/5]
[0]	train-mlogloss:0.687135	valid-mlogloss:0.68847
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 170 rounds.
[100]	train-mlogloss:0.44421	valid-mlogloss:0.507937
[200]	train-mlogloss:0.422703	valid-mlogloss:0.493123
[300]	train-mlogloss:0.416039	valid-mlogloss:0.489236
[400]	train-mlogloss:0.414716	valid-mlogloss:0.488257


KeyboardInterrupt: 

In [64]:
sub.columns = ['id', 'is_iceberg']

In [82]:
np.mean(weights)

0.17767320000000003

In [65]:
sub.to_csv('./Submissions/v30Oct_XGBoost_1608.csv', index=False)

In [66]:
sum(np.array(sub['is_iceberg'])>0.5)/len(sub)

0.27516619183285851

In [67]:
sub

Unnamed: 0,id,is_iceberg
0,5941774d,0.460345
1,4023181e,0.361958
2,b20200e4,0.266124
3,e7f018bb,0.665612
4,4371c8c3,0.538084
5,a8d9b1fd,0.0911877
6,29e7727e,0.3887
7,92a51ffb,0.733291
8,c769ac97,0.324631
9,aee0547d,0.0607445


In [247]:
# Predict
iceNet.eval()
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
results = []
for features, features_angle in test_loader:
    iceNet.eval()
    features = Variable(features, volatile=True).cuda()
    features_angle = Variable(features_angle, volatile=True).cuda()
    outputs = F.softmax(iceNet(features, features_angle))
    results.append(outputs.data[0][1])

In [248]:
results[:50]

[4.974951934855198e-07,
 0.029968729242682457,
 3.8373966049221053e-07,
 0.007663973607122898,
 0.8822445869445801,
 0.0792531967163086,
 0.104781374335289,
 0.000896515091881156,
 1.3228373063611798e-06,
 4.719344815384829e-06,
 3.1641402831183996e-09,
 0.1379968374967575,
 1.0460860721650533e-05,
 0.3653469979763031,
 0.15629898011684418,
 0.02273099310696125,
 0.11287692934274673,
 5.7135479437420145e-05,
 0.09061847627162933,
 0.9990752935409546,
 0.017497895285487175,
 0.0008137716795317829,
 0.6351602077484131,
 0.010825518518686295,
 5.440843331427914e-10,
 1.2547709502541693e-06,
 0.008549781516194344,
 0.5478723049163818,
 0.6568987369537354,
 0.18993176519870758,
 2.5820247273600216e-08,
 0.9969714879989624,
 0.6639015078544617,
 0.15789631009101868,
 0.07133708894252777,
 2.446020153001882e-05,
 2.094129376928322e-05,
 0.9889333248138428,
 0.0010424683568999171,
 0.998264491558075,
 5.059077352598251e-07,
 0.9476674199104309,
 0.9849733114242554,
 0.0020589660853147507,
 0.9

In [170]:
for idx, val in X_test):
    print(idx, val.shape)

0 (8424, 2, 75, 75)
1 (8424, 1)


In [155]:
sub = pd.read_csv('./Data/sample_submission.csv')

In [156]:
sub['is_iceberg'] = results

In [161]:
sub.iloc[3287,:]

id               378606db
is_iceberg    1.12818e-06
Name: 3287, dtype: object

In [162]:
test.iloc[3287,:]

band_1       [[-24.0210189819, -24.0210189819, -22.32080268...
band_2       [[-25.8426780701, -25.8426780701, -21.23370170...
id                                                    378606db
inc_angle                                              44.0226
Name: 3287, dtype: object

In [157]:
sub.to_csv('./Submissions/sub_30Oct_val_1729.csv', index=False)