#### Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

from rankboost import BipartiteRankBoost



#### Read Data 

In [2]:
# Read in our input data
train = pd.read_csv('../Dataset/train/train.csv')
test = pd.read_csv('../Dataset/test/test.csv')


# This prints out (rows, columns) in each dataframe
print('Train shape:', train.shape)
print('Test shape:', test.shape)


id_train = train['id'].values
y = train.target.values
id_test = test['id'].values

Train shape: (595212, 59)
Test shape: (892816, 58)


In [3]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', 1-gini_score, False

In [4]:
# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing

train['countNAs'] = train.isin(['-1']).sum(axis=1)
test['countNAs'] = test.isin(['-1']).sum(axis=1)

train_cont = train.drop(['id','ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat',
                    'ps_car_02_cat','ps_car_03_cat','ps_car_04_cat','ps_car_05_cat',
                    'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat','ps_car_09_cat',
                    'ps_car_10_cat','ps_car_11_cat','target'], axis=1)
test_cont = test.drop(['id','ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat',
                    'ps_car_02_cat','ps_car_03_cat','ps_car_04_cat','ps_car_05_cat',
                    'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat','ps_car_09_cat',
                    'ps_car_10_cat','ps_car_11_cat'], axis=1)

# One-hot encoding
one_hot = OneHotEncoder()
train_cat = train[['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat',
                    'ps_car_02_cat','ps_car_03_cat','ps_car_04_cat','ps_car_05_cat',
                    'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat','ps_car_09_cat',
                    'ps_car_10_cat','ps_car_11_cat']]
test_cat = test[['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat',
                    'ps_car_02_cat','ps_car_03_cat','ps_car_04_cat','ps_car_05_cat',
                    'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat','ps_car_09_cat',
                    'ps_car_10_cat','ps_car_11_cat']]

train_cat = train_cat.replace(-1, 999)
test_cat = test_cat.replace(-1, 999)

train_cat = one_hot.fit_transform(train_cat).toarray()
test_cat = one_hot.fit_transform(test_cat).toarray()

X = np.concatenate((train_cat, train_cont.values), axis=1)
test = np.concatenate((test_cat, test_cont.values), axis=1)
standardise = StandardScaler()
standardise = standardise.fit(X)
X = standardise.transform(X)
test = standardise.transform(test)
# print('Train shape:', train.shape)
# print('Test shape:', test.shape)

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### RankBoost 

In [6]:
classifier = BipartiteRankBoost(n_estimators=100, verbose=2)

In [None]:
fitted = classifier.fit(X_train, y_train)

building stump 1 out of 100
building stump 2 out of 100
building stump 3 out of 100
building stump 4 out of 100
building stump 5 out of 100
building stump 6 out of 100
building stump 7 out of 100
building stump 8 out of 100
building stump 9 out of 100
building stump 10 out of 100
building stump 11 out of 100
building stump 12 out of 100
building stump 13 out of 100
building stump 14 out of 100
building stump 15 out of 100
building stump 16 out of 100
building stump 17 out of 100
building stump 18 out of 100
building stump 19 out of 100
building stump 20 out of 100
building stump 21 out of 100
building stump 22 out of 100
building stump 23 out of 100
building stump 24 out of 100
building stump 25 out of 100


In [22]:
pred = classifier.predict_proba(X_val)

In [24]:
gini_normalized(y_val, pred)

0.26904366210649211

In [None]:
classifier

In [110]:
# Create a submission file
sub.to_csv('./submit/xgb_v7_missing_vals_specified.csv.gz', 
           index=False, compression='gzip')

In [109]:
val_scores = [0.279973, 0.283588, 0.282138, 0.291781, 0.277837]
np.mean(val_scores)

0.28306339999999997