## Machine Learning

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import gc




In [12]:
print('loading files...')
train = pd.read_csv('../../data/train_prepared.csv') #train_prepared  na_values=-1
test = pd.read_csv('../../data/test_prepared.csv')

print('files loaded...')

loading files...
files loaded...


In [3]:
#--- memory consumed by train dataframe ---
mem = train.memory_usage(index=True).sum()
print("Memory consumed by training set  :   {} MB" .format(mem/ 1024**2))
 
#--- memory consumed by test dataframe ---
mem = test.memory_usage(index=True).sum()
print("Memory consumed by test set      :   {} MB" .format(mem/ 1024**2))

Memory consumed by training set  :   217.9732208251953 MB
Memory consumed by test set      :   320.1474151611328 MB


In [4]:
def change_datatype(df):
    float_cols = list(df.select_dtypes(include=['int']).columns)
    for col in float_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)

change_datatype(train)
change_datatype(test) 

In [5]:
#--- Converting columns from 'float64' to 'float32' ---
def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)
        
change_datatype_float(train)
change_datatype_float(test)

In [6]:
#--- memory consumed by train dataframe ---
mem = train.memory_usage(index=True).sum()
print("Memory consumed by training set  :   {:.3f} MB" .format(mem/ 1024**2))
 
#--- memory consumed by test dataframe ---
mem = test.memory_usage(index=Ttrainrue).sum()
print("Memory consumed by test set      :   {:.3f} MB" .format(mem/ 1024**2))

Memory consumed by training set  :   54.493 MB
Memory consumed by test set      :   80.888 MB


In [None]:
#test.drop(['ps_car_03_cat', 'ps_car_05_cat'], inplace=True, axis=1)

col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

In [18]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [14]:
# Set xgboost parameters
params = {}
params['eta'] = 0.02 #0.02
params['max_depth'] = 4 # 4
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9
params['silent'] = True
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['silent'] = True
params['seed']: 6
#'maximize':True

#'eta': 0.09, 'max_depth': 5

In [15]:
X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub=test['id'].to_frame()
sub['target']=0

nrounds=2000  # need to change to 2000
kfold = 5  # need to change to 5

skf = StratifiedKFold(n_splits=kfold, random_state=0)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
    
sub.to_csv('submission_6.csv', index=False, float_format='%.5f')
gc.collect()
sub.head(2)


 xgb kfold: 1  of  5 : 
[0]	train-gini:0.170386	valid-gini:0.168183
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.243891	valid-gini:0.23553
[200]	train-gini:0.272816	valid-gini:0.253321
[300]	train-gini:0.293187	valid-gini:0.265738
[400]	train-gini:0.306891	valid-gini:0.271411
[500]	train-gini:0.317439	valid-gini:0.274709
[600]	train-gini:0.327346	valid-gini:0.276579
[700]	train-gini:0.335869	valid-gini:0.276887
[800]	train-gini:0.344163	valid-gini:0.276829
Stopping. Best iteration:
[738]	train-gini:0.339176	valid-gini:0.277165

 xgb kfold: 2  of  5 : 
[0]	train-gini:0.164558	valid-gini:0.165263
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.242458	valid-gini:0.224634
[200]	train-gini:0.269847	valid-gini:0.250713
[300]	train-gini:0.291972	valid-gini:0.2

Unnamed: 0,id,target
0,0,0.014187
1,1,0.014751


In [16]:
xgb_model.best_score #before 0.276476

0.270166

In [19]:
import lightgbm as lgb

params = {'metric': 'auc', 
          'learning_rate' : 0.01, 
          'max_depth':10, 
          'max_bin':10,  
          'objective': 'binary', 
          'feature_fraction': 0.8,
          'bagging_fraction':0.9,
          'bagging_freq':10, 
          'min_data': 500}


skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)
    
sub.to_csv('sub10.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

 lgb kfold: 1  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.623683	valid_0's gini: 0.247365
[200]	valid_0's auc: 0.624702	valid_0's gini: 0.249404
[300]	valid_0's auc: 0.627926	valid_0's gini: 0.255851
[400]	valid_0's auc: 0.631663	valid_0's gini: 0.263327
[500]	valid_0's auc: 0.634675	valid_0's gini: 0.269349
[600]	valid_0's auc: 0.636344	valid_0's gini: 0.272689
[700]	valid_0's auc: 0.637699	valid_0's gini: 0.275397
[800]	valid_0's auc: 0.638099	valid_0's gini: 0.276199
[900]	valid_0's auc: 0.638584	valid_0's gini: 0.277169
[1000]	valid_0's auc: 0.638592	valid_0's gini: 0.277185
[1100]	valid_0's auc: 0.638722	valid_0's gini: 0.277444
[1200]	valid_0's auc: 0.638742	valid_0's gini: 0.277484
Early stopping, best iteration is:
[1136]	valid_0's auc: 0.638879	valid_0's gini: 0.277758
 lgb kfold: 2  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.621713	valid_0's gini: 0.243425
[200]	valid_0's a

Unnamed: 0,id,target
0,0,0.028089
1,1,0.029611


In [23]:
lgb_model.best_score

defaultdict(dict,
            {'valid_0': {'auc': 0.63569906343042648,
              'gini': 0.27139812686085296}})

https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282