In [17]:
model_name = "submission"

In [18]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [19]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

In [20]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

In [21]:
meta_features = [
    load_meta("final_stacker/csv/", "with_top0_cb", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top3_cb", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top20", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top20_cb", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top20_xgb", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top20_goss", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top50", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top50_cb", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top100", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top50_cb", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top500", "tree_stacks"),
    load_meta("final_stacker/csv/", "with_top500_cb", "tree_stacks"),
]

# Loading Data

In [22]:
train = pd.read_csv("data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("data/application_test.csv", usecols = ["SK_ID_CURR"])

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, meta_features)

train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
test_id_rank = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [23]:
from sklearn.linear_model import LogisticRegression
def model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    clf = LogisticRegression(class_weight = {1: np.mean(target), 0: (1-np.mean(target))}, random_state = 123, n_jobs = 16)
    clf.fit(x_train, y_train)
    
    meta_train[test_index] = clf.predict_proba(x_test)[:,1]
    meta_test.append(clf.predict_proba(test)[:,1])
    
    global fold_roc
    auc_score = roc_auc_score(y_test, meta_train[test_index])
    fold_roc.append(auc_score)
 
    print auc_score

# Training Model

In [24]:
meta_train = np.zeros(train.shape[0])
meta_test = []
fold_roc = []
overall_roc = []

kf = StratifiedKFold(n_splits= 1000, shuffle=True, random_state=0)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] = pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] = meta_train
overall_roc+=fold_roc

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

  " = {}.".format(self.n_jobs))


0.875053003533569
0.7567491166077739
0.7577385159010601
0.8084805653710248
0.7120848056537104
0.8689752650176679
0.8267137809187279
0.8204946996466431
0.6996466431095407
0.8630388692579506
0.7489752650176679
0.791095406360424
0.8224734982332156
0.688339222614841
0.8031095406360425
0.7546289752650176
0.8128621908127208
0.8096113074204947
0.8093286219081273
0.8301060070671378
0.8014134275618374
0.7878445229681978
0.6828268551236749
0.8070671378091874
0.7687632508833923
0.8056537102473499
0.8289752650176678
0.8097526501766784
0.7809187279151943
0.7400706713780919
0.8768904593639576
0.774416961130742
0.8134275618374558
0.7700353356890459
0.8349116607773851
0.8569611307420495
0.8582332155477033
0.7937809187279152
0.7892579505300354
0.7857243816254417
0.8435335689045936
0.7713074204946997
0.8579505300353357
0.7990106007067138
0.7560424028268551
0.8470671378091874
0.7591519434628975
0.7772438162544169
0.8414134275618375
0.8565371024734982
0.8005653710247349
0.8409893992932863
0.86120141342756

0.8171024734982332
0.8234628975265018
0.7906713780918728
0.8231802120141343
0.7713074204946997
0.7621201413427563
0.7749823321554771
0.8713780918727916
0.7953356890459363
0.7949116607773852
0.8104593639575971
0.7721554770318021
0.8856537102473498
0.8288339222614841
0.8122968197879858
0.8344876325088338
0.7534982332155478
0.8884805653710246
0.8602120141342756
0.8612014134275618
0.8301060070671379
0.7578798586572438
0.7469964664310954
0.8610600706713781
0.7893992932862192
0.8487632508833922
0.7006360424028268
0.8798586572438163
0.8185159010600707
0.7826148409893994
0.7850176678445229
0.7959010600706713
0.8481978798586571
0.8214840989399294
0.766643109540636
0.8370318021201413
0.8251590106007067
0.7867137809187279
0.7934982332155477
0.7821908127208481
0.8190812720848056
0.7502473498233215
0.7363957597173145
0.8732155477031802
0.8025441696113074
0.8025441696113074
0.818374558303887
0.7351236749116608
0.7874204946996466
0.8142756183745583
0.817809187279152
0.8429681978798587
0.8380212014134

0.7544326241134751
0.8257978723404255
0.836436170212766
0.7833924349881797
0.8460401891252955
0.8482565011820331
0.7696513002364066
0.7675827423167849
0.8865248226950354
0.8104314420803782
0.799645390070922
0.8203309692671396
0.8274231678486997
0.8339243498817966
0.7485224586288417
0.7591607565011821
0.765661938534279
0.7934397163120568
0.7940307328605202
0.8450059101654845
0.75177304964539
0.821660756501182
0.7801418439716312
0.8484042553191489
0.8036347517730497
0.7749704491725768
0.8158983451536643
0.8645094562647754
0.833628841607565
0.8172281323877069
0.8371749408983451
0.7966903073286054
0.7551713947990544
0.8294917257683215
0.8269799054373522
0.8602245862884161
0.732565011820331
0.8718971631205674
0.821660756501182
0.8603723404255319
0.7455673758865248
0.8192966903073285
0.8287529550827424
0.8093971631205673
0.7770390070921985
0.8763297872340425
0.8617021276595744
0.8240248226950355
0.8485520094562649
0.7805851063829787
0.7780732860520094
0.8000886524822695
0.8453014184397163
0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Overall ROC: 0.806041099808,  Mean ROC: 0.806072557452, STD AUC: 0.0450201346126


In [25]:
train_id.to_csv("submission/{}_train1.csv".format(model_name), index=False)
test_id.to_csv("submission/{}_test1.csv".format(model_name), index=False)
test_id_rank.to_csv("submission/{}_rank_test1.csv".format(model_name), index=False)

<matplotlib.axes._subplots.AxesSubplot at 0x7f4ee42854d0>

In [26]:
meta_train = np.zeros(train.shape[0])
meta_test = []
fold_roc = []

kf = StratifiedKFold(n_splits= 1000, shuffle=True, random_state=564556)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] += np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] += pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] += meta_train
overall_roc+=fold_roc

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

0.7513780918727916
0.754487632508834
0.7451590106007068
0.7877031802120141
0.8303886925795053
0.8040989399293287
0.8175265017667844
0.8363250883392226
0.7696113074204947
0.8315194346289753
0.8163957597173145
0.7840282685512368
0.7708833922261483
0.8313780918727915
0.8440989399293286
0.823886925795053
0.8385865724381625
0.6520141342756184
0.8309540636042403
0.7910954063604241
0.8688339222614841
0.8631802120141343
0.8537102473498233
0.7949116607773852
0.7903886925795053
0.8173851590106007
0.8448056537102474
0.7629681978798586
0.7540636042402827
0.7640989399293285
0.8407067137809188
0.7813427561837456
0.8511660777385158
0.7728621908127209
0.8572438162544169
0.8689752650176678
0.8029681978798586
0.7622614840989399
0.7827561837455831
0.9014840989399293
0.7608480565371025
0.7841696113074205
0.8653003533568905
0.8265724381625442
0.7892579505300352
0.7549116607773853
0.7649469964664312
0.8042402826855124
0.8219081272084805
0.7684805653710247
0.7650883392226149
0.7409187279151943
0.800282685512

0.8805653710247351
0.7673498233215548
0.8500353356890459
0.8090459363957598
0.7807773851590106
0.7725795053003535
0.7693286219081272
0.7216961130742049
0.8206360424028267
0.8289752650176678
0.8217667844522969
0.8096113074204947
0.7387985865724382
0.849469964664311
0.8031095406360425
0.8067844522968197
0.9051590106007067
0.7901060070671377
0.8056537102473498
0.7830388692579505
0.8958303886925795
0.8090459363957598
0.7662190812720847
0.8055123674911661
0.7519434628975266
0.8739222614840989
0.8241696113074204
0.7769611307420494
0.8959717314487633
0.8797173144876325
0.7797879858657244
0.8158303886925795
0.8387279151943463
0.831095406360424
0.7414840989399294
0.7953356890459364
0.7946289752650177
0.7269257950530036
0.8742049469964664
0.8096113074204947
0.8007067137809187
0.7666431095406361
0.815547703180212
0.7457243816254416
0.828409893992933
0.7445936395759717
0.8740636042402827
0.9178798586572439
0.869964664310954
0.8418374558303887
0.8378798586572438
0.8349116607773852
0.776254416961130

0.7284278959810875
0.8092494089834515
0.8374704491725768
0.7758569739952719
0.8566784869976359
0.7801418439716311
0.7842789598108747
0.7992021276595744
0.8992316784869976
0.8169326241134753
0.8305260047281323
0.8250591016548463
0.7879728132387708
0.8018617021276596
0.7927009456264775
0.8055555555555556
0.7836879432624113
0.8283096926713948
0.8051122931442081
0.7832446808510638
0.8535756501182034
0.8516548463356974
0.7823581560283688
0.7796985815602837
0.7944739952718676
0.7664007092198581
0.8256501182033097
0.789598108747045
0.8368794326241135
0.7222222222222222
0.8549054373522459
0.8383569739952719
0.8291962174940898
0.7890070921985816
0.846483451536643
0.8083628841607565
0.8698286052009456
0.8071808510638299
0.8108747044917257
0.811613475177305
0.7997931442080378
0.8119089834515367
0.8615543735224587
0.7770390070921986
0.8325945626477542
0.8265366430260047
0.8825354609929078
0.788563829787234
0.8491430260047281
0.8871158392434988
0.8726359338061466
0.6895685579196218
0.80644208037825

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Overall ROC: 0.806041710124,  Mean ROC: 0.805952109386, STD AUC: 0.0447570296096


In [None]:
train_id.to_csv("submission/{}_train2.csv".format(model_name), index=False)
test_id.to_csv("submission/{}_test2.csv".format(model_name), index=False)
test_id_rank.to_csv("submission/{}_rank_test2.csv".format(model_name), index=False)

<matplotlib.axes._subplots.AxesSubplot at 0x7f4ee42854d0>

In [None]:
meta_train = np.zeros(train.shape[0])
meta_test = []
fold_roc = []
kf = StratifiedKFold(n_splits= 1000, shuffle=True, random_state=34543523)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] += np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] += pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] += meta_train
overall_roc+=fold_roc

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

0.8292579505300353
0.8128621908127208
0.8524381625441696
0.8511660777385159
0.7973144876325089
0.8
0.8018374558303887
0.8236042402826855
0.7496819787985867
0.8069257950530035
0.815547703180212
0.8438162544169612
0.7872791519434629
0.7486925795053003
0.7780918727915195
0.7573144876325089
0.8130035335689045
0.8021201413427561
0.8272791519434629
0.856678445229682
0.8747703180212014
0.7821908127208481
0.8587985865724381
0.8497526501766784
0.7485512367491166
0.7478445229681978
0.783886925795053
0.8260070671378092
0.8282685512367493
0.8069257950530035
0.8367491166077738
0.8070671378091874
0.7787985865724382
0.7720141342756184
0.8913074204946996
0.730600706713781
0.7570318021201413
0.8033922261484099
0.8703886925795052
0.8322261484098941
0.753922261484099
0.7775265017667845
0.7460070671378092
0.7710247349823321
0.8546996466431096
0.7754063604240283
0.8155477031802121
0.8111660777385159
0.80113074204947
0.7604240282685513
0.8141342756183746
0.8086219081272085
0.7529328621908128
0.7689045936395

0.8100353356890461
0.7992932862190812
0.7908127208480565
0.8455123674911661
0.8407067137809188
0.8192226148409895
0.8422614840989399
0.8046643109540635
0.7966077738515902
0.7143462897526502
0.7259363957597174
0.8402826855123675
0.850035335689046
0.88339222614841
0.8042402826855124
0.828409893992933
0.8137102473498233
0.8510247349823322
0.8541342756183746
0.8028268551236749
0.8196466431095406
0.8062190812720849
0.7918021201413428
0.8466431095406359
0.8384452296819787
0.8640282685512367
0.775547703180212
0.7913780918727915
0.7847349823321554
0.8026855123674912
0.8070671378091874
0.8350530035335689
0.8668551236749117
0.8022614840989399
0.8076325088339222
0.8043816254416961
0.7771024734982332
0.7949116607773852
0.7884098939929328
0.826148409893993
0.783886925795053
0.8159717314487632
0.8534275618374558
0.7635335689045937
0.7926501766784452
0.8778798586572438
0.7940636042402827
0.8018374558303887
0.8101766784452296
0.8240282685512368
0.867279151943463
0.7891166077738516
0.6282685512367492
0

In [None]:
train_id.to_csv("submission/{}_train3.csv".format(model_name), index=False)
test_id.to_csv("submission/{}_test3.csv".format(model_name), index=False)
test_id_rank.to_csv("submission/{}_rank_test3.csv".format(model_name), index=False)

In [None]:
meta_train = np.zeros(train.shape[0])
meta_test = []
fold_roc = []
kf = StratifiedKFold(n_splits= 1000, shuffle=True, random_state=56563)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] += np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] += pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] += meta_train
overall_roc+=fold_roc

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

In [None]:
train_id.to_csv("submission/{}_train4.csv".format(model_name), index=False)
test_id.to_csv("submission/{}_test4.csv".format(model_name), index=False)
test_id_rank.to_csv("submission/{}_rank_test4.csv".format(model_name), index=False)

In [None]:
meta_train = np.zeros(train.shape[0])
meta_test = []
fold_roc = []
kf = StratifiedKFold(n_splits= 1000, shuffle=True, random_state=245435)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] += np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] += pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] += meta_train
overall_roc+=fold_roc

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

In [None]:
test_id["TARGET"] /=5.0
test_id_rank["TARGET"] /=5.0
train_id["TARGET"] /=5.0

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, train_id["TARGET"]), np.mean(overall_roc), np.std(overall_roc))


In [None]:
train_id.to_csv("submission/{}_train.csv".format(model_name), index=False)
test_id.to_csv("submission/{}_test.csv".format(model_name), index=False)
test_id_rank.to_csv("submission/{}_rank_test.csv".format(model_name), index=False)