In [41]:
model_name = "l1_lr"

In [42]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [43]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

In [44]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def postprocess(data):
    columns = [c for c in data.columns if c not in ["TARGET","SK_ID_CURR"]]
    for col in columns:
        data[col] = data[col].fillna(data[col].mean())
    return data

def processColNames(df):
    df.columns = [c.replace(" ","_") for c in df.columns]
    return df


## Loading Features

Loading All Features File 

In [45]:
applications = processColNames(pd.read_csv("../extractor/csv/application_features_normalized_V1.csv").dropna(axis=1, how="all")).drop(["TARGET"], axis=1)
bureau_balance_bb = processColNames(pd.read_csv("../extractor/csv/bureau_features_normalized_V1.csv").dropna(axis=1, how="all"))
credit_card_balance = processColNames(pd.read_csv("../extractor/csv/credit_card_features_normalized_V1.csv").dropna(axis=1, how="all"))
installment_features = processColNames(pd.read_csv("../extractor/csv/installment_features_normalized_V1.csv").dropna(axis=1, how="all"))
pos_cash_balance = processColNames(pd.read_csv("../extractor/csv/pos_cash_features_normalized_V1.csv").dropna(axis=1, how="all"))
previous_apps = processColNames(pd.read_csv("../extractor/csv/previous_application_features_normalized_V1.csv").dropna(axis=1, how="all"))

base_features = [applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps ]

## Loading Meta Features

In [46]:
tree_meta_features = [
    load_meta("../base trees/csv/", "application", "trees"),
    load_meta("../base trees/csv/", "bureau_balance_bb", "trees"),
    load_meta("../base trees/csv/", "credit_card_balance", "trees"),
    load_meta("../base trees/csv/", "installment", "trees"),
    load_meta("../base trees/csv/", "pos_cash_balance", "trees"),
    load_meta("../base trees/csv/", "previous_apps", "trees"),
]

lr_meta_features = [
    load_meta("../base lr/csv/", "application", "lr"),
    load_meta("../base lr/csv/", "bureau_balance_bb", "lr"),
    load_meta("../base lr/csv/", "credit_card_balance", "lr"),
    load_meta("../base lr/csv/", "installment", "lr"),
    load_meta("../base lr/csv/", "pos_cash_balance", "lr"),
    load_meta("../base lr/csv/", "previous_apps", "lr"),
]

nb_meta_features = [
    load_meta("../base nb/csv/", "application", "nb"),
    load_meta("../base nb/csv/", "bureau_balance_bb", "nb"),
    load_meta("../base nb/csv/", "credit_card_balance", "nb"),
    load_meta("../base nb/csv/", "installment", "nb"),
    load_meta("../base nb/csv/", "pos_cash_balance", "nb"),
    load_meta("../base nb/csv/", "previous_apps", "nb"),
]

nn_meta_features = [
    load_meta("../base nn/csv/", "application", "nb"),
    load_meta("../base nn/csv/", "bureau_balance_bb", "nb"),
    load_meta("../base nn/csv/", "credit_card_balance", "nb"),
    load_meta("../base nn/csv/", "installment", "nb"),
    load_meta("../base nn/csv/", "pos_cash_balance", "nb"),
    load_meta("../base nn/csv/", "previous_apps", "nb"),
]

In [47]:
train = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, base_features)
data = join_features(data, tree_meta_features)
data = join_features(data, lr_meta_features)
data = join_features(data, nb_meta_features)
data = join_features(data, nn_meta_features)

data = postprocess(data)
train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [48]:
from sklearn.linear_model import LogisticRegression
def model_lr(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    clf = LogisticRegression(class_weight = {1: np.mean(target), 0: (1-np.mean(target))}, random_state = 123)
    clf.fit(x_train, y_train)
    
    meta_train[test_index] = clf.predict_proba(x_test)[:,1]
    meta_test.append(clf.predict_proba(test)[:,1])

    print roc_auc_score(y_test, meta_train[test_index])

# Training Model

In [49]:
meta_train = np.zeros(train.shape[0])
meta_test = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index].as_matrix(), train.iloc[test_index].as_matrix()
    y_train, y_test = target[train_index], target[test_index]

    model_lr(x_train, x_test, y_train, y_test, test.as_matrix(), meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
train_id["TARGET"] = meta_train
print "Overall ROC AUC SCORE: ",roc_auc_score(target,meta_train)

0.7631080850651101
0.7757690567463693
0.7679636149610197
0.7713405275224885
0.7699119016920565
0.7596135024773081
0.7791074653594747
0.7618684441663293
0.7670872848539483
0.7718775409351747


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Overall ROC AUC SCORE:  0.7687605652349535


In [50]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)