In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from warnings import filterwarnings
from sklearn.impute import SimpleImputer
filterwarnings('ignore')
import os

In [3]:
import sys
sys.path.insert(0,'../src/')
import config

In [4]:
IS_LOCAL = True
if(IS_LOCAL):
    PATH=config.RAW_DATA
else:
    PATH="../input/"
os.listdir(PATH)

['.DS_Store',
 'test.csv',
 '~$Data_Dictionary.xlsx',
 'train.csv',
 'sample_submission.csv']

In [5]:
train = pd.read_csv(PATH+"train.csv")
print("train shape", train.shape)
test = pd.read_csv(PATH+"test.csv")
print("test shape", test.shape)

train shape (400665, 36)
test shape (399335, 35)


In [6]:
target_column = "target"
id_column = "id"
categorical_cols = [c for c in test.columns if test[c].dtype in [np.object]]
numerical_cols = [c for c in test.columns if test[c].dtype in [np.float, np.int] and c not in [target_column, id_column]]
print("Number of features", len(categorical_cols)+len(numerical_cols))


Number of features 34


In [7]:
classifier = make_pipeline(
    ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols),    
    ]),
    LGBMClassifier(n_jobs=-1)
)

In [8]:
%%time
oof_pred = cross_val_predict(classifier, 
                             train, 
                             train[target_column], 
                             cv=5,
                             method="predict_proba")


CPU times: user 1min 54s, sys: 13.9 s, total: 2min 8s
Wall time: 44.2 s


In [11]:
print("Cross validation logloss {:.4f}".format(log_loss(train[target_column], oof_pred[:,1])))

Cross validation logloss 0.0326


In [12]:
sub = pd.read_csv(PATH+"sample_submission.csv")
sub.head()

Unnamed: 0,id,target
0,2,0
1,4,0
2,6,0
3,7,0
4,9,0


In [13]:
%%time
classifier.fit(train, train[target_column])
test_preds = classifier.predict_proba(test)[:,1]
sub[target_column] = test_preds
sub.to_csv(config.RESULTS+"public_submission.csv", index=False)

CPU times: user 31.7 s, sys: 3.83 s, total: 35.5 s
Wall time: 14.6 s
