In [1]:
# coding: utf-8
import warnings
import gc
import pickle
import time
import kaggle
import os


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from datetime import datetime as dt

from sklearn.model_selection import KFold


from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics

In [2]:
print("script started: ", time.strftime("%b %d %Y %H:%M:%S"))


train_fname = '../input/light-gbm-data-ms/train_cat.pkl'
test_fname = '../input/light-gbm-data-ms/test_cat.pkl'

train = pd.read_pickle(train_fname)
print("TRAIN LOADED")

script started:  Mar 12 2019 20:04:54
TRAIN LOADED


In [3]:
target = pd.read_pickle('../input/light-gbm-data-ms/target.pkl')


true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges'
]

binary_variables = [c for c in train.columns if train[c].nunique() == 2]

categorical_columns = [c for c in train.columns if c not in true_numerical_columns]


#max_iter = 3

gc.collect()

print("TRAIN PREPARED")


test = pd.read_pickle(test_fname)
print("TEST LOADED")

TRAIN PREPARED
TEST LOADED


In [4]:
experinment_nr = 5

In [5]:
gc.collect()

0

In [6]:
param = {
        'num_threads': 27,
        'num_leaves': 60,
        'min_data_in_leaf': 60,
        "boosting": "gbdt",
        'objective': 'binary',
        "metric": 'auc',
        'max_depth': -1,
        'learning_rate': 0.2,
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.8,
        "bagging_seed": 11,
        "lambda_l1": 0.1,
        "random_state": 133,
        "verbosity": -1
}

max_iter = 3
folds_nr = 3

task_name = '27 cores benchmark'

folds = KFold(n_splits=folds_nr, shuffle=True, random_state=15)
oof = np.zeros(len(train))
categorical_columns = [
    c for c in categorical_columns if c not in ['MachineIdentifier']]

features = [c for c in train.columns if c not in ['MachineIdentifier']]

print("task {} started: {}".format(task_name, time.strftime("%b %d %Y %H:%M:%S")))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
score = [0 for _ in range(folds.n_splits)]

print("STARTING K-FOLD CV")

task 27 cores benchmark started: Mar 12 2019 20:05:08
STARTING K-FOLD CV


In [7]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("task {} starting fold nr {} at: {}".format(task_name, fold_, time.strftime("%b %d %Y %H:%M:%S")))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature=categorical_columns
                           )
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature=categorical_columns
                           )

    num_round = 5200
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)

    oof[val_idx] = clf.predict(
        train.iloc[val_idx][features], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance(
        importance_type='gain')
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0)

    # we perform predictions by chunks
    initial_idx = 0
    chunk_size = 1000000
    current_pred = np.zeros(len(test))
    while initial_idx < test.shape[0]:
        final_idx = min(initial_idx + chunk_size, test.shape[0])
        idx = range(initial_idx, final_idx)
        current_pred[idx] = clf.predict(
            test.iloc[idx][features], num_iteration=clf.best_iteration)
        initial_idx = final_idx
    predictions += current_pred / min(folds.n_splits, max_iter)

    score[fold_] = metrics.roc_auc_score(target.iloc[val_idx], oof[val_idx])
    print("task {} finished fold nr {} at: {}".format(task_name, fold_, time.strftime("%b %d %Y %H:%M:%S")))

    if fold_ == max_iter - 1:
        break

print("task {} finished 3 FOLDS: {}".format(task_name, time.strftime("%b %d %Y %H:%M:%S")))        
if (folds.n_splits == max_iter):
    cv_score = metrics.roc_auc_score(target, oof)
else:
cv_score = sum(score) / max_iter

IndentationError: expected an indented block (<ipython-input-7-441a91acc536>, line 53)

In [8]:
cv_score_printable = "{:<8.5f}".format(cv_score)
print("CV score: {}".format(cv_score_printable))

cv_score_printable = cv_score_printable.replace(".", "")
cv_score_printable = cv_score_printable.strip()


# Feature importance
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[
    feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14, 25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig(
    'e{}_lgbm_importances_{}.png'.format(experinment_nr, cv_score_printable))
feature_importance_df.to_csv(
    'e{}_lgbm_importances_{}.csv'.format(experinment_nr, cv_score_printable))


# submit predictions

sub_df = pd.read_csv('../input/microsoft-malware-prediction/sample_submission.csv')
sub_df["HasDetections"] = predictions

model_dir = '../output'


model_name = 'submit_e{}_cv{}_{}.csv.gz'.format(
    experinment_nr, cv_score_printable, dt.now().strftime('%Y-%m-%d-%H-%M'))

fname = os.path.join(model_dir, model_name)
param_string = ', '.join(('{}: {}'.format(k, v) for k, v in param.items()))
message = 'CV: {} DATA: {} LGBM params: {}'.format(
    cv_score_printable, train_fname, param_string)
competition = 'microsoft-malware-prediction'

sub_df.to_csv(fname, compression='gzip', index=False)
#kaggle.api.competition_submit(os.path.abspath(fname), message, competition)
print("task {} finished: {}".format(task_name, time.strftime("%b %d %Y %H:%M:%S")))


print("script finished: ", time.strftime("%b %d %Y %H:%M:%S"))

NameError: name 'cv_score' is not defined