In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
import xgboost as xgb
import gc
import warnings
warnings.filterwarnings("ignore")

In [2]:
schema_train_4 = {
    'IsBeta': np.int8,
    'RtpStateBitfield': np.int8,
    'IsSxsPassiveMode': np.int8,
    'DefaultBrowsersIdentifier': np.int16,
    'AVProductStatesIdentifier': np.int32,
    'AVProductsInstalled': np.int8,
    'AVProductsEnabled': np.int8,
    'HasTpm': np.int8,
    'CountryIdentifier': np.int16,
    'CityIdentifier': np.int32,
    'OrganizationIdentifier': np.int8,
    'GeoNameIdentifier': np.int16,
    'LocaleEnglishNameIdentifier': np.int16,
    'OsBuild': np.int16,
    'OsSuite': np.int16,
    'IsProtected': np.int8,
    'AutoSampleOptIn': np.int8,
    'SMode': np.int8,
    'IeVerIdentifier': np.int16,
    'Firewall': np.int8,
    'UacLuaenable': np.int32,
    'Census_OEMNameIdentifier': np.int16,
    'Census_OEMModelIdentifier': np.int32,
    'Census_ProcessorCoreCount': np.int16,
    'Census_ProcessorManufacturerIdentifier': np.int8,
    'Census_ProcessorModelIdentifier': np.int16,
    'Census_PrimaryDiskTotalCapacity': np.int32,
    'Census_SystemVolumeTotalCapacity': np.int32,
    'Census_HasOpticalDiskDrive': np.int8,
    'Census_TotalPhysicalRAM': np.int32,
    'Census_InternalPrimaryDiagonalDisplaySizeInInches': np.float16,
    'Census_InternalPrimaryDisplayResolutionHorizontal': np.int16,
    'Census_InternalPrimaryDisplayResolutionVertical': np.int16,
    'Census_InternalBatteryNumberOfCharges': np.int32,
    'Census_OSBuildNumber': np.int16,
    'Census_OSBuildRevision': np.int32,
    'Census_OSInstallLanguageIdentifier': np.int8,
    'Census_OSUILocaleIdentifier': np.int16,
    'Census_IsPortableOperatingSystem': np.int8,
    'Census_IsFlightingInternal': np.int8,
    'Census_IsFlightsDisabled': np.int8,
    'Census_ThresholdOptIn': np.int8,
    'Census_FirmwareManufacturerIdentifier': np.int16,
    'Census_FirmwareVersionIdentifier': np.int32,
    'Census_IsSecureBootEnabled': np.int8,
    'Census_IsWIMBootEnabled': np.int8,
    'Census_IsVirtualDevice': np.int8,
    'Census_IsTouchEnabled': np.int8,
    'Census_IsPenCapable': np.int8,
    'Census_IsAlwaysOnAlwaysConnectedCapable': np.int8,
    'Wdft_IsGamer': np.int8,
    'Wdft_RegionIdentifier': np.int8,
    'HasDetections': np.int8,
    'Census_InternalBatteryType_informed': np.int8,
    'ProductName_index': np.int8,
    'Platform_index': np.int8,
    'Processor_index': np.int8,
    'OsPlatformSubRelease_index': np.int8,
    'OsBuildLab_index': np.int16,
    'SkuEdition_index': np.int8,
    'PuaMode_index': np.int8,
    'SmartScreen_index': np.int8,
    'Census_MDC2FormFactor_index': np.int8,
    'Census_DeviceFamily_index': np.int8,
    'Census_ProcessorClass_index': np.int8,
    'Census_PrimaryDiskTypeName_index': np.int8,
    'Census_ChassisTypeName_index': np.int8,
    'Census_PowerPlatformRoleName_index': np.int8,
    'Census_InternalBatteryType_index': np.int8,
    'Census_OSArchitecture_index': np.int8,
    'Census_OSBranch_index': np.int8,
    'Census_OSEdition_index': np.int8,
    'Census_OSSkuName_index': np.int8,
    'Census_OSInstallTypeName_index': np.int8,
    'Census_OSWUAutoUpdateOptionsName_index': np.int8,
    'Census_GenuineStateName_index': np.int8,
    'Census_ActivationChannel_index': np.int8,
    'Census_FlightRing_index': np.int8,
    'Census_OSVersion_index': np.int16,
    'EngineVersion_index': np.int8,
    'AppVersion_index': np.int8,
    'AvSigVersion_index': np.int16,
    'OsVer_index': np.int8,
    'Census_OSVersion_0_index': np.int8,
    'Census_OSVersion_1_index': np.int16,
    'EngineVersion_0_index': np.int8,
    'EngineVersion_1_index': np.int8,
    'AppVersion_0_index': np.int8,
    'AppVersion_1_index': np.int8,
    'AvSigVersion_0_index': np.int8,
    'AvSigVersion_1_index': np.int16,
    'OsVer_0_index': np.int8,
    'OsVer_1_index': np.int8,
    'OsBuildLab_diff': np.int16,
    'std_diff_DateOsBuildLab': np.float32,
    'AvSigVersion_diff': np.int16,
    'std_diff_AvSigVersion': np.float32,
    'OSVersion_diff': np.int16,
    'std_diff_OSVersion': np.float32,
    'max_OsBuildLab_diff': np.int16,
    'max_AvSigVersion_diff': np.int16,
    'max_OSVersion_diff': np.int16,
    'ratio_OsBuildLab_diff': np.float32,
    'ratio_AvSigVersion_diff': np.float32,
    'ratio_OSVersion_diff': np.float32,
    'prediction_2': np.int8,
    'prediction_4': np.int8,
    'prediction_8': np.int8,
    'prediction_16': np.int8,
    'prediction_32': np.int8,
    'prediction_64': np.int8,
    'count(DISTINCT AvSigVersion_Name)': np.int16,
    'count(DISTINCT AvSigVersion_Type)': np.int8,
    'count(DISTINCT AvSigVersion_AlertLevel)': np.int8,
    'count1': np.int32,
    'count2': np.int32,
    'count3': np.int32,
    'count4': np.int32,
    'count5': np.int16,
    'count6': np.int32,
    'count7': np.int32,
    'count8': np.int16
}

In [3]:
path = '../../data/train_final_4'
allFiles = glob.glob(path + "/*.csv")
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_)
    df = (df.fillna(-1)).astype(schema_train_4)
    list_.append(df)

train = pd.concat(list_, axis = 0, ignore_index = True)


In [4]:
drop_version = ['AvSigVersion_index', 'EngineVersion_index', 'Census_OSVersion_index', 'AppVersion_index']

sel_cols = [c for c in train.columns if c not in ['MachineIdentifier',
                                                      'HasDetections',
                                                      'Census_DeviceFamily_Windows.Server',
                                                      'Census_DeviceFamily_Windows.Desktop'
                                                     ]+drop_version]

train = train.sample(frac=0.1)
X_train = train.loc[:, sel_cols]
y_train = train.loc[:,'HasDetections']
del train
del list_
gc.collect()

563

In [5]:
X_train.shape

(892148, 117)

In [38]:
params = {
    'objective' :['binary'],
    'learning_rate' : [0.1,0.05,0.01,0.2,0.5],
    'num_leaves' : [50,75,100,125],
    'feature_fraction': [0.2, 0.3], 
    'bagging_fraction': [0.8,1,0.5,0.1,0.7], 
    'bagging_freq':[1,5,10,15,20,30],
    'boosting_type' : ['gbdt'],
    'metric': ['auc'],
    'max_depth': [20,30,50,19,5,9],
    'min_data_in_leaf': [30,10,20],
    'max_delta_step': [0,2],
    'lambda_l1': [0.2, 0,0.5],
    'lambda_l2': [0, 0.2,0.5]
}


In [7]:
kFolds = 5

In [39]:
lgb_model = lgb.LGBMClassifier(n_jobs=-1, seed=42)

In [55]:
random_search = RandomizedSearchCV(lgb_model, param_distributions=params, cv=kFolds,
                                  verbose=2, scoring='roc_auc', n_jobs=1, random_state=42,
                                  n_iter=25)

In [56]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1, total=   8.5s
[CV] objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.8s remaining:    0.0s


[CV]  objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1, total=   7.6s
[CV] objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1 
[CV]  objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1, total=   7.6s
[CV] objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=9, max_delta_step=2, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=1 
[CV]  objective=binary, num_leaves=100, mi

[CV]  objective=binary, num_leaves=100, min_data_in_leaf=10, metric=auc, max_depth=20, max_delta_step=2, learning_rate=0.1, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.7, total=   7.4s
[CV] objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=5, max_delta_step=2, learning_rate=0.5, lambda_l2=0.5, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=5, bagging_fraction=0.1 
[CV]  objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=5, max_delta_step=2, learning_rate=0.5, lambda_l2=0.5, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=5, bagging_fraction=0.1, total=   3.8s
[CV] objective=binary, num_leaves=100, min_data_in_leaf=30, metric=auc, max_depth=5, max_delta_step=2, learning_rate=0.5, lambda_l2=0.5, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=5, bagging_fraction=0.1 
[CV]  objective=binary, num_leaves=100, mi

[CV]  objective=binary, num_leaves=50, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.5, total=   5.6s
[CV] objective=binary, num_leaves=50, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.5 
[CV]  objective=binary, num_leaves=50, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.5, total=   5.5s
[CV] objective=binary, num_leaves=50, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.5 
[CV]  objective=binary, num_leaves=50, min_dat

[CV]  objective=binary, num_leaves=125, min_data_in_leaf=30, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.7, total=   9.9s
[CV] objective=binary, num_leaves=125, min_data_in_leaf=30, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.7 
[CV]  objective=binary, num_leaves=125, min_data_in_leaf=30, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.7, total=   9.0s
[CV] objective=binary, num_leaves=125, min_data_in_leaf=30, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=20, bagging_fraction=0.7 
[CV]  objective=binary, num_leaves

[CV]  objective=binary, num_leaves=125, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=10, bagging_fraction=0.5, total=   7.4s
[CV] objective=binary, num_leaves=125, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=10, bagging_fraction=0.5 
[CV]  objective=binary, num_leaves=125, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.2, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=10, bagging_fraction=0.5, total=   8.4s
[CV] objective=binary, num_leaves=125, min_data_in_leaf=10, metric=auc, max_depth=50, max_delta_step=2, learning_rate=0.05, lambda_l2=0.5, lambda_l1=0.5, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=15, bagging_fraction=0.1 
[CV]  objective=binary, num_leaves=125,

[CV]  objective=binary, num_leaves=50, min_data_in_leaf=20, metric=auc, max_depth=30, max_delta_step=0, learning_rate=0.1, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=30, bagging_fraction=0.8, total=   6.6s
[CV] objective=binary, num_leaves=50, min_data_in_leaf=20, metric=auc, max_depth=30, max_delta_step=0, learning_rate=0.1, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=30, bagging_fraction=0.8 
[CV]  objective=binary, num_leaves=50, min_data_in_leaf=20, metric=auc, max_depth=30, max_delta_step=0, learning_rate=0.1, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=30, bagging_fraction=0.8, total=   6.7s
[CV] objective=binary, num_leaves=50, min_data_in_leaf=20, metric=auc, max_depth=30, max_delta_step=0, learning_rate=0.1, lambda_l2=0, lambda_l1=0.2, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=30, bagging_fraction=0.8 
[CV]  objective=binary, num_leaves=50, min_dat

[CV]  objective=binary, num_leaves=125, min_data_in_leaf=20, metric=auc, max_depth=5, max_delta_step=0, learning_rate=0.01, lambda_l2=0.5, lambda_l1=0.5, feature_fraction=0.3, boosting_type=gbdt, bagging_freq=10, bagging_fraction=0.5, total=   5.5s
[CV] objective=binary, num_leaves=75, min_data_in_leaf=20, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=5, bagging_fraction=0.7 
[CV]  objective=binary, num_leaves=75, min_data_in_leaf=20, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=5, bagging_fraction=0.7, total=   7.3s
[CV] objective=binary, num_leaves=75, min_data_in_leaf=20, metric=auc, max_depth=19, max_delta_step=0, learning_rate=0.05, lambda_l2=0.2, lambda_l1=0, feature_fraction=0.2, boosting_type=gbdt, bagging_freq=5, bagging_fraction=0.7 
[CV]  objective=binary, num_leaves=75, min_

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 19.5min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=42,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0),
          fit_params=None, iid='warn', n_iter=25, n_jobs=1,
          param_distributions={'objective': ['binary'], 'learning_rate': [0.1, 0.05, 0.01, 0.2, 0.5], 'num_leaves': [50, 75, 100, 125], 'feature_fraction': [0.2, 0.3], 'bagging_fraction': [0.8, 1, 0.5, 0.1, 0.7], 'bagging_freq': [1, 5, 10, 15, 20, 30], 'boosting_type': ['gbdt'], 'metric': ['auc'], 'max_depth': [20, 30, 50, 19, 5, 9], 'min_data_in_leaf': [30, 10, 20], 'max_delta_step': [0, 2], 'lambda_l1': [0.2, 0, 0.

In [57]:
random_search.best_estimator_

LGBMClassifier(bagging_fraction=1, bagging_freq=20, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.3,
        importance_type='split', lambda_l1=0.2, lambda_l2=0.2,
        learning_rate=0.2, max_delta_step=2, max_depth=50, metric='auc',
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=20,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=75,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [58]:
random_search.best_score_

0.7210005208708987

In [59]:
lgb_2 = random_search.best_estimator_

In [48]:
random_search.best_estimator_.n_estimators = 500

In [60]:
lgb_2.fit(X_train, y_train)

LGBMClassifier(bagging_fraction=1, bagging_freq=20, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.3,
        importance_type='split', lambda_l1=0.2, lambda_l2=0.2,
        learning_rate=0.2, max_delta_step=2, max_depth=50, metric='auc',
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=20,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=75,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [61]:
lgb_2.feature_importances_

array([  0,  71,  27,  29, 346,  36,  34,   8, 271,  96,  44, 159, 189,
        60,  32,  17,   0,  39, 125,  27,  14,  81, 136,  61,   6, 131,
        97, 173,  29,  76, 130,  70,  73,  86,  48, 183, 104, 166,   0,
        12,   6,   0,  73,  68,  32,  28,  22,  12,   8,  12,  83, 158,
         0,   1,   7,  24,  33,  98,  21,   7, 110,  38,   2,   3,  22,
        41,  30,  24,  13,  57,  56,  52, 190,  54,  41, 111,  26,   3,
         0,  31,  95, 103, 161, 161,  79, 169,   1,   2,   0,   4,   4,
         8,   0,   0,  53, 184, 181,   2,   9,   3,   0,   2,  15,  36,
        73,  66,  75,  12,  13, 144,  63,  82, 148, 140, 168, 126, 135])

In [62]:
imp = pd.DataFrame({'feature': X_train.columns, 'importance': lgb_2.feature_importances_})

In [63]:
a = imp.sort_values('importance', ascending=False)

In [64]:
a

Unnamed: 0,feature,importance
4,AVProductStatesIdentifier,346
8,CountryIdentifier,271
72,Census_OSInstallTypeName_index,190
12,LocaleEnglishNameIdentifier,189
95,max_AvSigVersion_diff,184
35,Census_OSBuildRevision,183
96,max_OSVersion_diff,181
27,Census_SystemVolumeTotalCapacity,173
85,AvSigVersion_1_index,169
114,count6,168
