In [None]:
#!pip install --upgrade lightgbm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import category_encoders as ce
import gc
import lightgbm as lgb
import module.train_log as log
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.ensemble import VotingClassifier
%matplotlib inline

In [2]:
# 불러올 데이터의 타입과 컬럼을 정의한다.
def get_column_dtypes():
    return {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        #'IsBeta':                                               'int8',
        #'RtpStateBitfield':                                     'float16',
        #'IsSxsPassiveMode':                                     'int8',
        #'DefaultBrowsersIdentifier':                            'float16',
        #'AVProductStatesIdentifier':                            'float32',
        #'AVProductsInstalled':                                  'float16',
        #'AVProductsEnabled':                                    'float16',
        #'HasTpm':                                               'int8',
        #'CountryIdentifier':                                    'int16',
        #'CityIdentifier':                                       'float32',
        #'OrganizationIdentifier':                               'float16',
        #'GeoNameIdentifier':                                    'float16',
        #'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        #'OsBuild':                                              'int16',
        #'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        #'IsProtected':                                          'float16',
        #'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        #'SMode':                                                'float16',
        #'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        #'Firewall':                                             'float16',
        #'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        #'Census_OEMNameIdentifier':                             'float16',
        #'Census_OEMModelIdentifier':                            'float32',
        #'Census_ProcessorCoreCount':                            'float16',
        #'Census_ProcessorManufacturerIdentifier':               'float16',
        #'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        #'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        #'Census_SystemVolumeTotalCapacity':                     'float32',
        #'Census_HasOpticalDiskDrive':                           'int8',
        #'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        #'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        #'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        #'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        #'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        #'Census_OSBuildNumber':                                 'int16',
        #'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        #'Census_OSInstallLanguageIdentifier':                   'float16',
        #'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        #'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        #'Census_IsFlightingInternal':                           'float16',
        #'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        #'Census_ThresholdOptIn':                                'float16',
        #'Census_FirmwareManufacturerIdentifier':                'float16',
        #'Census_FirmwareVersionIdentifier':                     'float32',
        #'Census_IsSecureBootEnabled':                           'int8',
        #'Census_IsWIMBootEnabled':                              'float16',
        #'Census_IsVirtualDevice':                               'float16',
        #'Census_IsTouchEnabled':                                'int8',
        #'Census_IsPenCapable':                                  'int8',
        #'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        #'Wdft_IsGamer':                                         'float16',
        #'Wdft_RegionIdentifier':                                'float16',
        #'HasDetections':                                        'int8'
        }

def get_use_columns():
     return ['MachineIdentifier'
        ,'ProductName'
        ,'EngineVersion'
        ,'AppVersion'
        ,'AvSigVersion'
        ,'IsBeta'
        ,'RtpStateBitfield'
        ,'IsSxsPassiveMode'
        ,'DefaultBrowsersIdentifier'
        ,'AVProductStatesIdentifier'
        ,'AVProductsInstalled'
        ,'AVProductsEnabled'
        ,'HasTpm'
        ,'CountryIdentifier'
        ,'CityIdentifier'
        ,'OrganizationIdentifier'
        ,'GeoNameIdentifier'
        ,'LocaleEnglishNameIdentifier'
        ,'Platform'
        ,'Processor'
        ,'OsVer'
        ,'OsBuild'
        ,'OsSuite'
        ,'OsPlatformSubRelease'
        ,'OsBuildLab'
        ,'SkuEdition'
        ,'IsProtected'
        ,'AutoSampleOptIn'
        ,'PuaMode'
        ,'SMode'
        ,'IeVerIdentifier'
        ,'SmartScreen'
        ,'Firewall'
        ,'UacLuaenable'
        ,'Census_MDC2FormFactor'
        ,'Census_DeviceFamily'
        ,'Census_OEMNameIdentifier'
        ,'Census_OEMModelIdentifier'
        ,'Census_ProcessorCoreCount'
        ,'Census_ProcessorManufacturerIdentifier'
        ,'Census_ProcessorModelIdentifier'
        ,'Census_ProcessorClass'
        ,'Census_PrimaryDiskTotalCapacity'
        ,'Census_PrimaryDiskTypeName'
        ,'Census_SystemVolumeTotalCapacity'
        ,'Census_HasOpticalDiskDrive'
        ,'Census_TotalPhysicalRAM'
        ,'Census_ChassisTypeName'
        ,'Census_InternalPrimaryDiagonalDisplaySizeInInches'
        ,'Census_InternalPrimaryDisplayResolutionHorizontal'
        ,'Census_PowerPlatformRoleName'
        ,'Census_InternalBatteryType'
        ,'Census_InternalBatteryNumberOfCharges'
        ,'Census_OSVersion'
        ,'Census_OSArchitecture'
        ,'Census_OSBranch'
        ,'Census_OSBuildRevision'
        ,'Census_OSEdition'
        ,'Census_OSSkuName'
        ,'Census_OSInstallTypeName'
        ,'Census_OSInstallLanguageIdentifier'
        ,'Census_OSWUAutoUpdateOptionsName'
        ,'Census_IsPortableOperatingSystem'
        ,'Census_GenuineStateName'
        ,'Census_ActivationChannel'
        ,'Census_IsFlightingInternal'
        ,'Census_IsFlightsDisabled'
        ,'Census_FlightRing'
        ,'Census_ThresholdOptIn'
        ,'Census_FirmwareManufacturerIdentifier'
        ,'Census_FirmwareVersionIdentifier'
        ,'Census_IsSecureBootEnabled'
        ,'Census_IsWIMBootEnabled'
        ,'Census_IsVirtualDevice'
        ,'Census_IsTouchEnabled'
        ,'Census_IsPenCapable'
        ,'Census_IsAlwaysOnAlwaysConnectedCapable'
        ,'Wdft_IsGamer'
        ,'Wdft_RegionIdentifier'
        ,'HasDetections']

In [3]:
# 데이터를 불러온다.
def load_data():
    column_dtypes = get_column_dtypes()
    use_columns = get_use_columns()
    zf = zipfile.ZipFile('../input/all.zip', 'r')
    tr_train = pd.read_csv(zf.open('train.csv'), dtype=column_dtypes, usecols= use_columns)
    X_test = pd.read_csv(zf.open('test.csv'), dtype=column_dtypes, usecols= use_columns[:-1])
    
    # 학습 데이터의 xy를 분리한다.
    y_train = tr_train.HasDetections
    X_train = tr_train.drop(['HasDetections'], axis=1)

    X_train.to_pickle("../result/X_train.pkl")
    y_train.to_pickle("../result/y_train.pkl")
    X_test.to_pickle("../result/X_test.pkl")
    return X_train, y_train, X_test


In [4]:
# 데이터를 불러온다.
def load_pickle_data(data_type:str=None):
    if data_type is not None:
        X_train_name = 'X_train_{}'.format(data_type)
        X_test_name = 'X_test_{}'.format(data_type)
    else:
        X_train_name = 'X_train'
        X_test_name = 'X_test'
    
    X_train = pd.read_pickle("../result/{}.pkl".format(X_train_name))
    y_train = pd.read_pickle("../result/y_train.pkl")
    X_test = pd.read_pickle("../result/{}.pkl".format(X_test_name))
    return X_train, y_train, X_test


def get_sample(n:int=1000):
    global X_train, y_train
    X_train = X_train.sample(n=n) 
    y_train = y_train[X_train.index] 
    return X_train, y_train

In [5]:
# Target Encoder를 실행한다.
def to_target_encoder():
    global X_train, X_test
    category_columns = list(X_train.select_dtypes(include=['category']).columns)
    enc = ce.TargetEncoder(cols=category_columns).fit(X_train, y_train)
    X_train = enc.transform(X_train)
    X_test = enc.transform(X_test)

    X_train.to_pickle("../result/X_train_target_encoded.pkl")
    X_test.to_pickle("../result/X_test_target_encoded.pkl")
    return X_train, X_test

In [6]:
# 결측치를 평균값으로 채운다.
def fill_na():
    for column in list(X_train.columns[X_train.isna().any()]):
        if  pd.api.types.is_categorical_dtype(X_train[column]):
            X_train[column] = X_train[column].cat.add_categories(['NA'])
            X_train[column].fillna('NA', inplace=True)
            X_test[column] = X_test[column].cat.add_categories(['NA'])
            X_test[column].fillna('NA', inplace=True)
        else:
            mean = X_train[column].mean()
            X_train[column].fillna(mean, inplace=True)
            X_test[column].fillna(mean, inplace=True)

    X_train.to_pickle("../result/X_train_filled_na.pkl")
    X_test.to_pickle("../result/X_test_filled_na.pkl")
    return X_train, X_test

In [7]:
# ID 저장 및 제거 
def drop_id_n_get_test_id():
    ID_test = X_test.MachineIdentifier
    X_train.drop(['MachineIdentifier'], axis=1, inplace=True)
    X_test.drop(['MachineIdentifier'], axis=1, inplace=True)
    return ID_test

In [8]:
def get_lgb_params():
    return {'num_leaves': 60,
         'min_data_in_leaf': 60, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.05,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "random_state": 133,
         "verbosity": -1}

def get_grid_params():
    return {'lgb__num_leaves': [60],
         'lgb__min_data_in_leaf': [60], 
         'lgb__objective':['binary'],
         'lgb__max_depth': [-1],
         'lgb__learning_rate': [0.05],
         "lgb__boosting": ["gbdt"],
         "lgb__feature_fraction": [0.8],
         "lgb__bagging_freq": [1],
         "lgb__bagging_fraction": [0.8] ,
         "lgb__bagging_seed": [11],
         "lgb__metric": ['auc'],
         "lgb__lambda_l1": [0.1],
         "lgb__random_state": [133],
         'lgb__n_estimators': [40],
         'lgb__random_state' : [501], # Updated from 'seed'
    }

In [9]:
def train_lgb():
    N_FOLD = 5
    SEED = 42
    
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    kf = folds.split(X=X_train, y=y_train)
    grid_params = get_grid_params()
    clf_lgb = LGBMClassifier()
    
    clfs = [
        ('lgb', clf_lgb)
    ]
    clf_eb = VotingClassifier(estimators=clfs, voting='soft')
    clf_cv = GridSearchCV(estimator=clf_eb, param_grid=grid_params, cv=kf)
    clf_cv.fit(X_train, y_train)
    
    log.save("start lgb", X_train, clfs, clf_cv)
    print(clf_cv.best_params_, clf_cv.best_score_)
    #clf_cv.save_model('../result/lgb_model.txt')    
    
    return clf_cv.predict_proba(X_test)[:, 1]

In [10]:
def train_lgb_kfold():
    MAX_BOOST_ROUNDS = 700
    EARLY_STOPPING = 40
    N_FOLD = 5
    
    lgb_params = get_lgb_params()
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True)
    oof_preds = np.zeros(X_train.index.size)
    sub_preds = np.zeros(X_test.index.size)
    clf = None
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
        train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
        valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]
        train = lgb.Dataset(train_x, train_y)
        valid = lgb.Dataset(valid_x, valid_y)
        
        clf = lgb.train(lgb_params, train, 
                        num_boost_round=MAX_BOOST_ROUNDS,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train','valid'],
                        early_stopping_rounds=EARLY_STOPPING,
                        verbose_eval=50)
                
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[val_idx])))
    
    if cls is not None:
        clf.save_model('../result/lgb_model.txt')    
    return sub_preds

In [11]:
#X_train, y_train, X_test = load_data()
#fill_na()
X_train, y_train, X_test = load_pickle_data('filled_na')
X_train, y_train = get_sample(1000)
#to_target_encoder()
ID_test = drop_id_n_get_test_id()
pred_test = train_lgb()

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


{'lgb__bagging_fraction': 0.8, 'lgb__bagging_freq': 1, 'lgb__bagging_seed': 11, 'lgb__boosting': 'gbdt', 'lgb__feature_fraction': 0.8, 'lgb__lambda_l1': 0.1, 'lgb__learning_rate': 0.05, 'lgb__max_depth': -1, 'lgb__metric': 'auc', 'lgb__min_data_in_leaf': 60, 'lgb__n_estimators': 40, 'lgb__num_leaves': 60, 'lgb__objective': 'binary', 'lgb__random_state': 501} 0.569


In [12]:
log.get(10)

Unnamed: 0,train_name,train_models,params,score,features,features_cnt,train_sample,insert_datetime,source
0,start lgb,"[""lgb""]","{""lgb__bagging_fraction"": 0.8, ""lgb__bagging_f...",0.569,"[""ProductName"", ""EngineVersion"", ""AppVersion"",...",78.0,"{""schema"": {""fields"":[{""name"":""index"",""type"":""...",2019-01-18 09:50:38,\n# coding: utf-8\n\n# In[ ]:\n\n\n#!pip insta...
1,start lgb,"[""lgb""]","{""lgb__bagging_fraction"": 0.8, ""lgb__bagging_f...",0.573,"[""ProductName"", ""EngineVersion"", ""AppVersion"",...",,"{""schema"": {""fields"":[{""name"":""index"",""type"":""...",2019-01-14 20:56:29,\n# coding: utf-8\n\n# In[ ]:\n\n\n#!pip insta...
2,train_test,train_models,params,,,,,2019-01-09 20:41:59,source
3,train_test,train_models,params,,,,,2019-01-09 20:37:59,source
4,train_test,train_models,params,,,,,2019-01-09 20:30:35,source


In [None]:
submission = pd.DataFrame({'MachineIdentifier':ID_test, 'HasDetections':pred_test})
submission.to_csv('../result/submission.csv', index=False)

In [None]:
submission.head()