In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import category_encoders as ce
%matplotlib inline

In [2]:
# 불러올 데이터의 타입과 컬럼을 정의한다.
column_dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

use_columns = ['MachineIdentifier'
    ,'ProductName'
    ,'EngineVersion'
    ,'AppVersion'
    ,'AvSigVersion'
    ,'IsBeta'
    ,'RtpStateBitfield'
    ,'IsSxsPassiveMode'
    ,'DefaultBrowsersIdentifier'
    ,'AVProductStatesIdentifier'
    ,'AVProductsInstalled'
    ,'AVProductsEnabled'
    ,'HasTpm'
    ,'CountryIdentifier'
    ,'CityIdentifier'
    ,'OrganizationIdentifier'
    ,'GeoNameIdentifier'
    ,'LocaleEnglishNameIdentifier'
    ,'Platform'
    ,'Processor'
    ,'OsVer'
    ,'OsBuild'
    ,'OsSuite'
    ,'OsPlatformSubRelease'
    ,'OsBuildLab'
    ,'SkuEdition'
    ,'IsProtected'
    ,'AutoSampleOptIn'
    ,'PuaMode'
    ,'SMode'
    ,'IeVerIdentifier'
    ,'SmartScreen'
    ,'Firewall'
    ,'UacLuaenable'
    ,'Census_MDC2FormFactor'
    ,'Census_DeviceFamily'
    ,'Census_OEMNameIdentifier'
    ,'Census_OEMModelIdentifier'
    ,'Census_ProcessorCoreCount'
    ,'Census_ProcessorManufacturerIdentifier'
    ,'Census_ProcessorModelIdentifier'
    ,'Census_ProcessorClass'
    ,'Census_PrimaryDiskTotalCapacity'
    ,'Census_PrimaryDiskTypeName'
    ,'Census_SystemVolumeTotalCapacity'
    ,'Census_HasOpticalDiskDrive'
    ,'Census_TotalPhysicalRAM'
    ,'Census_ChassisTypeName'
    ,'Census_InternalPrimaryDiagonalDisplaySizeInInches'
    ,'Census_InternalPrimaryDisplayResolutionHorizontal'
    ,'Census_PowerPlatformRoleName'
    ,'Census_InternalBatteryType'
    ,'Census_InternalBatteryNumberOfCharges'
    ,'Census_OSVersion'
    ,'Census_OSArchitecture'
    ,'Census_OSBranch'
    ,'Census_OSBuildRevision'
    ,'Census_OSEdition'
    ,'Census_OSSkuName'
    ,'Census_OSInstallTypeName'
    ,'Census_OSInstallLanguageIdentifier'
    ,'Census_OSWUAutoUpdateOptionsName'
    ,'Census_IsPortableOperatingSystem'
    ,'Census_GenuineStateName'
    ,'Census_ActivationChannel'
    ,'Census_IsFlightingInternal'
    ,'Census_IsFlightsDisabled'
    ,'Census_FlightRing'
    ,'Census_ThresholdOptIn'
    ,'Census_FirmwareManufacturerIdentifier'
    ,'Census_FirmwareVersionIdentifier'
    ,'Census_IsSecureBootEnabled'
    ,'Census_IsWIMBootEnabled'
    ,'Census_IsVirtualDevice'
    ,'Census_IsTouchEnabled'
    ,'Census_IsPenCapable'
    ,'Census_IsAlwaysOnAlwaysConnectedCapable'
    ,'Wdft_IsGamer'
    ,'Wdft_RegionIdentifier'
    ,'HasDetections']

In [4]:
# 데이터를 불러온다.
zf = zipfile.ZipFile('../input/all.zip', 'r')
tr_train = pd.read_csv(zf.open('train.csv'), dtype=column_dtypes, usecols= use_columns)
X_test = pd.read_csv(zf.open('test.csv'), dtype=column_dtypes, usecols= use_columns[:-1])
del zf

In [5]:
# 학습 데이터의 xy를 분리한다.
y_train = tr_train.HasDetections
X_train = tr_train.drop(['HasDetections'], axis=1)
del tr_train

In [6]:
# Target Encoder를 실행한다.
category_columns = list(X_train.select_dtypes(include=['category']).columns)
enc = ce.TargetEncoder(cols=category_columns).fit(X_train, y_train)
X_train = enc.transform(X_train)
X_test = enc.transform(X_test)

for column in category_columns:
    X_train[column] = X_train[column].astype('float16')
    X_test[column] = X_test[column].astype('float16')

X_train.to_pickle("../result/X_train_target_encoded.pkl")
X_test.to_pickle("../result/X_test_target_encoded.pkl")

In [None]:
# X_train = pd.read_pickle("../result/X_train_target_encoded.pkl")
# X_test = pd.read_pickle("../result/X_test_target_encoded.pkl")

In [None]:
# 결측치를 평균값으로 채운다.
def fill_nan(df, column, mean):
    dtype = df[column].dtype
    if dtype == 'float16':
        df[column] = df[column].astype('float32')
    df[column].fillna(mean, inplace=True)
    if dtype == 'float16':
        df[column] = df[column].astype('float16')

for column in list(X_train.columns[X_train.isna().any()]):
    mean = X_train[column].mean()
    fill_nan(X_train, column, mean)
    fill_nan(X_test, column, mean)

In [46]:
#X_train["RtpStateBitfield"] = X_train["RtpStateBitfield"].astype('float32')
# X_train["RtpStateBitfield"].fillna(X_train["RtpStateBitfield"].mean(), inplace=True)
# X_train["RtpStateBitfield"].isna().sum()
X_train = pd.read_pickle("../result/X_train_target_encoded.pkl")

In [None]:
# ID 저장 및 제거 
ID_test = X_test.MachineIdentifier
X_train.drop(['MachineIdentifier'], axis=1, inplace=True)
X_test.drop(['MachineIdentifier'], axis=1, inplace=True)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#clf_mlp = MLPClassifier()
clf_xb = XGBClassifier()
#clf_rf = RandomForestClassifier()
clfs = [
    ('xgb', clf_xb),#0.8272
#   ('rf', clf_rf), #0.8284
#   ('mlp', clf_mlp),
]
clf_eb = VotingClassifier(estimators=clfs, voting='soft')
parameters = {
    'xgb__max_depth':[4], 'xgb__min_child_weight':[4], 'xgb__gamma':[0.2],
    'xgb__subsample':[0.9], 'xgb__colsample_bytree':[0.84],
    'xgb__reg_alpha':[0.01], 'xgb__learning_rate':[0.2], 
#    "rf__n_estimators":[45], "rf__max_depth":[20], "rf__min_samples_leaf":[3],
#    'mlp__solver':['adam'], 'mlp__max_iter':[1000], 'mlp__early_stopping':[True], 
#    'mlp__hidden_layer_sizes':[(128,64,32)],'mlp__activation':['logistic'],
}
clf = GridSearchCV(clf_eb, parameters, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)
#print(clf.best_params_)
score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (score.mean(), score.std(), "eb"))

In [None]:
pred = clf.predict(X_test)
submission = pd.concat([ID_test, pred] ,axis=1)
submission_pca.to_csv('../result/submission.csv', index=False)