In [1]:
# coding: utf-8
import gc
import pickle
import numpy as np
import pandas as pd
from itertools import combinations
from tqdm import tqdm

In [2]:
train = pd.read_pickle('../input/msmalware/train.pkl')
print("TRAIN LOADED")
test = pd.read_pickle('../input/msmalware/test.pkl')
print("TEST LOADED")

# merge both datasets

target = train['HasDetections']
del train['HasDetections']

target.to_pickle('target.pkl')
print("Target saved")
del(target)

train_rows = train.shape[0]

df_full = pd.concat([train, test])

print("MERGED") 
del(train)
del(test)
gc.collect()

TRAIN LOADED
TEST LOADED
Target saved
MERGED


18

In [3]:
# prepare the features
true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'fe_avsig_gamer_freq',
    'fe_cpucores_region_freq', 
    'fe_cpucores_oemname_freq',
    'fe_geoname_oemname_freq', 
    'fe_non_primary_drive_MB',
    'fe_screen_area'

    #'Census_InternalPrimaryDisplayResolutionHorizontal',
    #'Census_InternalPrimaryDisplayResolutionVertical',
#    'Census_InternalBatteryNumberOfCharges'
]

new_numerical_columns = [
    'non_primary_drive_MB',
    'aspect_ratio',
    'Screen_Area'
]

In [4]:
DTYPES = {
    'MachineIdentifier':                                    'object',
    'ProductName':                                          'object',
    'EngineVersion':                                        'object',
    'AppVersion':                                           'object',
    'AvSigVersion':                                         'object',
    'IsBeta':                                               'int8',
    'RtpStateBitfield':                                     'float16',
    'IsSxsPassiveMode':                                     'int8',
    'DefaultBrowsersIdentifier':                            'float32',  # was 'float16'
    'AVProductStatesIdentifier':                            'float32',
    'AVProductsInstalled':                                  'float16',
    'AVProductsEnabled':                                    'float16',
    'HasTpm':                                               'int8',
    'CountryIdentifier':                                    'int16',
    'CityIdentifier':                                       'float32',
    'OrganizationIdentifier':                               'float16',
    'GeoNameIdentifier':                                    'float16',
    'LocaleEnglishNameIdentifier':                          'int16',  # was 'int8'
    'Platform':                                             'object',
    'Processor':                                            'object',
    'OsVer':                                                'object',
    'OsBuild':                                              'int16',
    'OsSuite':                                              'int16',
    'OsPlatformSubRelease':                                 'object',
    'OsBuildLab':                                           'object',
    'SkuEdition':                                           'object',
    'IsProtected':                                          'float16',
    'AutoSampleOptIn':                                      'int8',
    'PuaMode':                                              'object',
    'SMode':                                                'float16',
    'IeVerIdentifier':                                      'float16',
    'SmartScreen':                                          'object',
    'Firewall':                                             'float16',
    'UacLuaenable':                                         'float64', # was 'float32'
    'Census_MDC2FormFactor':                                'object',
    'Census_DeviceFamily':                                  'object',
    'Census_OEMNameIdentifier':                             'float32', # was 'float16'
    'Census_OEMModelIdentifier':                            'float32',
    'Census_ProcessorCoreCount':                            'float16',
    'Census_ProcessorManufacturerIdentifier':               'float16',
    'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
    'Census_ProcessorClass':                                'object',
    'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
    'Census_PrimaryDiskTypeName':                           'object',
    'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
    'Census_HasOpticalDiskDrive':                           'int8',
    'Census_TotalPhysicalRAM':                              'float32',
    'Census_ChassisTypeName':                               'object',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
    'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
    'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
    'Census_PowerPlatformRoleName':                         'object',
    'Census_InternalBatteryType':                           'object',
    'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
    'Census_OSVersion':                                     'object',
    'Census_OSArchitecture':                                'object',
    'Census_OSBranch':                                      'object',
    'Census_OSBuildNumber':                                 'int16',
    'Census_OSBuildRevision':                               'int32',
    'Census_OSEdition':                                     'object',
    'Census_OSSkuName':                                     'object',
    'Census_OSInstallTypeName':                             'object',
    'Census_OSInstallLanguageIdentifier':                   'float16',
    'Census_OSUILocaleIdentifier':                          'int16',
    'Census_OSWUAutoUpdateOptionsName':                     'object',
    'Census_IsPortableOperatingSystem':                     'int8',
    'Census_GenuineStateName':                              'object',
    'Census_ActivationChannel':                             'object',
    'Census_IsFlightingInternal':                           'float16',
    'Census_IsFlightsDisabled':                             'float16',
    'Census_FlightRing':                                    'object',
    'Census_ThresholdOptIn':                                'float16',
    'Census_FirmwareManufacturerIdentifier':                'float16',
    'Census_FirmwareVersionIdentifier':                     'float32',
    'Census_IsSecureBootEnabled':                           'int8',
    'Census_IsWIMBootEnabled':                              'float16',
    'Census_IsVirtualDevice':                               'float16',
    'Census_IsTouchEnabled':                                'int8',
    'Census_IsPenCapable':                                  'int8',
    'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
    'Wdft_IsGamer':                                         'float16',
    'Wdft_RegionIdentifier':                                'float16',
    'HasDetections':                                        'float32',
}

In [5]:
def display_features(df):
    # https://www.kaggle.com/adityaecdrid/simple-feature-engineering-xd
    df['fe_non_primary_drive_MB'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity']

    df['aspect_ratio'] = df['Census_InternalPrimaryDisplayResolutionHorizontal']/ df['Census_InternalPrimaryDisplayResolutionVertical']

    df['fe_monitor_dims'] = df['Census_InternalPrimaryDisplayResolutionHorizontal'].astype(str) + '*' + df['Census_InternalPrimaryDisplayResolutionVertical'].astype('str')
    
  
    df['fe_screen_area'] = (df['aspect_ratio'] * (df['Census_InternalPrimaryDiagonalDisplaySizeInInches']**2))/(df['aspect_ratio']**2 + 1)
    
    df.drop('aspect_ratio', axis=1, inplace=True)

    return df
    
def fill_nan(df):

    df['Census_ThresholdOptIn'] = df['Census_ThresholdOptIn'].fillna(1)
    df['Census_IsWIMBootEnabled'] = df['Census_IsWIMBootEnabled'].fillna(1)
    df['Wdft_IsGamer'] = df['Wdft_IsGamer'].fillna(0)
    
    return df


true_numerical_columns = true_numerical_columns + new_numerical_columns
num_datatypes = ['int8', 'int16', 'int32', 'float16', 'float32']
true_numerical_columns = [c for c, v in DTYPES.items() if v in num_datatypes]

In [6]:
df_full = display_features(df_full)

print("categorical features prepared")

binary_variables = [c for c in df_full.columns if df_full[c].nunique() == 2]

categorical_columns = [c for c in df_full.columns
                       if (c not in true_numerical_columns) & (c not in binary_variables)]

#one more test with SmartScreen 

df_full['SmartScreen']=df_full['SmartScreen'].astype(str)
df_full['SmartScreen']=df_full['SmartScreen'].str.lower()
df_full['SmartScreen'].replace({"promt":"prompt",
                        "promprt":"prompt",
                        "00000000":"0",
                        "enabled":"on",
                        "of":"off" ,
                        "deny":"0" , # just one
                        "requiredadmin":"requireadmin"
                       },inplace=True)
df_full['SmartScreen'] = df_full['SmartScreen'].astype("category")
print("SmartScreen ready")

categorical features prepared
SmartScreen ready


In [7]:
def add_factor_sort(df, col):
    val = [tuple([float(v) for v in (s.split('.'))]) for s in df[col]]
    val = pd.factorize(val, sort=True)[0]
    return val

In [8]:
# compute some ratios
print("Preparing ratios")
nrows = df_full.shape[0]
df_full['fe_avsig_gamer_freq'] = df_full.groupby(['AvSigVersion','Wdft_IsGamer'])['OsBuild'].transform('count') / nrows
df_full['fe_cpucores_region_freq'] = df_full.groupby(['Census_ProcessorCoreCount','Wdft_RegionIdentifier'])['OsBuild'].transform('count') / nrows
df_full['fe_cpucores_oemname_freq'] = df_full.groupby(['Census_ProcessorCoreCount','Census_OEMNameIdentifier'])['OsBuild'].transform('count') / nrows
df_full['fe_geoname_oemname_freq'] = df_full.groupby(['GeoNameIdentifier','Census_OEMNameIdentifier'])['OsBuild'].transform('count') / nrows


#santiize bad value of AvSigVersion
df_full['AvSigVersion'] = df_full['AvSigVersion'].astype(str)
df_full.at[5244810, 'AvSigVersion'] = '1.273.1444.0'
df_full['AvSigVersion'] = df_full['AvSigVersion'].astype('category')

#encode the versions
print("Version factors")
versions = ['EngineVersion', 'AppVersion', 'AvSigVersion']
for version in tqdm(versions):
    colname = version + '_factor'
    df_full[colname] = add_factor_sort(df_full, version)
    

print("Target mean encoding")

Preparing ratios
Version factors


100%|██████████| 3/3 [01:54<00:00, 37.81s/it]

Target mean encoding





In [9]:
def frequency_encoding_single(df, col):
    t = df[col].value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[col] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

In [10]:
print("Frequency variables")
to_freq_encode = []
for col in categorical_columns:
    if df_full[col].nunique() > 1000:
        print(col, df_full[col].nunique())
        to_freq_encode.append(col)

for variable in tqdm(to_freq_encode):
    freq_enc_dict = frequency_encoding_single(df_full, variable)
    df_full[variable] = df_full[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    categorical_columns.remove(variable)


print("Categorical encoding")
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier':
        continue
    #senti = df_full[col].nunique() + 1    
    df_full[col] = pd.factorize(df_full[col])[0]
    df_full[col] = df_full[col].astype('int32')
    df_full[col] = df_full[col] + abs(df_full[col].min()) + 1

Frequency variables
MachineIdentifier 8921483
AvSigVersion 9623
Census_PrimaryDiskTotalCapacity 8796
Census_SystemVolumeTotalCapacity 636117
Census_InternalBatteryNumberOfCharges 52836
fe_non_primary_drive_MB 705572
fe_monitor_dims 18336


  0%|          | 0/8 [00:00<?, ?it/s]

fe_screen_area 5926


100%|██████████| 8/8 [01:42<00:00, 12.64s/it]
  0%|          | 0/28 [00:00<?, ?it/s]

Categorical encoding


100%|██████████| 28/28 [00:41<00:00,  1.07s/it]


In [11]:
gc.collect()

250

In [12]:
# split back and save
train = df_full[:train_rows]
test = df_full[train_rows:]

del df_full
gc.collect()

7

In [13]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [14]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 1454.90 Mb (62.1% reduction)
Mem. usage decreased to 1280.70 Mb (62.1% reduction)


In [15]:
train.to_pickle('train_encoded_full.pkl')
test.to_pickle('test_encoded_full.pkl')