In [1]:
import warnings
import gc
import time
import sys
import datetime

import utils


import numpy as np 
import pandas as pd 
import catboost as cb


from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score


pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

from tqdm import tqdm

print("script started: ", time.strftime("%b %d %Y %H:%M:%S"))

script started:  Mar 12 2019 17:17:40


In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train = pd.read_pickle('../input/msmalware/train.pkl')
print("TRAIN LOADED")
test = pd.read_pickle('../input/msmalware/test.pkl')
print("TEST LOADED")

# merge both datasets

target = train['HasDetections']
del train['HasDetections']

target.to_pickle('target.pkl')
print("Target saved")

train_rows = train.shape[0]

df_full = pd.concat([train, test])

print("MERGED") 
gc.collect()

TRAIN LOADED
TEST LOADED
Target saved
MERGED


18

In [4]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)


true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges'
]

binary_variables = [c for c in train.columns if train[c].nunique() == 2]

categorical_columns = [c for c in train.columns if c not in true_numerical_columns]

Mem. usage decreased to 1310.34 Mb (0.0% reduction)
Mem. usage decreased to 1153.46 Mb (0.0% reduction)


In [5]:
def encode_categorical_columns(x_train, x_test, columns, sort=True):
    train_length = x_train.shape[0]
    for col in tqdm(columns):
        if col == 'MachineIdentifier' or col == 'HasDetections':
            continue
            
        combined_data = pd.concat([x_train[col], x_test[col]])
        combined_data, _ = pd.factorize(combined_data, sort=sort)
        combined_data = pd.Series(combined_data).astype('int32')
        combined_data = combined_data + abs(combined_data.min()) + 1
        x_train[col] = combined_data.iloc[:train_length].values
        x_test[col] = combined_data.iloc[train_length:].values
        x_train[col] = x_train[col].fillna(0)
        x_test[col] = x_test[col].fillna(0)
        del combined_data
        
    return x_train, x_test

In [6]:
train, test = encode_categorical_columns(train, test, categorical_columns)

100%|██████████| 74/74 [02:16<00:00,  1.67s/it]


In [7]:
print(train.dtypes)

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

MachineIdentifier                           uint64
ProductName                                  int64
EngineVersion                                int64
AppVersion                                   int64
AvSigVersion                                 int64
IsBeta                                       int64
RtpStateBitfield                             int64
IsSxsPassiveMode                             int64
DefaultBrowsersIdentifier                    int64
AVProductStatesIdentifier                    int64
AVProductsInstalled                          int64
AVProductsEnabled                            int64
HasTpm                                       int64
CountryIdentifier                            int64
CityIdentifier                               int64
OrganizationIdentifier                       int64
GeoNameIdentifier                            int64
LocaleEnglishNameIdentifier                  int64
Platform                                     int64
Processor                      

In [8]:
print(train.dtypes)


# save data in case that model crashes
train.to_pickle('train_cat.pkl.gz', compression='gzip')

MachineIdentifier                           uint64
ProductName                                   int8
EngineVersion                                 int8
AppVersion                                    int8
AvSigVersion                                 int16
IsBeta                                        int8
RtpStateBitfield                              int8
IsSxsPassiveMode                              int8
DefaultBrowsersIdentifier                    int16
AVProductStatesIdentifier                    int32
AVProductsInstalled                           int8
AVProductsEnabled                             int8
HasTpm                                        int8
CountryIdentifier                            int16
CityIdentifier                               int32
OrganizationIdentifier                        int8
GeoNameIdentifier                            int16
LocaleEnglishNameIdentifier                  int16
Platform                                      int8
Processor                      

In [9]:
test.to_pickle('test_cat.pkl.gz', compression='gzip')

In [10]:
target.to_pickle('target.pkl.gz', compression='gzip')

In [11]:
print("data preparation finished: ", time.strftime("%b %d %Y %H:%M:%S"))

data preparation finished:  Mar 12 2019 18:10:37
