In [34]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
from ipyexperiments import *
from lib.fastai.imports import * 
from lib.fastai.structured import *
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from path import Path
import re2 as re
import joblib

In [35]:
##  Dainis's work

def display_n(df, n=250):
    with pd.option_context("display.max_rows", n):
        with pd.option_context("display.max_columns", n):
            display(df)
            
def add_datepart(df, fldname, drop=False, time=False):
    "Helper function that adds columns relevant to a date."
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

##  Pietro and Wojtek work
def add_timestamps(df):
    "Funection that loads time values from numpy files"
    datedictAS = np.load('dates/AvSigVersionTimestamps.npy')[()]
    df['DateAS'] = df['AvSigVersion'].map(datedictAS)  

    datedictOS = np.load('dates/OSVersionTimestamps.npy')[()]
    df['DateOS'] = df['Census_OSVersion'].map(datedictOS)  
    # BL timestamp
    def convert(x):
        try:
            d = datetime.strptime(x.split('.')[4],'%y%m%d-%H%M')
        except:
            d = np.nan
        return d
    df['DateBL'] = df['OsBuildLab'].map(convert)

In [36]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

# Uncomment the followng block on the first run
'''

with IPyExperimentsCPU():    
    print('Download Train and Test Data.\n')

    # Pietro, uncomment the following line and comment out the next one
    # INPUT_DIR = Path('E:/malware_microsoft' )
    INPUT_DIR = Path('./input' )

    train = pd.read_csv(Path(INPUT_DIR / 'train.csv'), dtype=dtypes, low_memory=True)
    train['MachineIdentifier'] = train.index.astype('uint32')

    test  = pd.read_csv(Path(INPUT_DIR /'test.csv'),  dtype=dtypes, low_memory=True)
    test['MachineIdentifier']  = test.index.astype('uint32')

    add_timestamps(train)
    add_timestamps(test)

    joblib.dump(train, 'data/train_w_time_origin.pkl')
    joblib.dump(test, 'data/test_w_time_origin.pkl')
'''

"\n\nwith IPyExperimentsCPU():    \n    print('Download Train and Test Data.\n')\n\n    # Pietro, uncomment the following line and comment out the next one\n    # INPUT_DIR = Path('E:/malware_microsoft' )\n    INPUT_DIR = Path('./input' )\n\n    train = pd.read_csv(Path(INPUT_DIR / 'train.csv'), dtype=dtypes, low_memory=True)\n    train['MachineIdentifier'] = train.index.astype('uint32')\n\n    test  = pd.read_csv(Path(INPUT_DIR /'test.csv'),  dtype=dtypes, low_memory=True)\n    test['MachineIdentifier']  = test.index.astype('uint32')\n\n    add_timestamps(train)\n    add_timestamps(test)\n\n    joblib.dump(train, 'data/train_w_time_origin.pkl')\n    joblib.dump(test, 'data/test_w_time_origin.pkl')\n"

In [6]:
def versioning(df, fldname, drop=False):
    "Helper function that adds columns relevant to a date."
    versions = df[fldname].str.split('.', expand=True)
    for i, v in enumerate(versions):
        df[fldname+'V'+str(i)] = versions[v]
    if drop: df.drop(fldname, axis=1, inplace=True)

def versioning(df, fldname, categorical_vars, drop=False):
    "Helper function that adds columns relevant to a date."
    versions = df[fldname].str.split(',', expand=True)
    for i, v in enumerate(versions):
        newfld = fldname+'V'+i
        df[newfld] = versions[v]
        categorical_vars.append(newfld)
    if drop: df.drop(fldname, axis=1, inplace=True)

with IPyExperimentsCPU() as preprocess:
    categorical_vars = [
        'MachineIdentifier',                                   
        'ProductName',                                         
        'EngineVersion',                                       
        'AppVersion',                                          
        'AvSigVersion',                                        
        'Platform',                                            
        'Processor',                                           
        'OsVer',                                               
        'OsPlatformSubRelease',                                
        'OsBuildLab',                                          
        'SkuEdition',                                          
        'PuaMode',                             
        'SmartScreen',                                         
        'Census_MDC2FormFactor',                               
        'Census_DeviceFamily',                                 
        'Census_ProcessorClass',                               
        'Census_PrimaryDiskTypeName',                          
        'Census_ChassisTypeName',                              
        'Census_PowerPlatformRoleName',                        
        'Census_InternalBatteryType',                          
        'Census_OSVersion',                                    
        'Census_OSArchitecture',                               
        'Census_OSBranch',                                     
        'Census_OSEdition',                                    
        'Census_OSSkuName',                                    
        'Census_OSInstallTypeName',                            
        'Census_OSWUAutoUpdateOptionsName',                    
        'Census_GenuineStateName',                             
        'Census_ActivationChannel',                            
        'Census_FlightRing',
    ]
    train=joblib.load('data/train_w_time_origin.pkl')
    test=joblib.load('data/test_w_time_origin.pkl')
    test['HasDetections'] = -1

    add_datepart(train, 'DateAS', drop=False, time=True)
    add_datepart(train, 'DateOS', drop=False, time=True)
    add_datepart(train, 'DateBL', drop=False, time=True)
    add_datepart(test, 'DateAS', drop=False, time=True)
    add_datepart(test, 'DateOS', drop=False, time=True)
    add_datepart(test, 'DateBL', drop=False, time=True)
    
    preprocess.keep_var_names('train', 'test', 'categorical_vars')
    


*** Experiment started with the CPU-only backend


*** Current state:
RAM:    Used    Free   Total       Util
CPU:   2,099  57,811  64,352 MB   3.26% 


･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:39.245
･ CPU:      7,045        127      9,195 MB |

IPyExperimentsCPU: Finishing

*** Experiment finished in 00:00:39 (elapsed wallclock time)

*** Newly defined local variables:
Kept:    test, train

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:    7,096        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   9,195  50,712  64,352 MB  14.29% 




In [8]:


joblib.dump(categorical_vars, 'val/categorical.pkl')

['val/categorical.pkl']

In [9]:
with pd.option_context("display.max_rows", 100):
    with pd.option_context("display.max_columns", 100):
        display(train[categorical_vars].head())
        


Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,Platform,Processor,OsVer,OsPlatformSubRelease,OsBuildLab,SkuEdition,PuaMode,SmartScreen,Census_MDC2FormFactor,Census_DeviceFamily,Census_ProcessorClass,Census_PrimaryDiskTypeName,Census_ChassisTypeName,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSWUAutoUpdateOptionsName,Census_GenuineStateName,Census_ActivationChannel,Census_FlightRing
0,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,windows10,x64,10.0.0.0,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,,,Desktop,Windows.Desktop,,HDD,Desktop,Desktop,,10.0.17134.165,amd64,rs4_release,Professional,PROFESSIONAL,UUPUpgrade,UNKNOWN,IS_GENUINE,Retail,Retail
1,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,windows10,x64,10.0.0.0,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,,,Notebook,Windows.Desktop,,HDD,Notebook,Mobile,,10.0.17134.1,amd64,rs4_release,Professional,PROFESSIONAL,IBSClean,UNKNOWN,OFFLINE,Retail,NOT_SET
2,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,windows10,x64,10.0.0.0,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,,RequireAdmin,Desktop,Windows.Desktop,,SSD,Desktop,Desktop,,10.0.17134.165,amd64,rs4_release,Core,CORE,UUPUpgrade,FullAuto,IS_GENUINE,OEM:NONSLP,Retail
3,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,windows10,x64,10.0.0.0,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,,ExistsNotSet,Desktop,Windows.Desktop,,UNKNOWN,MiniTower,Desktop,,10.0.17134.228,amd64,rs4_release,Professional,PROFESSIONAL,UUPUpgrade,FullAuto,IS_GENUINE,OEM:NONSLP,Retail
4,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,windows10,x64,10.0.0.0,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,,RequireAdmin,Notebook,Windows.Desktop,,HDD,Portable,Mobile,lion,10.0.17134.191,amd64,rs4_release,Core,CORE,Update,FullAuto,IS_GENUINE,Retail,Retail


In [21]:


versioned = ['EngineVersion','AppVersion','AvSigVersion','OsVer','Census_OSVersion','OsBuildLab']

with IPyExperimentsCPU() as vsplits:
    for ver in versioned:
        versioning(train, ver)
        versioning(test, ver)


*** Experiment started with the CPU-only backend


*** Current state:
RAM:    Used    Free   Total       Util
CPU:  11,004  47,670  64,352 MB  17.10% 


･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:02:56.148
･ CPU:      3,645        428     11,009 MB |

IPyExperimentsCPU: Finishing

*** Experiment finished in 00:02:56 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: ver

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:        4        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:  11,009  47,630  64,352 MB  17.11% 




In [29]:

df_raw = pd.concat([train, test], sort=False)
train_cats(df_raw)
df, y, nas = proc_df(df_raw)
train = df.head(len(train)).reset_index(drop=True)
test = df.tail(len(test)).reset_index(drop=True)
joblib.dump(train,'data/train_dainis.pkl')
joblib.dump(test,'data/test_dainis.pkl')

['data/test_dainis.pkl']

In [33]:
with IPyExperimentsCPU() as transform:
    '''
    print('Transform all features to category.\n')
    
    for i, usecol in enumerate(categorical_vars):
        print(str(i) + " / " + str(len(categorical_vars)))
        train[usecol] = train[usecol].astype('str')
        test[usecol] = test[usecol].astype('str')

        train[usecol] = train[usecol].astype('str')
        test[usecol] = test[usecol].astype('str')

        #Fit LabelEncoder
        le = LabelEncoder().fit(
                np.unique(train[usecol].unique().tolist()+
                          test[usecol].unique().tolist()))

        #At the end 0 will be used for dropped values
        train[usecol] = le.transform(train[usecol])+1
        test[usecol]  = le.transform(test[usecol])+1

        agg_tr = (train
                  .groupby([usecol])
                  .aggregate({'MachineIdentifier':'count'})
                  .reset_index()
                  .rename({'MachineIdentifier':'Train'}, axis=1))
        agg_te = (test
                  .groupby([usecol])
                  .aggregate({'MachineIdentifier':'count'})
                  .reset_index()
                  .rename({'MachineIdentifier':'Test'}, axis=1))

        agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
        #Select values with more than 1000 observations
        agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)
        agg['Total'] = agg['Train'] + agg['Test']
        #Drop unbalanced values
        agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
        agg[usecol+'Copy'] = agg[usecol]

        train[usecol+'bis'] = (pd.merge(train[[usecol]], 
                                  agg[[usecol, usecol+'Copy']], 
                                  on=usecol, how='left')[usecol+'Copy']
                         .replace(np.nan, 0).astype('int').astype('category'))

        test[usecol+'bis']  = (pd.merge(test[[usecol]], 
                                  agg[[usecol, usecol+'Copy']], 
                                  on=usecol, how='left')[usecol+'Copy']
                         .replace(np.nan, 0).astype('int').astype('category'))

        del le, agg_tr, agg_te, agg, usecol
        '''
    
    EXP_TAG=Path('dainis0')
    train_ids = train.index
    test_ids = test.index
    y_train = np.array(train['HasDetections'])
    
    # Fulfill contract with evaluator notebook
    joblib.dump(categorical_vars, Path('val' / EXP_TAG / 'categorical.pkl'))
    joblib.dump(train, Path('val' / EXP_TAG / 'train-original.pkl'))
    joblib.dump(test,Path( 'val' / EXP_TAG / ' test-original.pkl'))
    joblib.dump(y_train, Path('val' / EXP_TAG / 'y_train-original.pkl'))
    joblib.dump(train_ids,Path( 'val' / EXP_TAG / 'train_ids-original.pkl'))
    joblib.dump(test_ids, Path('val' / EXP_TAG / 'test_ids-original.pkl'))
    
 


*** Experiment started with the CPU-only backend


*** Current state:
RAM:    Used    Free   Total       Util
CPU:  32,254  27,282  64,352 MB  50.12% 


･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:02:09.890
･ CPU:         68         16     32,254 MB |

IPyExperimentsCPU: Finishing

*** Experiment finished in 00:02:10 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: EXP_TAG, test_ids, train_ids, y_train

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:        0        0 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:  32,254  27,292  64,352 MB  50.12% 


