In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import string
from numpy.random import normal,random,seed
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, n
from sklearn.metrics import log_loss
from sklearn import metrics
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import scipy.stats as st
import pickle

import lightgbm as lgb
import xlearn as xl

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [3]:
Data = pd.read_csv("data/train.csv", dtype=dtypes)

In [6]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8921483 entries, 0 to 8921482
Data columns (total 83 columns):
MachineIdentifier                                    category
ProductName                                          category
EngineVersion                                        category
AppVersion                                           category
AvSigVersion                                         category
IsBeta                                               int8
RtpStateBitfield                                     float16
IsSxsPassiveMode                                     int8
DefaultBrowsersIdentifier                            float16
AVProductStatesIdentifier                            float32
AVProductsInstalled                                  float16
AVProductsEnabled                                    float16
HasTpm                                               int8
CountryIdentifier                                    int16
CityIdentifier                         

In [70]:
Data.Wdft_IsGamer.value_counts()

0.0    6174143
1.0    2443889
Name: Wdft_IsGamer, dtype: int64

In [69]:
Data.loc[Data['HasDetections'] == 1, 'Wdft_IsGamer'].value_counts()

0.0    2976639
1.0    1324330
Name: Wdft_IsGamer, dtype: int64

In [4]:
Data = Data.loc[:, ["ProductName", "EngineVersion", "AppVersion", "AvSigVersion", "AVProductStatesIdentifier", "HasTpm", "CountryIdentifier", "CityIdentifier", "OrganizationIdentifier", "GeoNameIdentifier", "LocaleEnglishNameIdentifier", "Platform", "Processor", "OsVer", "OsBuild", "OsSuite", "OsPlatformSubRelease", "OsBuildLab", "SkuEdition", "IsProtected", "PuaMode", "SMode", "SmartScreen", "Firewall", "UacLuaenable", "Census_MDC2FormFactor", "Census_DeviceFamily", "Census_ProcessorCoreCount", "Census_ProcessorClass", "Census_PrimaryDiskTotalCapacity", "Census_PrimaryDiskTypeName", "Census_SystemVolumeTotalCapacity", "Census_HasOpticalDiskDrive", "Census_TotalPhysicalRAM", "Census_ChassisTypeName", "Census_InternalPrimaryDiagonalDisplaySizeInInches", "Census_InternalPrimaryDisplayResolutionHorizontal", "Census_InternalPrimaryDisplayResolutionVertical", "Census_PowerPlatformRoleName", "Census_OSVersion", "Census_OSArchitecture", "Census_OSBranch", "Census_OSBuildNumber", "Census_OSBuildRevision", "Census_OSEdition", "Census_OSSkuName", "Census_OSInstallTypeName", "Census_OSWUAutoUpdateOptionsName", "Census_IsPortableOperatingSystem", "Census_GenuineStateName", "Census_ActivationChannel", "Census_IsFlightsDisabled", "Census_FlightRing", "Census_IsSecureBootEnabled", "Census_IsVirtualDevice", "Census_IsTouchEnabled", "Census_IsPenCapable", "Census_IsAlwaysOnAlwaysConnectedCapable", "Wdft_IsGamer", "HasDetections"]]

In [None]:
DataTest = pd.read_csv("data/test.csv")

In [4]:
DataTest = DataTest.loc[:, ["ProductName", "EngineVersion", "AppVersion", "AvSigVersion", "AVProductStatesIdentifier", "HasTpm", "CountryIdentifier", "CityIdentifier", "OrganizationIdentifier", "GeoNameIdentifier", "LocaleEnglishNameIdentifier", "Platform", "Processor", "OsVer", "OsBuild", "OsSuite", "OsPlatformSubRelease", "OsBuildLab", "SkuEdition", "IsProtected", "PuaMode", "SMode", "SmartScreen", "Firewall", "UacLuaenable", "Census_MDC2FormFactor", "Census_DeviceFamily", "Census_ProcessorCoreCount", "Census_ProcessorClass", "Census_PrimaryDiskTotalCapacity", "Census_PrimaryDiskTypeName", "Census_SystemVolumeTotalCapacity", "Census_HasOpticalDiskDrive", "Census_TotalPhysicalRAM", "Census_ChassisTypeName", "Census_InternalPrimaryDiagonalDisplaySizeInInches", "Census_InternalPrimaryDisplayResolutionHorizontal", "Census_InternalPrimaryDisplayResolutionVertical", "Census_PowerPlatformRoleName", "Census_OSVersion", "Census_OSArchitecture", "Census_OSBranch", "Census_OSBuildNumber", "Census_OSBuildRevision", "Census_OSEdition", "Census_OSSkuName", "Census_OSInstallTypeName", "Census_OSWUAutoUpdateOptionsName", "Census_IsPortableOperatingSystem", "Census_GenuineStateName", "Census_ActivationChannel", "Census_IsFlightsDisabled", "Census_FlightRing", "Census_IsSecureBootEnabled", "Census_IsVirtualDevice", "Census_IsTouchEnabled", "Census_IsPenCapable", "Census_IsAlwaysOnAlwaysConnectedCapable", "Wdft_IsGamer"]]

In [5]:
X = Data.drop(['HasDetections'],axis=1)
y = LabelEncoder().fit_transform(Data.HasDetections)
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
del Data
del X
del y

In [21]:
print(type(xTrain))
print(type(pd.DataFrame(yTrain)))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [32]:
xTrainFFM = xTrain.join(pd.DataFrame(yTrain))
xTrainFFM = xTrainFFM.rename(columns = {0: 'HasDetections'})

In [33]:
xTrainFFM.shape

(6245038, 60)

In [6]:
le = LabelEncoder()
for column_name in xTrain.columns:
        if xTrain[column_name].dtype.name == 'category':
            xTrain.loc[:, column_name] = le.fit_transform(xTrain[column_name].astype(str))
        else:
            pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
xTrain = xTrain.fillna(-999)

In [12]:
xTrain.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier
7587898,5311600,4,65,57,8135,0,7.0,0,-999.0,53447.0,...,142.0,2865.0,0,0.0,0.0,0,0,0.0,1.0,11.0
3904626,2733105,1,65,106,8328,0,7.0,0,-999.0,27750.0,...,556.0,13908.0,0,-999.0,0.0,0,0,0.0,0.0,11.0
8705769,6094273,4,65,57,8088,0,7.0,0,-999.0,53447.0,...,556.0,63540.0,1,-999.0,0.0,0,0,0.0,1.0,1.0
1343039,940140,4,64,57,7680,0,7.0,0,-999.0,7945.0,...,628.0,8791.0,0,-999.0,0.0,0,0,0.0,1.0,10.0
7183243,5028447,4,65,57,8038,0,7.0,0,-999.0,53447.0,...,444.0,67773.0,0,0.0,0.0,0,0,0.0,0.0,15.0


In [8]:
for column_name in xTest.columns:
        if xTest[column_name].dtype.name == 'category':
            xTest.loc[:, column_name] = le.fit_transform(xTest[column_name].astype(str))
        else:
            pass

In [9]:
xTest = xTest.fillna(-999)

In [96]:
xTest.head()

Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,AVProductStatesIdentifier,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,...,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightsDisabled,Census_FlightRing,Census_IsSecureBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer
4291725,0.989258,0.431152,0.576172,0.002523,0.652832,0.987793,0.009079,0.000683,0.470459,0.171753,...,0.089905,0.382324,0.981934,0.032166,0.51416,0.991211,0.874512,0.961914,0.935059,0.034058
5081676,0.989258,0.431152,0.576172,0.000284,0.652832,0.987793,0.003342,0.001477,0.197632,0.002808,...,0.882812,0.382324,0.981934,0.936523,0.48584,0.991211,0.125488,0.038086,0.935059,0.691895
8395646,0.989258,0.431152,0.022964,0.000281,0.652832,0.987793,0.003914,0.036591,0.308594,0.004864,...,0.882812,0.530273,0.981934,0.936523,0.51416,0.991211,0.874512,0.961914,0.935059,0.691895
5206665,0.989258,0.431152,0.095398,0.001853,0.652832,0.987793,0.005527,0.005505,0.308594,0.005219,...,0.882812,0.382324,0.981934,0.936523,0.48584,0.991211,0.874512,0.961914,0.935059,0.691895
7481451,0.989258,0.431152,0.576172,0.000377,0.652832,0.987793,0.020432,9.5e-05,0.470459,0.01741,...,0.882812,0.382324,0.981934,0.936523,0.51416,0.991211,0.874512,0.961914,0.935059,0.691895


In [30]:
pd.Series({c: xTrain[c].unique() for c in xTrain})

ProductName                                                                         [4, 1, 2, 0, 5, 3]
EngineVersion                                        [65, 64, 61, 59, 63, 54, 45, 27, 50, 57, 42, 3...
AppVersion                                           [57, 106, 26, 51, 37, 44, 25, 17, 78, 94, 95, ...
AvSigVersion                                         [8135, 8328, 8088, 7680, 8038, 7886, 8184, 804...
AVProductStatesIdentifier                            [53447.0, 27750.0, 7945.0, 47238.0, 6630.0, 43...
HasTpm                                                                                          [1, 0]
CountryIdentifier                                    [168, 199, 158, 9, 60, 66, 57, 51, 141, 80, 20...
CityIdentifier                                       [nan, 150207.0, 20805.0, 10653.0, 29085.0, 117...
OrganizationIdentifier                               [nan, 27.0, 18.0, 48.0, 50.0, 33.0, 14.0, 28.0...
GeoNameIdentifier                                    [89.0, 202.0, 240.0,

In [68]:
xTrain.columns

['Census_ProcessorCoreCount','Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM',
 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
       'Census_InternalPrimaryDisplayResolutionHorizontal',
       'Census_InternalPrimaryDisplayResolutionVertical']

Index(['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion',
       'AVProductStatesIdentifier', 'HasTpm', 'CountryIdentifier',
       'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier',
       'LocaleEnglishNameIdentifier', 'Platform', 'Processor', 'OsVer',
       'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab',
       'SkuEdition', 'IsProtected', 'PuaMode', 'SMode', 'SmartScreen',
       'Firewall', 'UacLuaenable', 'Census_MDC2FormFactor',
       'Census_DeviceFamily', 'Census_ProcessorCoreCount',
       'Census_ProcessorClass', 'Census_PrimaryDiskTotalCapacity',
       'Census_PrimaryDiskTypeName', 'Census_SystemVolumeTotalCapacity',
       'Census_HasOpticalDiskDrive', 'Census_TotalPhysicalRAM',
       'Census_ChassisTypeName',
       'Census_InternalPrimaryDiagonalDisplaySizeInInches',
       'Census_InternalPrimaryDisplayResolutionHorizontal',
       'Census_InternalPrimaryDisplayResolutionVertical',
       'Census_PowerPlatformRoleName', 'Census_O

In [90]:
for column_name in xTest.columns:
    if column_name not in ['Census_ProcessorCoreCount','Census_PrimaryDiskTotalCapacity',\
                                              'Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM','Census_InternalPrimaryDiagonalDisplaySizeInInches',\
                                              'Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical']:
        probs = (xTest[column_name].value_counts()/xTest[column_name].count()).astype(np.float16)
        for ind in probs.index:
            xTest.loc[xTest[column_name] == ind, column_name] = probs[ind]
    else:
        pass

In [95]:
pd.DataFrame.to_csv(xTest, "xTest.csv")

In [89]:
print(probs)

for ind in probs.index:
    print(probs[ind])
        

4    9.892578e-01
1    1.069641e-02
2    5.960464e-06
3    2.980232e-06
5    1.132488e-06
0    7.748604e-07
Name: ProductName, dtype: float16
0.9893
0.0107
5.96e-06
3e-06
1.13e-06
8e-07


In [69]:
for column_name in xTrain.columns:
    if column_name not in ['Census_ProcessorCoreCount','Census_PrimaryDiskTotalCapacity',\
                                              'Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM','Census_InternalPrimaryDiagonalDisplaySizeInInches',\
                                              'Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical']:
        xTrain.loc[:, column_name] = pd.to_numeric(xTrain[column_name].value_counts()/xTrain[column_name].count())
        print(column_name)
    else:
        pass

ProductName
EngineVersion
AppVersion
AvSigVersion
AVProductStatesIdentifier
HasTpm
CountryIdentifier
CityIdentifier
OrganizationIdentifier
GeoNameIdentifier
LocaleEnglishNameIdentifier
Platform
Processor
OsVer
OsBuild
OsSuite
OsPlatformSubRelease
OsBuildLab
SkuEdition
IsProtected
PuaMode
SMode
SmartScreen
Firewall
UacLuaenable
Census_MDC2FormFactor
Census_DeviceFamily
Census_ProcessorClass
Census_PrimaryDiskTypeName
Census_HasOpticalDiskDrive
Census_ChassisTypeName
Census_PowerPlatformRoleName
Census_OSVersion
Census_OSArchitecture
Census_OSBranch
Census_OSBuildNumber
Census_OSBuildRevision
Census_OSEdition
Census_OSSkuName
Census_OSInstallTypeName
Census_OSWUAutoUpdateOptionsName
Census_IsPortableOperatingSystem
Census_GenuineStateName
Census_ActivationChannel
Census_IsFlightsDisabled
Census_FlightRing
Census_IsSecureBootEnabled
Census_IsVirtualDevice
Census_IsTouchEnabled
Census_IsPenCapable
Census_IsAlwaysOnAlwaysConnectedCapable
Wdft_IsGamer


In [103]:
type(yTest)

numpy.ndarray

## FFM

In [15]:
def convert_to_ffm(df,datasetType,numerics,categories):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(numerics)+len(categories)
    with open(str(datasetType) + "_ffm.txt", "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
             datastring = ""
             datarow = df.iloc[r].to_dict()
             datastring += str((datarow['HasDetections'])) # Set Target Variable here
             
            # For numerical fields, we are creating a dummy field here
             for i, x in enumerate(catdict.keys()):
                 if(catdict[x]==0):
                     datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                 else:
            
            # For a new field appearing in a training example
                     if(x not in catcodes):
                         catcodes[x] = {}
                         currentcode +=1
                         catcodes[x][datarow[x]] = currentcode #encoding the feature
             
            # For already encoded fields
                     elif(datarow[x] not in catcodes[x]):
                         currentcode +=1
                         catcodes[x][datarow[x]] = currentcode #encoding the feature
                     
                     code = catcodes[x][datarow[x]]
                     datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

             datastring += '\n'
             text_file.write(datastring)

In [34]:
convert_to_ffm(xTrainFFM, 'train', \
              ["Census_ProcessorCoreCount", "Census_PrimaryDiskTotalCapacity", "Census_SystemVolumeTotalCapacity", "Census_TotalPhysicalRAM", "Census_InternalPrimaryDiagonalDisplaySizeInInches",\
               "Census_InternalPrimaryDisplayResolutionHorizontal", "Census_InternalPrimaryDisplayResolutionVertical"], \
              ["ProductName", "EngineVersion", "AppVersion", "AvSigVersion", "AVProductStatesIdentifier", "HasTpm", "CountryIdentifier", "CityIdentifier", "OrganizationIdentifier", \
               "GeoNameIdentifier", "LocaleEnglishNameIdentifier", "Platform", "Processor", "OsVer", "OsBuild", "OsSuite", "OsPlatformSubRelease", "OsBuildLab", "SkuEdition", \
               "IsProtected", "PuaMode", "SMode", "SmartScreen", "Firewall", "UacLuaenable", "Census_MDC2FormFactor", "Census_DeviceFamily", "Census_ProcessorClass", \
               "Census_PrimaryDiskTypeName", "Census_HasOpticalDiskDrive", "Census_ChassisTypeName", "Census_PowerPlatformRoleName", "Census_OSVersion", "Census_OSArchitecture",\
               "Census_OSBranch", "Census_OSBuildNumber", "Census_OSBuildRevision", "Census_OSEdition", "Census_OSSkuName", "Census_OSInstallTypeName", "Census_OSWUAutoUpdateOptionsName",\
               "Census_IsPortableOperatingSystem", "Census_GenuineStateName", "Census_ActivationChannel", "Census_IsFlightsDisabled", "Census_FlightRing", "Census_IsSecureBootEnabled", \
               "Census_IsVirtualDevice", "Census_IsTouchEnabled", "Census_IsPenCapable", "Census_IsAlwaysOnAlwaysConnectedCapable", "Wdft_IsGamer", "HasDetections"]
              )

KeyboardInterrupt: 

In [None]:
ffm_model = xl.create_ffm()

In [None]:
ffm_model.setTrain("train_ffm.txt")

In [None]:
param = {'task':'binary', 
         'lr':0.2,
         'lambda':0.002, 
         'metric':'acc'}

In [None]:
ffm_model.fit(param, './model.out')

In [None]:
# Prediction task
ffm_model.setTest("test_ffm.txt") # Test data
ffm_model.setSigmoid() # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")

## Random Forest

In [45]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 120, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 25, 50]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [120, 328, 537, 746, 955, 1164, 1373, 1582, 1791, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10, 25, 50], 'min_samples_leaf': [1, 2, 4, 8, 10], 'bootstrap': [True, False]}


In [None]:
rfMalware = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfMalware, param_grid = random_grid, 
                          cv = 5, n_jobs = -1, verbose = 0)

In [None]:
# Fit the grid search to the data
rfGridSearch = grid_search.fit(xTrain, yTrain)

In [None]:
rfPred = rfMalware.predict_proba(xTest,)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
rfTunedPred = best_grid.predict_proba(xTest,)

In [None]:
log_loss(yTest,rfTunedPred)

In [None]:
# save the model to disk
filename = 'RF_model.sav'
pickle.dump(grid_search, open(filename, 'wb'))

## Ensemble RF Models

In [None]:
estimators = []

rfclassifer1 = RandomForestClassifier(n_estimators = 100,
               max_features = ,
               max_depth = ,
               min_samples_split = ,
               min_samples_leaf = ,
               bootstrap = ,random_state=6,n_jobs=-1)

estimators.append(('rf1', rfclassifer1))

rfclassifer2 = RandomForestClassifier(n_estimators = 120,
               max_features = ,
               max_depth = ,
               min_samples_split = ,
               min_samples_leaf = ,
               bootstrap = ,random_state=100,n_jobs=-1)

estimators.append(('rf2', rfclassifer2))

rfclassifer3 = RandomForestClassifier(n_estimators = 150,
               max_features = ,
               max_depth = ,
               min_samples_split = ,
               min_samples_leaf = ,
               bootstrap = ,random_state=4833,n_jobs=-1)

estimators.append(('rf3', rfclassifer3))

ensemblerfVoting = VotingClassifier(estimators, voting='soft')

In [None]:
ensemblerfVoting.fit(xTrain, yTrain)

In [None]:
ensemblerfPred = ensemblerfVoting.predict_proba(xTest,)

In [None]:
log_loss(yTest, ensemblerfPred, eps=1e-15, normalize=True)

In [None]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_rf_clf = CalibratedClassifierCV(ensemblerfVoting, method='isotonic', cv=5)
calibrated_rf_clf.fit(xTrain, yTrain)
rfCalibEnsemblePred = calibrated_rf_clf.predict_proba(xTest)

print("loss WITH calibration : ", log_loss(yTest, rfCalibEnsemblePred, eps=1e-15, normalize=True))

In [None]:
# save the model to disk
filename = 'Ensemble_RF_model.sav'
pickle.dump(ensemblerfVoting, open(filename, 'wb'))

## XG Boost

In [None]:
dTrain = xgb.DMatrix(xTrain, yTrain)

In [None]:
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

alg = XGBClassifier(
 learning_rate = st.uniform(0.05, 0.4),
 n_estimators=st.randint(120, 1200),
 max_depth=st.randint(3, 40),
 min_child_weight=from_zero_positive,
 reg_alpha = from_zero_positive,
 gamma=st.uniform(0, 10),
 subsample = one_to_left,
 colsample_bytree = one_to_left,
 nthreads=-1,
 objective='multi:softprob',
 stratified=True,
 scale_pos_weight=1, 
 num_class = 2,   
 seed=5381)

# xgb1_model = modelfit(xgb1, xTrain, predictors)
xgb_param = alg.get_xgb_params()
cvresult = xgb.cv(xgb_param, dTrain, num_boost_round=alg.get_params()['n_estimators'], \
                  nfold=20, metrics='mlogloss', early_stopping_rounds=50, verbose_eval=False)
alg.set_params(n_estimators=cvresult.shape[0])
    
#Fit the algorithm on the data
mdl = alg.fit(xTrain, label_encoded_y, eval_metric='logloss')

In [None]:
mdl.get_params

In [None]:
xgb_pred = mdl.predict_proba(xTest,)

In [None]:
log_loss(yTest, xgb1_pred)

In [None]:
# save the model to disk
filename = 'XGB_model.sav'
pickle.dump(mdl, open(filename, 'wb'))

In [16]:
# load the model from disk
loaded_model = pickle.load(open('XGB_model.sav', 'rb'))

In [77]:
loaded_model.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 500,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 0,
 'scale_pos_weight': 1,
 'seed': 1,
 'silent': True,
 'subsample': 1,
 'nthreads': -1}

In [30]:
result = loaded_model.score(xTest, yTest)
print(result)

0.6101578773335525


In [29]:
y_pre = loaded_model.predict(xTest)
y_pro = loaded_model.predict_proba(xTest)[:, 1]

In [33]:
print("AUC Score : %f" % metrics.roc_auc_score(yTest, y_pro))
print("Accuracy : %.4g" % metrics.accuracy_score(yTest, y_pre))

AUC Score : 0.674795
Accuracy : 0.6102


## Analyze Only Numeric Columns

In [16]:
xTrainNum = xTrain.loc[:, ["Census_ProcessorCoreCount", "Census_PrimaryDiskTotalCapacity", "Census_SystemVolumeTotalCapacity", "Census_TotalPhysicalRAM", "Census_InternalPrimaryDiagonalDisplaySizeInInches", "Census_InternalPrimaryDisplayResolutionHorizontal", "Census_InternalPrimaryDisplayResolutionVertical"]]

In [17]:
xTestNum = xTest.loc[:, ["Census_ProcessorCoreCount", "Census_PrimaryDiskTotalCapacity", "Census_SystemVolumeTotalCapacity", "Census_TotalPhysicalRAM", "Census_InternalPrimaryDiagonalDisplaySizeInInches", "Census_InternalPrimaryDisplayResolutionHorizontal", "Census_InternalPrimaryDisplayResolutionVertical"]]

In [36]:
logClf = LogisticRegressionCV(random_state=9457)

In [37]:
logClf.fit(xTrainNum, yTrain)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=9457, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [40]:
y_lrpre = logClf.predict(xTestNum)
y_lrpro = logClf.predict_proba(xTestNum)[:, 1]

In [42]:
print("AUC Score : %f" % metrics.roc_auc_score(yTest, y_lrpro))
print("Accuracy : %.4g" % metrics.accuracy_score(yTest, y_lrpre))

AUC Score : 0.557218
Accuracy : 0.5379


In [43]:
print("AUC Score : %f" % metrics.roc_auc_score(yTest, (y_pro + y_lrpro)/2))

AUC Score : 0.670988


In [46]:
rfClf = RandomForestClassifier(verbose=2)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfClf, param_grid = random_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [47]:
# Fit the grid search to the data
rfGridSearch = grid_search.fit(xTrainNum, yTrain)

Fitting 5 folds for each of 12000 candidates, totalling 60000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [48]:
rfClf = RandomForestClassifier(n_estimators=170, random_state=9053207, verbose=2)

In [49]:
rfClf.fit(xTrainNum, yTrain)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 170


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.6s remaining:    0.0s


building tree 2 of 170
building tree 3 of 170
building tree 4 of 170
building tree 5 of 170
building tree 6 of 170
building tree 7 of 170
building tree 8 of 170
building tree 9 of 170
building tree 10 of 170
building tree 11 of 170
building tree 12 of 170
building tree 13 of 170
building tree 14 of 170
building tree 15 of 170
building tree 16 of 170
building tree 17 of 170
building tree 18 of 170
building tree 19 of 170
building tree 20 of 170
building tree 21 of 170
building tree 22 of 170
building tree 23 of 170
building tree 24 of 170
building tree 25 of 170
building tree 26 of 170
building tree 27 of 170
building tree 28 of 170
building tree 29 of 170
building tree 30 of 170
building tree 31 of 170
building tree 32 of 170
building tree 33 of 170
building tree 34 of 170
building tree 35 of 170
building tree 36 of 170
building tree 37 of 170
building tree 38 of 170
building tree 39 of 170
building tree 40 of 170
building tree 41 of 170
building tree 42 of 170
building tree 43 of 170


[Parallel(n_jobs=1)]: Done 170 out of 170 | elapsed: 100.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=170, n_jobs=None,
            oob_score=False, random_state=9053207, verbose=2,
            warm_start=False)

In [50]:
y_rfnpre = rfClf.predict(xTestNum)
y_rfnpro = rfClf.predict_proba(xTestNum)[:, 1]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 170 out of 170 | elapsed:  9.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 170 out of 170 | elapsed:  9.7min finished


In [51]:
print("AUC Score : %f" % metrics.roc_auc_score(yTest, y_rfnpro))
print("Accuracy : %.4g" % metrics.accuracy_score(yTest, y_rfnpre))

AUC Score : 0.542977
Accuracy : 0.5354


In [56]:
print("AUC Score : %f" % metrics.roc_auc_score(yTest, \
                                               (0.8*y_pro + 0.1*y_lrpro + 0.1*y_rfnpro)))

AUC Score : 0.674005


In [75]:
chi2, p, dof, expected = st.chi2_contingency(xTrain.iloc[:,["OsVer", "OsBuild"]])

TypeError: cannot perform reduce with flexible type

In [71]:
print(chi2)

25725881.82255344


## Light GBM

In [18]:
train_data = lgb.Dataset(xTrainNum, label=yTrain)
test_data = lgb.Dataset(xTestNum, label=yTest)

In [19]:
param = {'num_leaves':43, 
         'num_round': 500,
         'num_trees':500, 
         'objective':'binary',
         'learning_rate': 0.1,
         'max_depth': 31,
         'seed': 1324,
         'num_threads': 8, 
         'verbose': 2}
param['metric'] = 'auc'

In [None]:
#lgb.cv(param, train_data, num_round, nfold=5)
bst = lgb.train(param, train_data, valid_sets=[test_data])

