## Microsoft-Malware-Prediction
References 

https://www.kaggle.com/c/microsoft-malware-prediction/data


https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html

https://towardsdatascience.com/3-steps-to-a-clean-dataset-with-pandas-2b80ef0c81ae



https://machinelearningmastery.com/handle-missing-data-python/




Dask for Handling Huge datasets: 

https://www.kaggle.com/szelee/how-to-import-a-csv-file-of-55-million-rows



### Download unzip data

In [1]:
# used to clean console
from IPython.display import clear_output

# get my token
!wget https://www.dropbox.com/s/xqslv5bc95p45co/kaggle.json
!wget https://effectiveml.com/files/paramsearch.py
clear_output()
  
# Next, install the Kaggle API client.
!pip install -q kaggle

!pip install dask_ml==0.11.0
# !pip install pandas==0.23.4


!pip install catboost

# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

clear_output()


!kaggle competitions download microsoft-malware-prediction

!unzip train.csv.zip
!unzip test.csv.zip
!unzip sample_submission.csv.zip

clear_output()

!ls

kaggle.json	sample_submission.csv	   test.csv.zip
paramsearch.py	sample_submission.csv.zip  train.csv
sample_data	test.csv		   train.csv.zip


### Imports

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from dask_ml import preprocessing
import dask.dataframe as dd   # dask always needs to call .compute() 
import gc
from copy import copy
import numpy as np
from sklearn.cluster import KMeans

 
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import KFold
from paramsearch import paramsearch
from itertools import product,chain
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score



print('Dependencies Loaded!')

Dependencies Loaded!


### Prep Data

In [0]:
#defined appropriate datatypes to cut down extra space usage  
data_types = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

columns = list(data_types.keys())

ddf_train = dd.read_csv('train.csv', usecols=columns, dtype=data_types).compute()

# ddf_test = dd.read_csv('test.csv', usecols=columns[:-1], dtype=data_types).compute()

In [4]:
ddf_train.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7.0,0,,53447.0,...,36144.0,0,,0.0,0,0,0.0,0.0,10.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7.0,0,,53447.0,...,57858.0,0,,0.0,0,0,0.0,0.0,8.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7.0,0,,53447.0,...,52682.0,0,,0.0,0,0,0.0,0.0,3.0,0
3,00000b11598a75ea8ba1beea8459149f,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7.0,0,,53447.0,...,20050.0,0,,0.0,0,0,0.0,0.0,3.0,1
4,000014a5f00daa18e76b81417eeb99fc,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7.0,0,,53447.0,...,19844.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1


In [0]:
ddf_train.describe()

Unnamed: 0,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
count,8921483.0,8889165.0,8921483.0,433438.0,8885262.0,8885262.0,8885262.0,8921483.0,8921483.0,8596074.0,...,8761350.0,8921483.0,3261780.0,8905530.0,8921483.0,8921483.0,8850140.0,8618032.0,8618032.0,8921483.0
mean,7.509962e-06,,0.01733378,,49483.2,,,0.9879711,108.049,80491.52,...,32680.55,0.4860229,2.980232e-07,0.007202148,0.1255431,0.03807091,,,,0.4997927
std,0.002740421,0.0,0.1305118,,13799.94,0.0,0.0,0.1090149,63.04706,48734.61,...,21126.12,0.4998046,0.0005459785,0.08453369,0.3313338,0.1913675,0.0,0.0,0.0,0.5
min,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,5.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,7.0,0.0,788.0,49480.0,1.0,1.0,1.0,51.0,36825.0,...,13156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
50%,0.0,7.0,0.0,1632.0,53447.0,1.0,1.0,1.0,97.0,82373.0,...,33070.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
75%,0.0,7.0,0.0,2372.0,53447.0,2.0,1.0,1.0,162.0,123700.0,...,52436.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,1.0
max,1.0,35.0,1.0,3212.0,70507.0,7.0,5.0,1.0,222.0,167962.0,...,72105.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.0,1.0


In [0]:
stats = []
for col in ddf_train.columns:
    stats.append((col, ddf_train[col].unique()))
# stats
stats = []
for col in ddf_train.columns:
  stats.append((col, ddf_train[col].nunique(), ddf_train[col].isnull().sum() * 100 / ddf_train.shape[0], ddf_train[col].value_counts(normalize=True, dropna=False).values[0] * 100, ddf_train[col].dtype))
    
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)

Unnamed: 0,Feature,Unique_values,Percentage of missing values,Percentage of values in the biggest category,type
28,PuaMode,2,99.974119,99.974119,category
41,Census_ProcessorClass,3,99.589407,99.589407,category
8,DefaultBrowsersIdentifier,1730,95.141637,95.141637,float16
68,Census_IsFlightingInternal,2,83.044030,83.044030,float16
52,Census_InternalBatteryType,78,71.046809,71.046809,category
71,Census_ThresholdOptIn,2,63.524472,63.524472,float16
75,Census_IsWIMBootEnabled,2,63.439038,63.439038,float16
31,SmartScreen,21,35.610795,48.379658,category
15,OrganizationIdentifier,49,30.841487,47.037662,float16
29,SMode,2,6.027686,93.928812,float16


### Functions

In [0]:
def smoothLaplace(df):
  cat_feat = []
  cont_feat = []
  df_tmp = copy(df)
  # go through each column
  for index, (column, dtype) in enumerate(df_tmp.dtypes.items()):

    # target data
    if column in ["HasDetections", "MachineIdentifier"] :
      continue

    # check variable type
    if dtype.name == "category":
      cat_feat.append(index+1)
      # label encoding
      le = preprocessing.LabelEncoder()
      df_tmp[column] = le.fit_transform(df_tmp[column])
      del le
      gc.collect()
      df_tmp[column] = df_tmp[column] + 1     # since we do +1 to all values

    else:
      cont_feat.append(index+1)
      df_tmp[column].fillna(value=0, inplace=True)   #replace value in column where we have nan

      # soothing
      df_tmp[column] = df_tmp[column]+1
      
  return df_tmp, np.array(cat_feat), np.array(cont_feat)

def noSmooth(df):
  cat_feat = []
  cont_feat = []
  df_tmp = copy(df)
  # go through each column
  for index, (column, dtype) in enumerate(df_tmp.dtypes.items()):

    # target data
    if column in ["HasDetections", "MachineIdentifier"] :
      continue

    # check variable type
    if dtype.name == "category":
      cat_feat.append(index)
      # label encoding
      le = preprocessing.LabelEncoder()
      df_tmp[column] = le.fit_transform(df_tmp[column])
      del le
      gc.collect()
      

    else:
      cont_feat.append(index)
#       df_tmp[column].fillna(value=str(np.nan), inplace=True)   #replace value in column where we have nan

      
  return df_tmp, np.array(cat_feat), np.array(cont_feat)

In [6]:
df_train, cat_dims, con_feat = smoothLaplace(ddf_train)
del ddf_train, con_feat
gc.collect()
df_train.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,2,33,28,837,1,8.0,1,1.0,53448.0,...,36145.0,1,1.0,1.0,1,1,1.0,1.0,11.0,0
1,000007535c3f730efa9ea0b7ef1bd645,2,23,18,260,1,8.0,1,1.0,53448.0,...,57859.0,1,1.0,1.0,1,1,1.0,1.0,9.0,0
2,000007905a28d863f6d0d597892cd692,2,33,28,715,1,8.0,1,1.0,53448.0,...,52683.0,1,1.0,1.0,1,1,1.0,1.0,4.0,0
3,00000b11598a75ea8ba1beea8459149f,2,33,28,761,1,8.0,1,1.0,53448.0,...,20051.0,1,1.0,1.0,1,1,1.0,1.0,4.0,1
4,000014a5f00daa18e76b81417eeb99fc,2,33,28,724,1,8.0,1,1.0,53448.0,...,19845.0,1,1.0,1.0,1,1,1.0,1.0,2.0,1


In [0]:
params = {'depth':[3,1,2,6,4,5,7,8,9,10,12,13,14],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'thread_count':4,
          'task_type':"GPU"}

In [7]:
''' MOVE DATA FOR MODEL TRAINING '''

X = df_train[columns[:-1]]

y = df_train[columns[-1]]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.12, random_state=42, shuffle=True)
del X,y
gc.collect()

machine_ID = X_train[columns[0]]

del X_train[columns[0]]


machine_ID_Test = X_valid[columns[0]]

del X_valid[columns[0]]


10

## Modeling

### Cat Boost - GPU

In [0]:
import numpy as np
import gc
import time
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split #TAKES NUMPY OR DATA FRAME!!
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score

In [10]:
start = time.time()
model = CatBoostClassifier(task_type = "GPU")
# model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose = False)
model.fit(X_train, y_train, cat_features=None, verbose = False)
end = time.time()

print('GPU: %s MIN'%((end - start)/60))

GPU: 5.32486481666565 MIN


In [0]:
# Get predictions
y_pred_valid = model.predict(X_valid)

In [0]:
#Accuracy on test set
print("Test accuracy: %s"%(accuracy_score(y_valid, y_pred_valid).round(2)))

Test accuracy: 0.67


In [0]:
submission = dd.read_csv('sample_submission.csv').compute()
submission['HasDetections'] = y_pred_valid
submission['MachineIdentifier'] = machine_ID_Test

In [0]:
submission.head()

In [0]:
submission.to_csv('catboost_submission.csv', index=False)


In [0]:
!zip -r catboost_submission.zip catboost_submission.csv

  adding: catboost_submission.csv (deflated 51%)


In [0]:
# Download Data

from google.colab import files
files.download('catboost_submission.zip')