In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('/kaggle/input/malware-prediction-am/train.csv')
test = pd.read_csv('/kaggle/input/malware-prediction-am/test.csv')

In [None]:
machine_id = test['MachineIdentifier']

In [None]:
def handle_missing_values(train,test):
    
    for feature in ["DefaultBrowsersIdentifier","Census_IsFlightingInternal", "Census_ThresholdOptIn", "Census_IsWIMBootEnabled", "OrganizationIdentifier",
                "SMode", "Wdft_IsGamer", "Wdft_RegionIdentifier", "Census_FirmwareManufacturerIdentifier", "Census_FirmwareVersionIdentifier", 
                "Census_OEMModelIdentifier", "Census_OEMNameIdentifier", "Firewall", "Census_TotalPhysicalRAM", "Census_IsAlwaysOnAlwaysConnectedCapable",
                "Census_OSInstallLanguageIdentifier", "IeVerIdentifier", "Census_SystemVolumeTotalCapacity", "Census_PrimaryDiskTotalCapacity",
                "Census_InternalPrimaryDiagonalDisplaySizeInInches", "Census_InternalPrimaryDisplayResolutionHorizontal", 
                "Census_InternalPrimaryDisplayResolutionVertical", "AVProductsEnabled", "AVProductsInstalled", "AVProductStatesIdentifier", "IsProtected", 
                "Census_ProcessorModelIdentifier", "Census_ProcessorCoreCount", "Census_ProcessorManufacturerIdentifier", "RtpStateBitfield", 
                "Census_IsVirtualDevice", "UacLuaenable", "GeoNameIdentifier",'SmartScreen', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName', 
                'Census_PowerPlatformRoleName', 'OsBuildLab']:
        train[feature].fillna(train[feature].mode()[0], inplace=True)
        test[feature].fillna(test[feature].mode()[0], inplace=True)
                
    return train,test

def additional_features(train,test):
    
    # Disk Space Remaining
    train['disk_remain'] = train['Census_PrimaryDiskTotalCapacity'] - train['Census_SystemVolumeTotalCapacity']
    test['disk_remain'] = test['Census_PrimaryDiskTotalCapacity'] - test['Census_SystemVolumeTotalCapacity']
    train['disk_remain'] = train['disk_remain'].astype('float32')
    test['disk_remain'] = test['disk_remain'].astype('float32')

    # Ram-to-CPU ratio
    train['ram_cpu_ratio'] = train['Census_TotalPhysicalRAM'] / train['Census_ProcessorCoreCount']
    test['ram_cpu_ratio'] = test['Census_TotalPhysicalRAM'] / test['Census_ProcessorCoreCount']

    # Pixel Per Inch PPI sqrt(horizonal**2 + vertical**2) / diagonal
    train['ppi'] = np.sqrt(train['Census_InternalPrimaryDisplayResolutionHorizontal']**2 + train['Census_InternalPrimaryDisplayResolutionVertical']**2) / train['Census_InternalPrimaryDiagonalDisplaySizeInInches']
    test['ppi'] = np.sqrt(test['Census_InternalPrimaryDisplayResolutionHorizontal']**2 + test['Census_InternalPrimaryDisplayResolutionVertical']**2) / test['Census_InternalPrimaryDiagonalDisplaySizeInInches']

    # PPI squared
    train['ppi2'] = train.ppi ** 2
    test['ppi2'] = test.ppi ** 2

    # Screen aspect ratio = Horizonal / Vertical
    train['aspect_ratio'] = train['Census_InternalPrimaryDisplayResolutionHorizontal'] / train['Census_InternalPrimaryDisplayResolutionVertical']
    test['aspect_ratio'] = test['Census_InternalPrimaryDisplayResolutionHorizontal'] / test['Census_InternalPrimaryDisplayResolutionVertical']

    # Pixel count = Horizonal * Vertical
    train['pixel_count'] = train['Census_InternalPrimaryDisplayResolutionHorizontal'] * train['Census_InternalPrimaryDisplayResolutionVertical']
    test['pixel_count'] = test['Census_InternalPrimaryDisplayResolutionHorizontal'] * test['Census_InternalPrimaryDisplayResolutionVertical']
    
    return train,test

def frequency_encoding(variable):
    t = pd.concat([train[variable], test[variable]]).value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[variable] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

In [None]:
train,test = handle_missing_values(train,test)

In [None]:
train,test = additional_features(train,test)

In [None]:
train = train.drop([
            'MachineIdentifier',
            'IsBeta', 
            'CityIdentifier', 
            'AutoSampleOptIn', 
            'PuaMode', 
            'Census_InternalBatteryType', 
            'Census_InternalBatteryNumberOfCharges', 
            'Census_OSArchitecture',
            'Census_OSSkuName',
            'Census_IsFlightsDisabled'], axis=1)
test = test.drop([
            'IsBeta', 
            'CityIdentifier', 
            'AutoSampleOptIn', 
            'PuaMode', 
            'Census_InternalBatteryType', 
            'Census_InternalBatteryNumberOfCharges', 
            'Census_OSArchitecture',
            'Census_OSSkuName',
            'Census_IsFlightsDisabled'], axis=1)

In [None]:
conversion_dict = {
    'off': 'Off', '&#x02;': '2', '&#x01;': '1', 'on': 'On', 'requireadmin': 'RequireAdmin', 'OFF': 'Off', 
    'Promt': 'Prompt', 'requireAdmin': 'RequireAdmin', 'prompt': 'Prompt', 'warn': 'Warn', 
    '00000000': '0', '&#x03;': '3', np.nan: 'NoExist'
}
train.replace({'SmartScreen': conversion_dict}, inplace=True)
test.replace({'SmartScreen': conversion_dict}, inplace=True)

In [None]:
conversion_dict = {
    'HDD': 'HDD', 'SSD': 'SSD', 'UNKNOWN': 'UNKNOWN', 'Unspecified': 'UNKNOWN'
}
train.replace({'Census_PrimaryDiskTypeName': conversion_dict}, inplace=True)
test.replace({'Census_PrimaryDiskTypeName': conversion_dict}, inplace=True)

In [None]:
conversion_dict = {
    'AppliancePC' : 'AppliancePC', 'Desktop' : 'Desktop', 'EnterpriseServer' : 'EnterpriseServer', 
    'Mobile' : 'Mobile', 'SOHOServer' : 'SOHOServer','Slate' : 'Slate', 'Workstation' : 'Workstation', 
    'PerformanceServer' : 'PerformanceServer', 'UNKNOWN': 'UNKNOWN', 'Unspecified': 'UNKNOWN'
}
train.replace({'Census_PowerPlatformRoleName': conversion_dict}, inplace=True)
test.replace({'Census_PowerPlatformRoleName': conversion_dict}, inplace=True)

In [None]:
conversion_dict = {
    'AllInOne' : 'AllInOne', 'Convertible' : 'Convertible', 'Desktop' :  'Desktop', 'Detachable' : 'Detachable', 
    'LargeServer' : 'Server','LargeTablet' : 'Tablet', 'MediumServer' : 'Server', 'Notebook' : 'Notebook', 
    'PCOther' : 'PCOther', 'SmallServer' : 'Server','SmallTablet' : 'Tablet'
}
train.replace({'Census_MDC2FormFactor': conversion_dict}, inplace=True)
test.replace({'Census_MDC2FormFactor': conversion_dict}, inplace=True)

In [None]:
skewed_features = ["Census_IsWIMBootEnabled", "Census_IsFlightingInternal", "Census_ThresholdOptIn", 
                   "Census_IsPortableOperatingSystem", "SMode", "Census_DeviceFamily", "UacLuaenable", 
                   "Census_IsVirtualDevice", "ProductName"]
train.drop(skewed_features, axis=1, inplace=True)
test.drop(skewed_features, axis=1, inplace=True)

In [None]:
train.drop('Census_ProcessorManufacturerIdentifier', axis=1, inplace=True)
test.drop('Census_ProcessorManufacturerIdentifier', axis=1, inplace=True)
train.drop('Census_PrimaryDiskTotalCapacity', axis=1, inplace=True)
test.drop('Census_PrimaryDiskTotalCapacity', axis=1, inplace=True)

In [None]:
train.drop('Census_InternalPrimaryDisplayResolutionVertical', axis=1, inplace=True)
test.drop('Census_InternalPrimaryDisplayResolutionVertical', axis=1, inplace=True)
train.drop('Census_OSInstallLanguageIdentifier', axis=1, inplace=True)
test.drop('Census_OSInstallLanguageIdentifier', axis=1, inplace=True)

In [None]:
train.drop('Census_ChassisTypeName',  axis=1, inplace=True)
test.drop('Census_ChassisTypeName',  axis=1, inplace=True)
## OS_Ver are repeated
train.drop('Census_OSVersion',  axis=1, inplace=True)
test.drop('Census_OSVersion',  axis=1, inplace=True)

In [None]:
train.drop('IsSxsPassiveMode', axis=1, inplace=True)
test.drop('IsSxsPassiveMode', axis=1, inplace=True)
#train.drop('IsBeta', axis=1, inplace=True)
#test.drop('IsBeta', axis=1, inplace=True)

#train.drop('PuaMode',inplace=True, axis=1)
train.drop('Census_ProcessorClass',inplace=True, axis=1)
#test.drop('PuaMode', inplace=True, axis=1)
test.drop('Census_ProcessorClass', inplace=True, axis=1)

In [None]:
test.drop('MachineIdentifier', axis=1, inplace=True)

In [None]:
frequency_encoded = ['OsPlatformSubRelease', 'OsBuildLab', 'Processor', 'SkuEdition', 'SmartScreen', 'Platform']

for variable in frequency_encoded:
    freq_enc_dict = frequency_encoding(variable)
    train[variable] = train[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
    test[variable] = test[variable].map(lambda x: freq_enc_dict.get(x, np.nan))

In [None]:
feature_encoded = ['Census_MDC2FormFactor', 'Census_PrimaryDiskTypeName', 'Census_PowerPlatformRoleName',  
                   'Census_OSBranch', 'Census_OSEdition', 'Census_OSInstallTypeName', 'Census_OSWUAutoUpdateOptionsName', 
                   'Census_GenuineStateName', 'Census_ActivationChannel', 'Census_FlightRing', 'EngineVersion', 
                   'AppVersion', 'AvSigVersion', 'OsVer']

for feature in feature_encoded:
    train[feature] = LabelEncoder().fit_transform(train[feature].astype(str))
    test[feature] = LabelEncoder().fit_transform(test[feature].astype(str))

In [None]:
target= train['HasDetections']
train.drop('HasDetections', axis=1, inplace=True)

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(train, target, test_size=0.15,random_state=1)

In [None]:
param = {'num_leaves': 60,
         'min_data_in_leaf': 100, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.1,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "random_state": 133,
         "verbosity": -1}

In [None]:
lgb_train = lgb.Dataset(X_train, label=Y_train)
lgb_val = lgb.Dataset(X_val, label=Y_val)

In [None]:
model = lgb.train(param, lgb_train, 10000, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=200, verbose_eval=100)

In [None]:
result = model.predict(test, num_iteration=model.best_iteration)

In [None]:
df = pd.DataFrame(result)
res = pd.concat([machine_id, df], axis = 1)
res.set_index('MachineIdentifier',inplace=True)
res.columns = {'HasDetections'}
res.to_csv('submission_kaggle.csv')