In [1]:
import numpy as np 
import pandas as pd 
from datetime import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import lightgbm as lgb
import time
import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn import metrics
import gc
import dask.dataframe as dd
import dask
import timeit
from sklearn.model_selection import train_test_split

In [2]:
dtypes = {
    'MachineIdentifier':                                    'object',
    'ProductName':                                          'object',
    'EngineVersion':                                        'object',
    'AppVersion':                                           'object',
    'AvSigVersion':                                         'object',
    'IsBeta':                                               'int8',
    'RtpStateBitfield':                                     'float16',
    'IsSxsPassiveMode':                                     'int8',
    'DefaultBrowsersIdentifier':                            'float32',  # was 'float16'
    'AVProductStatesIdentifier':                            'float32',
    'AVProductsInstalled':                                  'float16',
    'AVProductsEnabled':                                    'float16',
    'HasTpm':                                               'int8',
    'CountryIdentifier':                                    'int16',
    'CityIdentifier':                                       'float32',
    'OrganizationIdentifier':                               'float16',
    'GeoNameIdentifier':                                    'float16',
    'LocaleEnglishNameIdentifier':                          'int16',  # was 'int8'
    'Platform':                                             'object',
    'Processor':                                            'object',
    'OsVer':                                                'object',
    'OsBuild':                                              'int16',
    'OsSuite':                                              'int16',
    'OsPlatformSubRelease':                                 'object',
    'OsBuildLab':                                           'object',
    'SkuEdition':                                           'object',
    'IsProtected':                                          'float16',
    'AutoSampleOptIn':                                      'int8',
    'PuaMode':                                              'object',
    'SMode':                                                'float16',
    'IeVerIdentifier':                                      'float16',
    'SmartScreen':                                          'object',
    'Firewall':                                             'float16',
    'UacLuaenable':                                         'float64', # was 'float32'
    'Census_MDC2FormFactor':                                'object',
    'Census_DeviceFamily':                                  'object',
    'Census_OEMNameIdentifier':                             'float32', # was 'float16'
    'Census_OEMModelIdentifier':                            'float32',
    'Census_ProcessorCoreCount':                            'float16',
    'Census_ProcessorManufacturerIdentifier':               'float16',
    'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
    'Census_ProcessorClass':                                'object',
    'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
    'Census_PrimaryDiskTypeName':                           'object',
    'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
    'Census_HasOpticalDiskDrive':                           'int8',
    'Census_TotalPhysicalRAM':                              'float32',
    'Census_ChassisTypeName':                               'object',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
    'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
    'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
    'Census_PowerPlatformRoleName':                         'object',
    'Census_InternalBatteryType':                           'object',
    'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
    'Census_OSVersion':                                     'object',
    'Census_OSArchitecture':                                'object',
    'Census_OSBranch':                                      'object',
    'Census_OSBuildNumber':                                 'int16',
    'Census_OSBuildRevision':                               'int32',
    'Census_OSEdition':                                     'object',
    'Census_OSSkuName':                                     'object',
    'Census_OSInstallTypeName':                             'object',
    'Census_OSInstallLanguageIdentifier':                   'float16',
    'Census_OSUILocaleIdentifier':                          'int16',
    'Census_OSWUAutoUpdateOptionsName':                     'object',
    'Census_IsPortableOperatingSystem':                     'int8',
    'Census_GenuineStateName':                              'object',
    'Census_ActivationChannel':                             'object',
    'Census_IsFlightingInternal':                           'float16',
    'Census_IsFlightsDisabled':                             'float16',
    'Census_FlightRing':                                    'object',
    'Census_ThresholdOptIn':                                'float16',
    'Census_FirmwareManufacturerIdentifier':                'float16',
    'Census_FirmwareVersionIdentifier':                     'float32',
    'Census_IsSecureBootEnabled':                           'int8',
    'Census_IsWIMBootEnabled':                              'float16',
    'Census_IsVirtualDevice':                               'float16',
    'Census_IsTouchEnabled':                                'int8',
    'Census_IsPenCapable':                                  'int8',
    'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
    'Wdft_IsGamer':                                         'float16',
    'Wdft_RegionIdentifier':                                'float16',
    'HasDetections':                                        'float32',
}



## Load Data

In [3]:
%%time
df_train = dd.read_csv("train.csv", blocksize= 256e6, dtype = dtypes)
gc.collect()
df_train = df_train.compute()
df_train.drop(['index','Date','YearMonth'],axis=1,inplace=True) 

Wall time: 53.6 s


In [4]:
%%time
df_test = dd.read_csv("test_sampled.csv", blocksize= 256e6, dtype = dtypes)
gc.collect()
df_test = df_test.compute()
df_test.drop(['index','Date','YearMonth'],axis=1,inplace=True) 
y_test = df_test['HasDetections']

Wall time: 8.04 s


In [5]:
y_tran = df_train['HasDetections']


In [6]:
def encode_categorical(x_train, x_test, columns, sort=True):
    """
    Function addapted from kaggle to encode categorical features for 
    lightgbm
    """
    train_length = x_train.shape[0]
    for col in tqdm(columns):
        if col != 'MachineIdentifier' and col != 'HasDetections':
            combined_data = pd.concat([x_train[col], x_test[col]])
            combined_data, _ = pd.factorize(combined_data, sort=sort)
            combined_data = pd.Series(combined_data).astype('int32')
            x_train[col] = combined_data.iloc[:train_length].values
            x_test[col] = combined_data.iloc[train_length:].values
            x_train[col] = x_train[col].fillna(0)
            x_test[col] = x_test[col].fillna(0)
            del combined_data

    return x_train, x_test

In [7]:
g = df_train.columns.to_series().groupby(df_train.dtypes).groups
types_ = {k.name: v for k, v in g.items()}
object_columns = types_['object']

df_train, df_test = encode_categorical(df_train, df_test, object_columns)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [8]:
def train_model(x, y, light_gbm_params,   
                evaluation_metric='auc', 
                save_feature_importances=False,
                identifier_columns=['MachineIdentifier'], exp_name = "lightgbm"):

    
    scores = []
    classifier_models = []
    feature_importance_df = pd.DataFrame()
        
    for i in range(3):
        
        x_train, x_validation, y_train, y_validation = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.15,
                                                           random_state=2019*i)
        
        
    
        x_train.drop(identifier_columns, axis=1, inplace=True)
        validation_identifier_data = x_validation[identifier_columns]
        x_validation.drop(identifier_columns, axis=1, inplace=True)
        x_train_columns = x_train.columns
        
        
        trn_data = lgb.Dataset(x_train,label=y_train)
        
        del x_train
        del y_train
        
        val_data = lgb.Dataset(x_validation,label=y_validation)
        
        print("Training on round {}".format(i))
        start = time.time()
        classifier_model = lgb.train(light_gbm_params,
                                     trn_data,
                                      1000,
                                     valid_sets=[trn_data, val_data],
                                     verbose_eval=100,
                                     early_stopping_rounds=100)
        end = time.time()
        print("Tme elapsed: {}".format(end-start))

        classifier_models.append(classifier_model)
        print("Predicting on round {}".format(i))
        predictions = classifier_model.predict(x_validation, num_iteration=classifier_model.best_iteration)
        
        # Round to 0 o 1
        predictions[predictions > 0.5] = 1
        predictions[predictions <= 0.5] = 0
        # Calculate accuracy
        score = accuracy_score(y_validation, predictions)        
        scores.append(score)
        
        best_model_index = 1
        if score > max(scores) or i == 1:        
            df_impt = pd.DataFrame()
            df_impt["feature"] = x_train_columns
            df_impt["importance"] = classifier_model.feature_importance(importance_type='gain')
            df_impt["fold"] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, df_impt], axis=0)
            best_model_index = i
        

    cols = (feature_importance_df[["feature", "importance"]]
            .groupby("feature")
            .mean()
            .sort_values(by="importance", ascending=False)[:1000].index)

    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]


    file_name = exp_name + '.csv'
    best_features.sort_values(by="importance", ascending=False) \
        .groupby("feature") \
        .mean() \
        .sort_values(by="importance", ascending=False) \
        .to_csv(file_name, index=True)
    
    best_features

    return classifier_models, best_model_index, scores , best_features

In [9]:
random_sample_percent = 5/8
light_gbm_params = {   
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'nthread': 4,
        'learning_rate': 0.05,
        'max_depth': 5,
        'num_leaves': 30,
        'sub_feature': 0.9,
        'sub_row':0.9,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'random_state': 2019
        }

In [10]:
df_test.drop(['MachineIdentifier','HasDetections'],axis=1,inplace=True) 

In [11]:
exp_name = "lightgbm_01"

models,best_model_index, validation_score, feature_importance = train_model(df_train.drop('HasDetections', axis=1),
                                      y_tran, light_gbm_params,                             
                                      save_feature_importances=True, exp_name=exp_name)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training on round 0
Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.617149	valid_1's binary_logloss: 0.61752
[200]	training's binary_logloss: 0.610959	valid_1's binary_logloss: 0.61177
[300]	training's binary_logloss: 0.607447	valid_1's binary_logloss: 0.608698
[400]	training's binary_logloss: 0.604995	valid_1's binary_logloss: 0.606637
[500]	training's binary_logloss: 0.603023	valid_1's binary_logloss: 0.605048
[600]	training's binary_logloss: 0.601029	valid_1's binary_logloss: 0.603416
[700]	training's binary_logloss: 0.599438	valid_1's binary_logloss: 0.602204
[800]	training's binary_logloss: 0.598034	valid_1's binary_logloss: 0.601155
[900]	training's binary_logloss: 0.596758	valid_1's binary_logloss: 0.600247
[1000]	training's binary_logloss: 0.595683	valid_1's binary_logloss: 0.599497
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.595683	valid_1's binary_logloss: 0.599497
Tme elapsed: 371.67037

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training on round 1
Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.617095	valid_1's binary_logloss: 0.617811
[200]	training's binary_logloss: 0.610552	valid_1's binary_logloss: 0.611763
[300]	training's binary_logloss: 0.607106	valid_1's binary_logloss: 0.608773
[400]	training's binary_logloss: 0.604603	valid_1's binary_logloss: 0.606617
[500]	training's binary_logloss: 0.602466	valid_1's binary_logloss: 0.604823
[600]	training's binary_logloss: 0.600791	valid_1's binary_logloss: 0.603504
[700]	training's binary_logloss: 0.599137	valid_1's binary_logloss: 0.602194
[800]	training's binary_logloss: 0.597781	valid_1's binary_logloss: 0.601196
[900]	training's binary_logloss: 0.596541	valid_1's binary_logloss: 0.600325
[1000]	training's binary_logloss: 0.595457	valid_1's binary_logloss: 0.599575
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.595457	valid_1's binary_logloss: 0.599575
Tme elapsed: 339.453

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training on round 2
Training until validation scores don't improve for 100 rounds.
[100]	training's binary_logloss: 0.617063	valid_1's binary_logloss: 0.617856
[200]	training's binary_logloss: 0.610433	valid_1's binary_logloss: 0.611597
[300]	training's binary_logloss: 0.60712	valid_1's binary_logloss: 0.608692
[400]	training's binary_logloss: 0.604479	valid_1's binary_logloss: 0.606428
[500]	training's binary_logloss: 0.602502	valid_1's binary_logloss: 0.604849
[600]	training's binary_logloss: 0.600672	valid_1's binary_logloss: 0.60342
[700]	training's binary_logloss: 0.599048	valid_1's binary_logloss: 0.602166
[800]	training's binary_logloss: 0.597698	valid_1's binary_logloss: 0.601182
[900]	training's binary_logloss: 0.596492	valid_1's binary_logloss: 0.600365
[1000]	training's binary_logloss: 0.59536	valid_1's binary_logloss: 0.599609
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.59536	valid_1's binary_logloss: 0.599609
Tme elapsed: 345.9221320

In [12]:
## predict

y_pred=models[best_model_index].predict(df_test, axis=1)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
accuracy_score(y_test, y_pred)

0.6309396063427538

In [15]:
# To plot feature importance
def plot_save_importance(best_features,plot_name):
    """
    Functio to plot and save importance of features
    """
    plt.figure(figsize=(14, 25))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('Importance of features')
    plt.tight_layout()
    file_name = plot_name + -".pdf"
    plt.savefig(file_name)