In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
import time
import gc
import pickle
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.io as pio
from scipy.sparse import hstack
init_notebook_mode(connected=True)
gc.enable()

continuous_columns = [  # All the columns which have a real continuous data
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges',
    'Census_OSBuildNumber',
    'Census_OSBuildRevision',
    'Census_ThresholdOptIn',
    'OsBuild'
]

bool_columns = [
    'IsBeta',
    'IsSxsPassiveMode',
    'Census_IsPortableOperatingSystem',
    'Census_IsSecureBootEnabled',
    'Census_IsTouchEnabled',
    'Census_IsPenCapable',
    'HasTpm',
    'Census_HasOpticalDiskDrive'
]

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [2]:
def OneHotEncoding(train,test,m = 10000):
    ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(pd.concat([train,test]))
    train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
    test = vstack([ohe.transform(test[i*m:(i+1)*m])  for i in range(test.shape[0] // m +  1)])

    print('Saving files...')
    save_npz('train_log.npz', train, compressed=True)
    save_npz('test_log.npz',  test,  compressed=True)
    del ohe,train ,test

In [7]:
def transform_to_category(train,test):
    categorical_feature = []
    col_number = -1
    #Transform features to categories
    for col_name in train.columns.tolist()[1:-1]:
        col_number = col_number+1
        if col_name in bool_columns:
            continue
        elif col_name in continuous_columns:
            nan_val = train[col_name].astype('float32').mean()
            train[col_name].fillna(nan_val, inplace=True)
            test[col_name].fillna(nan_val, inplace=True)
            del nan_val
            continue
        categorical_feature.append(col_number)
        train[col_name] = train[col_name].astype('str')
        test[col_name] = test[col_name].astype('str')

        # Fit LabelEncoder
        le = LabelEncoder().fit(
            np.unique(train[col_name].unique().tolist() +
                      test[col_name].unique().tolist()))

        # At the end 0 will be used for dropped values
        train[col_name] = le.transform(train[col_name]) + 1
        test[col_name] = le.transform(test[col_name]) + 1
        # Fit LabelEncoder again to make the all the category to have continues numbers
        le = LabelEncoder().fit(
            np.unique(train[col_name].unique().tolist() +
                      test[col_name].unique().tolist()))
        train[col_name] = le.transform(train[col_name])
        test[col_name] = le.transform(test[col_name])

        del le
        gc.collect()
        #Reduce memory consumption to accelerate training
        mx = max(train[col_name].max(), test[col_name].max())
        if mx < 2**7:
            train[col_name] = train[col_name].astype('int8')
            test[col_name] = test[col_name].astype('int8')
        elif mx < 2**15:
            train[col_name] = train[col_name].astype('int16')
            test[col_name] = test[col_name].astype('int16')
        else:
            train[col_name] = train[col_name].astype('int32')
            test[col_name] = test[col_name].astype('int32')
        del col_name, mx

    del col_number
    gc.collect()

    train_ids = train.index
    test_ids = test.index
    y_train = np.array(train['HasDetections'])
    y_test = np.array(test['HasDetections'])


    del test['HasDetections'], train['HasDetections']
    gc.collect()
    
    return train,test,train_ids,test_ids,y_train,y_test

In [4]:
def split_train_val(train_ids,y_train):
    kati=[]
    for i in range(3):
        train_x, val_x, _, _ = train_test_split(train_ids,y_train,test_size=0.1, random_state=2019+i)
        train_index = np.array(sorted(train_x))
        val_index = np.array(sorted(val_x))
        kati.append((train_index,val_index))
        del train_x, val_x
        gc.collect()    
    return kati

In [27]:
def train_classifier(train_val_set,sol='saga',reg=0.01,DK=False,FE=False,m = 10000):
    LR_results_acc = []
    LR_results_auc = []
    counter=0
    sec_iter= []
    model = []
    print('Starting Logistic Regression \n')
    for train_index, test_index in train_val_set:

        t = time.perf_counter()
        print('\n Trial {}\n'.format(counter+1))

        train = load_npz('train_log.npz')

        x_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m+1)])
        x_val = vstack([train[test_index[i*m:(i+1)*m]] for i in range(test_index.shape[0] // m+1)])
        if DK:
            for col in features:
                x_fit = hstack((x_fit,np.array(train_eng.iloc[train_index][col])[:,None]))
                x_val = hstack((x_val,np.array(train_eng.iloc[test_index][col])[:,None]))
        if FE:    
            for col in features2:
                x_fit = hstack((x_fit,np.array(train_ex.iloc[train_index][col])[:,None]))
                x_val = hstack((x_val,np.array(train_ex.iloc[test_index][col])[:,None]))        
        x_fit, x_val = csr_matrix(x_fit, dtype='float32'), csr_matrix(x_val, dtype='float32')
        y_fit, y_val = y_train[train_index], y_train[test_index]

        del train
        gc.collect()
        # Logistic Regression Classifier
        clf = LogisticRegression(
                    C = reg,
                    max_iter = 200,
                    tol = 0.00002,
                    solver = sol,
                    fit_intercept = True,
                    penalty = 'l2',
                    dual = False,
                    verbose = 100)

        clf.fit(x_fit, y_fit)
        n_iter = clf.n_iter_
        i=clf.predict_proba(x_val)[:,1]
        j=clf.predict(x_val)
        LR_results_auc.append(roc_auc_score(y_val,i))
        LR_results_acc.append(accuracy_score(y_val,j)) 
        model.append(clf)
        del x_fit, x_val, y_fit, y_val, train_index,clf
        gc.collect()      
        elapsed_time = time.perf_counter() - t
        sec_iter.append(elapsed_time/n_iter)  
        counter+=1
        
    return LR_results_acc,LR_results_auc,model,sec_iter 

In [24]:
def test_set_metrics(best_model,DK=False,FE=False):
    test = load_npz('test_log.npz')
    if DK:
        for col in features:
            test = hstack((test,np.array(test_eng[col])[:,None]))
    if FE:    
        for col in features2:
            test = hstack((test,np.array(test_ex[col])[:,None]))    
    test = csr_matrix(test, dtype='float32')
    i=model[best_model].predict_proba(test)[:,1]
    j=model[best_model].predict(test)
    test_roc = roc_auc_score(y_test,i)
    test_acc = accuracy_score(y_test,j) 
    return test_acc,test_roc

## Logistic Regression with sag solver and C=0.05 raw data

In [15]:
print('Download Train and Test Data.\n')
#t = time.perf_counter()
train = pd.read_csv('sample_train_with_no_missing_values.csv', low_memory=True)
test  = pd.read_csv('sample_test_with_no_missing_values.csv', low_memory=True)
del train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()

print('Transforing features to categories')
train,test,train_ids,test_ids,y_train,y_test = transform_to_category(train,test)
print('One Hot Encoding the features')
OneHotEncoding(train,test)
print('Creating train and validation sets')
train_val_sets = split_train_val(train_ids,y_train)
LR_results_acc,LR_results_auc,model,sec_iter = train_classifier(train_val_sets)

Download Train and Test Data.




Columns (28) have mixed types. Specify dtype option on import or set low_memory=False.



Transforing features to categories
One Hot Encoding the features
Saving files...
Creating train and validation sets
Fitting Logistic regression
Starting Logistic Regression 


 Trial 1

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
convergence after 42 epochs took 165 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.8min finished

 Trial 2

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
convergence after 39 epochs took 155 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.6min finished

 Trial 3

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
convergence after 33 epochs took 149 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out

In [26]:
best_model = np.argmax(LR_results_acc)
test_acc,test_roc = test_set_metrics(best_model)
for i in range(3):
    print('Trial:',i)
    print('Accuracy on val set using the raw data:', LR_results_acc[i])
    print('ROC AUC on val set using the raw data:', LR_results_auc[i])
    print('Average time per iteration:',sec_iter[i][0],'\n')

print('Mean accuracy is:',np.mean(LR_results_acc),'with',np.std(LR_results_acc)*100,'% standard deviation')
print('Execution time per iteration:',np.mean(sec_iter))
print('Accuracy on test set :', test_acc)
print('ROC AUC on test set using:', test_roc)

Trial: 0
Accuracy on val set using the raw data: 0.6579685972793874
ROC AUC on val set using the raw data: 0.7206921209001405
Average time per iteration: 7.024340467905166 

Trial: 1
Accuracy on val set using the raw data: 0.658318313471247
ROC AUC on val set using the raw data: 0.7207356812653187
Average time per iteration: 7.317196677231457 

Trial: 2
Accuracy on val set using the raw data: 0.6585469740582323
ROC AUC on val set using the raw data: 0.7211792031425327
Average time per iteration: 8.879978860333336 

Mean accuracy is: 0.6582779616029555 with 0.023783906257902034 % standard deviation
Execution time per iteration: 7.740505335156652
Accuracy on test set : 0.621576059015531
ROC AUC on test set using: 0.6763719678890392


## Logistic Regression with saga solver and C=0.01 raw data

In [None]:
print('Creating train and validation sets')
train_val_sets = split_train_val(train_ids,y_train)
LR_results_acc,LR_results_auc,model,sec_iter = train_classifier(train_val_sets,sol='saga',reg=0.01)

Creating train and validation sets
Starting Logistic Regression 


 Trial 1

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
max_iter reached after 997 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 16.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 16.6min finished



The max_iter was reached which means the coef_ did not converge




 Trial 2

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [9]:
best_model = np.argmax(LR_results_acc)
test_acc,test_roc = test_set_metrics(best_model)
for i in range(3):
    print('Trial:',i)
    print('Accuracy on val set using the raw data:', LR_results_acc[i])
    print('ROC AUC on val set using the raw data:', LR_results_auc[i])
    print('Average time per iteration:',sec_iter[i][0],'\n')

print('Mean accuracy is:',np.mean(LR_results_acc),'with',np.std(LR_results_acc)*100,'% standard deviation')
print('Execution time per iteration:',np.mean(sec_iter))
print('Accuracy on test set :', test_acc)
print('ROC AUC on test set using:', test_roc)

Trial: 0
Accuracy on val set using the raw data: 0.6582600274392705
ROC AUC on val set using the raw data: 0.7202387822011277
Average time per iteration: 6.817834427360013 

Trial: 1
Accuracy on val set using the raw data: 0.6588204700544302
ROC AUC on val set using the raw data: 0.7205736656205185
Average time per iteration: 10.608940491140002 

Trial: 2
Accuracy on val set using the raw data: 0.6589056573319345
ROC AUC on val set using the raw data: 0.720947164429651
Average time per iteration: 9.615234210630005 

Accuracy is: 0.6586620516085451 with 0.02863934270344954 % standard deviation
Execution time per iteration: 9.01400304304334


## Logistic Regression with sag solver and C=0.05 removed features

In [None]:
print('Download Train and Test Data with redudant features removed.\n')
train = pd.read_csv('sample_train_removed_features.csv', low_memory=True)
test  = pd.read_csv('sample_test_removed_features.csv', low_memory=True)
del train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()

print('Transforing features to categories')
train,test,train_ids,test_ids,y_train,y_test = transform_to_category(train,test)
print('One Hot Encoding the features')
OneHotEncoding(train,test)
print('Creating train and validation sets')
train_val_sets = split_train_val(train_ids,y_train)
LR_results_acc,LR_results_auc,model,sec_iter = train_classifier(train_val_sets)

In [24]:
best_model = np.argmax(LR_results_acc)
test_acc,test_roc = test_set_metrics(best_model)
for i in range(3):
    print('Trial:',i)
    print('Accuracy on val set removing redundant features:', LR_results_acc[i])
    print('ROC AUC on val set removing redundant features:', LR_results_auc[i])
    print('Average time per iteration:',sec_iter[i][0],'\n')

print('Mean accuracy is:',np.mean(LR_results_acc),'with',np.std(LR_results_acc)*100,'% standard deviation')
print('Execution time per iteration:',np.mean(sec_iter))
print('Accuracy on test set :', test_acc)
print('ROC AUC on test set using:', test_roc)

Trial: 0
Accuracy on val set using the raw data: 0.6575605950555511
ROC AUC on val set using the raw data: 0.7200691154419181
Average time per iteration: 4.339123708429997 

Trial: 1
Accuracy on val set using the raw data: 0.6576188810875276
ROC AUC on val set using the raw data: 0.7200922955813303
Average time per iteration: 4.217396166159997 

Trial: 2
Accuracy on val set using the raw data: 0.6575650785964724
ROC AUC on val set using the raw data: 0.7205884217466303
Average time per iteration: 4.323433965169988 

Accuracy is: 0.6575815182465171 with 0.00264828491806262 % standard deviation
Execution time per iteration: 4.293317946586661


## Logistic Regression with sag solver and C=0.05 with engineered features using domain knowledge

In [None]:
print('Download Train and Test Data with engineered features.\n')
train_eng = pd.read_csv('sample_train_engineered.csv', low_memory=True)
test_eng  = pd.read_csv('sample_test_engineered.csv', low_memory=True)
test_eng['Lag5'].fillna(0,inplace=True)
gc.collect()

#Normalizing engineered features
features = list(train_eng.columns[-6:])
for col in features:
    mean = train_eng[col].mean()
    std = train_eng[col].std()
    train_eng[col] = (train_eng[col]-mean)/std
    mean = test_eng[col].mean()
    std = test_eng[col].std()
    test_eng[col] = (test_eng[col]-mean)/std

print('Transforing features to categories')
train,test,train_ids,test_ids,y_train,y_test = transform_to_category(train,test)
print('One Hot Encoding the features')
OneHotEncoding(train,test)
print('Creating train and validation sets')
train_val_sets = split_train_val(train_ids,y_train)
LR_results_acc,LR_results_auc,model,sec_iter = train_classifier(train_val_sets,DK=True)    

In [106]:
best_model = np.argmax(LR_results_acc)
test_acc,test_roc = test_set_metrics(best_model,DK=True)
for i in range(3):
    print('Trial:',i)
    print('Accuracy on val set using the DK features:', LR_results_acc[i])
    print('ROC AUC on val set using the DK features:', LR_results_auc[i])
    print('Average time per iteration:',sec_iter[i][0],'\n')

print('Mean accuracy is:',np.mean(LR_results_acc),'with',np.std(LR_results_acc)*100,'% standard deviation')
print('Execution time per iteration:',np.mean(sec_iter))
print('Accuracy on test set :', test_acc)
print('ROC AUC on test set using:', test_roc)

Trial: 0
Accuracy on val set using the raw data: 0.6581793237026874
ROC AUC on val set using the raw data: 0.7203169415767454
Average time per iteration: 4.331263996214984 

Trial: 1
Accuracy on val set using the raw data: 0.6583721159623024
ROC AUC on val set using the raw data: 0.7204620498257785
Average time per iteration: 4.343054847965032 

Trial: 2
Accuracy on val set using the raw data: 0.6585380069763896
ROC AUC on val set using the raw data: 0.7210493407614424
Average time per iteration: 4.337539140820009 

Accuracy is: 0.6583631488804599 with 0.01465690488609836 % standard deviation
Execution time per iteration: 4.337285995000008


## Logistic Regression with sag solver and C=0.05 with BK&FE

In [None]:
print('Download Train and Test Data with engineered features.\n')
train_ex = pd.read_csv('train_extra_13.csv', low_memory=True)
test_ex = pd.read_csv('test_extra_13.csv', low_memory=True)
gc.collect()
features2=list(train_ex.columns)[-7:-1]

print('Transforing features to categories')
train,test,train_ids,test_ids,y_train,y_test = transform_to_category(train,test)
print('One Hot Encoding the features')
OneHotEncoding(train,test)
print('Creating train and validation sets')
train_val_sets = split_train_val(train_ids,y_train)
LR_results_acc,LR_results_auc,model,sec_iter = train_classifier(train_val_sets,DK=True, FE=True) 

In [127]:
best_model = np.argmax(LR_results_acc)
test_acc,test_roc = test_set_metrics(best_model,DK=True,FE=True)
for i in range(3):
    print('Trial:',i)
    print('Accuracy on val set using the DK features:', LR_results_acc[i])
    print('ROC AUC on val set using the DK features:', LR_results_auc[i])
    print('Average time per iteration:',sec_iter[i][0],'\n')

print('Mean accuracy is:',np.mean(LR_results_acc),'with',np.std(LR_results_acc)*100,'% standard deviation')
print('Execution time per iteration:',np.mean(sec_iter))
print('Accuracy on test set :', test_acc)
print('ROC AUC on test set using:', test_roc)

Trial: 0
Accuracy on val set using the raw data: 0.6584842044853343
ROC AUC on val set using the raw data: 0.7205153207970998
Average time per iteration: 4.57077757356994 

Trial: 1
Accuracy on val set using the raw data: 0.6583138299303257
ROC AUC on val set using the raw data: 0.7206272527852066
Average time per iteration: 4.575619924209969 

Trial: 2
Accuracy on val set using the raw data: 0.6586814802858706
ROC AUC on val set using the raw data: 0.7212364293113739
Average time per iteration: 4.575491411099938 

Accuracy is: 0.6584931715671769 with 0.015022650099243626 % standard deviation
Execution time per iteration: 4.573962969626615


In [133]:
#Just adding the results to csv so that members of the team can use them
model = ['raw data','raw data 2','removed features','DK_engineered', 'DK&FE']
convergence = ['True','True','True','False','False']
DEV = [0.658542490517311, 0.6589056573319345, 0.6575785292192362,
      0.6585380069763896,0.6586814802858706]
test= [0.6215880151603916, 0.6217188810875276, 0.6213279690096726,
      0.6199500233144825,0.6198364399383063]
time_iter =[7.360175984256411, 9.615234210630005, 4.217396166159997,
           4.337285995000008, 4.573962969626615]
std = [0.02298734939586127,0.02863934270344954,0.00264828491806262,
      0.01465690488609836,0.015022650099243626]
solver = ['sag','saga','sag','sag','sag']
reg_strength= [0.05,0.01, 0.05,0.05,0.05]

my_results = pd.DataFrame(np.column_stack([model, solver, reg_strength,convergence,
                                          DEV, std, test, time_iter]), 
                               columns=['Model', 'Solver', 'Regularization_L2',
                                       'Convergence','DEV set','std %','test set',
                                       'Time/Iteration'])
export_csv = my_results.to_csv(r'LR_results.csv', index=None, header=True)

# Logistic regression with frequency encoded features

In [11]:
print('Download Train and Test Data with redudant features removed.\n')
t = time.perf_counter()
train = pd.read_csv('freq_train.csv', low_memory=True)
test  = pd.read_csv('freq_test.csv', low_memory=True)
gc.collect()


train_x, val_x, _, _ = train_test_split(train,train['HasDetections'],test_size=0.1, random_state=2019)
y_fit = train_x['HasDetections']
y_val = val_x['HasDetections']
y_test = test['HasDetections']

# Cleanup
del train_x['HasDetections'], val_x['HasDetections'], test['HasDetections']
gc.collect()

print('Starting Logistic regression')
# Logistic Regression Classifier
clf = LogisticRegression(
            C = 0.05,
            max_iter = 100,
            tol = 0.0001,
            solver = 'sag',
            fit_intercept = True,
            penalty = 'l2',
            dual = False,
            verbose = 100)

clf.fit(train_x, y_fit)

oof_preds = clf.predict_proba(train_x)[:, 1]
print('Validation AUC score:', roc_auc_score(y_fit, oof_preds),'\n') 
# Validation Set
oof_preds = clf.predict_proba(val_x)[:, 1]
print('Validation AUC score: ', roc_auc_score(y_val, oof_preds),'\n')        

# Cleanup 
del train_x, y_fit, y_val, val_x
gc.collect()

# Test Set
predictions = clf.predict_proba(test)[:, 1]
elapsed_time = time.perf_counter() - t
print('Time elapsed to complete the whole process:',elapsed_time)

Download Train and Test Data with redudant features removed.

Starting Logistic regression
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
convergence after 19 epochs took 55 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.8s finished
Validation AUC score: 0.6725096665036613 

Validation AUC score:  0.6723269602470521 

Time elapsed to complete the whole process: 105.92784166699994


In [12]:
weights = clf.coef_
save_obj(weights,'freq_weights')
abs_weights = np.abs(weights)
my_dict = {}
features=list(train.columns)
features.remove('HasDetections')
for num,col in enumerate(features):
    my_dict[col] = abs_weights[0,num]

In [13]:
sorted_dict = sorted(my_dict.items(), key=lambda x: x[1], reverse=False)
cols=[]
weights = []
for tup in sorted_dict:
    cols.append(tup[0])
    weights.append(tup[1])

In [15]:
trace = go.Bar(y=cols,
               x=weights,
               orientation='h', marker=dict(color='rgb(49,130,189)'), name='train')

layout = go.Layout(
    title='Feature Importance', height=600, width=800,
    xaxis=dict(
        title='Number of categories',
        titlefont=dict(size=16, color='rgb(107, 107, 107)'),
        domain=[0.25, 1]
    ),
    barmode='group',
    bargap=0.1,
    bargroupgap=0.1
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)