In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

## Read the training and test data sets previously split and stored as separage csv files
## The last column labelled '86' is the 0/1 label indicationg benign/malware for each sample row.

df_train = pd.read_csv("data/train_data.csv")
df_test = pd.read_csv("data/test_data.csv")

last_column = df_train.shape[1] - 1 ## index of the label column
X_train = df_train.iloc[:,:last_column]
y_train = df_train.iloc[:,last_column]

X_test = df_test.iloc[:,:last_column]
y_test = df_test.iloc[:,last_column]

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(random_state = 47)

parameters = {'n_estimators': [100, 110, 120],
              'max_depth': [20,22,24], 
              'min_samples_split': [2],
             'min_samples_leaf': [1],
             'max_features': ['log2'],
             'ccp_alpha': [0.0, 0.005, 0.01]}
model = GridSearchCV(estimator, parameters ,scoring='accuracy',verbose=3)
model.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END ccp_alpha=0.0, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.968 total time=   0.8s
[CV 2/5] END ccp_alpha=0.0, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.971 total time=   0.8s
[CV 3/5] END ccp_alpha=0.0, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.973 total time=   0.8s
[CV 4/5] END ccp_alpha=0.0, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.971 total time=   0.8s
[CV 5/5] END ccp_alpha=0.0, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.969 total time=   0.8s
[CV 1/5] END ccp_alpha=0.0, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=110;, score=0.967 total time=   0.8s
[CV 2/5] END ccp_alpha=0.0

[CV 5/5] END ccp_alpha=0.005, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=110;, score=0.943 total time=   0.9s
[CV 1/5] END ccp_alpha=0.005, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=120;, score=0.940 total time=   1.0s
[CV 2/5] END ccp_alpha=0.005, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=120;, score=0.931 total time=   1.0s
[CV 3/5] END ccp_alpha=0.005, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=120;, score=0.949 total time=   1.0s
[CV 4/5] END ccp_alpha=0.005, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=120;, score=0.939 total time=   0.9s
[CV 5/5] END ccp_alpha=0.005, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=120;, score=0.944 total time=   0.9s
[CV 1/5] END ccp_alpha=0.005, max_depth=22, max_features=log2, min_samples_l

[CV 4/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.932 total time=   0.8s
[CV 5/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.926 total time=   0.8s
[CV 1/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=110;, score=0.926 total time=   0.9s
[CV 2/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=110;, score=0.925 total time=   0.9s
[CV 3/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=110;, score=0.931 total time=   0.9s
[CV 4/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=110;, score=0.931 total time=   0.9s
[CV 5/5] END ccp_alpha=0.01, max_depth=22, max_features=log2, min_samples_leaf=1, 

GridSearchCV(estimator=RandomForestClassifier(random_state=47),
             param_grid={'ccp_alpha': [0.0, 0.005, 0.01],
                         'max_depth': [20, 22, 24], 'max_features': ['log2'],
                         'min_samples_leaf': [1], 'min_samples_split': [2],
                         'n_estimators': [100, 110, 120]},
             scoring='accuracy', verbose=3)

In [47]:
model.best_params_

{'ccp_alpha': 0.0,
 'max_depth': 22,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 120}

In [48]:
model.best_estimator_

RandomForestClassifier(max_depth=22, max_features='log2', n_estimators=120,
                       random_state=47)

In [49]:
yp = model.predict(X_test)
print('acc', accuracy_score(y_test, yp))
print('recall', recall_score(y_test, yp))
print('precision', precision_score(y_test, yp))
print('F1', f1_score(y_test, yp))
pd.DataFrame(confusion_matrix(y_test,yp))

acc 0.9676136363636364
recall 0.9645103324348607
precision 0.9712734675412803
F1 0.9678800856531049


Unnamed: 0,0,1
0,4221,127
1,158,4294
