In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

## Read the training and test data sets previously split and stored as separage csv files
## The last column labelled '86' is the 0/1 label indicationg benign/malware for each sample row.

df_train = pd.read_csv("data/train_data.csv")
df_test = pd.read_csv("data/test_data.csv")

last_column = df_train.shape[1] - 1 ## index of the label column
X_train = df_train.iloc[:,:last_column]
y_train = df_train.iloc[:,last_column]

X_test = df_test.iloc[:,:last_column]
y_test = df_test.iloc[:,last_column]

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
estimator = GradientBoostingClassifier()
parameters = {'learning_rate': [0.05, 0.075, 0.10, 0.15], 'n_estimators': [100, 150, 200]}
model = GridSearchCV(estimator, parameters ,scoring='accuracy',verbose=3)
model.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END learning_rate=0.05, n_estimators=100;, score=0.957 total time=   2.0s
[CV 2/5] END learning_rate=0.05, n_estimators=100;, score=0.958 total time=   2.0s
[CV 3/5] END learning_rate=0.05, n_estimators=100;, score=0.960 total time=   2.0s
[CV 4/5] END learning_rate=0.05, n_estimators=100;, score=0.962 total time=   2.0s
[CV 5/5] END learning_rate=0.05, n_estimators=100;, score=0.956 total time=   2.0s
[CV 1/5] END learning_rate=0.05, n_estimators=150;, score=0.959 total time=   3.0s
[CV 2/5] END learning_rate=0.05, n_estimators=150;, score=0.960 total time=   3.1s
[CV 3/5] END learning_rate=0.05, n_estimators=150;, score=0.963 total time=   3.0s
[CV 4/5] END learning_rate=0.05, n_estimators=150;, score=0.963 total time=   3.0s
[CV 5/5] END learning_rate=0.05, n_estimators=150;, score=0.959 total time=   3.0s
[CV 1/5] END learning_rate=0.05, n_estimators=200;, score=0.960 total time=   4.0s
[CV 2/5] END learning_rate

GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.05, 0.075, 0.1, 0.15],
                         'n_estimators': [100, 150, 200]},
             scoring='accuracy', verbose=3)

In [5]:
model.best_params_

{'learning_rate': 0.15, 'n_estimators': 200}

In [6]:
model.best_estimator_

GradientBoostingClassifier(learning_rate=0.15, n_estimators=200)

In [7]:
yp = model.predict(X_test)
print('acc', accuracy_score(y_test, yp))
print('recall', recall_score(y_test, yp))
print('precision', precision_score(y_test, yp))
print('F1', f1_score(y_test, yp))
pd.DataFrame(confusion_matrix(y_test,yp))

acc 0.9623863636363637
recall 0.963162623539982
precision 0.9625140291806958
F1 0.9628382171325923


Unnamed: 0,0,1
0,4181,167
1,164,4288
