In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

## Read the training and test data sets previously split and stored as separage csv files
## The last column labelled '86' is the 0/1 label indicationg benign/malware for each sample row.

df_train = pd.read_csv("data/train_data.csv")
df_test = pd.read_csv("data/test_data.csv")

last_column = df_train.shape[1] - 1 ## index of the label column
X_train = df_train.iloc[:,:last_column]
y_train = df_train.iloc[:,last_column]

X_test = df_test.iloc[:,:last_column]
y_test = df_test.iloc[:,last_column]

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [8]:
estimator = SVC()
parameters = {'C': [ 1.0, 2.0, 5.0, 10.0],
              'kernel': ['linear','poly','rbf','sigmoid']}
model = GridSearchCV(estimator, parameters ,scoring='accuracy',verbose=3)
model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............C=1.0, kernel=linear;, score=0.956 total time=   3.6s
[CV 2/5] END ..............C=1.0, kernel=linear;, score=0.958 total time=   3.2s
[CV 3/5] END ..............C=1.0, kernel=linear;, score=0.963 total time=   3.3s
[CV 4/5] END ..............C=1.0, kernel=linear;, score=0.961 total time=   3.4s
[CV 5/5] END ..............C=1.0, kernel=linear;, score=0.958 total time=   3.3s
[CV 1/5] END ................C=1.0, kernel=poly;, score=0.960 total time=   2.7s
[CV 2/5] END ................C=1.0, kernel=poly;, score=0.963 total time=   2.7s
[CV 3/5] END ................C=1.0, kernel=poly;, score=0.964 total time=   2.7s
[CV 4/5] END ................C=1.0, kernel=poly;, score=0.962 total time=   2.7s
[CV 5/5] END ................C=1.0, kernel=poly;, score=0.964 total time=   2.8s
[CV 1/5] END .................C=1.0, kernel=rbf;, score=0.964 total time=   3.3s
[CV 2/5] END .................C=1.0, kernel=rbf;

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1.0, 2.0, 5.0, 10.0],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='accuracy', verbose=3)

In [9]:
model.best_params_

{'C': 5.0, 'kernel': 'rbf'}

In [10]:
model.best_estimator_

SVC(C=5.0)

In [11]:
yp = model.predict(X_test)
print('acc', accuracy_score(y_test, yp))
print('recall', recall_score(y_test, yp))
print('precision', precision_score(y_test, yp))
print('F1', f1_score(y_test, yp))
pd.DataFrame(confusion_matrix(y_test,yp))

acc 0.96625
recall 0.9597933513027853
precision 0.9731268503757686
F1 0.9664141128576275


Unnamed: 0,0,1
0,4230,118
1,179,4273
