In [20]:
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score, precision_score, recall_score)
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB


In [14]:
# Create function that prints out all imported sklearn metrics for evaluating each machine learning model
def eval_metrics(y_test, y_pred):
     print('Accuracy is %.3f' % accuracy_score(y_test, y_pred))
     print('F1-measure is: %.3f' % f1_score(y_test, y_pred, average= 'weighted'))
     print('Recall is %.3f' % recall_score(y_test, y_pred, average= 'weighted'))
     print('Precision is %.3f' % precision_score(y_test, y_pred, average='weighted'))
     conf_matrix = confusion_matrix(y_test, y_pred)
     print(conf_matrix)
     print(pd.crosstab(y_test, y_pred, rownames = ['True'],colnames = ['Predicted'], margins=True))
     

In [3]:
# Load in the data from the iris csv file into 2 numpy arrays
features = genfromtxt('/users/aahiljivani/documents/py_projects/ml_for_cyber_sec/datasets/iris.csv', delimiter=',', usecols=(i for i in range(4)), dtype=float,skip_header=1)
class_value = genfromtxt('/users/aahiljivani/documents/py_projects/ml_for_cyber_sec/datasets/iris.csv', delimiter=',',usecols=(-1) ,dtype = str, skip_header=1)





In [4]:
# Making the categorical variables integers for preprocessing
labels = LabelEncoder().fit_transform(class_value)
# normalizing the quantitative features so that they are scaled to variables between 0 and 1
features_norm = StandardScaler().fit_transform(features)


In [5]:
# Splitting the data for training and testing
x_train, x_test, y_train, y_test = train_test_split(features_norm,labels,test_size = 0.30, random_state=0)

In [16]:
# Train the machine learning model using Support Vector Machines
svm1 = svm.SVC()
svm1.fit(x_train, y_train)
y_pred = svm1.predict(x_test)

In [15]:
eval_metrics(y_test= y_test, y_pred=y_pred)


Accuracy is 0.978
F1-measure is: 0.978
Recall is 0.978
Precision is 0.980
[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]
Predicted   0   1   2  All
True                      
0          16   0   0   16
1           0  17   1   18
2           0   0  11   11
All        16  17  12   45


In [17]:
# Train the model using LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
log_predicted = log_reg.predict(x_test) 


In [19]:
# Evaluate metrics for Logistic Regression
eval_metrics(y_test = y_test, y_pred= log_predicted)

Accuracy is 0.978
F1-measure is: 0.978
Recall is 0.978
Precision is 0.980
[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]
Predicted   0   1   2  All
True                      
0          16   0   0   16
1           0  17   1   18
2           0   0  11   11
All        16  17  12   45


In [21]:
# Evaluate model with Naive Bayes

GB = GaussianNB()
GB.fit(x_train, y_train)
GB_pred = GB.predict(x_test)

In [22]:
eval_metrics(y_test = y_test, y_pred = GB_pred)

Accuracy is 1.000
F1-measure is: 1.000
Recall is 1.000
Precision is 1.000
[[16  0  0]
 [ 0 18  0]
 [ 0  0 11]]
Predicted   0   1   2  All
True                      
0          16   0   0   16
1           0  18   0   18
2           0   0  11   11
All        16  18  11   45
