In [28]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

In [29]:
def load_data():
    with open('PhishingData.arff','r') as f:
        data = []
        for l in f.readlines():
            data.append([int(x) for x in l.strip('\r\n').split(',')])
    f.close()
    return np.array(data)

In [30]:
def svm(x_train, x_test, y_train, y_test):
    model = OneVsRestClassifier(SVC(random_state=0))
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [31]:
def random_forest(x_train, x_test, y_train, y_test):
    forest = RandomForestClassifier(n_estimators=100)
    model = OneVsRestClassifier(forest)
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [32]:
def linearsvm(x_train, x_test, y_train, y_test):
    model = OneVsRestClassifier(LinearSVC(random_state=0))
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [33]:
def naive_bayes(x_train, x_test, y_train, y_test):
    model = OneVsRestClassifier(GaussianNB())
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [34]:
def gradboost(x_train, x_test, y_train, y_test):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    model = OneVsRestClassifier(clf)
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [35]:
def kneighbors(x_train, x_test, y_train, y_test):
    neigh = KNeighborsClassifier(n_neighbors=5)
    model = OneVsRestClassifier(neigh)
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [36]:
def SGD(x_train, x_test, y_train, y_test):
    clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=50)
    model = OneVsRestClassifier(clf)
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [37]:
def DT(x_train, x_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier()
    model = OneVsRestClassifier(clf)
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [38]:
def MLP(x_train, x_test, y_train, y_test):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-1,hidden_layer_sizes=(7, 3))
    model = OneVsRestClassifier(clf)
    model.fit(x_train, y_train)
    print('Training Accuracy', model.score(x_train, y_train))
    print('Test Accuracy', model.score(x_test, y_test))
    target_names = ['class -1', 'class 0', 'class 1']
    print(classification_report(y_test, model.predict(x_test), target_names=target_names))

In [39]:
if __name__ == "__main__":
    data = load_data()
    train_labels = data[:,9]
    train_data = data[:,:9]
    X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.20, random_state=42)
#     X_train.reshape(-1,1)
#     X_test.reshape(-1,1)
#     y_train.reshape(-1,1)
#     y_test.reshape(-1,1)
    random_forest(X_train, X_test, y_train, y_test)
    linearsvm(X_train, X_test, y_train, y_test)
    svm(X_train, X_test, y_train, y_test)
    naive_bayes(X_train, X_test, y_train, y_test)
    gradboost(X_train, X_test, y_train, y_test)
    kneighbors(X_train, X_test, y_train, y_test)
    #SGD(X_train, X_test, y_train, y_test)
    DT(X_train, X_test, y_train, y_test)
    MLP(X_train, X_test, y_train, y_test)

('Training Accuracy', 0.96765249537892795)
('Test Accuracy', 0.87084870848708484)
             precision    recall  f1-score   support

   class -1       0.89      0.90      0.89       132
    class 0       0.85      0.68      0.76        25
    class 1       0.85      0.88      0.87       114

avg / total       0.87      0.87      0.87       271

('Training Accuracy', 0.83733826247689469)
('Test Accuracy', 0.81918819188191883)
             precision    recall  f1-score   support

   class -1       0.83      0.92      0.87       132
    class 0       0.00      0.00      0.00        25
    class 1       0.81      0.89      0.85       114

avg / total       0.75      0.82      0.78       271

('Training Accuracy', 0.88077634011090578)
('Test Accuracy', 0.83763837638376382)
             precision    recall  f1-score   support

   class -1       0.88      0.91      0.90       132
    class 0       0.60      0.12      0.20        25
    class 1       0.80      0.91      0.85       114

avg 