In [3]:
import arff
import numpy as np
import json
from sklearn.model_selection import train_test_split, KFold

In [2]:
!pip install liac-arff

Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename=liac_arff-2.5.0-py3-none-any.whl size=11716 sha256=aa7a1fc5bc211d5a742109a5383850768a73db2d63f73b9c29c881dda0e22bd9
  Stored in directory: /root/.cache/pip/wheels/5d/2a/9c/3895d9617f8f49a0883ba686326d598e78a1c2f54fe3cae86d
Successfully built liac-arff
Installing collected packages: liac-arff
Successfully installed liac-arff-2.5.0


In [4]:
dataset = arff.load(open('dataset.arff', 'r'))
data = np.array(dataset['data'])

In [5]:
print('The dataset has {0} datapoints with {1} features'.format(data.shape[0], data.shape[1]-1))
print('Features: {0}'.format([feature[0] for feature in dataset['attributes']]))

The dataset has 11055 datapoints with 30 features
Features: ['having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result']


In [6]:
data = data[:, [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 22, 30]]

In [7]:
X, y = data[:, :-1], data[:, -1]
y.reshape(y.shape[0])
print('Before spliting')
print('X:{0}, y:{1}'.format(X.shape, y.shape))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print('After spliting')
print('X_train:{0}, y_train:{1}, X_test:{2}, y_test:{3}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

Before spliting
X:(11055, 17), y:(11055,)
After spliting
X_train:(7738, 17), y_train:(7738,), X_test:(3317, 17), y_test:(3317,)


In [8]:
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
print('Saved!')

Saved!


In [14]:
test_data = dict()
test_data['X_test'] = X_test.tolist()
test_data['y_test'] = y_test.tolist()

import os

# Specify the directory path
directory_path = '../../static/'

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Now you can proceed with writing the file
with open(os.path.join(directory_path, 'testdata.json'), 'w') as tdfile:
    json.dump(test_data, tdfile)
    print('Test Data written to testdata.json')


Test Data written to testdata.json


In [10]:
from sklearn.tree import _tree

In [11]:
def tree_to_json(tree):
    tree_ = tree.tree_
    feature_names = range(30)
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    def recurse(node):
        tree_json = dict()
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            tree_json['type'] = 'split'
            threshold = tree_.threshold[node]
            tree_json['threshold'] = "{} <= {}".format(feature_name[node], threshold)
            tree_json['left'] = recurse(tree_.children_left[node])
            tree_json['right'] = recurse(tree_.children_right[node])
        else:
            tree_json['type'] = 'leaf'
            tree_json['value'] = tree_.value[node].tolist()
        return tree_json

    return recurse(0)

In [37]:
def forest_to_json(forest):
    forest_json = dict()
    forest_json['n_features'] = len(forest.estimators_[0].feature_importances_)
    forest_json['n_classes'] = forest.n_classes_
    forest_json['classes'] = forest.classes_.tolist()
    forest_json['n_outputs'] = forest.n_outputs_
    forest_json['n_estimators'] = forest.n_estimators
    forest_json['estimators'] = [tree_to_json(estimator) for estimator in forest.estimators_]
    return forest_json


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
import json
#import dump

In [22]:
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')
print('X_train:{0}, y_train:{1}'.format(X_train.shape, y_train.shape))

X_train:(7738, 17), y_train:(7738,)


In [23]:
clf = RandomForestClassifier()
print('Cross Validation Score: {0}'.format(np.mean(cross_val_score(clf, X_train, y_train, cv=10))))

Cross Validation Score: 0.9472721134142956


In [24]:
clf.fit(X_train, y_train)

In [25]:
X_test = np.load('X_test.npy')
y_test = np.load('y_test.npy')

In [26]:
pred = clf.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_test, pred)))

Accuracy: 0.9466385287910762


In [38]:
# Specify the directory path
directory_path_classifier = '../../static/'

# Create the directory if it doesn't exist
if not os.path.exists(directory_path_classifier):
    os.makedirs(directory_path_classifier)

# Now you can proceed with writing the file
with open(os.path.join(directory_path_classifier, 'classifier.json'), 'w') as clf_file:
    json.dump(forest_to_json(clf), clf_file)
    print('Classifier JSON written to classifier.json')


Classifier JSON written to classifier.json
