In [1]:
import pickle

import os
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

In [2]:
javascript_path = "JavascriptSamples/"
python_path = "PythonSamples/"
powershell_path = "PowerShellSamples/"
csv_path = "csv_dataset/"
xml_path = "xml_dataset/"

In [5]:
corpus = []
labels = []
file_types_and_labels = [(csv_path, 2),(javascript_path,1)]

In [6]:
for files_path, label in file_types_and_labels:
    files = os.listdir(files_path)
    for file in files:
        file_path = files_path + "/" + file
        try:
            with open(file_path, "r") as myfile:
                data = myfile.read().replace("\n", "")
        except:
            pass
        data = str(data)
        corpus.append(data)
        labels.append(label)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    corpus, labels, test_size=0.33, random_state=11
)

In [8]:
text_clf = Pipeline(
    [
        ("vect", HashingVectorizer(input="content", ngram_range=(1, 3))),
        ("tfidf", TfidfTransformer(use_idf=True,)),
        ("rf", RandomForestClassifier(class_weight="balanced")),
    ]
)

In [9]:
text_clf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2', pr...tors=10, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [10]:
y_test_pred = text_clf.predict(X_test)

In [11]:
print(accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

0.991980753809142
[[1227    0]
 [  10   10]]


In [15]:
ans = text_clf.predict('abcd')

ValueError: Iterable over raw text documents expected, string object received.

In [14]:
pickle.dump(text_clf, open('model.pkl', 'wb'))

In [16]:
X_test[0]

"'use strict';// This tests that closing a watcher when the underlying handle is// already destroyed will result in a noop instead of a crash.const common = require('../common');const tmpdir = require('../common/tmpdir');const fs = require('fs');const path = require('path');tmpdir.refresh();const root = path.join(tmpdir.path, 'watched-directory');fs.mkdirSync(root);const watcher = fs.watch(root, { persistent: false, recursive: false });// The following listeners may or may not be invoked.watcher.addListener('error', () => {  setTimeout(    () => { watcher.close(); },  // Should not crash if it's invoked    common.platformTimeout(10)  );});watcher.addListener('change', () => {  setTimeout(    () => { watcher.close(); },    common.platformTimeout(10)  );});fs.rmdirSync(root);// Wait for the listener to hitsetTimeout(  common.mustCall(() => {}),  common.platformTimeout(100));"

In [17]:
ans = text_clf.predict("abcd")

ValueError: Iterable over raw text documents expected, string object received.

In [18]:
ans = text_clf.predict("'use strict';// This tests that closing a watcher when the underlying handle is// already destroyed will result in a noop instead of a crash.const common = require('../common');const tmpdir = require('../common/tmpdir');const fs = require('fs');const path = require('path');tmpdir.refresh();const root = path.join(tmpdir.path, 'watched-directory');fs.mkdirSync(root);const watcher = fs.watch(root, { persistent: false, recursive: false });// The following listeners may or may not be invoked.watcher.addListener('error', () => {  setTimeout(    () => { watcher.close(); },  // Should not crash if it's invoked    common.platformTimeout(10)  );});watcher.addListener('change', () => {  setTimeout(    () => { watcher.close(); },    common.platformTimeout(10)  );});fs.rmdirSync(root);// Wait for the listener to hitsetTimeout(  common.mustCall(() => {}),  common.platformTimeout(100));")

ValueError: Iterable over raw text documents expected, string object received.

In [19]:
input = "'use strict';// This tests that closing a watcher when the underlying handle is// already destroyed will result in a noop instead of a crash.const common = require('../common');const tmpdir = require('../common/tmpdir');const fs = require('fs');const path = require('path');tmpdir.refresh();const root = path.join(tmpdir.path, 'watched-directory');fs.mkdirSync(root);const watcher = fs.watch(root, { persistent: false, recursive: false });// The following listeners may or may not be invoked.watcher.addListener('error', () => {  setTimeout(    () => { watcher.close(); },  // Should not crash if it's invoked    common.platformTimeout(10)  );});watcher.addListener('change', () => {  setTimeout(    () => { watcher.close(); },    common.platformTimeout(10)  );});fs.rmdirSync(root);// Wait for the listener to hitsetTimeout(  common.mustCall(() => {}),  common.platformTimeout(100));"


In [20]:
input = [input]

In [21]:
ans = text_clf.predict(input)

In [22]:
print(ans)

[1]
