In [1]:
import pandas as pd
import numpy as np
import re
import sklearn

from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [2]:
def clean_string(query):
    return re.sub('\s+', ' ', query.replace('(', ' ').replace(')', ' ').replace(',', ' , ')).strip()

In [3]:
df = pd.read_csv('../data/all_data.csv')
df['payload'] = df['payload'].apply(clean_string)

In [4]:
count_vect = CountVectorizer(analyzer='word', ngram_range=(3,3))
raw_counts = count_vect.fit_transform(df['payload'])

# Display features
features = count_vect.get_feature_names()
print('Number of features: ' + str(len(features)))

Number of features: 2551635


In [5]:
# Append metadata to sparse matrix that will be used to train the classifier
num_feats = df[['length']].values
all_data = sparse.hstack((raw_counts, num_feats))
all_labels = df['label'].values

In [6]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(all_data, all_labels, test_size=0.3, random_state=0)

In [10]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0).fit(train_data, train_labels)
classifier_type = 'Random Forest'



In [11]:
# Run inference on the test data and predict labels for each data point in the test data
predicted_labels = classifier.predict(test_data)

# Calculate and print the following metrics: precision, recall, f1-measure, and accuracy
from sklearn import metrics
precision = metrics.precision_score(test_labels, predicted_labels)
recall = metrics.recall_score(test_labels, predicted_labels)
f1measure = metrics.f1_score(test_labels, predicted_labels)
accuracy = metrics.accuracy_score(test_labels, predicted_labels)

print(' precision = ' + str(precision))
print('    recall = ' + str(recall))
print('F1-measure = ' + str(f1measure))
print('  accuracy = ' + str(accuracy))
print('\n')

 precision = 0.9279373368146214
    recall = 0.8114155251141553
F1-measure = 0.8657734470158345
  accuracy = 0.9086437493091633




In [12]:
classifier.feature_importances_

array([0.       , 0.       , 0.       , ..., 0.       , 0.       ,
       0.0591272])

In [25]:
np.argsort(classifier.feature_importances_)[::-1]

array([2551635, 2540402, 2172133, ..., 1686811, 1686810,       0])

In [27]:
np.sort(classifier.feature_importances_)[::-1]

array([0.0591272 , 0.05100889, 0.0179549 , ..., 0.        , 0.        ,
       0.        ])

In [32]:
features[2540402]

'union all select'

In [35]:
# features[2551635] -> length