# Importing the Libraries

In [2]:
#import nltk
#from nltk.corpus import stopwords
#set(stopwords.words('english'))

In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Reading the Dataset 

In [4]:
dataset = pd.read_csv('train-set.csv', header=0, \
                    delimiter="\t")

### Checking Dataset Shape

In [5]:
dataset.shape

(64, 1)

### Checking the Column Names

In [6]:
dataset.columns.values

array(['id;description;severity;type;family;risk factor;score'],
      dtype=object)

### Converting the Dataset into an Array

In [7]:
dataset_array = np.array(dataset)

### Accessing the Third Row 

In [8]:
dataset_array[2][0]

'3;The version of MySQL running on the remote host is 5.6.x prior to 5.6.39. It is, therefore, affected by multiple vulnerabilities as noted in the January 2018 Critical Patch Update advisory. Please consult the CVRF details for the applicable CVEs for additional information.;medium ;local ;databases;medium ;4.3'

# Extracting the description from the Dataset 

In [9]:
description = []
for i in range(dataset.shape[0]):
    temporary_variable = dataset_array[i][0].split(';')
    temporary_variable=np.array(temporary_variable)
    description.append(temporary_variable[1])    

In [10]:
description = np.array(description)

In [11]:
description.shape

(64,)

### Removing unecessary Details i.e. Stop Words, Non Letters, HTML

In [12]:
from bs4 import BeautifulSoup  
import re
from nltk.corpus import stopwords
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [13]:
# Get the number of reviews based on the dataframe column size

# Initialize an empty list to hold the clean reviews
clean_description = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range(dataset.shape[0]):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_description.append( review_to_words( description[i] ) )

In [14]:
clean_description_array = np.array(clean_description)

In [15]:
clean_description_array.shape

(64,)

In [16]:
clean_description_array[6]

'plugin runs hydra find http proxy accounts passwords brute force use plugin enter logins file passwords file hydra nasl wrappers options advanced settings block'

# Creating the Bag of Words 

In [17]:
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
description_features = vectorizer.fit_transform(clean_description_array)

# Numpy arrays are easy to work with, so convert the result to an 
# array
description_features = description_features.toarray()

Creating the bag of words...



In [18]:
description_features.shape

(64, 618)

## Creating the Result array as per SEVERITY

In [19]:
severity = []
for i in range(dataset.shape[0]):
    temporary_variable = dataset_array[i][0].split(';')
    temporary_variable=np.array(temporary_variable)
    severity.append(temporary_variable[2])  

In [20]:
severity = np.array(severity)

In [21]:
severity.shape

(64,)

In [22]:
severity[6]

'high'

### Testing with Random Forest (3 Class Classification as per Severity)

In [23]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
n_folds = 5
score = 0.0
skf = StratifiedKFold(severity, n_folds)
avg_score = 0

for train_index, test_index in skf:
    X_train, X_test = description_features[train_index], description_features[test_index]
    y_train, y_test = severity[train_index], severity[test_index]
    forest = RandomForestClassifier(n_estimators = 50)
    forest.fit( X_train, y_train )
    score = forest.score(X_test,y_test)
    avg_score += score 
    print(score)
    
print("avg",avg_score/n_folds)



0.25
0.38461538461538464
0.6666666666666666
0.3333333333333333
0.2727272727272727
avg 0.38146853146853144


# Creating the Result array as per SCORE

In [24]:
score_result = []
for i in range(dataset.shape[0]):
    temporary_variable = dataset_array[i][0].split(';')
    temporary_variable=np.array(temporary_variable)
    score_result.append(temporary_variable[6]) 

In [25]:
score_result[12]='7.5'
score_result[54]='7.5'
score_result = np.array(score_result).astype(np.float)

In [26]:
score_result.shape

(64,)

In [27]:
for i in range(dataset.shape[0]):
    print(score_result[i])

7.5
3.5
4.3
10.0
7.5
7.5
7.5
10.0
10.0
6.5
3.3
4.0
7.5
7.8
10.0
10.0
10.0
10.0
10.0
9.3
9.4
7.5
9.7
9.3
6.4
5.6
5.0
5.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
7.5
7.6
7.5
7.5
7.5
9.3
9.0
7.5
7.5
7.5
9.0
4.3
6.8
4.9
4.1
5.0
4.6
7.5
5.8
4.8
2.1
3.7
3.5
3.5
2.1
2.6
2.1


# Testing with Gradient Boosting Regression 

In [28]:
from sklearn import ensemble
from sklearn.metrics import mean_squared_error

n_folds = 5
skf = StratifiedKFold(score_result, n_folds)



for train_index, test_index in skf:
    X_train, X_test = description_features[train_index], description_features[test_index]
    y_train, y_test = score_result[train_index], score_result[test_index]
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
    clf = ensemble.GradientBoostingRegressor(**params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    print("MSE: %.4f" % mse)
    


MSE: 8.1312
MSE: 2.9190
MSE: 2.8643
MSE: 9.3840
MSE: 3.1921
