In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_json('cve-2016.json')

In [3]:
description = []
severity = []
scores = []

for i in range(dataset.shape[0]):
    new=dataset.CVE_Items[i]
    if('baseMetricV2' in new['impact'].keys()):
        severity.append(new['impact']['baseMetricV2']['severity'])
        scores.append(new['impact']['baseMetricV2']['cvssV2']['baseScore'])
        description.append(new['cve']['description']['description_data'][0]['value'])


In [4]:
description = np.array(description)
severity = np.array(severity)
scores = np.array(scores)

In [15]:
score_result = np.array(scores).astype(np.float)

In [5]:
from bs4 import BeautifulSoup  
import re
from nltk.corpus import stopwords
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [6]:
# Get the number of reviews based on the dataframe column size

# Initialize an empty list to hold the clean reviews
clean_description = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range(description.shape[0]):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_description.append( review_to_words( description[i] ) )

In [7]:
clean_description_array = np.array(clean_description)

In [8]:
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 500) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
description_features = vectorizer.fit_transform(clean_description_array)

# Numpy arrays are easy to work with, so convert the result to an 
# array
description_features = description_features.toarray()

Creating the bag of words...



In [9]:
import os
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [17]:
from sklearn import ensemble
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.cross_validation import StratifiedKFold

n_folds = 5
avg_mad = 0
skf = StratifiedKFold(score_result, n_folds)

for train_index, test_index in skf:
    X_train, X_test = description_features[train_index], description_features[test_index]
    Y_train, Y_test = score_result[train_index], score_result[test_index]
    
    
    # Define model parameters
    learning_rate = 0.001
    training_epochs = 100

    # Define how many inputs and outputs are in our neural network
    number_of_inputs = 500
    number_of_outputs = 1

    # Define how many neurons we want in each layer of our neural network
    layer_1_nodes = 50
    layer_2_nodes = 100
    layer_3_nodes = 50
    
    tf.reset_default_graph()
    
    # Input Layer
    with tf.variable_scope('input'):
        X = tf.placeholder(tf.float32, shape=(None, number_of_inputs))

    # Layer 1
    with tf.variable_scope('layer_1'):
        weights = tf.get_variable("weights1", shape=[number_of_inputs, layer_1_nodes], initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.get_variable(name="biases1", shape=[layer_1_nodes], initializer=tf.zeros_initializer())
        layer_1_output = tf.nn.relu(tf.matmul(X, weights) + biases)

    # Layer 2
    with tf.variable_scope('layer_2'):
        weights = tf.get_variable("weights2", shape=[layer_1_nodes, layer_2_nodes], initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.get_variable(name="biases2", shape=[layer_2_nodes], initializer=tf.zeros_initializer())
        layer_2_output = tf.nn.relu(tf.matmul(layer_1_output, weights) + biases)

    # Layer 3
    with tf.variable_scope('layer_3'):
        weights = tf.get_variable("weights3", shape=[layer_2_nodes, layer_3_nodes], initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.get_variable(name="biases3", shape=[layer_3_nodes], initializer=tf.zeros_initializer())
        layer_3_output = tf.nn.relu(tf.matmul(layer_2_output, weights) + biases)

    # Output Layer
    with tf.variable_scope('output'):
        weights = tf.get_variable("weights4", shape=[layer_3_nodes, number_of_outputs])
        biases = tf.get_variable(name="biases4", shape=[number_of_outputs], initializer=tf.zeros_initializer())
        prediction = tf.matmul(layer_3_output, weights) + biases

    # Section Two: Define the cost function of the neural network that will measure prediction accuracy during training

    with tf.variable_scope('cost'):
        Y = tf.placeholder(tf.float32)
        cost = tf.reduce_mean(abs(prediction-Y))
    
    # Section Three: Define the optimizer function that will be run to optimize the neural network

    with tf.variable_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    # Initialize a session so that we can run TensorFlow operations
    with tf.Session() as session:

        # Run the global variable initializer to initialize all variables and layers of the neural network
        session.run(tf.global_variables_initializer())

        # Run the optimizer over and over to train the network.
        # One epoch is one full run through the training data set.
        for epoch in range(training_epochs):

            # Feed in the training data and do one step of neural network training
            session.run(optimizer, feed_dict={X: X_train, Y: Y_train})

            # Every 5 training steps, log our progress
            if epoch % 5 == 0:
                training_cost = session.run(cost, feed_dict={X: X_train, Y:Y_train})
                testing_cost = session.run(cost, feed_dict={X: X_test, Y:Y_test})

                print(epoch, training_cost, testing_cost)

        # Training is now complete!
        print("Training is complete!")

        final_training_cost = session.run(cost, feed_dict={X: X_train, Y: Y_train})
        final_testing_cost = session.run(cost, feed_dict={X: X_test, Y: Y_test})

        print("Final Training cost: {}".format(final_training_cost))
        print("Final Testing cost: {}".format(final_testing_cost))
    

0 5.809585 5.8462367
5 5.160324 5.2313447
10 4.2164226 4.246761
15 2.9862418 2.9408264
20 2.5988214 2.5132635
25 2.5496793 2.5486178
30 2.2028866 2.3043842
35 2.2061298 2.329694
40 2.134942 2.26963
45 2.0858686 2.248738
50 2.0794406 2.2617583
55 2.04184 2.2180786
60 2.0302649 2.1936624
65 2.0178003 2.1843233
70 2.0057778 2.1821067
75 1.9960316 2.1751335
80 1.9873366 2.158892
85 1.9803038 2.1441638
90 1.9724272 2.1340103
95 1.9656241 2.1253836
Training is complete!
Final Training cost: 1.9599733352661133
Final Testing cost: 2.116116523742676
0 6.3137636 6.2920585
5 5.868054 5.8787446
10 5.4937167 5.4885683
15 4.9287376 4.952388
20 4.118076 4.2019634
25 3.10795 3.1922522
30 2.6273646 2.5404053
35 2.6899126 2.554878
40 2.353303 2.2635539
45 2.248467 2.1674213
50 2.2123804 2.126564
55 2.135391 2.0603943
60 2.1185682 2.0704641
65 2.09118 2.0492766
70 2.0647955 2.0151448
75 2.0486777 2.00189
80 2.0305424 1.9952497
85 2.017988 1.9920549
90 2.0043027 1.9815844
95 1.9927137 1.9734163
Training i

KeyboardInterrupt: 

In [22]:
from sklearn import ensemble
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.cross_validation import StratifiedKFold


n_folds = 5
avg_mad = 0
skf = StratifiedKFold(score_result, n_folds)


for train_index, test_index in skf:
    X_train, X_test = description_features[train_index], description_features[test_index]
    Y_train, Y_test = score_result[train_index], score_result[test_index]
    
    
    #regularisation
    regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
    
    # Define model parameters
    learning_rate = 0.001
    training_epochs = 200

    # Define how many inputs and outputs are in our neural network
    number_of_inputs = 500
    number_of_outputs = 1

    # Define how many neurons we want in each layer of our neural network
    layer_1_nodes = 50
    layer_2_nodes = 100
    layer_3_nodes = 50
    
    tf.reset_default_graph()
    
    # Input Layer
    with tf.variable_scope('input'):
        X = tf.placeholder(tf.float32, shape=(None, number_of_inputs))

    # Layer 1
    with tf.variable_scope('layer_1'):
        weights = tf.get_variable("weights1", regularizer=regularizer,shape=[number_of_inputs, layer_1_nodes], initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.get_variable(name="biases1", shape=[layer_1_nodes], initializer=tf.zeros_initializer())
        layer_1_output = tf.nn.relu(tf.matmul(X, weights) + biases)


    # Output Layer
    with tf.variable_scope('output'):
        weights = tf.get_variable("weights4", regularizer=regularizer, shape=[layer_1_nodes, number_of_outputs])
        biases = tf.get_variable(name="biases4", shape=[number_of_outputs], initializer=tf.zeros_initializer())
        prediction = tf.matmul(layer_1_output, weights) + biases

    # Section Two: Define the cost function of the neural network that will measure prediction accuracy during training

    with tf.variable_scope('cost'):
        Y = tf.placeholder(tf.float32)
        cost = tf.reduce_mean(abs(prediction-Y))
    
    # Section Three: Define the optimizer function that will be run to optimize the neural network

    with tf.variable_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    # Initialize a session so that we can run TensorFlow operations
    with tf.Session() as session:

        # Run the global variable initializer to initialize all variables and layers of the neural network
        session.run(tf.global_variables_initializer())

        # Run the optimizer over and over to train the network.
        # One epoch is one full run through the training data set.
        for epoch in range(training_epochs):

            # Feed in the training data and do one step of neural network training
            session.run(optimizer, feed_dict={X: X_train, Y: Y_train})

            # Every 5 training steps, log our progress
            if epoch % 5 == 0:
                training_cost = session.run(cost, feed_dict={X: X_train, Y:Y_train})
                testing_cost = session.run(cost, feed_dict={X: X_test, Y:Y_test})

                print(epoch, training_cost, testing_cost)

        # Training is now complete!
        print("Training is complete!")

        final_training_cost = session.run(cost, feed_dict={X: X_train, Y: Y_train})
        final_testing_cost = session.run(cost, feed_dict={X: X_test, Y: Y_test})

        print("Final Training cost: {}".format(final_training_cost))
        print("Final Testing cost: {}".format(final_testing_cost))

0 6.3913636 6.330703
5 5.9288187 5.916041
10 5.5377703 5.5372915
15 5.135031 5.1657634
20 4.736758 4.78527
25 4.3379664 4.392524
30 3.9444056 4.0002966
35 3.5720854 3.6296792
40 3.2340565 3.2989545
45 2.943415 3.0183892
50 2.7128747 2.8018372
55 2.5437698 2.6515915
60 2.4343283 2.5563335
65 2.3604124 2.4999278
70 2.3145652 2.4660306
75 2.279439 2.4442096
80 2.2531784 2.4279125
85 2.2313697 2.4141328
90 2.2133071 2.4030728
95 2.1971588 2.3945718
100 2.1838992 2.387891
105 2.1721396 2.382463
110 2.162445 2.3777144
115 2.154373 2.3734365
120 2.1467428 2.3697414
125 2.1401405 2.3663604
130 2.133463 2.3634777
135 2.1269307 2.360345
140 2.121909 2.3576157
145 2.1158257 2.354647
150 2.1107652 2.3515654
155 2.1055086 2.34805
160 2.100851 2.3449965
165 2.097205 2.3415732
170 2.0933006 2.3382661
175 2.0886867 2.3346088
180 2.0846198 2.3310363
185 2.0806203 2.327309
190 2.076965 2.3236957
195 2.0726862 2.3197658
Training is complete!
Final Training cost: 2.0701630115509033
Final Testing cost: 2.3

KeyboardInterrupt: 

In [23]:
n_folds = 5
avg_mad = 0
skf = StratifiedKFold(score_result, n_folds)


for train_index, test_index in skf:
    X_train, X_test = description_features[train_index], description_features[test_index]
    Y_train, Y_test = score_result[train_index], score_result[test_index]
    
    
    # Define model parameters
    learning_rate = 0.001
    training_epochs = 200

    # Define how many inputs and outputs are in our neural network
    number_of_inputs = 500
    number_of_outputs = 1

    # Define how many neurons we want in each layer of our neural network
    layer_1_nodes = 50
    layer_2_nodes = 100
    layer_3_nodes = 50
    
    tf.reset_default_graph()
    
    # Input Layer
    with tf.variable_scope('input'):
        X = tf.placeholder(tf.float32, shape=(None, number_of_inputs))

    # Layer 1
    with tf.variable_scope('layer_1'):
        weights = tf.get_variable("weights1",shape=[number_of_inputs, layer_1_nodes], initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.get_variable(name="biases1", shape=[layer_1_nodes], initializer=tf.zeros_initializer())
        layer_1_output = tf.nn.relu(tf.matmul(X, weights) + biases)


    # Output Layer
    with tf.variable_scope('output'):
        weights = tf.get_variable("weights4", shape=[layer_1_nodes, number_of_outputs])
        biases = tf.get_variable(name="biases4", shape=[number_of_outputs], initializer=tf.zeros_initializer())
        prediction = tf.matmul(layer_1_output, weights) + biases

    # Section Two: Define the cost function of the neural network that will measure prediction accuracy during training

    with tf.variable_scope('cost'):
        Y = tf.placeholder(tf.float32)
        cost = tf.reduce_mean(abs(prediction-Y))
    
    # Section Three: Define the optimizer function that will be run to optimize the neural network

    with tf.variable_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    # Initialize a session so that we can run TensorFlow operations
    with tf.Session() as session:

        # Run the global variable initializer to initialize all variables and layers of the neural network
        session.run(tf.global_variables_initializer())

        # Run the optimizer over and over to train the network.
        # One epoch is one full run through the training data set.
        for epoch in range(training_epochs):

            # Feed in the training data and do one step of neural network training
            session.run(optimizer, feed_dict={X: X_train, Y: Y_train})

            # Every 5 training steps, log our progress
            if epoch % 5 == 0:
                training_cost = session.run(cost, feed_dict={X: X_train, Y:Y_train})
                testing_cost = session.run(cost, feed_dict={X: X_test, Y:Y_test})

                print(epoch, training_cost, testing_cost)

        # Training is now complete!
        print("Training is complete!")

        final_training_cost = session.run(cost, feed_dict={X: X_train, Y: Y_train})
        final_testing_cost = session.run(cost, feed_dict={X: X_test, Y: Y_test})

        print("Final Training cost: {}".format(final_training_cost))
        print("Final Testing cost: {}".format(final_testing_cost))

0 6.083245 6.080838
5 5.5260377 5.5600276
10 4.971763 5.038488
15 4.4246273 4.5104346
20 3.9139543 3.9918978
25 3.4488997 3.524602
30 3.0483305 3.1278381
35 2.7404468 2.828259
40 2.53088 2.6351495
45 2.4048758 2.5277576
50 2.334103 2.4732242
55 2.2906017 2.4425502
60 2.2589564 2.4197042
65 2.2281182 2.3995092
70 2.202884 2.38372
75 2.1825671 2.3724823
80 2.1671972 2.364674
85 2.1542888 2.3584723
90 2.144201 2.3531485
95 2.1350422 2.348176
100 2.1265638 2.3436902
105 2.1175482 2.3393404
110 2.1106508 2.3347213
115 2.1047485 2.3300486
120 2.098937 2.3253934
125 2.0932546 2.3205848
130 2.0880895 2.315838
135 2.0822582 2.3107066
140 2.0773337 2.3060513
145 2.073586 2.3012295
150 2.068536 2.2963436
155 2.0648196 2.2918763
160 2.0598803 2.2871017
165 2.0558808 2.2826457
170 2.052055 2.2777278
175 2.0484335 2.2728
180 2.0448995 2.2680101
185 2.0410495 2.2634091
190 2.037872 2.258481
195 2.033979 2.2536285
Training is complete!
Final Training cost: 2.031540870666504
Final Testing cost: 2.24980

In [29]:
n_folds = 5
avg_mad = 0
skf = StratifiedKFold(score_result, n_folds)
dropout = 0.5

for train_index, test_index in skf:
    X_train, X_test = description_features[train_index], description_features[test_index]
    Y_train, Y_test = score_result[train_index], score_result[test_index]
    
    
    # Define model parameters
    learning_rate = 0.001
    training_epochs = 300

    # Define how many inputs and outputs are in our neural network
    number_of_inputs = 500
    number_of_outputs = 1

    # Define how many neurons we want in each layer of our neural network
    layer_1_nodes = 50
    layer_2_nodes = 100
    layer_3_nodes = 50
    
    tf.reset_default_graph()
    
    # Input Layer
    with tf.variable_scope('input'):
        X = tf.placeholder(tf.float32, shape=(None, number_of_inputs))

    # Layer 1
    with tf.variable_scope('layer_1'):
        weights = tf.get_variable("weights1",shape=[number_of_inputs, layer_1_nodes], initializer=tf.contrib.layers.xavier_initializer())
        biases = tf.get_variable(name="biases1", shape=[layer_1_nodes], initializer=tf.zeros_initializer())
        layer_1_output = tf.nn.relu(tf.matmul(X, weights) + biases)
    layer_1_output = tf.layers.dropout(layer_1_output, rate=dropout)

    # Output Layer
    with tf.variable_scope('output'):
        weights = tf.get_variable("weights4", shape=[layer_1_nodes, number_of_outputs])
        biases = tf.get_variable(name="biases4", shape=[number_of_outputs], initializer=tf.zeros_initializer())
        prediction = tf.matmul(layer_1_output, weights) + biases
    prediction = tf.layers.dropout(prediction, rate=dropout)
    
    
    # Section Two: Define the cost function of the neural network that will measure prediction accuracy during training

    with tf.variable_scope('cost'):
        Y = tf.placeholder(tf.float32)
        cost = tf.reduce_mean(abs(prediction-Y))
    
    # Section Three: Define the optimizer function that will be run to optimize the neural network

    with tf.variable_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    # Initialize a session so that we can run TensorFlow operations
    with tf.Session() as session:

        # Run the global variable initializer to initialize all variables and layers of the neural network
        session.run(tf.global_variables_initializer())

        # Run the optimizer over and over to train the network.
        # One epoch is one full run through the training data set.
        for epoch in range(training_epochs):

            # Feed in the training data and do one step of neural network training
            session.run(optimizer, feed_dict={X: X_train, Y: Y_train})

            # Every 5 training steps, log our progress
            if epoch % 5 == 0:
                training_cost = session.run(cost, feed_dict={X: X_train, Y:Y_train})
                testing_cost = session.run(cost, feed_dict={X: X_test, Y:Y_test})

                print(epoch, training_cost, testing_cost)

        # Training is now complete!
        print("Training is complete!")

        final_training_cost = session.run(cost, feed_dict={X: X_train, Y: Y_train})
        final_testing_cost = session.run(cost, feed_dict={X: X_test, Y: Y_test})

        print("Final Training cost: {}".format(final_training_cost))
        print("Final Testing cost: {}".format(final_testing_cost))

0 6.062321 6.0602183
5 5.5669346 5.595575
10 5.0552363 5.1093874
15 4.5413995 4.6046963
20 4.0413485 4.0950303
25 3.5686922 3.6246138
30 3.1437864 3.2121816
35 2.7962208 2.8848507
40 2.553022 2.6630778
45 2.4085565 2.5366564
50 2.330534 2.4762907
55 2.2872834 2.4477959
60 2.2569604 2.4284132
65 2.2283502 2.4105759
70 2.2004101 2.3945217
75 2.1798089 2.3826413
80 2.164123 2.3740633
85 2.1521626 2.367618
90 2.1415398 2.36214
95 2.1327498 2.357028
100 2.1234882 2.3523703
105 2.116745 2.348004
110 2.1101744 2.343452
115 2.1029496 2.338674
120 2.0973334 2.3335829
125 2.091523 2.3282762
130 2.0854359 2.3232312
135 2.080473 2.3177183
140 2.0758843 2.3126848
145 2.0710387 2.3077934
150 2.066506 2.302633
155 2.0620923 2.2976885
160 2.0581756 2.2925594
165 2.054114 2.2874322
170 2.0510647 2.282283
175 2.0470674 2.2772973
180 2.0432103 2.271976
185 2.0395427 2.2668421
190 2.0356562 2.2619812
195 2.0326715 2.2568953
200 2.0296624 2.2519739
205 2.0266175 2.247082
210 2.0233376 2.242167
215 2.020108

KeyboardInterrupt: 