In [13]:
import numpy as np
import pandas as pd
import warnings
import nltk.data
#nltk.download()   
warnings.filterwarnings('ignore')

In [14]:
dataset = pd.read_json('cve-2016.json')

In [45]:
dataset.shape

(9417, 6)

In [15]:
dataset.columns.values

array(['CVE_Items', 'CVE_data_format', 'CVE_data_numberOfCVEs',
       'CVE_data_timestamp', 'CVE_data_type', 'CVE_data_version'],
      dtype=object)

In [16]:
description = []
severity = []
scores = []

for i in range(dataset.shape[0]):
    new=dataset.CVE_Items[i]
    if('baseMetricV2' in new['impact'].keys()):
        severity.append(new['impact']['baseMetricV2']['severity'])
        scores.append(new['impact']['baseMetricV2']['cvssV2']['baseScore'])
        description.append(new['cve']['description']['description_data'][0]['value'])

In [17]:
description = np.array(description)
severity = np.array(severity)
scores = np.array(scores)

In [18]:
from bs4 import BeautifulSoup  
import re
from nltk.corpus import stopwords
def review_to_wordlist( raw_review, remove_stopwords=False  ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(words ) 

In [19]:

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [20]:
sentences = []  # Initialize an empty list of sentences


for review in description:
    sentences += review_to_sentences(review, tokenizer)



In [21]:
len(sentences)

11870

In [22]:
sentences[3400]

['ibm',
 'sametime',
 'meeting',
 'server',
 'vulnerable',
 'cross',
 'site',
 'scripting']

In [23]:
scores.shape

(8205,)

In [24]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-07-09 02:03:02,365 : INFO : 'pattern' package not found; tag filters are not available for English
2018-07-09 02:03:02,371 : INFO : collecting all words and their counts
2018-07-09 02:03:02,372 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-09 02:03:02,424 : INFO : PROGRESS: at sentence #10000, processed 202304 words, keeping 9499 word types
2018-07-09 02:03:02,434 : INFO : collected 10441 word types from a corpus of 230372 raw words and 11870 sentences
2018-07-09 02:03:02,435 : INFO : Loading a fresh vocabulary
2018-07-09 02:03:02,442 : INFO : min_count=40 retains 705 unique words (6% of original 10441, drops 9736)
2018-07-09 02:03:02,442 : INFO : min_count=40 leaves 189168 word corpus (82% of original 230372, drops 41204)
2018-07-09 02:03:02,447 : INFO : deleting the raw counts dictionary of 10441 items
2018-07-09 02:03:02,448 : INFO : sample=0.001 downsamples 77 most-common words
2018-07-09 02:03:02,449 : INFO : downsampling leaves estimated 

Training model...


2018-07-09 02:03:02,613 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-09 02:03:02,618 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-09 02:03:02,621 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-09 02:03:02,624 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-09 02:03:02,625 : INFO : EPOCH - 1 : training on 230372 raw words (126976 effective words) took 0.1s, 936605 effective words/s
2018-07-09 02:03:02,741 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-09 02:03:02,743 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-09 02:03:02,744 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-09 02:03:02,749 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-09 02:03:02,750 : INFO : EPOCH - 2 : training on 230372 raw words (127006 effective words) took 0.1s, 1090665 effective words/s
2

In [25]:
model.most_similar("windows")

[('gold', 0.9094369411468506),
 ('rt', 0.8332555890083313),
 ('loading', 0.827616274356842),
 ('r', 0.7401587963104248),
 ('continuous', 0.7247640490531921),
 ('microsoft', 0.7045366168022156),
 ('os', 0.690426766872406),
 ('sp', 0.6736881732940674),
 ('execute', 0.6600849628448486),
 ('privileges', 0.6469964981079102)]

In [26]:
model.most_similar("vulnerable")    

[('caused', 0.8722426891326904),
 ('forgery', 0.8688178062438965),
 ('cross', 0.855127215385437),
 ('websphere', 0.8501957654953003),
 ('csrf', 0.7897992134094238),
 ('reflected', 0.7893112897872925),
 ('lifecycle', 0.7889853715896606),
 ('tivoli', 0.7777188420295715),
 ('actions', 0.776308536529541),
 ('victim', 0.7574336528778076)]

In [28]:
print(model)

Word2Vec(vocab=705, size=300, alpha=0.025)


In [36]:
model['internet'].shape

(300,)

In [34]:
words = list(model.wv.vocab)
#print(words)

In [41]:
print(model[words[704]])

[-0.04629976 -0.03775239 -0.0612707   0.04138781  0.01347958 -0.13033734
  0.03593222 -0.09561654  0.09193933  0.07477549 -0.01045573 -0.06526111
  0.04529662  0.00507673 -0.05801545 -0.08556713 -0.03978465  0.00544046
  0.05104588 -0.06756661  0.0370345  -0.04885554 -0.06992226  0.06885462
 -0.01036942 -0.01555624  0.01590928  0.05672768  0.03949884  0.00494205
  0.09041023  0.00571667  0.09692305  0.00783475 -0.01307637 -0.09706757
  0.01655647 -0.05390599 -0.04951017 -0.0244361  -0.03356093 -0.05495088
 -0.09433108 -0.01117052  0.01180344  0.06742935  0.02205241 -0.01068878
  0.03313516  0.05438668  0.07552817 -0.09990546 -0.08005269  0.04091886
  0.07880871 -0.04165225 -0.00918824  0.11682916  0.01531674  0.03951841
  0.06148017 -0.03348539  0.07114412 -0.05529675  0.02345757  0.05184504
 -0.08638492  0.03308222 -0.04185783  0.06531723 -0.02965119  0.05407631
 -0.08639693  0.01736786  0.0844368   0.03411895 -0.04996516 -0.04844293
 -0.00310361  0.11538444 -0.05378686 -0.02967841  0