## Pre-Processing


In [1]:
#!pip3 install nltk
#!pip3 install gensim
#!pip3 install pyLDAvis

In [2]:
import pandas as pd
import requests
import gensim
import string
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from collections import defaultdict
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
#nltk.download('popular')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('omw-1.4')

In [4]:
df = pd.read_csv("../data/raw/allitems.csv", encoding='iso8859_15').drop(columns = ['Unnamed: 0'])
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Name,Status,Description,References,Phase,Votes,Comments
0,CVE-1999-0001,Candidate,ip_input.c in BSD-derived TCP/IP implementatio...,BUGTRAQ:19981223 Re: CERT Advisory CA-98.13 - ...,Modified (20051217),"MODIFY(1) Frech | NOOP(2) Northcutt, W...",Christey> A Bugtraq posting indicates that the...
1,CVE-1999-0002,Entry,Buffer overflow in NFS mountd gives root acces...,BID:121 | URL:http://www.securityfocus.com...,,,
2,CVE-1999-0003,Entry,Execute commands as root via buffer overflow i...,BID:122 | URL:http://www.securityfocus.com...,,,
3,CVE-1999-0004,Candidate,"MIME buffer overflow in email clients, e.g. So...",CERT:CA-98.10.mime_buffer_overflows | MS:M...,Modified (19990621),"ACCEPT(8) Baker, Cole, Collins, Dik, Landfi...","Frech> Extremely minor, but I believe e-mail i..."
4,CVE-1999-0005,Entry,Arbitrary command execution via IMAP buffer ov...,BID:130 | URL:http://www.securityfocus.com...,,,
...,...,...,...,...,...,...,...
227233,CVE-2022-24031,Candidate,** RESERVED ** This candidate has been reserve...,,Assigned (20220126),None (candidate not yet proposed),
227234,CVE-2022-24032,Candidate,** RESERVED ** This candidate has been reserve...,,Assigned (20220126),None (candidate not yet proposed),
227235,CVE-2022-24033,Candidate,** RESERVED ** This candidate has been reserve...,,Assigned (20220126),None (candidate not yet proposed),
227236,CVE-2022-24034,Candidate,** RESERVED ** This candidate has been reserve...,,Assigned (20220126),None (candidate not yet proposed),


We remove all intances of "\*\*" as these vulnerabilties are highlighted to inform us that they are not valid, resons include that they are reserved for a later vulnerabitlity or are a duplicate of another CVE. We will also remove any CVEs from 2022 becuase we would be making judgements about the year based on 1 month's data which would 

In [5]:
df = df[~df.Description.str.startswith('**')]
df = df[~df.Name.str.contains('2022')]
desc = df['Description'].str.lower()

In [6]:
desc[0:10]

0    ip_input.c in bsd-derived tcp/ip implementatio...
1    buffer overflow in nfs mountd gives root acces...
2    execute commands as root via buffer overflow i...
3    mime buffer overflow in email clients, e.g. so...
4    arbitrary command execution via imap buffer ov...
5    buffer overflow in pop servers based on bsd/qu...
6    information from ssl-encrypted sessions via pk...
7    buffer overflow in nis+, in sun's rpc.nisd pro...
8    inverse query buffer overflow in bind 4.9 and ...
9    denial of service vulnerability in bind 8 rele...
Name: Description, dtype: object

Remove Numbers

In [7]:
from string import digits
desc1 = ['']
remove_digits = str.maketrans("","",digits)
for line in desc:
     desc1.append(line.translate(remove_digits))

Remove stop words, punctuation and lemmitize

In [8]:
stop_words = stopwords.words('english')

def remove_stop_words(text):
    words = text.split(' ')
    text1 = " ".join([i for i in words if i not in stop_words])
    return text1

In [9]:
def clean_entry(text): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation + string.digits}
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)

    return text1

In [10]:
test_string = "hi my name is bill, you'll like to eat 10 beans. it's cold ( 2deg ) outside."
test_tokens = word_tokenize(test_string)
print(test_tokens)

['hi', 'my', 'name', 'is', 'bill', ',', 'you', "'ll", 'like', 'to', 'eat', '10', 'beans', '.', 'it', "'s", 'cold', '(', '2deg', ')', 'outside', '.']


In [11]:
test_string1 = remove_stop_words(clean_entry(test_string))
print(word_tokenize(test_string1))
test_string2 = clean_entry(remove_stop_words(test_string))
print(word_tokenize(test_string2))

['hi', 'name', 'bill', 'youll', 'like', 'eat', 'beans', 'cold', 'deg', 'outside']
['hi', 'name', 'bill', 'like', 'eat', 'beans', 'cold', 'deg', 'outside']


We see here the correct order to apply our functions is removal of stopwords and then cleaning. This is because words like `"you'll"` are in our stopword list but `"youll"` is not, and thus removing punctuation in the cleaning step messes us up if we do it before removing the stopwords. 
We may now apply our functions to `desc` to clean it up.

In [12]:
desc1 = desc.apply(remove_stop_words)
desc1 = desc1.apply(clean_entry)

In [13]:
print(desc[0])
print(desc1[0])

ip_input.c in bsd-derived tcp/ip implementations allows remote attackers to cause a denial of service (crash or hang) via crafted packets.
ipinputc bsdderived tcpip implementations allows remote attackers cause denial service crash hang via crafted packets


I found out after doing the above that `gensim` has a list of stopwords which is much more comprehensive than the list in `nltk`. The code below is therefore almost a complete repeat of our `remove_stop_words` function. We will use both functions to ensure that we cover the largest range of stop words.

In [14]:
all_stop_words = gensim.parsing.preprocessing.STOPWORDS

def remove_all_stop_words(text):
    words = text.split(' ')
    text1 = " ".join([i for i in words if i not in all_stop_words])
    return text1

We can therefore generate produce a sparser representation (I call it `desc2`) of our descriptions than `desc1` if we want to. We see that it gets rid of stuff like `"via"`.

In [15]:
desc2 = desc1.apply(remove_all_stop_words)
desc2[0]

'ipinputc bsdderived tcpip implementations allows remote attackers cause denial service crash hang crafted packets'

Let's start the lemmatising.

In [16]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

desc3 = []
for line in desc2:
    tokens = word_tokenize(line)
    lemma_function = WordNetLemmatizer()
    desc3.append([lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens)]) 

In [24]:
df['Description'] = desc3
df.to_csv('../data/processed/formatted_df.csv')

KeyboardInterrupt: 

In [25]:
type(desc3[1])
type(df['Description'][1])

list