# Depend Libraries

In [1]:
import random
import ast
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Load Annotated Data from Cybertweet Dataset

In [2]:
df = pd.read_csv('./twitter_data.csv')
df.dropna(inplace=True)

In [3]:
df

Unnamed: 0,id,text,annotation,type
0,5b88752abb325e65390af240,Best way to build empathy is through honesty a...,irrelevant,['Vulnerability']
1,5b8875d5bb325e65a22cf81b,Cryptocurrency Scams Replacing Ransomware as A...,business,['ransomware']
2,5b88768dbb325e65fa7e78e3,Cryptocurrency Scams Replacing Ransomware as A...,business,['ransomware']
3,5b8876f9bb325e65fa7e78e4,Protect your customers access Prestashop Ant...,threat,['Ddos']
4,5b8876f9bb325e65fa7e78e5,Data leak from Huazhu Hotels may affect 130 mi...,threat,['leak']
...,...,...,...,...
21363,5b982924bb325e3cadebf9ad,@FuddBot (cont'd) ... Spikes in tweet volume: ...,business,['botnet']
21364,5b982928bb325e3cadebf9ae,@NarutoRomania ransomware content blackmail fo...,threat,['ransomware']
21365,5b982929bb325e3cadebf9af,Make sure you have upgraded to #tor 8.0 #priva...,threat,['general']
21366,5b98294abb325e3cadebf9b0,@JennMGreenberg @MindingTheKings I was thinkin...,irrelevant,['general']


# Preprocess Input Data:

### convert column type values from (string): '['botnet']' into (list): ['botnet']

In [4]:
new_data = []

for type_as_str in df['type']:
    type_as_list = ast.literal_eval(type_as_str)
    
    #standardize types to lower case: 'vulnerability' and 'Vulnerability' should be the same type
    for i in range(len(type_as_list)):
        type_as_list[i] = type_as_list[i].lower()
    
    new_data.append(type_as_list)
    
df['type'] = new_data

# Encoding Input Data

### One-Hot Encoding

In [5]:
types = df['type']
multilabel = MultiLabelBinarizer()
types_encoded = multilabel.fit_transform(df['type'])

In [6]:
pd.DataFrame(types_encoded, columns=multilabel.classes_)

Unnamed: 0,0day,all,botnet,ddos,general,leak,ransomware,vulnerability
0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
21363,0,0,1,0,0,0,0,0
21364,0,0,0,0,0,0,1,0
21365,0,0,0,0,1,0,0,0
21366,0,0,0,0,1,0,0,0


### Vectorize Input Text into a Sparse Matrix

In [7]:
#I could use the max_feature argument -> limits dictionary representation -> faster training. I won't use it here.
tfidf = TfidfVectorizer(analyzer='word')
text_vectorized = tfidf.fit_transform(df['text'])

In [8]:
text_vectorized.shape, types_encoded.shape

((21368, 47959), (21368, 8))

### Create Testing and Training Datasets

In [9]:
text_Train, text_Test, type_Train, type_Test = train_test_split(text_vectorized, types_encoded, test_size = 0.2, random_state = 0)

# Model Performance Measurement Functions

In [10]:
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1) / np.maximum(y_true, y_pred).sum(axis = 1)
    return jaccard.mean()*100

def print_score(y_pred, clf):
    print('clf: ', clf.__class__.__name__)
    print('Jacard score: {}'.format(j_score(type_Test, y_pred)))
    print('----')

# Build and Train Model

In [11]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [12]:
#compare performance
for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(text_Train, type_Train)
    pred = clf.predict(text_Test)
    print_score(pred, classifier)

clf:  SGDClassifier
Jacard score: 84.40570893776322
----
clf:  LogisticRegression
Jacard score: 79.34019653720168
----
clf:  LinearSVC
Jacard score: 85.11932615816565
----


# Test with Sample Data

In [13]:
while True:
    user_input = input('Enter Some Text:')
    if user_input == 'stop':
        break
        
    #transform input into sparse matrix
    input_transform = tfidf.transform([user_input])
    #decode prediction into text classifications
    print('Classification:', multilabel.inverse_transform(clf.predict(input_transform)))

Enter Some Text:I want to steal passwords from Reddit servers.
Classification: [('general',)]
Enter Some Text:I want to ddos YouTube's network tomorrow.
Classification: [('ddos',)]
Enter Some Text:How do you make and deploy ransomware effectively?
Classification: [('ransomware',)]
Enter Some Text: found a new vulnerability in the Google Home device.
Classification: [('vulnerability',)]
Enter Some Text:stop


# Test with CVE Data

In [14]:
df_2 = pd.read_csv('./cve_data_description_only.csv')
df_2.dropna(inplace=True)

exclude_str = '**'
cve_arr = []

for cve in df_2['Description']:
    if exclude_str not in cve:
        cve_arr.append(cve)
        
#remove duplicates
cve_arr = list(set(cve_arr))

In [15]:
results = []
counter = 0

for cve in cve_arr:
    input_transform = tfidf.transform([cve])
    prediction = multilabel.inverse_transform(clf.predict(input_transform))
    prediction_str = str(prediction)
    results.append(prediction_str)
    
    if counter < 500:
        print('Input:', cve)
        print('Classification:', prediction_str)
        print('-----------------')
        
    counter += 1


Input: awstats.pl in AWStats 6.5 build 1.857 and earlier allows remote attackers to obtain the installation path via the (1) year, (2) pluginmode or (3) month parameters.
Classification: [('general',)]
-----------------
Input: QEMU 4.1.0 has a memory leak in zrle_compress_data in ui/vnc-enc-zrle.c during a VNC disconnect operation because libz is misused, resulting in a situation where memory allocated in deflateInit2 is not freed in deflateEnd.
Classification: [('general',)]
-----------------
Input: For the printers listed a maliciously crafted print file might cause certain HP Inkjet printers to assert. Under certain circumstances, the printer produces a core dump to a local device.
Classification: [('general',)]
-----------------
Input: WebKit, as used in Apple iOS before 9 and iTunes before 12.3, allows remote attackers to execute arbitrary code or cause a denial of service (memory corruption and application crash) via a crafted web site, a different vulnerability than other WebKit

Input: PAM_extern before 0.2 sends a password as a command line argument, which allows local users to obtain the password by listing the command line arguments, such as ps.  NOTE: the provenance of this information is unknown; the details are obtained solely from third party information.
Classification: [('general',)]
-----------------
Input: Unspecified vulnerability in the Primavera P6 Enterprise Project Portfolio Management component in Oracle Primavera Products Suite 8.4, 15.x, and 16.x allows remote authenticated users to affect confidentiality and integrity via unknown vectors.
Classification: [('vulnerability',)]
-----------------
Input: Lack of authentication in file-viewing components in DDRT Dashcom Live 2019-05-09 allows anyone to remotely access all claim details by visiting easily guessable dashboard/uploads/claim_files/claim_id_ URLs.
Classification: [('general',)]
-----------------
Input: Yii 2.x before 2.0.15 allows remote attackers to execute arbitrary LUA code via a v

Input: win32k.sys in the kernel-mode drivers in Microsoft Windows XP SP2 and SP3, Windows Server 2003 SP2, Windows Vista SP1 and SP2, Windows Server 2008 Gold, SP2, and R2, and Windows 7 does not properly validate user-mode input, which allows local users to gain privileges via a crafted application, aka "Win32k Window Class Pointer Confusion Vulnerability."
Classification: [()]
-----------------
Input: Piwik 1.1 allows remote attackers to obtain sensitive information via a direct request to a .php file, which reveals the installation path in an error message, as demonstrated by plugins/Widgetize/Widgetize.php and certain other files.
Classification: [('general',)]
-----------------
Input: The CentralAuth extension for MediaWiki before 1.19.10, 1.2x before 1.21.4, and 1.22.x before 1.22.1 allows remote attackers to obtain usernames via vectors related to writing the names to the DOM of a page.
Classification: [('general',)]
-----------------
Input: projectContents.jsp in the Developer 

Input: Memory leak in decode_line_info in dwarf2.c in the Binary File Descriptor (BFD) library (aka libbfd), as distributed in GNU Binutils 2.29, allows remote attackers to cause a denial of service (memory consumption) via a crafted ELF file.
Classification: [('general',)]
-----------------
Input: Cross-site scripting (XSS) vulnerability in PmWiki before 2.1.18 allows remote attackers to inject arbitrary web script or HTML via unspecified vectors involving "table markups".
Classification: [('vulnerability',)]
-----------------
Input: Apache OpenOffice.org (OOo) before 4.0 allows remote attackers to cause a denial of service (memory corruption) or possibly have unspecified other impact via a crafted element in an OOXML document file.
Classification: [('general',)]
-----------------
Input: Novell Netware NWFTPD 5.06.05 allows remote attackers to cause a denial of service (ABEND) via an MDTM command that uses a long path for the target file, possibly due to a buffer overflow.
Classificat

Input: A buffer overflow was discovered in the URL-authentication backend of the Icecast before 2.4.4. If the backend is enabled, then any malicious HTTP client can send a request for that specific resource including a crafted header, leading to denial of service and potentially remote code execution.
Classification: [('general',)]
-----------------
Input: SQL injection vulnerability in the Weblinks (com_weblinks) component in Joomla! allows remote attackers to execute arbitrary SQL commands via the id parameter in a view action to index.php.
Classification: [('vulnerability',)]
-----------------
Input: A vulnerability in the web framework of Cisco Unified Communications Manager could allow an unauthenticated, remote attacker to conduct a cross-site scripting (XSS) attack against a user of the web interface of the affected software. More Information: CSCvb95951. Known Affected Releases: 12.0(0.99999.2). Known Fixed Releases: 11.0(1.23064.1) 11.5(1.12031.1) 11.5(1.12900.21) 11.5(1.12900

Input: Lack of input sanitization in AceManager of ALEOS before 4.12.0, 4.9.5 and 4.4.9 allows disclosure of sensitive system information.
Classification: [('vulnerability',)]
-----------------
Input: Multiple PHP remote file inclusion vulnerabilities in TalkBack 2.2.7 allow remote attackers to execute arbitrary PHP code via a URL in the (1) language_file parameter to (a) comments-display-tpl.php and (b) addons/separate-comments-mod/my-comments-display-tpl.php and the (2) config[comments_form_tpl] parameter to comments-display-tpl.php.
Classification: [('general',)]
-----------------
Input: OpenNMS Horizon and Meridian allows HQL Injection in element/nodeList.htm (aka the NodeListController) via snmpParm or snmpParmValue to addCriteriaForSnmpParm. This affects Horizon before 25.2.1, Meridian 2019 before 2019.1.4, Meridian 2018 before 2018.1.16, and Meridian 2017 before 2017.1.21.
Classification: [('general',)]
-----------------
Input: Format string vulnerability in xv before 3.10a allo

Input: An information disclosure issue was discovered GitLab versions < 12.1.2, < 12.0.4, and < 11.11.6 in the security dashboard which could result in disclosure of vulnerability feedback information.
Classification: [('vulnerability',)]
-----------------
Input: Buffer overflow in errpt in AIX 4.3.3 allows local users to execute arbitrary code as root.
Classification: [('general',)]
-----------------
Input: Unspecified vulnerability in the PeopleSoft Enterprise FSCM component in Oracle PeopleSoft Products 9.0, Bundle, #36, 9.1, Bundle, and #13 allows remote authenticated users to affect confidentiality and integrity via unknown vectors related to eProcurement.
Classification: [('general',)]
-----------------
Input: The SIP channel driver in Asterisk Open Source 1.4.x before 1.4.17, Business Edition before C.1.0-beta8, AsteriskNOW before beta7, Appliance Developer Kit before Asterisk 1.4 revision 95946, and Appliance s800i 1.0.x before 1.0.3.4 allows remote attackers to cause a denial 

Input: Multiple integer overflows in Common Unix Printing System (CUPS) 1.1.14 through 1.1.17 allow remote attackers to execute arbitrary code via (1) the CUPSd HTTP interface, as demonstrated by vanilla-coke, and (2) the image handling code in CUPS filters, as demonstrated by mksun.
Classification: [('general',)]
-----------------
Input: The BIOS in Intel Compute Stick systems based on 6th Gen Intel Core processors prior to version CC047 may allow an attacker with physical access to the system to gain access to personal information.
Classification: [('general',)]
-----------------
Input: Unquoted Windows search path vulnerability in the panda_url_filtering service in Panda Global Protection 17.0.1 allows local users to gain privileges via a malicious artefact.
Classification: [('vulnerability',)]
-----------------
Input: CoreText in Apple iOS before 8.4 and OS X before 10.10.4 allows remote attackers to execute arbitrary code or cause a denial of service (memory corruption) via a craf

In [16]:
fields = ['Description', 'Classification']
output_data = []
for a,b in zip(cve_arr, results):
    output_data.append([a,b])

with open('cve_classified.csv', 'w') as f: 
      
    write = csv.writer(f) 
    write.writerow(fields) 
    write.writerows(output_data)   