# Depend Libraries

In [None]:
import random
import ast
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Load Annotated Data from Cybertweet Dataset

In [None]:
df = pd.read_csv('./twitter_data.csv')
df.dropna(inplace=True)

In [None]:
df

# Preprocess Input Data:

### convert column type values from (string): '['botnet']' into (list): ['botnet']

In [None]:
new_data = []

for type_as_str in df['type']:
    type_as_list = ast.literal_eval(type_as_str)
    
    #standardize types to lower case: 'vulnerability' and 'Vulnerability' should be the same type
    for i in range(len(type_as_list)):
        type_as_list[i] = type_as_list[i].lower()
    
    new_data.append(type_as_list)
    
df['type'] = new_data

# Encoding Input Data

### One-Hot Encoding

In [None]:
types = df['type']
multilabel = MultiLabelBinarizer()
types_encoded = multilabel.fit_transform(df['type'])

In [None]:
pd.DataFrame(types_encoded, columns=multilabel.classes_)

### Vectorize Input Text into a Sparse Matrix

In [None]:
#I could use the max_feature argument -> limits dictionary representation -> faster training. I won't use it here.
tfidf = TfidfVectorizer(analyzer='word')
text_vectorized = tfidf.fit_transform(df['text'])

In [None]:
text_vectorized.shape, types_encoded.shape

### Create Testing and Training Datasets

In [None]:
text_Train, text_Test, type_Train, type_Test = train_test_split(text_vectorized, types_encoded, test_size = 0.2, random_state = 0)

# Model Performance Measurement Functions

In [None]:
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1) / np.maximum(y_true, y_pred).sum(axis = 1)
    return jaccard.mean()*100

def print_score(y_pred, clf):
    print('clf: ', clf.__class__.__name__)
    print('Jacard score: {}'.format(j_score(type_Test, y_pred)))
    print('----')

# Build and Train Model

In [None]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [None]:
#compare performance
for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(text_Train, type_Train)
    pred = clf.predict(text_Test)
    print_score(pred, classifier)

# Test with Sample Data

In [None]:
while True:
    user_input = input('Enter Some Text:')
    if user_input == 'stop':
        break
        
    #transform input into sparse matrix
    input_transform = tfidf.transform([user_input])
    #decode prediction into text classifications
    print('Classification:', multilabel.inverse_transform(clf.predict(input_transform)))

# Test with CVE Data

In [None]:
df_2 = pd.read_csv('./cve_data_description_only.csv')
df_2.dropna(inplace=True)

exclude_str = '**'
cve_arr = []

for cve in df_2['Description']:
    if exclude_str not in cve:
        cve_arr.append(cve)
        
#remove duplicates
cve_arr = list(set(cve_arr))

In [None]:
results = []
counter = 0

for cve in cve_arr:
    input_transform = tfidf.transform([cve])
    prediction = multilabel.inverse_transform(clf.predict(input_transform))
    prediction_str = str(prediction)
    results.append(prediction_str)
    
    if counter < 500:
        print('Input:', cve)
        print('Classification:', prediction_str)
        print('-----------------')
        
    counter += 1


In [None]:
fields = ['Description', 'Classification']
output_data = []
for a,b in zip(cve_arr, results):
    output_data.append([a,b])

with open('cve_classified.csv', 'w') as f: 
      
    write = csv.writer(f) 
    write.writerow(fields) 
    write.writerows(output_data)   