# Keyword extraction (supervised model)

In [1]:
import pandas as pd

tags = pd.read_csv('tags.csv')
tags.head()

Unnamed: 0,name,topics,keywords
0,csirt-setting-up-guide-in-estonian,"['CSIRT Cooperation', 'CSIRTs in Europe']",['CSIRTs']
1,csirt-setting-up-guide-in-russian,"['CSIRT Cooperation', 'CSIRTs in Europe']",[]
2,standards-and-tools-for-exchange-and-processin...,"['CSIRT Services', 'Reactive Services']",['CSIRTs']
3,csirt-setting-up-guide-in-irish,"['CSIRT Cooperation', 'CSIRTs in Europe']",['CSIRTs']
4,cyber-europe-2012-key-findings-report,"['Cyber Exercises', 'Cyber Europe']",['Crisis Management']


In [2]:
from pathlib import Path
import json
import re

df = pd.DataFrame(columns=["filename", "title", "keywords", "text", "metadata"])

idx = 0
for path in Path("../pdf-reports/plaintext").iterdir():
    if path.suffix == '.json':
        with path.open() as json_file:
            parsed = json.load(json_file)
            filename = path.stem
            
            # Some documents do not have title among metadata.
            title = parsed["metadata"]["title"] if "title" in parsed["metadata"].keys() else filename
            
            keywords = tags[tags.name == filename].keywords.get_values()[0]
            keywords = keywords.replace('\'', '\"')
            keywords = json.loads(keywords)
                        
            df.loc[idx] = [filename, title, keywords, parsed["content"], parsed["metadata"]]
            idx += 1

df.head()

Unnamed: 0,filename,title,keywords,text,metadata
0,8th-enisa-workshop-certs-in-europe-report,8th ENISA Workshop ‘CERTs in Europe’,[CSIRTs],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'dc:creator': 'M.A.C. Dekker', 'Last-Save-Dat..."
1,a-collection-of-good-practice-for-cert-quality...,,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'dc:creator': 'Henk Bronk', 'Last-Save-Date':..."
2,a-security-analysis-of-next-generation-web-sta...,a-security-analysis-of-next-generation-web-sta...,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'dc:creator': 'Lieven Desmet', 'Last-Save-Dat..."
3,actionable-information-for-security,actionable-information-for-security,[CSIRTs],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'resourceName': 'actionable-information-for-s...
4,algorithms-key-size-and-parameters-report-2014,algorithms-key-size-and-parameters-report-2014,[Privacy],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'access_permission:assemble_document': 'true'...


In [3]:
len(df)

351

In [4]:
df = df
df['num_keywords'] = df.keywords.apply(lambda x: len(x))
df['num_keywords'].value_counts()
# 30 documents are without a keyword!!!

1    214
2     61
3     35
0     30
5      7
4      4
Name: num_keywords, dtype: int64

In [5]:
df_keywords = df[df.num_keywords > 0]
df_keywords.head()

Unnamed: 0,filename,title,keywords,text,metadata,num_keywords
0,8th-enisa-workshop-certs-in-europe-report,8th ENISA Workshop ‘CERTs in Europe’,[CSIRTs],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"{'dc:creator': 'M.A.C. Dekker', 'Last-Save-Dat...",1
3,actionable-information-for-security,actionable-information-for-security,[CSIRTs],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'resourceName': 'actionable-information-for-s...,1
4,algorithms-key-size-and-parameters-report-2014,algorithms-key-size-and-parameters-report-2014,[Privacy],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'access_permission:assemble_document': 'true'...,1
5,algorithms-key-sizes-and-parameters-report,algorithms-key-sizes-and-parameters-report,[Privacy],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'resourceName': 'algorithms-key-sizes-and-par...,1
6,an-evaluation-framework-for-cyber-security-str...,An evaluation Framework for National Cyber Sec...,[Cyber Security],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'dc:creator': 'Dimitra.Liveri@enisa.europa.eu...,1


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(df_keywords.keywords)
print(len(mlb.classes_))
mlb.classes_

51


array(['Big Data', 'CSIRTs', 'Certification', 'Cloud Computing Security',
       'Crisis Management',
       'Critical Information Infrastructure Protection (CIIP)',
       'Cryptography', 'Cyber Security', 'Cyber Threat Intelligence',
       'Cyber Threats', 'Cyber crisis cooperation',
       'Cyber crisis procedures', 'Data protection', 'Digital Skills',
       'ENISA events', 'European Union Institutions', 'Exercises',
       'Finance', 'Good Practice', 'Health', 'Identity & Trust',
       'Incident Reporting', 'Incident Response', 'Internet of things',
       'Managed Services: Metrics', 'Mobile Applications',
       'Mobile Security', 'National Cyber Security Strategies',
       'Network and Information Security Awareness', 'Online Safety',
       'Personal Data', 'Privacy', 'Privacy Tools', 'Procurement',
       'Public Private Partnership', 'Resilience', 'Risk Management',
       'SCADA', 'SMEs', 'Smart Cars', 'Smart Cities', 'Smart Grids',
       'Standards', 'Threat Intelligen

In [7]:
y = mlb.transform(df_keywords.keywords)
y

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [8]:
def remove_url(s):
    return re.sub(r'http\S+', '', s, flags=re.MULTILINE)

print(remove_url("Hello https://a72d304c643b011e8b84c061e85c3662-550367788.eu-west-3.elb.amazonaws.com/ world!"))

Hello  world!


In [9]:
def strip_tags(s):
    """Basic regexp based HTML / XML tag stripper function
    For serious HTML/XML preprocessing you should rather use an external
    library such as lxml or BeautifulSoup.
    """
    s = re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
    #s = re.compile(r"\&\w+\;").sub(" ", s)
    s = re.compile(r"\&[^ ]+\;").sub(" ", s)
    return s

print(strip_tags("<p>Hello &nbsp; &#039; world!</p>"))

 Hello     world! 


In [10]:
def preproc(s):
    return strip_tags(remove_url(s)).lower()

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(ngram_range=(1, 1), preprocessor=preproc, stop_words='english')
cv2 = CountVectorizer(ngram_range=(2, 2), preprocessor=preproc, stop_words='english')

In [12]:
X1 = cv1.fit_transform(df_keywords['text'])
X2 = cv2.fit_transform(df_keywords['text'])

In [13]:
print(X1.shape)
print(X2.shape)
print(y.shape)

(321, 130085)
(321, 1455436)
(321, 51)


In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Build a model for each keyword
for idx, keyword in enumerate(mlb.classes_):
    label = y[:, idx]
    # Make sure there are at least 2 positive label for the training set and the test set each.
    if np.sum(label) <= 4:
        continue
    print('-' * 80)
    print("Keyword: " + keyword)
    print('-' * 80)
    
    # Split the dataset into training and test set while preserving the proportion of labels.
    # The bi-grams work better than uni-grams for the naive Bayes model.
    #X_train, X_test, y_train, y_test = train_test_split(X1, label, test_size=0.33, random_state=42, stratify=label)
    X_train, X_test, y_train, y_test = train_test_split(X2, label, test_size=0.33, random_state=42, stratify=label)
    print("Number of positive labels in the train (test) set: {} ({})".format(np.sum(y_train), np.sum(y_test)))
    
    # Train a model.
    clf = MultinomialNB().fit(X_train, y_train)
    
    # Print the performance metrics on the training set.
    y_pred = clf.predict(X_train)
    y_prob = clf.predict_proba(X_train)[:, 1]
    print("Accuracy on the train set: {:.3f}".format(np.mean(y_pred == y_train)))
    print("Area under the ROC curve: {:.3f}".format(roc_auc_score(y_train, y_prob)))
    print(confusion_matrix(y_train, y_pred))
    
    # Print the performance metrics on the test set.
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    print("Accuracy on the test set: {:.3f}".format(np.mean(y_pred == y_test)))
    print("Area under the ROC curve: {:.3f}".format(roc_auc_score(y_test, y_prob)))
    print(confusion_matrix(y_test, y_pred))

--------------------------------------------------------------------------------
Keyword: CSIRTs
--------------------------------------------------------------------------------
Number of positive labels in the train (test) set: 48 (24)
Accuracy on the train set: 1.000
Area under the ROC curve: 1.000
[[167   0]
 [  0  48]]
Accuracy on the test set: 0.972
Area under the ROC curve: 0.986
[[80  2]
 [ 1 23]]
--------------------------------------------------------------------------------
Keyword: Cloud Computing Security
--------------------------------------------------------------------------------
Number of positive labels in the train (test) set: 9 (4)
Accuracy on the train set: 1.000
Area under the ROC curve: 1.000
[[206   0]
 [  0   9]]
Accuracy on the test set: 0.877
Area under the ROC curve: 0.799
[[90 12]
 [ 1  3]]
--------------------------------------------------------------------------------
Keyword: Crisis Management
------------------------------------------------------------

Number of positive labels in the train (test) set: 5 (2)
Accuracy on the train set: 1.000
Area under the ROC curve: 1.000
[[210   0]
 [  0   5]]
Accuracy on the test set: 0.877
Area under the ROC curve: 0.938
[[91 13]
 [ 0  2]]
--------------------------------------------------------------------------------
Keyword: Trust service providers
--------------------------------------------------------------------------------
Number of positive labels in the train (test) set: 15 (7)
Accuracy on the train set: 0.995
Area under the ROC curve: 0.997
[[199   1]
 [  0  15]]
Accuracy on the test set: 0.830
Area under the ROC curve: 0.813
[[83 16]
 [ 2  5]]
--------------------------------------------------------------------------------
Keyword: eID
--------------------------------------------------------------------------------
Number of positive labels in the train (test) set: 14 (7)
Accuracy on the train set: 0.995
Area under the ROC curve: 0.998
[[200   1]
 [  0  14]]
Accuracy on the test set: 0