# Keyword extraction (supervised model)

In [1]:
# These are the keywords extracted from the metadata provided by the Tika parser.
# The list is certainly incomplete (check against https://www.enisa.europa.eu/publications)
# Some of the keywords do not make sense...
KEYWORDS =\
[['Article 13a'],
 ['Big Data'],
 ['Business Continuity'],
 ['Charalampos Koutsouris'],
 ['DRAFT'],
 ['Data Protection'],
 ['ENISA', 'European Network and Information Security Agency'],
 ['GDPR'],
 ['IT Continuity'],
 ['IT Service Continuity'],
 ['Industrial Control Systems'],
 ['Louis Marinos'],
 ['Micro-Enterprises'],
 ['Obrela Security Industries'],
 ['Personal Data'],
 ['Privacy'],
 ['Report on Cyber Security Information Sharing in the Energy Sector'],
 ['Resilience'],
 ['Resilient technologies'],
 ['Risk'],
 ['Security'],
 ['Simone Bal'],
 ['Small and Medium Enterprises', 'SME', 'SMEs'],
 ['Template'],
 ['Training'],
 ['Trust service', 'Trust services'],
 ['e-Government'],
 ['e-delivery'],
 ['e-document'],
 ['e-signature'],
 ['enisa botnets detection measurement disinfection defence'],
 ['honeypots'],
 ['time stamping']]

In [2]:
KEYWORDS = [list(map(str.lower, x)) for x in KEYWORDS]
len(KEYWORDS)

33

In [3]:
def unique_keyword(keyword):
    for group in KEYWORDS:
        if keyword.strip().lower() in group:
            return group[0]

In [4]:
import pandas as pd
from pathlib import Path
import json
import re

df = pd.DataFrame(columns=["filename", "title", "keywords", "text", "metadata"])

idx = 0
for path in Path("../pdf-reports/plaintext").iterdir():
    if path.suffix == '.json':
        with path.open() as json_file:
            parsed = json.load(json_file)
            filename = path.stem
            
            # Some documents do not have title among metadata.
            title = parsed["metadata"]["title"] if "title" in parsed["metadata"].keys() else filename
            
            # Some documents do not have keywords among metadata.
            if "meta:keyword" in parsed["metadata"].keys():
                keywords = [unique_keyword(x) for x in re.split(",|;", parsed["metadata"]["meta:keyword"])
                           if unique_keyword(x)]
            else:
                keywords = []
                        
            df.loc[idx] = [filename, title, keywords, parsed["content"], parsed["metadata"]]
            idx += 1

df.head()

Unnamed: 0,filename,title,keywords,text,metadata
0,8th-enisa-workshop-certs-in-europe-report,8th ENISA Workshop ‘CERTs in Europe’,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...
1,a-collection-of-good-practice-for-cert-quality...,,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...
2,a-security-analysis-of-next-generation-web-sta...,a-security-analysis-of-next-generation-web-sta...,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...
3,actionable-information-for-security,actionable-information-for-security,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...
4,algorithms-key-size-and-parameters-report-2014,algorithms-key-size-and-parameters-report-2014,[],\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...


In [5]:
len(df)

351

In [6]:
df = df
df['num_keywords'] = df.keywords.apply(lambda x: len(x))
df['num_keywords'].value_counts()
# 292 documents are without a keyword!!!

0     292
2      43
1      10
11      2
8       1
6       1
5       1
3       1
Name: num_keywords, dtype: int64

In [7]:
df_keywords = df[df.num_keywords > 0]
df_keywords.head()

Unnamed: 0,filename,title,keywords,text,metadata,num_keywords
7,annex-i,Protecting Industrial Control Systems,"[enisa, industrial control systems]",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...,2
8,annex-ii,Protecting Industrial Control Systems,"[enisa, industrial control systems]",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...,2
9,annex-iii,Protecting Industrial Control Systems,"[enisa, industrial control systems]",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...,2
10,annex-iv,Protecting Industrial Control Systems,"[enisa, industrial control systems]",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...,2
11,annex-v,Protecting Industrial Control Systems,"[enisa, industrial control systems]",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,{'X-Parsed-By': ['org.apache.tika.parser.Defau...,2


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(df_keywords.keywords)
mlb.classes_

array(['article 13a', 'big data', 'business continuity',
       'charalampos koutsouris', 'data protection', 'draft', 'e-delivery',
       'e-document', 'e-government', 'e-signature', 'enisa',
       'enisa botnets detection measurement disinfection defence', 'gdpr',
       'honeypots', 'industrial control systems', 'it continuity',
       'it service continuity', 'louis marinos', 'micro-enterprises',
       'obrela security industries', 'personal data', 'privacy',
       'report on cyber security information sharing in the energy sector',
       'resilience', 'resilient technologies', 'risk', 'security',
       'simone bal', 'small and medium enterprises', 'template',
       'time stamping', 'training', 'trust service'], dtype=object)

In [9]:
y = mlb.transform(df_keywords.keywords)
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 1]])

In [10]:
def remove_url(s):
    return re.sub(r'http\S+', '', s, flags=re.MULTILINE)

print(remove_url("Hello https://a72d304c643b011e8b84c061e85c3662-550367788.eu-west-3.elb.amazonaws.com/ world!"))

Hello  world!


In [11]:
def strip_tags(s):
    """Basic regexp based HTML / XML tag stripper function
    For serious HTML/XML preprocessing you should rather use an external
    library such as lxml or BeautifulSoup.
    """
    s = re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
    #s = re.compile(r"\&\w+\;").sub(" ", s)
    s = re.compile(r"\&[^ ]+\;").sub(" ", s)
    return s

print(strip_tags("<p>Hello &nbsp; &#039; world!</p>"))

 Hello     world! 


In [12]:
def preproc(s):
    return strip_tags(remove_url(s)).lower()

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(ngram_range=(1, 1), preprocessor=preproc, stop_words='english')
cv2 = CountVectorizer(ngram_range=(2, 2), preprocessor=preproc, stop_words='english')

In [14]:
X1 = cv1.fit_transform(df_keywords['text'])
X2 = cv2.fit_transform(df_keywords['text'])

In [15]:
#from sklearn.neighbors import KNeighborsClassifier

#neigh = KNeighborsClassifier(n_neighbors=10)
#neigh.fit(X_train, y_train)
#y_pred = neigh.predict(X_test)

In [16]:
print(X1.shape)
print(X2.shape)
print(y.shape)

(59, 22678)
(59, 318874)
(59, 33)


In [17]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Build a model for each keyword
for idx, keyword in enumerate(mlb.classes_):
    label = y[:, idx]
    # Make sure there is at least 1 positive label for the training set and the test set each.
    if np.sum(label) < 2:
        continue
    print('-' * 80)
    print("Keyword: " + keyword)
    print('-' * 80)
    
    # Split the dataset into training and test set while preserving the proportion of labels.
    # The uni-grams work better than bi-grams for the naive Bayes model.
    X_train, X_test, y_train, y_test = train_test_split(X1, label, test_size=0.33, random_state=42, stratify=label)
    #X_train, X_test, y_train, y_test = train_test_split(X2, label, test_size=0.33, random_state=42, stratify=label)
    print("Number of positive labels in the train (test) set: {} ({})".format(np.sum(y_train), np.sum(y_test)))
    
    # Train a model.
    clf = MultinomialNB().fit(X_train, y_train)
    
    # Print the performance metrics on the training set.
    y_pred = clf.predict(X_train)
    y_prob = clf.predict_proba(X_train)[:, 1]
    print("Accuracy on the train set: {:.3f}".format(np.mean(y_pred == y_train)))
    print("Area under the ROC curve: {:.3f}".format(roc_auc_score(y_train, y_prob)))
    print(confusion_matrix(y_train, y_pred))
    
    # Print the performance metrics on the test set.
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    print("Accuracy on the test set: {:.3f}".format(np.mean(y_pred == y_test)))
    print("Area under the ROC curve: {:.3f}".format(roc_auc_score(y_test, y_prob)))
    print(confusion_matrix(y_test, y_pred))

--------------------------------------------------------------------------------
Keyword: business continuity
--------------------------------------------------------------------------------
Number of positive labels in the train (test) set: 2 (1)
Accuracy on the train set: 1.000
Area under the ROC curve: 1.000
[[37  0]
 [ 0  2]]
Accuracy on the test set: 1.000
Area under the ROC curve: 1.000
[[19  0]
 [ 0  1]]
--------------------------------------------------------------------------------
Keyword: charalampos koutsouris
--------------------------------------------------------------------------------
Number of positive labels in the train (test) set: 1 (1)
Accuracy on the train set: 1.000
Area under the ROC curve: 1.000
[[38  0]
 [ 0  1]]
Accuracy on the test set: 0.950
Area under the ROC curve: 0.500
[[19  0]
 [ 1  0]]
--------------------------------------------------------------------------------
Keyword: enisa
-----------------------------------------------------------------------