# Cyberspacy demo
## This notebook serves to give a brief example of some of the current capabilities of cyberspacy.
## Please note that there are many ideas for this package, so stay tuned for more capabilties powers by text processing, LLMS, etc


In [5]:
# we will go up a directory to make this easier...
import sys

from spacy import displacy

sys.path.append('..')

from cyberspacy.pipelines import PipelineFactory

print('cyberspacy imported...')

cyberspacy imported...


In [2]:
# Once you have cyberspacy and its dependencies, 
# you can either start to use each of its modualr pieces directly 
# or you can use the `PipelineFactory` class to select from a number of "out of the box" pipelines.

# For example, this will construct a `PipelineFactory` and get a new pipeline 
# for detecting URLS, and whether they may be malicious:

factory = PipelineFactory()

In [3]:
# let's see what methods are available to make pipelines...
print(help(factory))

Help on PipelineFactory in module cyberspacy.pipelines object:

class PipelineFactory(builtins.object)
 |  Methods defined here:
 |
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |
 |  create_malicious_url_classifier_pipeline(self)
 |      Creates a simple pipeline which processes URLs it encounters and applies
 |      a predictive model to determine if the URL might be malicious.
 |      The training of this model was performed with two datasets from Kaggle
 |
 |  create_url_parser_pipeline(self)
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |
 |  __dict__
 |      dictionary for instance variables
 |
 |  __weakref__
 |      list of weak references to the object

None


In [4]:
# we'll use this pipeline to set up some tests:

nlp = factory.create_malicious_url_classifier_pipeline()

# let's see what pipenames it has:
print(nlp.pipe_names)

['cyberspacy_url_parser', 'cyberspacy_malicious_url_classifier']


In [12]:
# now we can process some text examples to see if they have URLs and whether they might be malicious...

doc_A = nlp('My project is also on http://www.github.com as well')

# now we can look through the tokens to see which one is a URL and whether it is predicted to be malicious...
for token_idx, token in enumerate(doc_A):
    print(f'Token: [{token.text}]')

    if token.like_url:
        print('\tToken is a URL... is it malicious???')

        print(f'\ttoken._.URL_malicious_classification: {token._.URL_malicious_classification}')

Token: [My]
Token: [project]
Token: [is]
Token: [also]
Token: [on]
Token: [http://www.github.com]
	Token is a URL... is it malicious???
	token._.URL_malicious_classification: False
Token: [as]
Token: [well]


In [14]:
# Let's look at another example to see its behavior

bad_url = 'DO NOT CLICK THIS: http://www.hjlaw.biz/index.php?option=com_user&view=remind'

doc_B = nlp(bad_url)

# now we can look through the tokens to see which one is a URL and whether it is predicted to be malicious...
for token_idx, token in enumerate(doc_B):
    print(f'Token: [{token.text}]')

    if token.like_url:
        print('\tToken is a URL... is it malicious???')

        print(f'\ttoken._.URL_malicious_classification: {token._.URL_malicious_classification}')

Token: [DO]
Token: [NOT]
Token: [CLICK]
Token: [THIS]
Token: [:]
Token: [http://www.hjlaw.biz/index.php?option=com_user&view=remind]
	Token is a URL... is it malicious???
	token._.URL_malicious_classification: True
