In [55]:
#Parsing STiX 1.x Data and creating a common format
import xml.etree.ElementTree as ET
import json

tree = ET.parse("./dataset/stix1_sample.xml")
root = tree.getroot()

stix1_schema = {

 "STIX_Package": [
     "version",
     "id",
     "timestamp",
     "Indicators",
     "Incidents",
     "Campaigns",
     "Threat_Actors",
 ],

 "Indicator": [
     "id",
     "timestamp",
     "Type",
     "Description",
     "Observable",
 ],

 "Observable": [
     "id",
     "Object",
     "Title",
     "Description"
 ],

 "Object": [
     "Properties"
 ],

 "URIObject": [
     "Value",
     "condition",
     "apply_condition",
     "type"
 ],

 "Incident": [
     "id",
     "timestamp",
     "Description",
     "Time"
 ],

 "TTP": [
     "id",
     "Title",
     "Description",
 ],

 "Campaign": [
     "id",
     "Title",
     "Description",
 ],

 "ThreatActor": [
     "id",
     "Title",
     "Description",
 ]
}

ns = {
    "stix": "http://docs.oasis-open.org/cti/ns/stix/core-1",
    "indicator": "http://docs.oasis-open.org/cti/ns/stix/indicator-1",
    "URIObject": "http://docs.oasis-open.org/cti/ns/cybox/objects/uri-2"
}

def strip_ns(tag):
    return tag.split("}")[-1]

top_idn = root.get("id").split(":")[-1] or "ID not found"
version = root.get("version")

all_outputs = []

for indicator_elem in root.findall(".//stix:Indicator", ns):

    modified_dict = {
        "stix_version": "",
        "entity_type": "",
        "indicator_type": "",
        "description": "",
        "timestamp":"",
        "values": None,
        "validation" : False
    }

    valid_tags = []
    valid_attributes = []

    for inx in root.iter():
        clean = strip_ns(inx.tag)
        valid_tags.append(clean)
        for a in inx.attrib.keys():
            valid_attributes.append(a)

    tags_val = False
    attr_val = False

    for key, attributes in stix1_schema.items():
        if key in valid_tags:
            tags_val = True
            for at in attributes:
                if at in valid_attributes:
                   attr_val = True
                else:
                    attr_val = False
        else:
            continue

    if tags_val == True or attr_val == True:
        modified_dict["validation"] = True

    modified_dict["stix_version"] = version

    idn = indicator_elem.get("id").split(":")[-1] or None
    timestamp = indicator_elem.get("timestamp") or ""

    entity_type_array = ["Indicator", "Incident", "TTP", "Campaign", "ThreatActor"]
    for elem in root.iter():
        clean = strip_ns(elem.tag)
        if clean in entity_type_array:
            modified_dict["entity_type"] = clean

    type_elem = indicator_elem.find(".//indicator:Type", ns)
    desc_elem = indicator_elem.find(".//indicator:Description", ns)
    value_elem = indicator_elem.find(".//URIObject:Value", ns)

    indicator_type = type_elem.text if type_elem is not None else ""
    description = desc_elem.text if desc_elem is not None else ""
    values = value_elem.text.split("##comma##") if value_elem is not None else []

    modified_dict["indicator_type"] = indicator_type
    modified_dict["description"] = description
    modified_dict["values"] = values
    modified_dict["timestamp"] = timestamp

    pattern_parts = []
    for u in values:
        pattern_parts.append(f"url:value = '{u}'")

    pattern = "[ " + " OR ".join(pattern_parts) + " ]"

    stix2_output = {
        "type": "bundle",
        "spec_version": modified_dict["stix_version"],
        "id": top_idn,
        "objects": [
            {
                "type": modified_dict["entity_type"],
                "id": idn,
                "created": timestamp,
                "modified": timestamp,
                "labels": [indicator_type],
                "description": description,
                "pattern": pattern
            }
        ]
    }

    all_outputs.append(stix2_output)

print(json.dumps(all_outputs, indent=4))


[
    {
        "type": "bundle",
        "spec_version": "stix-1.2.1",
        "id": "package-4cc56b6b-748f-440b-9f01-03bcf3ce7c68",
        "objects": [
            {
                "type": "Indicator",
                "id": "Indicator-db4a6ffe-61f0-488d-85a1-20bd5e360f37",
                "created": "2015-05-15T09:00:00.000000Z",
                "modified": "2015-05-15T09:00:00.000000Z",
                "labels": [
                    "URL Watchlist"
                ],
                "description": "Sample URL Indicator for this watchlist",
                "pattern": "[ url:value = 'http://example.com/foo/malicious1.html' OR url:value = 'http://example.com/foo/malicious2.html' OR url:value = 'http://example.com/foo/malicious3.html' ]"
            }
        ]
    }
]


In [47]:
#STiX 2.x Parsing
import json
with open("./dataset/stix20_sample.json") as f:
    item = json.load(f)

stix2_validation_schema = {
    "bundle": {
        "required": ["type", "id", "objects"]
    },
    "common": {
        "required": ["type", "id", "created"]
    },
    "malware": {
        "required": ["name", "labels"]
    },
    "indicator": {
        "required": ["pattern", "valid_from", "labels"]
    },
    "relationship": {
        "required": ["source_ref", "target_ref", "relationship_type"]
    },
    "attack-pattern": {
        "required": ["name"]
    },
    "tool": {
        "required": ["name", "labels"]
    },
    "identity": {
        "required": ["name", "identity_class"]
    },
    "report": {
        "required": ["name", "published", "object_refs"]
    },
    "course-of-action": {
        "required": ["name"]
    }
}

root_obj = item["type"]
is_field = True if item.get("objects") else False
typeof_obj = isinstance(item.get("objects"), list)

ph1 = False
ph2 = False
ph3 = False
ph0 = False

if (root_obj == "bundle") and (is_field == True) and (typeof_obj == True):
    ph1 = True
    if ("type" in item.keys()) and ("id" in item.keys()) and ("objects" in item.keys()):
        ph0 = True

        for _ in range(len(item["objects"])):

            if ("type" in item["objects"][_].keys()) and ("id" in item["objects"][_].keys()):
                ph2 = True

                obj_type = item["objects"][_]["type"]

                if obj_type in stix2_validation_schema:
                    for vals in stix2_validation_schema[obj_type]["required"]:
                        if vals in item["objects"][_].keys():
                            ph3 = True
                        else:
                            ph3 = False
                            break
            else:
                ph2 = False
    else:
        ph0 = False
else:
    ph1 = False

validation = False
if ph0 == True and ph1 == True and ph2 == True and ph3 == True:
    validation = True

arr_of_data = []

def version_definition(data):
    try:
        item_out = list(data.keys())
        item_in = list(data["objects"][0].keys())
        merged_list = item_in + item_out

        for indice in range(len(data["objects"])):

            nest_id = data["objects"][indice].get("id")

            modified_dict_20 = {
                "stix_version": "",
                "entity_type": "",
                "indicator_type": "",
                "description": "",
                "timestamp":"",
                "values": None,
                "validation": False
            }

            if validation == False:
                continue
            else:
                modified_dict_20["validation"] = validation

            if ("stix_version" in merged_list) or ("spec_version" in merged_list) or ("version" in merged_list):
                modified_dict_20["stix_version"] = 2.1
            else:
                modified_dict_20["stix_version"] = 2.0

            modified_dict_20["entity_type"] = data["objects"][indice].get("type", "Couldn't find entity type")
            modified_dict_20["description"] = data["objects"][indice].get("description", "Description not found")
            modified_dict_20["timestamp"] = data["objects"][indice].get("created", None)

            labels = data["objects"][indice].get("labels", [])
            if isinstance(labels, list):
                modified_dict_20["indicator_type"] = labels
            else:
                modified_dict_20["indicator_type"] = [labels]

            object = data["objects"][indice]

            name = [object.get("name")] if object.get("name") else []
            alias = object.get("x_mitre_aliases") or []
            refs = object.get("external_references") or []

            er = []
            for r in refs:
                if "external_id" in r:
                    er.append(r["external_id"])
                elif "url" in r:
                    er.append(r["url"])

            merged_values = name + alias + er
            modified_dict_20["values"] = merged_values
            arr_of_data.append(modified_dict_20)
    except Exception as e:
        print("Error accessing STiX data!", e)

version_definition(item)

idn = item.get("id")

if len(arr_of_data) > 0:
    modified_dict = arr_of_data[-1]
    urls = modified_dict["values"] or []
    pattern_parts = []
    for u in urls:
        pattern_parts.append(f"url:value = '{u}'")
    pattern = "[ " + " OR ".join(pattern_parts) + " ]"
    stix2_output = {
        "type": "bundle",
        "spec_version": modified_dict["stix_version"],
        "id": idn,
        "objects": [
            {
                "type": modified_dict["entity_type"],
                "id": nest_id,
                "created": modified_dict["timestamp"],
                "modified": modified_dict["timestamp"],
                "labels": modified_dict["indicator_type"],
                "description": modified_dict["description"],
                "pattern": pattern
            }
        ]
    }
    print(json.dumps(stix2_output, indent=4))

{
    "type": "bundle",
    "spec_version": 2.0,
    "id": "bundle--9ed7099a-63b8-4e49-92c7-547d39aa29e0",
    "objects": [
        {
            "type": "malware",
            "id": "malware--00806466-754d-44ea-ad6f-0caf59cb8556",
            "created": "2018-10-17T00:14:20.652Z",
            "modified": "2018-10-17T00:14:20.652Z",
            "labels": [
                "malware"
            ],
            "description": "[TrickBot](https://attack.mitre.org/software/S0266) is a Trojan spyware program written in C++ that first emerged in September 2016 as a possible successor to [Dyre](https://attack.mitre.org/software/S0024). [TrickBot](https://attack.mitre.org/software/S0266) was developed and initially used by [Wizard Spider](https://attack.mitre.org/groups/G0102) for targeting banking sites in North America, Australia, and throughout Europe; it has since been used against all sectors worldwide as part of \"big game hunting\" ransomware campaigns.(Citation: S2 Grupo TrickBot June 2

In [53]:
import pandas as pd

stix_1_df = pd.DataFrame(data=modified_dict)
stix_2_df = pd.DataFrame(data=stix2_output["objects"])
stix_1_df

Unnamed: 0,type,id,created,modified,labels,description,pattern
0,Indicator,Indicator-db4a6ffe-61f0-488d-85a1-20bd5e360f37,2015-05-15T09:00:00.000000Z,2015-05-15T09:00:00.000000Z,[URL Watchlist],Sample URL Indicator for this watchlist,[ url:value = 'http://example.com/foo/maliciou...


In [54]:
stix_2_df

Unnamed: 0,type,id,created,modified,labels,description,pattern
0,malware,malware--00806466-754d-44ea-ad6f-0caf59cb8556,2018-10-17T00:14:20.652Z,2018-10-17T00:14:20.652Z,[malware],[TrickBot](https://attack.mitre.org/software/S...,[ url:value = 'TrickBot' OR url:value = 'Trick...
