## 1. Import the given Xview Labels
***Ref: https://github.com/DIUx-xView/data_utilities/blob/master/xview_class_labels.txt***

In [1]:
# Ensure label.txt file is downloaded into the working directory 
file_name = 'xview_class_labels.txt'

In [2]:
# Create the Xview Data dictionary for comparison
xview_label_dict = {}

with open('xview_class_labels.txt') as file:
    for line in file:
        line = line.rstrip('\n')
        (key, value) = line.split(':')
        xview_label_dict[int(key)] = value

In [3]:
# print each dictionary item
print("" + "\n".join("{!r}: {!r},".format(k, v) for k, v in xview_label_dict.items()) + "}")

11: 'Fixed-wing Aircraft',
12: 'Small Aircraft',
13: 'Passenger/Cargo Plane',
15: 'Helicopter',
17: 'Passenger Vehicle',
18: 'Small Car',
19: 'Bus',
20: 'Pickup Truck',
21: 'Utility Truck',
23: 'Truck',
24: 'Cargo Truck',
25: 'Truck Tractor w/ Box Trailer',
26: 'Truck Tractor',
27: 'Trailer',
28: 'Truck Tractor w/ Flatbed Trailer',
29: 'Truck Tractor w/ Liquid Tank',
32: 'Crane Truck',
33: 'Railway Vehicle',
34: 'Passenger Car',
35: 'Cargo/Container Car',
36: 'Flat Car',
37: 'Tank car',
38: 'Locomotive',
40: 'Maritime Vessel',
41: 'Motorboat',
42: 'Sailboat',
44: 'Tugboat',
45: 'Barge',
47: 'Fishing Vessel',
49: 'Ferry',
50: 'Yacht',
51: 'Container Ship',
52: 'Oil Tanker',
53: 'Engineering Vehicle',
54: 'Tower crane',
55: 'Container Crane',
56: 'Reach Stacker',
57: 'Straddle Carrier',
59: 'Mobile Crane',
60: 'Dump Truck',
61: 'Haul Truck',
62: 'Scraper/Tractor',
63: 'Front loader/Bulldozer',
64: 'Excavator',
65: 'Cement Mixer',
66: 'Ground Grader',
71: 'Hut/Tent',
72: 'Shed',
73: 'Buil

## 2. Generate Labels with Text Embedding for the Xview Data

In [4]:
# create a function to find the intersection of 2 lists
# Reference: https://www.geeksforgeeks.org/python-intersection-two-lists/
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def parent_labels(label_dict, target, omit_list = []):
    '''
    parent_labels takes a data dictionary with labels as keys and a list of features and returns a list containing all
    common features between the keys excluding the target and any key on the omit list.
    
    INPUTS:
    label_dict: the dictionary containing all labels and features in the sub class
    target: a string containing the target label
    omit_list: a list of keys to exclude from the feature selection
    
    OUTPUT:
    feature_list: a list conainting all common features in the data dicitonary excluding the target and the omit_list
    '''
    # create the exclude list
    exclude_list = [target]
    exclude_list.extend(omit_list)
    
    # create a list of keys in the label dictionary
    key_list = []
    for key in label_dict.keys():
        key_list.append(key)
        
    # remove keys on the exclude list
    for label in exclude_list:
        key_list.remove(label)
    
    # polulate the feature list
    feature_list = []
    if len(key_list) == 1:
        feature_list = label_dict[key_list[0]]
    elif len(key_list) == 2:
        feature_list = intersection(label_dict[key_list[0]], label_dict[key_list[1]])
    else:
        i = 0
        while i < (len(key_list) - 1):
            if i == 0:
                feature_list = intersection(label_dict[key_list[i]], label_dict[key_list[i+1]])
                i += 1
            else:
                feature_list = intersection(label_dict[key_list[i]], feature_list)
                i += 1
    
    # return the final feature list
    return feature_list

In [5]:
# Create the dictionary of aircraft labels
civ_aircraft_sub_dict = {'Fixed-wing Aircraft' : [],
                         'Small Aircraft' : ['Fixed-wing Aircraft', 'wings', 'elevators', 'rudder', 'tail', 'cockpit', 'single engine', 'white', 'red', 'blue', 'silver', 'fuselage'],
                         'Passenger/Cargo Plane' : ['Fixed-wing Aircraft', 'wings', 'elevators', 'rudder', 'tail',  'cockpit', 'multi engine', 'white', 'red', 'blue', 'silver', 'fuselage'],
                         'Helicopter' : ['main rotor', 'tail rotor', 'rudder', 'tail',  'cockpit', 'red', 'white', 'blue', 'fuselage']}

# Create the parent label (common attributes between sub labels)
omit_list = ['Helicopter']
civ_aircraft_sub_dict['Fixed-wing Aircraft'] = parent_labels(civ_aircraft_sub_dict, 'Fixed-wing Aircraft', omit_list)

In [6]:
# Create the dictionary of passenger vehicle labels
passenger_sub_dict = {'Passenger Vehicle' : [],
                      'Small Car' : ['1:3 ratio', 'Passenger Vehicle', 'wheels', 'windows', 'single cab', 'full cab', '2 axles', 'doors', 'hood', 'trunk', 'white', 'black', 'silver', 'red', 'blue', 'small'],
                      'Bus' : ['1:3 ratio', 'Passenger Vehicle', 'wheels', 'windows', 'single cab', 'full cab', 'doors', '3 axles', 'long', 'rectangle', 'white', 'silver', 'red', 'blue']}

# Create the parent lable (common attributes between sub labels)
passenger_sub_dict['Passenger Vehicle'] = parent_labels(passenger_sub_dict, 'Passenger Vehicle')

In [7]:
# Create the dictionary of truck labels
truck_sub_dict = {'Pickup Truck' : ['1:3 ratio', 'Truck', 'wheels', 'windows', 'single cab', 'full cab', '2 axles', 'doors', 'hood', 'bed', 'white', 'black', 'silver', 'red', 'blue'],
                  'Utility Truck' : ['1:3 ratio', 'Truck', 'wheels', 'windows', 'single cab', 'full cab', '2 axles', 'doors', 'hood', 'bed', 'box', 'crane', 'winch', 'white'],
                  'Truck' : [],
                  'Cargo Truck' : ['Truck', 'wheels', 'windows', 'single cab', 'full cab', '2 axles', 'doors', 'bed', 'box', 'Shipping Container', 'tractor', 'trailer', 'long', 'rectangle'],
                  'Truck Tractor w/ Box Trailer' : ['Truck', 'wheels', 'windows', 'single cab', 'full cab', '5 axles', 'doors', 'bed', 'box', 'trailer', 'Shipping Container'],
                  'Truck Tractor' : ['Truck', 'Shipping Container', 'wheels', 'windows', 'single cab', 'full cab', '3 axles', 'doors'],
                  'Trailer' : ['Shipping Container', 'wheels', 'bed', 'box', 'trailer', 'flat', '2 axles'],
                  'Truck Tractor w/ Flatbed Trailer' : ['Truck', 'wheels', 'windows', 'single cab', 'full cab', '5 axles', 'doors', 'bed', 'flat', 'trailer'],
                  'Truck Tractor w/ Liquid Tank' : ['Truck', 'wheels', 'windows', 'single cab', 'full cab', '5 axles', 'doors', 'bed', 'tube', 'tank', 'trailer'],
                  'Crane Truck' : ['Truck', 'wheels', 'windows', 'single cab', 'full cab', '3 axles', 'doors', 'boom', 'crane', 'outriggers', 'carrier', 'red', 'blue', 'white', 'yellow', 'orange', 'rectangle']}

# Create the parent label (common attributes between sub labels)
omit_list = ['Trailer']
truck_sub_dict['Truck'] = parent_labels(truck_sub_dict, 'Truck', omit_list)

In [8]:
# Create the dictionary of truck labels
rail_sub_dict = {'Railway Vehicle' : [],
                 'Passenger Car' : ['Railway Vehicle', 'wheelset', 'rail', 'long', 'windows', 'rectangle'],
                 'Cargo/Container Car' : ['Railway Vehicle', 'wheelset', 'rail', 'long', 'box', 'Shipping Container', 'rectangle'],
                 'Flat Car' : ['Railway Vehicle', 'wheelset', 'rail', 'long', 'bed', 'flat', 'rectangle'],
                 'Tank car' : ['Railway Vehicle', 'wheelset', 'rail', 'long', 'tube', 'cylinder', 'rectangle'],
                 'Locomotive' : ['Railway Vehicle', 'wheelset', 'rail', 'long', 'engine', 'stack', 'rectangle']}

# Create the parent label (common attributes between sub labels)
rail_sub_dict['Railway Vehicle'] = parent_labels(rail_sub_dict, 'Railway Vehicle')

In [9]:
maritime_sub_dict = {'Maritime Vessel' : [],
                     'Motorboat': ['Maritime Vessel', 'water', 'small', 'wake', 'engine', 'sharp', 'white'],
                     'Sailboat': ['Maritime Vessel', 'water', 'small', 'sails', 'sharp', 'white'],
                     'Tugboat': ['Maritime Vessel', 'water', 'small', 'wake', 'engine', 'round', 'red', 'yellow', 'orange', 'white', 'blue', 'fender', 'wide', 'bridge', 'stack'],
                     'Barge': ['Maritime Vessel', 'water', 'medium', 'rectangle', 'red', 'yellow', 'orange', 'blue', 'grey', 'wide'],
                     'Fishing Vessel': ['Maritime Vessel', 'water', 'medium', 'wake', 'engine', 'round', 'red', 'yellow', 'orange', 'white', 'blue', 'green', 'gantry', 'masthead', 'bridge', 'stack'],
                     'Ferry': ['Maritime Vessel', 'water', 'medium', 'wake', 'engine', 'round', 'windows', 'yellow', 'orange', 'white', 'bridge', 'stack', 'wide'],
                     'Yacht': ['Maritime Vessel', 'water', 'medium', 'wake', 'engine', 'sharp', 'white'],
                     'Container Ship': ['Maritime Vessel', 'water', 'large', 'wake', 'engine', 'rectangle', 'white', 'blue', 'red', 'Shipping Container', 'stack', 'bridge'],
                     'Oil Tanker': ['Maritime Vessel', 'water', 'large', 'wake', 'engine', 'round', 'white', 'green', 'red', 'flat', 'stack', 'bridge']}

# Create the parent label (common attributes between sub labels)
maritime_sub_dict['Maritime Vessel'] = parent_labels(maritime_sub_dict, 'Maritime Vessel')

In [10]:
engineering_sub_dict = {'Engineering Vehicle' : [],
                        'Tower crane' : ['Engineering Vehicle', 'fixed', 'counter weight', 'crane', 'boom', 'single cab', 'half cab', 'narrow', 'long', 'thin'],
                        'Container Crane' : ['Engineering Vehicle', 'Shipping Container','fixed', 'counter weight', 'crane', 'boom', 'half cab', 'spreader', 'wide', 'long', '2 axles'],
                        'Reach Stacker' : ['Engineering Vehicle', 'crane', 'boom', 'single cab', 'half cab', '2 axles', 'spreader', 'wheels', 'windows', 'doors', 'red', 'blue', 'white', 'yellow', 'orange'],
                        'Straddle Carrier' : ['Engineering Vehicle', 'single cab', 'half cab', 'spreader', 'wheels', 'windows', 'doors', 'red', 'blue', 'white', 'yellow', 'orange', 'square', 'flat'],
                        'Mobile Crane' : ['Engineering Vehicle', 'wheels', 'tracks', 'windows', 'multi cab', 'half cab', 'full cab', 'doors', 'boom', 'crane', 'outriggers', 'carrier', 'red', 'blue', 'white', 'yellow', 'orange', 'rectangle'],
                        'Dump Truck' : ['Engineering Vehicle', 'wheels', 'windows', 'full cab', 'single cab', '3 axles', 'doors', 'haul bed', 'large', 'square', 'rectangle', 'red', 'blue', 'white', 'yellow', 'orange'],
                        'Haul Truck' : ['Engineering Vehicle', 'wheels', 'windows', 'full cab', 'single cab', '3 axles', 'doors', 'haul bed', 'large', 'square', 'rectangle', 'red', 'blue', 'white', 'yellow', 'orange'],
                        'Scraper/Tractor' : ['Engineering Vehicle', 'wheels', 'windows', 'half cab', 'single cab', '2 axles', 'doors', 'bowl', 'tractor', 'scraper', 'yellow', 'rectangle', 'trailer'],
                        'Front loader/Bulldozer' : ['Engineering Vehicle', 'wheels', 'windows', 'full cab', 'single cab', 'blade', 'tracks', '2 axles', 'doors', 'scoop', 'yellow', 'orange', 'green'],
                        'Excavator' : ['Engineering Vehicle', 'tracks', 'windows', 'half cab', 'single cab', 'doors', 'boom', 'bucket', 'arm', 'square', 'yellow', 'orange', 'red', 'white', 'blade'],
                        'Cement Mixer' : ['Engineering Vehicle', 'wheels', 'windows', 'full cab', 'single cab', '3 axles', 'doors', 'hood', 'water tank', 'barrel', 'hopper', 'rectangle', 'red', 'blue', 'white', 'yellow', 'orange'],
                        'Ground Grader' : ['Engineering Vehicle', 'wheels', 'windows', 'full cab', 'single cab', '3 axles', 'doors', 'blade', 'frame', 'yellow', 'red', 'orange', 'rectangle']}

# Create the parent lable (common attributes between sub labels)
engineering_sub_dict['Engineering Vehicle'] = parent_labels(engineering_sub_dict, 'Engineering Vehicle')

In [11]:
building_sub_dict = {'Hut/Tent' : ['Building', 'canvas', 'curved', 'small'],
                     'Shed' : ['Building', 'walls', 'small', 'roof', 'flat', 'corners', 'straight'],
                     'Building' : ['Building', 'medium', 'roof', 'flat', 'corners', 'straight', 'walls'],
                     'Aircraft Hangar' : ['Building', 'walls', 'curved', 'white', 'grey'],
                     'Damaged Building' : ['Building', 'walls', 'debris', 'damage'],
                     'Facility' : ['Building', 'walls', 'roof', 'flat', 'corners', 'straight']}

In [12]:
other_struct_sub_dict = {'Construction Site': ['walls', 'engineering Vehicle', 'dirt', 'debris', 'fence'],
                         'Vehicle Lot'  : ['Passenger Vehicle', 'truck', 'wheels', 'tracks', 'lights', 'fence', 'flat', 'concrete', 'asphalt', 'dirt'],
                         'Helipad' : ['H', 'green', 'red', 'yellow', 'circle', 'flat', 'lights'],
                         'Storage Tank' : ['round', 'cylinder', 'tube', 'white', 'yellow', 'blue'],
                         'Shipping container lot' : ['Shipping Container', 'flat', 'concrete', 'dirt', 'asphalt', 'lights', 'fence', 'truck'],
                         'Shipping Container' : ['rectangle', 'box', 'small', 'red', 'green', 'yellow', 'blue'],
                         'Pylon' : ['lines', 'grey', 'tall'],
                         'Tower' : ['tall', 'narrow', 'stack']}

In [13]:
# Changed dictionary format for ease of tokenizing
'''
xview_embed_dict = {'Civilian Aircraft' : civ_aircraft_sub_dict,
                    'Civilian Passenger Vehicle' : passenger_sub_dict,
                    'Civilian Truck' : truck_sub_dict,
                    'Railway Vehicle' : rail_sub_dict,
                    'Maritime Vessels' : maritime_sub_dict,
                    'Civilian Engineering' : engineering_sub_dict,
                    'Buildings' : building_sub_dict,
                    'Other Structures' : other_struct_sub_dict}
'''

xview_embed_dict = {**civ_aircraft_sub_dict,
                    **passenger_sub_dict,
                    **truck_sub_dict,
                    **rail_sub_dict,
                    **maritime_sub_dict,
                    **engineering_sub_dict,
                    **building_sub_dict,
                    **other_struct_sub_dict}

## 3. Ensure the given Xview Labels and the Generated Embedding Dictionary Match

In [14]:
# Extract the labels from the Xview Embedding Dictionary
xview_labels = [] # create a list to store possible lables

for key in xview_embed_dict.keys():
    xview_labels.append(key)

In [15]:
# Ensure the Embedded dictionary is the same length as the Xview Labels

if len(xview_label_dict) == len(xview_embed_dict):
    print('The Embedded Dictionary and Label Dictionary are the same size.')
else:
    print('ERROR: The Embedded Dictionary and Label Dictionary are NOT the same size.')

The Embedded Dictionary and Label Dictionary are the same size.


In [16]:
# Ensure the Labels in Embedded dictionary match the given Xview Labels

i = 0

for key in xview_label_dict.keys():
    label_value = xview_label_dict[key]
    if label_value == xview_labels[i]:
        i += 1
    else:
        print(label_value + ' has no match in the embedded dictionary')
        print('Embedded dictionary returns : ' + xview_labels[i] + '\n')
        i += 1

## 4. Generate Labels with Text Embedding for Military Vehicles

***References:***

 ***Worldwide Equipment Guide: https://odin.tradoc.army.mil/WEG***
 
 ***TC 7-100.4 Hybrid Threat Force Structure Organization Guide (2015)***

In [17]:
'''
Fixed Wing : B52, F16, E3
Rotary Wing : UH60, CH47, AH64
UAV : Shadow, predator
'''

aircraft_sub_dict = {'Military Fixed-wing' : ['wings', 'elevators', 'rudder', 'tail', 'cockpit', 'multi engine', 'single engine', 'weapons', 'external fuel tank', 'grey', 'brown', 'green', 'black', 'blue', 'fuselage'],
                     'Military Helicopter' : ['main rotor', 'tail rotor', 'rudder', 'tail',  'cockpit', 'weapons', 'external fuel tank', 'grey', 'brown', 'green', 'black', 'skids', 'wheels', 'fuselage'],
                     'UAV' : ['trailer', 'small', 'wings', 'tail', 'rudder']}

In [18]:
'''
Short Range Anti-Air : Roland-2 French SHORAD, HQ-7 (FM-80) Chinese SHORAD, CSA-41 (FM-80) Iranian SHORAD
Point Defense : LD 2000 (LuDun-2000) Chinese 8x8 Mobile Air Defense Gun Missile System
Long Range Missile : HQ-22 Chinese Long-Range Air Defense Missile System, S-300P (SA-10 Grumble) Russian 8x8 Long-Range Surface-to-Air Missile System
Radar System : P-40 (Long Track) Russian Mobile 3-D UHF Radar System, 96L6E (Cheese Board) Russian Early-Warning and Acquisition Radar
'''

ada_sub_dict = {'General Anti-Air' : ['launch tube', 'turret', 'gun', 'missile', 'rectangle', 'cab', 'hood', 'wheels', 'tracks', 'trailer', 'platform', 'green', 'tan', 'radar', 'array', 'antenna'],
                'Radar System' : ['wheels', 'cab', 'windows', 'doors', 'radar', 'array', 'antenna', 'square', 'green', 'tan', 'Shipping Container']}

In [19]:
'''
Towed Cannon : M777, various field howitzers
Multiple-Rocket Launchers : BM-21 122mm MLR, and various other MLRS
Artillery Command and Reconnaissance : 
'''

artillery_sub_dict = {'Towed Cannon' : ['cannon', 'carriage', 'wheels', 'tan', 'green', 'square'],
                      'Multiple-Rocket Launchers' : ['wheels', 'tracks', 'cab', 'launch tube', 'windows', 'doors', 'tan', 'green', 'rectangle']}

In [20]:
'''
Minelaying systems: GBL-130 Chinese Armored Mine Dispenser, UMZ Russian 6x6 Scatterable Minelaying System
Mine-Clearing Systems : IMR-2M CEV, MR-2 CEV 
Counter Mobility Systems : excavator, scoop loader, backhoe, skid-steer
'''

engineer_sub_dict = {'Minelaying systems' : ['tracks', 'wheels', 'dispenser', 'canister', 'tan', 'green'],
                     'Mine-Clearing Systems' : ['tracks', 'wheels', 'blade', 'rollers', 'crane', 'hydraulic arm', 'tan', 'green'],
                     'Gap-Crossing Systems' : ['bridge', 'crane', 'rectangle', 'tracks', 'tan', 'green'],
                     'Counter Mobility Systems' : ['Excavator', 'Scraper/Tractor', 'Front Loader', 'Ground Grader', 'Dump Truck', 'hydraulic arm', 'scoop', 'blade', 'frontloader', 'cab', 'doors', 'windshield', 'tan', 'green']}

In [21]:
'''
Tank : Abrams, T90
Infantry Carrier : Bradley, BMP
Support Vehicle : M113, M88, other armored recovery vehicles
Self Propelled Artillery : Palidin
'''

armor_sub_dict = {'Main Battle Tank' : ['2:3 ratio', 'tracks', 'turret', 'cannon', 'rectangle', 'tan', 'green', 'hatch'],
                  'Infantry Carrier' : ['2:3 ratio', 'tracks', 'square', 'tan', 'green', 'hatch'],
                  'Support Vehicle' : ['2:3 ratio', 'tracks', 'blade', 'tow arm', 'square', 'tan', 'green', 'hatch'],
                  'Self Propelled Artillery'  : ['2:3 ratio', 'tracks', 'turret', 'cannon', 'square', 'tan', 'green', 'hatch']}

In [22]:
'''
Infantry Carriers : Stryker, various wheeled APCs
Light Tactical Vehicles : HMMWV, JLTV, MAT-V
Combat Support Vehicles : M997 Ambulance
Mobile Gun Systems : wheeled howitzer, Stryker MGS, various antitank vehicles
'''

light_sub_dict = {'Infantry Carriers' : ['1:3 ratio', 'wheels', 'turret', 'rectangle', 'tan', 'green', 'hatch', 'gun'],
                  'Light Tactical Vehicles' : ['1:3 ratio', 'wheels', 'turret', 'windows', 'cab', 'doors', 'rectangle', 'hood', 'trunk', 'tan', 'green'],
                  'Combat Support Vehicles' : ['1:3 ratio', 'wheels', 'windows', 'cab', 'doors', 'rectangle', 'hood', 'trunk', 'tan', 'green'],
                  'Mobile Gun Systems' : ['1:3 ratio', 'wheels', 'cannon', 'turret', 'rectangle', 'tan', 'green']}

In [23]:
'''
Supply Vehicle : M977 HEMTT, M1094
Recovery Vehicle : Wrecker, various other tow truck type vehicles
Tanker : fuel truck, hippo
'''

logistics_sub_dict = {'Military Supply Vehicle' : ['1:3 ratio', 'wheels', 'windows', 'cab', 'doors', 'hood', 'Shipping Container', 'bed', 'trailer', 'tan', 'green'],
                      'Military Recovery Vehicle' : ['1:3 ratio', 'wheels', 'windows', 'cab', 'doors', 'winch', 'tow arm', 'tan', 'green'],
                      'Military Fueler' : ['1:3 ratio', 'wheels', 'windows', 'cab', 'doors', 'tank', 'tube', 'hood', 'tan', 'green']}

In [24]:
other_sub_dict = {'Command and Control Systems' : ['antenna', 'square', 'green', 'tan'],
                  'Communications Systems' : ['radar', 'array', 'antenna', 'square', 'green', 'tan'],
                  'Electronic Warfare Systems' : ['radar', 'array', 'antenna', 'square', 'green', 'tan']}

In [25]:
# Changed dictionary format for ease of tokenizing
'''
military_embed_dict = {'Military Aircraft' : aircraft_sub_dict,
                       'Air Defense' : ada_sub_dict,
                       'Armor' :armor_sub_dict,
                       'Artillery' : artillery_sub_dict,
                       'Military Engineer and CBRN' : engineer_sub_dict,
                       'Light Military Vehicles' : light_sub_dict,
                       'Military Logistics' : logistics_sub_dict,
                       'Other Military' : other_sub_dict}
'''

military_embed_dict = {**aircraft_sub_dict,
                       **ada_sub_dict,
                       **armor_sub_dict,
                       **artillery_sub_dict,
                       **engineer_sub_dict,
                       **light_sub_dict,
                       **logistics_sub_dict,
                       **other_sub_dict}

## 5. Merge Embedding Dictionaries (if needed)

In [26]:
#main_dict = {**xview_embed_dict , **military_embed_dict}
main_dict = xview_embed_dict

## 6. Create a Dictionary of tokens for Embedding

In [27]:
# Import needed packages

import nltk
import re

from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jingr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
# Create a helper function to tokenize the values in the dictionary

def tokenize_list(word_list):
    token_list = []
    for text in word_list:
        # Remove spaces
        text = re.sub(r'([^\s])\s([^\s])', r'\1_\2',text)
    
        # Remove case
        text = text.lower() 
        
        # Store the text in the token list
        token_list.append(text)
        
    return(token_list)

In [29]:
# Tokenize the Xview dataset for labeling

# Create an empty dictionary to store labels
xview_label_dict_embedded = {}


# Match embedded text to labels provided with the Xview data
for key in xview_label_dict.keys():
    xview_label_dict_embedded[key] = set(tokenize_list(xview_embed_dict[xview_label_dict[key]]))
    
# Tokenize the main dictionary
for key in main_dict.keys():
    main_dict[key] = set(tokenize_list(main_dict[key]))

In [30]:
# Print the embedded Xview Data
print("" + "\n".join("{!r}: {!r},".format(k, v) for k, v in xview_label_dict_embedded.items()) + "}")

11: {'red', 'blue', 'tail', 'fuselage', 'silver', 'cockpit', 'wings', 'fixed-wing_aircraft', 'white', 'elevators', 'rudder'},
12: {'red', 'blue', 'tail', 'fuselage', 'silver', 'cockpit', 'wings', 'fixed-wing_aircraft', 'single_engine', 'white', 'elevators', 'rudder'},
13: {'red', 'blue', 'tail', 'fuselage', 'silver', 'multi_engine', 'cockpit', 'wings', 'fixed-wing_aircraft', 'white', 'elevators', 'rudder'},
15: {'red', 'blue', 'tail', 'fuselage', 'tail_rotor', 'cockpit', 'main_rotor', 'white', 'rudder'},
17: {'red', 'blue', 'full_cab', '1:3_ratio', 'doors', 'silver', 'passenger_vehicle', 'wheels', 'single_cab', 'white', 'windows'},
18: {'red', '2_axles', 'blue', 'full_cab', '1:3_ratio', 'doors', 'silver', 'passenger_vehicle', 'wheels', 'single_cab', 'black', 'small', 'hood', 'white', 'windows', 'trunk'},
19: {'red', 'blue', 'full_cab', '1:3_ratio', 'doors', 'silver', 'passenger_vehicle', 'wheels', 'rectangle', 'single_cab', 'white', '3_axles', 'windows', 'long'},
20: {'red', '2_axles',

## 7. Dictionary Analyis

In [31]:
features = [] # create a list to store possible features
labels = [] # create a list to store possible lables

for entity in main_dict:
    features.extend(main_dict[entity]) # ectract the features from the data dictionary
    labels.append(entity)

feature_dic = {feature : features.count(feature) for feature in set(features)} # create a dictionary that counts the number of occurances of a given feature

In [32]:
print('There are ' + str(len(main_dict)) + ' possible labels')
print('There are ' + str(len(feature_dic)) + ' possible features to describe the labels')
print('They are:\n')

number = 1
for label in labels:
    print(str(number) + '.\t' + label)
    number +=1

There are 60 possible labels
There are 107 possible features to describe the labels
They are:

1.	Fixed-wing Aircraft
2.	Small Aircraft
3.	Passenger/Cargo Plane
4.	Helicopter
5.	Passenger Vehicle
6.	Small Car
7.	Bus
8.	Pickup Truck
9.	Utility Truck
10.	Truck
11.	Cargo Truck
12.	Truck Tractor w/ Box Trailer
13.	Truck Tractor
14.	Trailer
15.	Truck Tractor w/ Flatbed Trailer
16.	Truck Tractor w/ Liquid Tank
17.	Crane Truck
18.	Railway Vehicle
19.	Passenger Car
20.	Cargo/Container Car
21.	Flat Car
22.	Tank car
23.	Locomotive
24.	Maritime Vessel
25.	Motorboat
26.	Sailboat
27.	Tugboat
28.	Barge
29.	Fishing Vessel
30.	Ferry
31.	Yacht
32.	Container Ship
33.	Oil Tanker
34.	Engineering Vehicle
35.	Tower crane
36.	Container Crane
37.	Reach Stacker
38.	Straddle Carrier
39.	Mobile Crane
40.	Dump Truck
41.	Haul Truck
42.	Scraper/Tractor
43.	Front loader/Bulldozer
44.	Excavator
45.	Cement Mixer
46.	Ground Grader
47.	Hut/Tent
48.	Shed
49.	Building
50.	Aircraft Hangar
51.	Damaged Building
52.	Facility


In [33]:
import pandas as pd

features_df = pd.DataFrame(list(feature_dic.items()))

features_df.rename(columns = {0:'Feature', 1 : 'Count'}, inplace = True)

features_df = features_df.sort_values(by='Count', ascending=False)

features_df['Frequency'] = features_df['Count']/len(labels)

In [34]:
pd.set_option('display.max_rows', None)

print(features_df)

                 Feature  Count  Frequency
71                 white     27   0.450000
15               windows     24   0.400000
86                   red     24   0.400000
28                wheels     23   0.383333
33                 doors     22   0.366667
78            single_cab     22   0.366667
87                  blue     21   0.350000
37             rectangle     18   0.300000
60                yellow     18   0.300000
26              full_cab     18   0.300000
46                orange     14   0.233333
47   engineering_vehicle     14   0.233333
50                  flat     11   0.183333
44                 truck     11   0.183333
43                 water     10   0.166667
0        maritime_vessel     10   0.166667
6                   long     10   0.166667
75               2_axles      9   0.150000
68                engine      8   0.133333
76                   bed      8   0.133333
18    shipping_container      8   0.133333
83                silver      7   0.116667
96         

In [35]:
# consider making colors bright vs subdued instead of listing individual colors?

## 8. OHE the Tokens to be Embedded in the Model

In [36]:
# Import required packages
import numpy as np

In [37]:
# create the bag of word to OHE
corpus = np.array(features_df['Feature'].to_list())  # create the corpus from the analysis df
s_corpus = len(corpus)  # store the length of the corpus

corpus_dict = {}
i = 0
# Convert the corpus into a dictionary {word : int} format
for word in corpus:
    corpus_dict[word] = i 
    i += 1

In [38]:
# One hot encode the data
for key in xview_label_dict.keys():
    int_list = []
    binary_list = [0] * s_corpus
    for value in xview_label_dict_embedded[key]:
        int_list.append(corpus_dict[value])  # convert words in the dictionary to an int using the corpus dict
    for num in int_list:
        binary_list[num] = 1  # use the ints to change the 0's to 1's in the binary list
    xview_label_dict_embedded[key] = binary_list  # add the binary list to the data dictionary

In [39]:
print("" + "\n".join("{!r}: {!r},".format(k, v) for k, v in xview_label_dict_embedded.items()) + "}")

11: [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
12: [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
13: [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
15: [1, 0, 1, 0, 0,