In [11]:
import gensim
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import re

import helpers as h

## 1) Finding relations from public data (Google newsgroups)

In [2]:
# Loading the data

unigram_model = gensim.models.KeyedVectors.load_word2vec_format
('/Users/barak/Desktop/GoogleNews-vectors-negative300.bin', binary=True)

First, we'll manually set up a collection of pairs of states/countries and their capitals. Then, we'll use these to establish the relation of "is the capital of" and test it on two examples...

In [3]:
# Set up a training set for "is the capital of" by hand

capitals_train = [
    ("California", "Sacramento"),
    ("Texas", "Austin"),
    ("Canada", "Ottawa"),
    ("France", "Paris"),
    ("Russia", "Moscow"),
    ("Massachusetts", "Boston"),
    ("Italy", "Rome"),
    ("India", "Delhi"),
    ("China", "Beijing"),
    ("Australia", "Canberra"),
    ("England", "London"),
    ("Peru", "Lima"),
    ("Florida", "Tallahassee"),
    ("Japan", "Tokyo")
]

In [4]:
# This collects the difference of all state - capital vectors and averages them.

diffs = [unigram_model[state] - unigram_model[city] for state, city in capitals_train]
mean_diffs_vector = np.average(diffs, axis=0)

How do we know that we've discovered the "is the capital of" relationship? We can use it to find out what various capitals are!!

In [5]:
# this take the difference of our test case ("vermont") and the vector that represents "is the capital of"

transform = unigram_model["Vermont"] - mean_diffs_vector
results = unigram_model.similar_by_vector(transform)
results[1:]

[(u'Montpelier', 0.7260538339614868),
 (u'New_Hampshire', 0.674058198928833),
 (u'Brattleboro', 0.6718513369560242),
 (u'Maine', 0.6640110015869141),
 (u'Rutland', 0.6482397317886353),
 (u'St._Johnsbury', 0.6425949335098267),
 (u'Bennington', 0.6381337642669678),
 (u'Bellows_Falls', 0.6248161196708679),
 (u'UVM', 0.6161313056945801)]

To be fair, it's not always the first hit, here, it's the second one...

In [6]:
transform = unigram_model["Portugal"] - mean_diffs_vector
results = unigram_model.similar_by_vector(transform)
results[1:]

[(u'Portuguese', 0.6647554636001587),
 (u'Lisbon', 0.6610859632492065),
 (u'Porto', 0.6567690372467041),
 (u'Portugual', 0.6197739839553833),
 (u'Spain', 0.6073086261749268),
 (u'Madrid', 0.6043921709060669),
 (u'Benfica', 0.5990990400314331),
 (u'Oporto', 0.5915431976318359),
 (u'Barcelona', 0.5897266864776611)]

We can also try to discover the "is a kind of relation" in the same way

In [7]:
is_kind_of = [
    ("cat", "pet"),
    ("doctor", "profession"),
    ("sofa", "furniture"),
    ("baseball", "sport"),
    ("ford", "car"),
    ("guitar", "instrument"),
    ("dime", "coin"),
    ("dog", "animal"),
    ("mac", "computer"),
    ("pasta", "food"),
    ("bird", "animal")
]

In [8]:
kind_diffs = [unigram_model[thing] - unigram_model[type_] for thing, type_ in is_kind_of]
k_diffs = np.average(kind_diffs, axis=0)

In [9]:
transform = unigram_model["lizard"] - k_diffs
results = unigram_model.similar_by_vector(transform)
results[1:]

[(u'reptile', 0.699384868144989),
 (u'lizards', 0.6936733722686768),
 (u'reptiles', 0.6410720348358154),
 (u'snake', 0.6338107585906982),
 (u'creature', 0.6250758171081543),
 (u'toad', 0.6105209589004517),
 (u'frog', 0.6086322665214539),
 (u'snakes', 0.5942181348800659),
 (u'mammal', 0.5934826135635376)]

And, of course, the first hit is the right answer...

We can also do this as a kid of entity recognition. For instance, here we're going to find out some more animals, given an initial seeding set of just 10 animals...

In [10]:
# animals
animals = [
    'dog',
    'cat',
    'lion',
    'giraffe',
    'horse',
    'duck',
    'bird',
    'fish',
    'whale',
    'mouse'
]
animal_vecs = [unigram_model[animal] for animal in animals]
avg_animals = np.average(animal_vecs, axis=0)
results = unigram_model.similar_by_vector(avg_animals, topn=20)
results = [res for res in results if res not in animals]
results

[(u'cat', 0.7806287407875061),
 (u'bird', 0.7411798238754272),
 (u'rabbit', 0.7326385974884033),
 (u'dog', 0.7189368605613708),
 (u'critter', 0.6959779262542725),
 (u'animal', 0.6861838698387146),
 (u'turtle', 0.6853160262107849),
 (u'pup', 0.6818647980690002),
 (u'whale', 0.6802307367324829),
 (u'giraffe', 0.6801244020462036),
 (u'otter', 0.675615131855011),
 (u'squirrel', 0.6697624921798706),
 (u'mammal', 0.6691585779190063),
 (u'cats', 0.668297290802002),
 (u'fox', 0.6656603217124939),
 (u'reptile', 0.663225531578064),
 (u'dolphin', 0.6631218194961548),
 (u'animals', 0.6623847484588623),
 (u'frog', 0.6570603847503662),
 (u'lizard', 0.6523070335388184)]

## Using Cisco Data

Now, we can move on to a business use case. Cisco wants to understand a collection of manuals about their ASA firewalls. We'll look to see if we can build out the relationship of "support" for various security protocols...

Here, we're just loading the data and preparing our bigram detector...

In [16]:

cols = ["title", "id", "tech_solution", "protocol", "concept", "body"]
df = h.parse_jsons("/Users/barak/projects/cisco/data/", cols)
df = pd.DataFrame(df)

phrases = gensim.models.phrases.Phrases(h.processtxt(df.body), min_count=2000, threshold=2)
bigram_detector = gensim.models.phrases.Phraser(phrases)

In [17]:
# builds the bigram model

bigram_model = gensim.models.Word2Vec(bigram_detector[h.processtxt(df.body)], min_count=30, workers=4)

Now, we're going to pull a list of security protocols from our dataset...

In [20]:

protocols = h.get_protocols(df.protocol)
protocols = [protocol.lower() for protocol in protocols]

# determine the vector for protocols.

protocol_vecs = [bigram_model[protocol] for protocol in protocols]
mean_protocols = np.average(protocol_vecs, axis = 0)
protocols

[u'ipsec', u'ike', u'ikev2', u'ikev1', u'ssl']

In [21]:
# give it a list of products

products = ["asa5540", "asa5505", "asa5510", "asa5520", "asa5500", "asa_5505"]
product_vecs = [bigram_model[prod] for prod in products]
mean_products = np.average(product_vecs, axis = 0)

In [22]:
# the "product to protocol" vector
prod_to_protocol = mean_products - mean_protocols

Now, what we're going to do is see how closely the "supports" relation holds for each of the possible protocols for some of our products...

What's cool about this is that the ASA 5540 supports IKEv1 but not v2, and that the ASA 5505 supports IKEv2 but not v1. The model shows us that this holds, just looking at the similarity score for these protocols.

In [23]:
diff = bigram_model['asa5540'] - prod_to_protocol
results = bigram_model.similar_by_vector(diff, topn=10000)
prot_results = [result for result in results if result[0] in protocols]
prot_results

[(u'ike', 0.315812349319458),
 (u'ikev1', 0.30333074927330017),
 (u'ipsec', 0.26640862226486206),
 (u'ssl', 0.24321871995925903),
 (u'ikev2', 0.16395549476146698)]

In [24]:
diff = bigram_model['asa5505'] - prod_to_protocol
results = bigram_model.similar_by_vector(diff, topn=10000)
prot_results = [result for result in results if result[0] in protocols]
prot_results

[(u'ikev2', 0.303021103143692),
 (u'ike', 0.2829524278640747),
 (u'ssl', 0.27673107385635376),
 (u'ikev1', 0.16295677423477173),
 (u'ipsec', 0.1360032707452774)]