In [26]:
import pandas as pd
import numpy as np
import pymongo
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import os
import re
import string
import pickle

import nltk
import gensim

from keras.layers import Embedding

In [3]:
#Get pretrained vectors for word2vec

!wget https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz

--2017-10-27 14:14:11--  https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.161.157
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.161.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/octet-stream]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2017-10-27 14:27:37 (1.95 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



## Load pretrained word2vec model

In [33]:
gnews_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [34]:
gnews_w2v.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [35]:
gnews_w2v.most_similar(positive=['apartment'])

[('apartment_complex', 0.7893964052200317),
 ('townhouse', 0.7727404236793518),
 ('apartments', 0.7152601480484009),
 ('bedroom', 0.7045778632164001),
 ('Apartments', 0.6661036014556885),
 ('house', 0.6628996133804321),
 ('duplex', 0.6575543880462646),
 ('rooming_house', 0.6564425230026245),
 ('townhome', 0.6522176265716553),
 ('fourplex', 0.648613691329956)]

In [36]:
cosine_similarity(gnews_w2v['big'].reshape(1, -1), gnews_w2v['small'].reshape(1, -1))

array([[ 0.49586788]], dtype=float32)

## Load pre-trained GloVe model

In [28]:
wiki_w2v = gensim.models.KeyedVectors.load_word2vec_format('./data/glove_w2v_small.txt')

In [29]:
cosine_similarity(wiki_w2v['small'].reshape(1, -1), wiki_w2v['big'].reshape(1, -1))

array([[ 0.70083666]], dtype=float32)

In [30]:
wiki_w2v['dog']

array([ 0.11008   , -0.38780999, -0.57615   , -0.27713999,  0.70520997,
        0.53994   , -1.07860005, -0.40145999,  1.15040004, -0.56779999,
        0.0038977 ,  0.52877998,  0.64560997,  0.47262001,  0.48548999,
       -0.18407001,  0.18009999,  0.91396999, -1.19790006, -0.57779998,
       -0.37985   ,  0.33605999,  0.77200001,  0.75555003,  0.45506001,
       -1.76709998, -1.0503    ,  0.42566001,  0.41892999, -0.68326998,
        1.56729996,  0.27684999, -0.61707997,  0.64638001, -0.076996  ,
        0.37118   ,  0.13079999, -0.45137   ,  0.25398001, -0.74392003,
       -0.086199  ,  0.24067999, -0.64819002,  0.83548999,  1.25020003,
       -0.51379001,  0.04224   , -0.88117999,  0.71579999,  0.38519001], dtype=float32)

## Generate dog vectors

In [31]:
def generate_text_from_num(dogtime_html, synonyms, antonyms):
    """
    Given a BeautifulSoup object dogtime_html generated from the dogtime website,
    generate text from numeric features using synonym and antonym dictionaries
    """
    dog_text = ''
    
    char_dict = dict()
    for characteristic in dogtime_html.find_all(class_="characteristic item-trigger-title"):
        char_dict[characteristic.text.strip()] =\
                int(characteristic.find_next().find_next()['class'][1].split('-')[-1])
    
    for trait, value in char_dict.items():
        if value > 3:
            factor = value - 3

            dog_text += factor*(synonyms[trait])
            
        elif value < 3:
            factor = 3 - value
            
            dog_text += factor*(antonyms[trait])
    
    return dog_text

In [32]:
client = pymongo.MongoClient("mongodb://54.67.82.182/dogbreeds")
db = client.dogbreeds

In [81]:
trait_synonyms = dict()
trait_synonyms['Adaptability'] = 'adaptable '
trait_synonyms['Adapts Well to Apartment Living'] = 'apartment '
trait_synonyms['Affectionate with Family'] = 'cuddly '
trait_synonyms['All Around Friendliness'] = 'friendly '
trait_synonyms['Amount Of Shedding'] = ''
trait_synonyms['Dog Friendly'] = 'dogs '
trait_synonyms['Drooling Potential'] = ''
trait_synonyms['Easy To Groom'] = 'grooming '
trait_synonyms['Easy To Train'] = 'obedient '
trait_synonyms['Energy Level'] = 'energetic '
trait_synonyms['Exercise Needs'] = 'active '
trait_synonyms['Friendly Toward Strangers'] = 'friendly '
trait_synonyms['General Health'] = 'healthy '
trait_synonyms['Good For Novice Owners'] = 'novice '
trait_synonyms['Health Grooming'] = 'healthy '
trait_synonyms['Incredibly Kid Friendly Dogs'] = 'children '
trait_synonyms['Intelligence'] = 'intelligent '
trait_synonyms['Intensity'] = ''
trait_synonyms['Potential For Mouthiness'] = 'fetch '
trait_synonyms['Potential For Playfulness'] = 'playful '
trait_synonyms['Potential For Weight Gain'] = ''
trait_synonyms['Prey Drive'] = 'hunting '
trait_synonyms['Sensitivity Level'] = ''
trait_synonyms['Size'] = 'big '
trait_synonyms['Tendency To Bark Or Howl'] = ''
trait_synonyms['Tolerates Being Alone'] = 'alone '
trait_synonyms['Tolerates Cold Weather'] = 'cold '
trait_synonyms['Tolerates Hot Weather'] = 'hot '
trait_synonyms['Trainability'] = 'obedient '
trait_synonyms['Wanderlust Potential'] = ''

In [82]:
trait_antonyms = dict()
trait_antonyms['Adaptability'] = ''
trait_antonyms['Adapts Well to Apartment Living'] = 'yard '
trait_antonyms['Affectionate with Family'] = ''
trait_antonyms['All Around Friendliness'] = ''
trait_antonyms['Amount Of Shedding'] = 'clean '
trait_antonyms['Dog Friendly'] = 'protective '
trait_antonyms['Drooling Potential'] = 'clean '
trait_antonyms['Easy To Groom'] = ''
trait_antonyms['Easy To Train'] = ''
trait_antonyms['Energy Level'] = 'calm '
trait_antonyms['Exercise Needs'] = 'lazy '
trait_antonyms['Friendly Toward Strangers'] = 'security '
trait_antonyms['General Health'] = ''
trait_antonyms['Good For Novice Owners'] = ''
trait_antonyms['Health Grooming'] = ''
trait_antonyms['Incredibly Kid Friendly Dogs'] = ''
trait_antonyms['Intelligence'] = ''
trait_antonyms['Intensity'] = 'relaxed '
trait_antonyms['Potential For Mouthiness'] = 'safe '
trait_antonyms['Potential For Playfulness'] = 'aloof '
trait_antonyms['Potential For Weight Gain'] = 'thin '
trait_antonyms['Prey Drive'] = ''
trait_antonyms['Sensitivity Level'] = 'adaptable '
trait_antonyms['Size'] = 'small '
trait_antonyms['Tendency To Bark Or Howl'] = 'quiet '
trait_antonyms['Tolerates Being Alone'] = ''
trait_antonyms['Tolerates Cold Weather'] = ''
trait_antonyms['Tolerates Hot Weather'] = ''
trait_antonyms['Trainability'] = ''
trait_antonyms['Wanderlust Potential'] = 'homebody '

In [83]:
data_dir = os.path.join(os.path.curdir, "data")
image_dir = os.path.join(data_dir, 'Images', 'test')

In [84]:
dog_dirs = [direct for direct in os.listdir(image_dir)\
            if os.path.isdir(os.path.join(image_dir, direct))]

In [85]:
dog_dirs.remove('not_dog')

In [86]:
dog_breeds = [dog.split('-', 1)[1].lower() for dog in dog_dirs]

In [87]:
breed_text = dict()
for breed, dog_dir in zip(dog_breeds, dog_dirs):
    dog_content = db.dogbreeds.find_one({"breed" : breed})
    dogtime_html = BeautifulSoup(dog_content["dogtime_content"], "lxml")
    breed_text[dog_dir] = generate_text_from_num(dogtime_html, trait_synonyms, trait_antonyms)

In [88]:
breed_vecs = dict()
for breed, text in breed_text.items():
    trait_list = text.split(' ')[:-2]
    breed_vecs[breed] = np.zeros_like(wiki_w2v['dog'])
    for trait in trait_list:
        breed_vecs[breed] += wiki_w2v[trait]
        

In [89]:
cosine_similarity(breed_vecs['n02094433-Yorkshire_terrier'].reshape(1, -1), wiki_w2v['apartment'].reshape(1, -1))

array([[ 0.58356476]], dtype=float32)

In [90]:
breed_mat = np.zeros((115, 50))

In [91]:
breed_vecs['not_dog'] = np.zeros_like(breed_vecs['n02085620-Chihuahua'])

In [92]:
breeds = np.load('breed_indices.npy').tolist()

In [93]:
for breed, index in breeds.items():
    breed_mat[index] += breed_vecs[breed]

In [94]:
breed_mat

array([[  6.69011927,  -0.36988768,  -3.14270592, ...,  -9.92434216,
         10.33327675,   6.50213337],
       [  5.20151949,  -0.77531481,  -2.08489585, ...,  -5.72092247,
          5.49588871,   4.27674007],
       [  7.06528807,  -2.00259781,  -1.57315409, ...,  -6.34839916,
          8.38027191,   5.27300692],
       ..., 
       [  6.75482035,  -0.32357758,  -3.92642164, ..., -11.23333645,
          7.89738274,   8.98905754],
       [  5.86392879,  -0.26254541,  -0.15559515, ...,  -3.0988152 ,
          7.37815475,   7.0153985 ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ]])

In [95]:
np.save('breed_glove_matrix.npy', breed_mat)

In [97]:
np.argsort(cosine_similarity((wiki_w2v['apartment']).reshape(1, -1), breed_mat)[0])

array([114,  63,  82,  37,  79,  70,  84,  13,  56,  83,  97,  29,  46,
       105,  64,  59,  30,  54,  32,  66,  57,  73,  14,  91,  67,  23,
        98,  74,  60,  40,  62,  33,  27,  61,  94, 104,  90,  86,  58,
        55,  15,  28,  16,  34,  24, 103,  95, 108,  76,  87,  75,  89,
        81, 110,  52,  45,  11,  26,  17,  12,  47,  71,  25,   8,  80,
        88,  96,  19,  92,  78, 102,  41, 111,  65,  31,  48,  68,  39,
       112,  22,  38,  69,   5,  42,  18,  35,  21,  10,  44,   7,  51,
        72, 107,   6, 100, 113,  49,   9,  20, 106,  93,  77,   3,  50,
         4,  43,   0, 109, 101,   2,  36,  99,  85,  53,   1])

In [99]:
np.sort(cosine_similarity((wiki_w2v['apartment']).reshape(1, -1), breed_mat)[0])

array([ 0.        ,  0.29916795,  0.32994977,  0.36641484,  0.36856291,
        0.36914406,  0.37304946,  0.37386154,  0.37935054,  0.37950443,
        0.3811247 ,  0.38220287,  0.38444102,  0.38475004,  0.38527763,
        0.38735825,  0.38977064,  0.39056499,  0.39115811,  0.39195703,
        0.3943784 ,  0.39608334,  0.39713538,  0.39808329,  0.3998926 ,
        0.40137057,  0.40152298,  0.40282064,  0.40472544,  0.40571255,
        0.40723348,  0.40762689,  0.40783887,  0.40903474,  0.41225996,
        0.4131129 ,  0.41560581,  0.41707098,  0.41776998,  0.42061047,
        0.42081472,  0.42117322,  0.4222803 ,  0.42446258,  0.42474214,
        0.42621602,  0.42714755,  0.42735732,  0.42768688,  0.4371028 ,
        0.43778829,  0.43786065,  0.43997885,  0.44172749,  0.44261593,
        0.44282651,  0.44413139,  0.45022266,  0.45134529,  0.45254394,
        0.4533395 ,  0.45509903,  0.45523257,  0.4562121 ,  0.45761025,
        0.45991338,  0.46163454,  0.46303357,  0.46632928,  0.46

In [115]:
np.sort(cosine_similarity((wiki_w2v['apartment']).reshape(1, -1), breed_mat)[0] / \
        np.max(cosine_similarity((wiki_w2v['apartment']).reshape(1, -1), breed_mat)[0]))

array([ 0.        ,  0.4590567 ,  0.50628971,  0.56224334,  0.56553944,
        0.56643118,  0.57242381,  0.57366989,  0.58209247,  0.5823286 ,
        0.58481481,  0.5864692 ,  0.58990352,  0.59037769,  0.59118725,
        0.59437985,  0.59808152,  0.59930042,  0.60021053,  0.60143643,
        0.60515189,  0.60776802,  0.60938231,  0.61083683,  0.61361312,
        0.61588098,  0.61611485,  0.61810604,  0.62102885,  0.62254351,
        0.62487728,  0.62548095,  0.62580622,  0.62764123,  0.63259014,
        0.63389894,  0.63772417,  0.63997239,  0.64104497,  0.64540355,
        0.64571695,  0.64626705,  0.64796581,  0.65131439,  0.65174337,
        0.65400495,  0.65543433,  0.65575621,  0.6562619 ,  0.67071011,
        0.67176196,  0.67187299,  0.67512325,  0.67780645,  0.67916971,
        0.67949284,  0.68149509,  0.69084181,  0.69256443,  0.69440369,
        0.69562443,  0.69832433,  0.69852924,  0.70003228,  0.70217766,
        0.70571169,  0.70835272,  0.71049945,  0.71555653,  0.71

In [98]:
breeds

{'n02085620-Chihuahua': 0,
 'n02085782-Japanese_spaniel': 1,
 'n02085936-Maltese_dog': 2,
 'n02086079-Pekinese': 3,
 'n02086240-Shih-Tzu': 4,
 'n02086646-Blenheim_spaniel': 5,
 'n02086910-papillon': 6,
 'n02087046-toy_terrier': 7,
 'n02087394-Rhodesian_ridgeback': 8,
 'n02088094-Afghan_hound': 9,
 'n02088238-basset': 10,
 'n02088364-beagle': 11,
 'n02088466-bloodhound': 12,
 'n02088632-bluetick': 13,
 'n02089078-black-and-tan_coonhound': 14,
 'n02089867-Walker_hound': 15,
 'n02089973-English_foxhound': 16,
 'n02090379-redbone': 17,
 'n02090622-borzoi': 18,
 'n02090721-Irish_wolfhound': 19,
 'n02091032-Italian_greyhound': 20,
 'n02091134-whippet': 21,
 'n02091244-Ibizan_hound': 22,
 'n02091467-Norwegian_elkhound': 23,
 'n02091635-otterhound': 24,
 'n02091831-Saluki': 25,
 'n02092002-Scottish_deerhound': 26,
 'n02092339-Weimaraner': 27,
 'n02093256-Staffordshire_bullterrier': 28,
 'n02093428-American_Staffordshire_terrier': 29,
 'n02093647-Bedlington_terrier': 30,
 'n02093754-Border_terr

## Test potential user input

In [117]:
sentence = "Hello: this is a sentence with punctuation! Doesn't it look great?"

In [88]:
nltk.word_tokenize(sentence)

['Hello',
 ':',
 'this',
 'is',
 'a',
 'sentence',
 'with',
 'punctuation',
 '!',
 'Does',
 "n't",
 'it',
 'look',
 'great',
 '?']

In [89]:
re.compile('\w+').findall(sentence)

['Hello',
 'this',
 'is',
 'a',
 'sentence',
 'with',
 'punctuation',
 'Doesn',
 't',
 'it',
 'look',
 'great']

In [90]:
nltk.tokenize.regexp.WordPunctTokenizer().tokenize(sentence)

['Hello',
 ':',
 'this',
 'is',
 'a',
 'sentence',
 'with',
 'punctuation',
 '!',
 'Doesn',
 "'",
 't',
 'it',
 'look',
 'great',
 '?']

In [91]:
wiki_w2v.similarity('big', 'small')

0.71795842658489328

In [92]:
wiki_w2v["n't"]

array([  1.57309994e-01,   3.95300001e-01,   6.35860026e-01,
        -1.09749997e+00,  -9.57679987e-01,  -1.38410004e-02,
        -1.98530003e-01,   2.54180014e-01,   3.67309988e-01,
        -1.74860001e-01,   2.76849985e-01,   3.19429994e-01,
         3.00779998e-01,   6.85309991e-02,  -1.59170002e-01,
        -2.19439998e-01,   6.40970021e-02,   8.47450018e-01,
        -6.19889975e-01,   5.41729987e-01,   2.79210001e-01,
         5.03830016e-01,   2.14600004e-02,  -2.05709994e-01,
         7.79939964e-02,   3.22290003e-01,  -4.91829991e-01,
        -1.14110005e+00,   2.33329996e-01,  -5.43579996e-01,
         9.22849998e-02,   8.68600011e-01,   6.91270009e-02,
         1.92289993e-01,   2.83740014e-01,   4.60139990e-01,
        -2.83199996e-01,   4.53839988e-01,   3.52090001e-01,
        -4.91730005e-01,  -1.47709996e-01,  -7.17670023e-02,
        -2.43550003e-01,  -6.30890012e-01,  -6.77969992e-01,
        -1.31640002e-01,   3.59739989e-01,  -7.52919972e-01,
         3.82039994e-02,

In [139]:
def tokenize_input(sentence):
    """
    Tokenizes a sentence, removes punctuation, and converts to lowercase letters.
    """
    translate_table = dict((ord(char), None) for char in string.punctuation)
    sentence_list = sentence.split()
    return [x.translate(translate_table).lower() for x in sentence_list]

In [138]:
word_vec = np.zeros(100)

In [140]:
words = tokenize_input(sentence)

In [142]:
words[0]

'hello'

In [99]:
for word in words:
    word_vec += wiki_w2v[word]

In [102]:
np.argsort(cosine_similarity(word_vec.reshape(1, -1), breed_mat))

array([[114,  63,  82,  84,  59,  68,  56,  65,  29,  93,  90,  67,  40,
         79,  80, 106,  57,  66,  17,  52,  13, 110, 109,  70,  37,  46,
         83, 108,  60, 113,  55,  50,  10,  28,  54,  32,  30,  96,  27,
         74,  64,  62,  45,   6,  23, 101,  76,   2,   5, 112, 105,  11,
         97, 111,  16,  73,  42,  14,  34,  61,  58,  51,  98, 103,  44,
          3,  99,  15,  94,   1,   0,  41,  22,   7,  21,  35,  33,  20,
         81,  89,  85,  12,  78,  69,  88,  77,  31,  87,   8,  49,   4,
         86,  75,  39,  43, 100,  48, 104,  18,  95,  25,  38,  47,  91,
         24,  72,  53,  19,  26,  36,  92,   9, 107, 102,  71]])

In [106]:
words = tokenize_input("A sentence that may contain a word you've not seen before: l'cie.")

In [112]:
def vectorize_words(model, words):
    """
    Take a list of words, and convert it into the sum of the word vectors
    for the model, ignoring out of vocabulary words
    """
    word_vec = np.zeros(len(model['you']))
    for word in words:
        try:
            word_vec += wiki_w2v[word]
        except:
            pass
    
    return word_vec

In [124]:
dog_desc = "playful, affectionate children"

In [125]:
dog_vec = vectorize_words(wiki_w2v, tokenize_input(dog_desc))

In [126]:
np.argsort(cosine_similarity(dog_vec.reshape(1, -1), breed_mat))

array([[114, 107,   1,  76,  50,  71,  36,   3,  45,   2,  42,  32, 106,
         58,  97,  75,  99,  47, 113,  46,  18,  68,   9,  38,  81,  15,
        104, 102,  37,  25,   6,  53,  26,  85,  19,  52,  34,  40,  43,
        111,  70,  74,  88,  65,  20,  16,  63,  48,   4,  23,  69, 110,
        103,  87,  78,   7,  39,   8,  73, 112, 100,   0,  80,  24,  67,
         86,  77,  66,  35,  14,  72, 101,  60,  31,  30,  21,  44,  33,
         59, 105,  49,  84,  27,  79,  11,  51,  22,  92,  62,  61,  96,
         41,  12,  94,  89,  95,  10,  93, 109,  54,  55,   5,  91,  64,
         83,  98, 108,  90,  13,  82,  57,  28,  29,  56,  17]])

## Use first 100,000 words in GloVe

In [107]:
embeddings_index = dict()
f = open(os.path.join('data', 'glove.6B.50d.txt'))
i = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    i += 1 
    if i >= 100000:
        break
f.close()

In [108]:
word_index = dict()
for idx, word in enumerate(embeddings_index.keys()):
    word_index[word] = idx

In [109]:
len(word_index)

100000

In [111]:
embedding_matrix = np.zeros((len(word_index)+1, 50))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [18]:
embedding_layer = Embedding(len(word_index)+1,
                            100,
                            weights=[embedding_matrix],
                            trainable=False)

In [112]:
embeddings_index['small']

array([ 1.14189994,  0.21620999,  0.05988   , -0.22087   ,  0.72100002,
        0.55394   , -0.85043001, -0.27485001,  0.0788    , -0.54447001,
       -0.16458   , -0.39355001,  0.94567001,  0.3132    , -0.57388002,
        0.006172  ,  0.51563001,  0.59235001, -0.54390001, -1.10969996,
        0.13733   , -0.66900998, -0.16584   ,  0.44869   , -0.42888999,
       -1.12249994, -0.43607   ,  0.55676001,  0.39962   , -0.25964001,
        3.85430002, -0.33091   ,  0.38144001,  0.059943  ,  0.19653   ,
        0.50616997, -0.41123   ,  0.16168   ,  0.1503    , -0.061063  ,
       -0.0063997 ,  0.1881    , -0.037663  ,  0.29605001,  0.46123999,
        0.0066039 , -0.30737999, -0.64137   , -0.058311  , -0.45848   ], dtype=float32)

In [113]:
with open('glove_vec_dict.p', 'wb') as file:
    pickle.dump(embeddings_index, file)

In [66]:
word_index['bark']

14924

In [70]:
word_index['quiet']

3742

In [56]:
index_word = {v: k for k, v in word_index.items()}

In [60]:
index_word[10000]

'persecution'

In [100]:
for trait in trait_antonyms.values():
    try:
        print(word_index[trait[:-1]])
    except:
        pass

4758
2431
7935
2431
3841
16531
194
8213
1876
29986
4474
38707
357
3742
102547


In [79]:
word_index['obedient']

44400