In [84]:
import pandas as pd
import numpy as np
import pymongo
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import os
import re
import string

import nltk
import gensim

In [3]:
#Get pretrained vectors for word2vec

!wget https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz

--2017-10-27 14:14:11--  https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.161.157
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.161.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/octet-stream]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2017-10-27 14:27:37 (1.95 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



## Load pretrained word2vec model

In [33]:
gnews_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [34]:
gnews_w2v.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [35]:
gnews_w2v.most_similar(positive=['apartment'])

[('apartment_complex', 0.7893964052200317),
 ('townhouse', 0.7727404236793518),
 ('apartments', 0.7152601480484009),
 ('bedroom', 0.7045778632164001),
 ('Apartments', 0.6661036014556885),
 ('house', 0.6628996133804321),
 ('duplex', 0.6575543880462646),
 ('rooming_house', 0.6564425230026245),
 ('townhome', 0.6522176265716553),
 ('fourplex', 0.648613691329956)]

In [36]:
cosine_similarity(gnews_w2v['big'].reshape(1, -1), gnews_w2v['small'].reshape(1, -1))

array([[ 0.49586788]], dtype=float32)

## Load pre-trained GloVe model

In [37]:
wiki_w2v = gensim.models.KeyedVectors.load_word2vec_format('./data/glove_w2v_100d.txt')

In [38]:
cosine_similarity(wiki_w2v['small'].reshape(1, -1), wiki_w2v['big'].reshape(1, -1))

array([[ 0.71795845]], dtype=float32)

In [39]:
wiki_w2v['dog']

array([ 0.30816999,  0.30937999,  0.52802998, -0.92543   , -0.73671001,
        0.63475001,  0.44196999,  0.10262   , -0.09142   , -0.56607002,
       -0.5327    ,  0.2013    ,  0.77039999, -0.13982999,  0.13727   ,
        1.1128    ,  0.89301002, -0.17869   , -0.0019722 ,  0.57288998,
        0.59478998,  0.50427997, -0.28990999, -1.34909999,  0.42756   ,
        1.27479994, -1.16129994, -0.41084   ,  0.042804  ,  0.54865998,
        0.18897   ,  0.3759    ,  0.58034998,  0.66974998,  0.81155998,
        0.93864   , -0.51005   , -0.070079  ,  0.82819003, -0.35346001,
        0.21086   , -0.24412   , -0.16553999, -0.78358001, -0.48482001,
        0.38968   , -0.86356002, -0.016391  ,  0.31984001, -0.49246001,
       -0.069363  ,  0.018869  , -0.098286  ,  1.31260002, -0.12116   ,
       -1.23989999, -0.091429  ,  0.35293999,  0.64644998,  0.089642  ,
        0.70293999,  1.12440002,  0.38639   ,  0.52083999,  0.98786998,
        0.79952002, -0.34625   ,  0.14094999,  0.80167001,  0.20

## Generate dog vectors

In [40]:
def generate_text_from_num(dogtime_html, synonyms, antonyms):
    """
    Given a BeautifulSoup object dogtime_html generated from the dogtime website,
    generate text from numeric features using synonym and antonym dictionaries
    """
    dog_text = ''
    
    char_dict = dict()
    for characteristic in dogtime_html.find_all(class_="characteristic item-trigger-title"):
        char_dict[characteristic.text.strip()] =\
                int(characteristic.find_next().find_next()['class'][1].split('-')[-1])
    
    for trait, value in char_dict.items():
        if value > 3:
            factor = value - 3

            dog_text += factor*(synonyms[trait])
            
        elif value < 3:
            factor = 3 - value
            
            dog_text += factor*(antonyms[trait])
    
    return dog_text

In [41]:
client = pymongo.MongoClient("mongodb://54.67.82.182/dogbreeds")
db = client.dogbreeds

In [42]:
trait_synonyms = dict()
trait_synonyms['Adaptability'] = 'adaptable '
trait_synonyms['Adapts Well to Apartment Living'] = 'apartment '
trait_synonyms['Affectionate with Family'] = 'cuddly '
trait_synonyms['All Around Friendliness'] = 'friendly '
trait_synonyms['Amount Of Shedding'] = ''
trait_synonyms['Dog Friendly'] = 'dogs '
trait_synonyms['Drooling Potential'] = ''
trait_synonyms['Easy To Groom'] = 'grooming '
trait_synonyms['Easy To Train'] = 'trainable '
trait_synonyms['Energy Level'] = 'energetic '
trait_synonyms['Exercise Needs'] = 'active '
trait_synonyms['Friendly Toward Strangers'] = 'friendly '
trait_synonyms['General Health'] = 'healthy '
trait_synonyms['Good For Novice Owners'] = 'novice '
trait_synonyms['Health Grooming'] = 'healthy '
trait_synonyms['Incredibly Kid Friendly Dogs'] = 'children '
trait_synonyms['Intelligence'] = 'intelligent '
trait_synonyms['Intensity'] = ''
trait_synonyms['Potential For Mouthiness'] = 'fetch '
trait_synonyms['Potential For Playfulness'] = 'playful '
trait_synonyms['Potential For Weight Gain'] = ''
trait_synonyms['Prey Drive'] = 'hunting '
trait_synonyms['Sensitivity Level'] = ''
trait_synonyms['Size'] = 'big '
trait_synonyms['Tendency To Bark Or Howl'] = ''
trait_synonyms['Tolerates Being Alone'] = 'alone '
trait_synonyms['Tolerates Cold Weather'] = 'cold '
trait_synonyms['Tolerates Hot Weather'] = 'hot '
trait_synonyms['Trainability'] = 'trainable '
trait_synonyms['Wanderlust Potential'] = ''

In [43]:
trait_antonyms = dict()
trait_antonyms['Adaptability'] = ''
trait_antonyms['Adapts Well to Apartment Living'] = 'yard '
trait_antonyms['Affectionate with Family'] = ''
trait_antonyms['All Around Friendliness'] = ''
trait_antonyms['Amount Of Shedding'] = 'clean '
trait_antonyms['Dog Friendly'] = 'protective '
trait_antonyms['Drooling Potential'] = 'clean '
trait_antonyms['Easy To Groom'] = ''
trait_antonyms['Easy To Train'] = ''
trait_antonyms['Energy Level'] = 'calm '
trait_antonyms['Exercise Needs'] = 'lazy '
trait_antonyms['Friendly Toward Strangers'] = 'security '
trait_antonyms['General Health'] = ''
trait_antonyms['Good For Novice Owners'] = ''
trait_antonyms['Health Grooming'] = ''
trait_antonyms['Incredibly Kid Friendly Dogs'] = ''
trait_antonyms['Intelligence'] = ''
trait_antonyms['Intensity'] = 'relaxed '
trait_antonyms['Potential For Mouthiness'] = 'safe '
trait_antonyms['Potential For Playfulness'] = 'aloof '
trait_antonyms['Potential For Weight Gain'] = 'thin '
trait_antonyms['Prey Drive'] = ''
trait_antonyms['Sensitivity Level'] = 'adaptable '
trait_antonyms['Size'] = 'small '
trait_antonyms['Tendency To Bark Or Howl'] = 'quiet '
trait_antonyms['Tolerates Being Alone'] = ''
trait_antonyms['Tolerates Cold Weather'] = ''
trait_antonyms['Tolerates Hot Weather'] = ''
trait_antonyms['Trainability'] = ''
trait_antonyms['Wanderlust Potential'] = 'homebody '

In [44]:
data_dir = os.path.join(os.path.curdir, "data")
image_dir = os.path.join(data_dir, 'Images', 'test')

In [45]:
dog_dirs = [direct for direct in os.listdir(image_dir)\
            if os.path.isdir(os.path.join(image_dir, direct))]

In [46]:
dog_dirs.remove('not_dog')

In [47]:
dog_breeds = [dog.split('-', 1)[1].lower() for dog in dog_dirs]

In [48]:
breed_text = dict()
for breed, dog_dir in zip(dog_breeds, dog_dirs):
    dog_content = db.dogbreeds.find_one({"breed" : breed})
    dogtime_html = BeautifulSoup(dog_content["dogtime_content"], "lxml")
    breed_text[dog_dir] = generate_text_from_num(dogtime_html, trait_synonyms, trait_antonyms)

In [49]:
breed_vecs = dict()
for breed, text in breed_text.items():
    trait_list = text.split(' ')[:-2]
    breed_vecs[breed] = np.zeros_like(wiki_w2v['dog'])
    for trait in trait_list:
        breed_vecs[breed] += wiki_w2v[trait]
        

In [53]:
cosine_similarity(breed_vecs['n02094433-Yorkshire_terrier'].reshape(1, -1), wiki_w2v['apartment'].reshape(1, -1))

array([[ 0.5506736]], dtype=float32)

In [63]:
breed_mat = np.zeros((115, 100))

In [59]:
breed_vecs['not_dog'] = np.zeros_like(breed_vecs['n02085620-Chihuahua'])

In [64]:
breeds = np.load('breed_indices.npy').tolist()

In [65]:
for breed, index in breeds.items():
    breed_mat[index] += breed_vecs[breed]

In [66]:
breed_mat

array([[ -1.01540565e+00,   5.45926809e+00,   3.44888043e+00, ...,
         -3.10276031e+00,   3.75666428e+00,   7.80975819e-03],
       [ -1.77221918e+00,   1.87089062e+00,   4.58829021e+00, ...,
         -1.02567804e+00,  -1.15763593e+00,   1.68762589e+00],
       [ -2.43179893e+00,   6.07515049e+00,   5.99154949e+00, ...,
          1.17739439e-02,   1.68751764e+00,   4.56869990e-01],
       ..., 
       [ -4.00478983e+00,   1.32845678e+01,   4.60762930e+00, ...,
         -9.75337982e-01,   2.10804796e+00,  -2.04442978e-01],
       [ -6.40680075e-01,   3.68850780e+00,   2.27188468e+00, ...,
         -4.15260410e+00,   5.04354334e+00,   6.00267887e-01],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [67]:
np.save('breed_glove_matrix.npy', breed_mat)

In [83]:
np.argsort(cosine_similarity((wiki_w2v['novice']).reshape(1, -1), breed_mat)[0])

array([114,  28,  76, 107,  32,  48,  98, 105,  85,  35,  19,  95, 109,
        97, 103,  81,  33,  11,  26,  29,  78,  68,  12,  87,   9,  86,
        70,  31,  75,  61,  91,  50,  38,  45,  49, 113,  54,  60,  22,
        21,  51,  18,  56,  94,  58,  71, 102,  62,  92,  64,  66,  25,
        14,  42,  47, 108,  90,  96,  73,   7,  46,   1,   8,  16,  63,
        88,  36,  57,  74,  89,  82,   3, 104,  55,  10,  84, 100,  37,
        30,  17,  83,  99,  40,  24,  44,  79,   0,  13,  80,  43,  41,
        77,  23,  69, 110, 106,  59,  15,   4,  52,   5,  34, 111,  39,
        27,  72,  20,   6, 112,  67, 101,  65,   2,  93,  53])

In [76]:
breeds

{'n02085620-Chihuahua': 0,
 'n02085782-Japanese_spaniel': 1,
 'n02085936-Maltese_dog': 2,
 'n02086079-Pekinese': 3,
 'n02086240-Shih-Tzu': 4,
 'n02086646-Blenheim_spaniel': 5,
 'n02086910-papillon': 6,
 'n02087046-toy_terrier': 7,
 'n02087394-Rhodesian_ridgeback': 8,
 'n02088094-Afghan_hound': 9,
 'n02088238-basset': 10,
 'n02088364-beagle': 11,
 'n02088466-bloodhound': 12,
 'n02088632-bluetick': 13,
 'n02089078-black-and-tan_coonhound': 14,
 'n02089867-Walker_hound': 15,
 'n02089973-English_foxhound': 16,
 'n02090379-redbone': 17,
 'n02090622-borzoi': 18,
 'n02090721-Irish_wolfhound': 19,
 'n02091032-Italian_greyhound': 20,
 'n02091134-whippet': 21,
 'n02091244-Ibizan_hound': 22,
 'n02091467-Norwegian_elkhound': 23,
 'n02091635-otterhound': 24,
 'n02091831-Saluki': 25,
 'n02092002-Scottish_deerhound': 26,
 'n02092339-Weimaraner': 27,
 'n02093256-Staffordshire_bullterrier': 28,
 'n02093428-American_Staffordshire_terrier': 29,
 'n02093647-Bedlington_terrier': 30,
 'n02093754-Border_terr

## Test potential user input

In [87]:
sentence = "Hello: this is a sentence with punctuation! Doesn't it look great?"

In [88]:
nltk.word_tokenize(sentence)

['Hello',
 ':',
 'this',
 'is',
 'a',
 'sentence',
 'with',
 'punctuation',
 '!',
 'Does',
 "n't",
 'it',
 'look',
 'great',
 '?']

In [89]:
re.compile('\w+').findall(sentence)

['Hello',
 'this',
 'is',
 'a',
 'sentence',
 'with',
 'punctuation',
 'Doesn',
 't',
 'it',
 'look',
 'great']

In [90]:
nltk.tokenize.regexp.WordPunctTokenizer().tokenize(sentence)

['Hello',
 ':',
 'this',
 'is',
 'a',
 'sentence',
 'with',
 'punctuation',
 '!',
 'Doesn',
 "'",
 't',
 'it',
 'look',
 'great',
 '?']

In [91]:
wiki_w2v.similarity('big', 'small')

0.71795842658489328

In [92]:
wiki_w2v["n't"]

array([  1.57309994e-01,   3.95300001e-01,   6.35860026e-01,
        -1.09749997e+00,  -9.57679987e-01,  -1.38410004e-02,
        -1.98530003e-01,   2.54180014e-01,   3.67309988e-01,
        -1.74860001e-01,   2.76849985e-01,   3.19429994e-01,
         3.00779998e-01,   6.85309991e-02,  -1.59170002e-01,
        -2.19439998e-01,   6.40970021e-02,   8.47450018e-01,
        -6.19889975e-01,   5.41729987e-01,   2.79210001e-01,
         5.03830016e-01,   2.14600004e-02,  -2.05709994e-01,
         7.79939964e-02,   3.22290003e-01,  -4.91829991e-01,
        -1.14110005e+00,   2.33329996e-01,  -5.43579996e-01,
         9.22849998e-02,   8.68600011e-01,   6.91270009e-02,
         1.92289993e-01,   2.83740014e-01,   4.60139990e-01,
        -2.83199996e-01,   4.53839988e-01,   3.52090001e-01,
        -4.91730005e-01,  -1.47709996e-01,  -7.17670023e-02,
        -2.43550003e-01,  -6.30890012e-01,  -6.77969992e-01,
        -1.31640002e-01,   3.59739989e-01,  -7.52919972e-01,
         3.82039994e-02,

In [103]:
def tokenize_input(sentence):
    """
    Tokenizes a sentence, removes punctuation, and converts to lowercase letters.
    """
    l = nltk.word_tokenize(sentence)
    return [x.lower() for x in l if not re.fullmatch('[' + string.punctuation + ']+', x)]

In [98]:
word_vec = np.zeros(100)

In [104]:
words = tokenize_input(sentence)

In [99]:
for word in words:
    word_vec += wiki_w2v[word]

In [102]:
np.argsort(cosine_similarity(word_vec.reshape(1, -1), breed_mat))

array([[114,  63,  82,  84,  59,  68,  56,  65,  29,  93,  90,  67,  40,
         79,  80, 106,  57,  66,  17,  52,  13, 110, 109,  70,  37,  46,
         83, 108,  60, 113,  55,  50,  10,  28,  54,  32,  30,  96,  27,
         74,  64,  62,  45,   6,  23, 101,  76,   2,   5, 112, 105,  11,
         97, 111,  16,  73,  42,  14,  34,  61,  58,  51,  98, 103,  44,
          3,  99,  15,  94,   1,   0,  41,  22,   7,  21,  35,  33,  20,
         81,  89,  85,  12,  78,  69,  88,  77,  31,  87,   8,  49,   4,
         86,  75,  39,  43, 100,  48, 104,  18,  95,  25,  38,  47,  91,
         24,  72,  53,  19,  26,  36,  92,   9, 107, 102,  71]])

In [106]:
words = tokenize_input("A sentence that may contain a word you've not seen before: l'cie.")

In [112]:
def vectorize_words(model, words):
    """
    Take a list of words, and convert it into the sum of the word vectors
    for the model, ignoring out of vocabulary words
    """
    word_vec = np.zeros(len(model['you']))
    for word in words:
        try:
            word_vec += wiki_w2v[word]
        except:
            pass
    
    return word_vec

In [124]:
dog_desc = "playful, affectionate children"

In [125]:
dog_vec = vectorize_words(wiki_w2v, tokenize_input(dog_desc))

In [126]:
np.argsort(cosine_similarity(dog_vec.reshape(1, -1), breed_mat))

array([[114, 107,   1,  76,  50,  71,  36,   3,  45,   2,  42,  32, 106,
         58,  97,  75,  99,  47, 113,  46,  18,  68,   9,  38,  81,  15,
        104, 102,  37,  25,   6,  53,  26,  85,  19,  52,  34,  40,  43,
        111,  70,  74,  88,  65,  20,  16,  63,  48,   4,  23,  69, 110,
        103,  87,  78,   7,  39,   8,  73, 112, 100,   0,  80,  24,  67,
         86,  77,  66,  35,  14,  72, 101,  60,  31,  30,  21,  44,  33,
         59, 105,  49,  84,  27,  79,  11,  51,  22,  92,  62,  61,  96,
         41,  12,  94,  89,  95,  10,  93, 109,  54,  55,   5,  91,  64,
         83,  98, 108,  90,  13,  82,  57,  28,  29,  56,  17]])