In [15]:
import pandas as pd
import numpy as np
import pymongo
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup

import nltk
import gensim

In [3]:
#Get pretrained vectors for word2vec

!wget https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz

--2017-10-27 14:14:11--  https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.161.157
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.161.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/octet-stream]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2017-10-27 14:27:37 (1.95 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



## Load pretrained word2vec model

In [6]:
gnews_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
gnews_w2v.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [22]:
gnews_w2v.most_similar(positive=['apartment'])

[('apartment_complex', 0.7893964052200317),
 ('townhouse', 0.7727404236793518),
 ('apartments', 0.7152601480484009),
 ('bedroom', 0.7045778632164001),
 ('Apartments', 0.6661036014556885),
 ('house', 0.6628996133804321),
 ('duplex', 0.6575543880462646),
 ('rooming_house', 0.6564425230026245),
 ('townhome', 0.6522176265716553),
 ('fourplex', 0.648613691329956)]

In [34]:
cosine_similarity(gnews_w2v['dog'].reshape(1, -1), gnews_w2v['small'].reshape(1, -1))

array([[ 0.11135036]], dtype=float32)

## Load pre-trained GloVe model

In [2]:
wiki_w2v = gensim.models.KeyedVectors.load_word2vec_format('./data/glove_w2v_small.txt')

In [12]:
cosine_similarity(wiki_w2v['small'].reshape(1, -1), wiki_w2v['big'].reshape(1, -1))

array([[ 0.70083666]], dtype=float32)

In [4]:
wiki_w2v['dog']

array([ 0.11008   , -0.38780999, -0.57615   , -0.27713999,  0.70520997,
        0.53994   , -1.07860005, -0.40145999,  1.15040004, -0.56779999,
        0.0038977 ,  0.52877998,  0.64560997,  0.47262001,  0.48548999,
       -0.18407001,  0.18009999,  0.91396999, -1.19790006, -0.57779998,
       -0.37985   ,  0.33605999,  0.77200001,  0.75555003,  0.45506001,
       -1.76709998, -1.0503    ,  0.42566001,  0.41892999, -0.68326998,
        1.56729996,  0.27684999, -0.61707997,  0.64638001, -0.076996  ,
        0.37118   ,  0.13079999, -0.45137   ,  0.25398001, -0.74392003,
       -0.086199  ,  0.24067999, -0.64819002,  0.83548999,  1.25020003,
       -0.51379001,  0.04224   , -0.88117999,  0.71579999,  0.38519001], dtype=float32)

## Generate dog vectors

In [14]:
def generate_text_from_num(dogtime_html, synonyms, antonyms):
    """
    Given a BeautifulSoup object dogtime_html generated from the dogtime website,
    generate text from numeric features using synonym and antonym dictionaries
    """
    dog_text = ''
    
    char_dict = dict()
    for characteristic in dogtime_html.find_all(class_="characteristic item-trigger-title"):
        char_dict[characteristic.text.strip()] =\
                int(characteristic.find_next().find_next()['class'][1].split('-')[-1])
    
    for trait, value in char_dict.items():
        if value > 3:
            factor = value - 3

            dog_text += factor*(synonyms[trait]+' ')
            
        elif value < 3:
            factor = 3 - value
            
            dog_text += factor*(antonyms[trait]+' ')
    
    return dog_text

In [6]:
client = pymongo.MongoClient("mongodb://54.67.82.182/dogbreeds")
db = client.dogbreeds

In [11]:
trait_synonyms = dict()
trait_synonyms['Adaptability'] = 'adaptable'
trait_synonyms['Adapts Well to Apartment Living'] = 'apartment'
trait_synonyms['Affectionate with Family'] = 'cuddly'
trait_synonyms['All Around Friendliness'] = 'friendly'
trait_synonyms['Amount Of Shedding'] = ''
trait_synonyms['Dog Friendly'] = 'other dogs'
trait_synonyms['Drooling Potential'] = ''
trait_synonyms['Easy To Groom'] = 'grooming'
trait_synonyms['Easy To Train'] = 'trainable'
trait_synonyms['Energy Level'] = 'energetic'
trait_synonyms['Exercise Needs'] = 'active'
trait_synonyms['Friendly Toward Strangers'] = 'friendly'
trait_synonyms['General Health'] = 'healthy'
trait_synonyms['Good For Novice Owners'] = 'novice'
trait_synonyms['Health Grooming'] = 'healthy'
trait_synonyms['Incredibly Kid Friendly Dogs'] = 'children'
trait_synonyms['Intelligence'] = 'intelligent'
trait_synonyms['Intensity'] = ''
trait_synonyms['Potential For Mouthiness'] = 'fetch'
trait_synonyms['Potential For Playfulness'] = 'playful'
trait_synonyms['Potential For Weight Gain'] = ''
trait_synonyms['Prey Drive'] = 'hunting'
trait_synonyms['Sensitivity Level'] = ''
trait_synonyms['Size'] = 'big'
trait_synonyms['Tendency To Bark Or Howl'] = ''
trait_synonyms['Tolerates Being Alone'] = 'alone'
trait_synonyms['Tolerates Cold Weather'] = 'cold'
trait_synonyms['Tolerates Hot Weather'] = 'hot'
trait_synonyms['Trainability'] = 'trainable'
trait_synonyms['Wanderlust Potential'] = ''

In [13]:
trait_antonyms = dict()
trait_antonyms['Adaptability'] = ''
trait_antonyms['Adapts Well to Apartment Living'] = 'yard'
trait_antonyms['Affectionate with Family'] = ''
trait_antonyms['All Around Friendliness'] = ''
trait_antonyms['Amount Of Shedding'] = 'clean'
trait_antonyms['Dog Friendly'] = 'protective'
trait_antonyms['Drooling Potential'] = 'clean'
trait_antonyms['Easy To Groom'] = ''
trait_antonyms['Easy To Train'] = ''
trait_antonyms['Energy Level'] = 'calm'
trait_antonyms['Exercise Needs'] = 'lazy'
trait_antonyms['Friendly Toward Strangers'] = 'security'
trait_antonyms['General Health'] = ''
trait_antonyms['Good For Novice Owners'] = ''
trait_antonyms['Health Grooming'] = ''
trait_antonyms['Incredibly Kid Friendly Dogs'] = ''
trait_antonyms['Intelligence'] = ''
trait_antonyms['Intensity'] = 'relaxed'
trait_antonyms['Potential For Mouthiness'] = 'safe'
trait_antonyms['Potential For Playfulness'] = 'aloof'
trait_antonyms['Potential For Weight Gain'] = 'thin'
trait_antonyms['Prey Drive'] = ''
trait_antonyms['Sensitivity Level'] = 'adaptable'
trait_antonyms['Size'] = 'small'
trait_antonyms['Tendency To Bark Or Howl'] = 'quiet'
trait_antonyms['Tolerates Being Alone'] = ''
trait_antonyms['Tolerates Cold Weather'] = ''
trait_antonyms['Tolerates Hot Weather'] = ''
trait_antonyms['Trainability'] = ''
trait_antonyms['Wanderlust Potential'] = 'homebody'