In [86]:
import glob
import os
import re
import string

import numpy as np
import pandas as pd
import multiprocessing

from gensim.models import word2vec, Word2Vec
from gensim.models.phrases import Phrases, Phraser
import spacy

import logging  # Setting up the loggings to monitor gensim

pd.set_option('display.max_rows', 4000)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [235]:
N_CORES = multiprocessing.cpu_count()

In [218]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 0:
        return ' '.join(txt)

In [234]:
# cleaning(nlp("Yeah"))
cleaned = cleaning(nlp("I couldn't hurt, because I thought I should not."))

In [232]:
cleaned

'-PRON- not hurt , -PRON- think -PRON- .'

## Wikiart

In [247]:
LABEL_SENTENCES_FILE = "../data/wikiart/label_sentences.txt"

In [248]:
## Read lines
with open(LABEL_SENTENCES_FILE, 'r') as f:
    lines = f.readlines()
all_labels = list(map(lambda s: s.strip(), lines))

In [250]:
all_labels

['still life flower painting post impressionism flowers plants',
 'nude painting nu expressionism female nude',
 'portrait cubism male portraits',
 'portrait cubism male portraits',
 'self portrait cubism male portraits famous people',
 'nude painting nu cubism female nude',
 'design constructivism designs sketches',
 'design constructivism designs sketches',
 'nude painting nu post impressionism female nude',
 'portrait post impressionism female portraits',
 'flower painting post impressionism flowers plants',
 'still life post impressionism food beverages',
 'vanitas expressionism books letters skeletons skulls',
 'flower painting post impressionism flowers plants',
 'design cubism designs sketches',
 'interior contemporary realism furniture decoration',
 'landscape contemporary realism countryside spain',
 'landscape contemporary realism countryside spain',
 'vanitas contemporary realism skeletons skulls',
 'marina contemporary realism seas oceans spain',
 'cityscape contemporary re

In [272]:
## Stemming and Lemmatization
labels_dict = {}
labels_txt = [cleaning(doc) for doc in nlp.pipe(all_labels, batch_size=5000, n_threads=-1)]
for i, filename in enumerate(filenames):
    labels_dict[filename] = labels_txt[i]
    
label_corpus = list(map(lambda s:str(s).split(), labels_txt))

In [242]:
labels_dict

{'0000001.jpeg': 'life flower paint post - impressionism flower plant',
 '0000002.jpeg': 'nude painting nu expressionism female nude',
 '0000003.jpeg': 'portrait cubism male portrait',
 '0000004.jpeg': 'portrait cubism male portrait',
 '0000005.jpeg': 'self portrait cubism male portrait famous people',
 '0000006.jpeg': 'nude painting nu cubism female nude',
 '0000007.jpeg': 'design constructivism design sketch',
 '0000008.jpeg': 'design constructivism design sketch',
 '0000009.jpeg': 'nude painting nu post - impressionism female nude',
 '0000010.jpeg': 'portrait post - impressionism female portrait',
 '0000011.jpeg': 'flower painting post - impressionism flower plant',
 '0000012.jpeg': 'life post - impressionism food beverage',
 '0000013.jpeg': 'vanita expressionism book letters skeleton skull',
 '0000014.jpeg': 'flower painting post - impressionism flower plant',
 '0000015.jpeg': 'design cubism design sketch',
 '0000016.jpeg': 'interior contemporary realism furniture decoration',
 '00

In [82]:
labels_dict

{'0000001.jpeg': 'life flower painting post impressionism flower plant',
 '0000002.jpeg': 'nude painting nu expressionism female nude',
 '0000003.jpeg': 'portrait cubism male portrait',
 '0000004.jpeg': 'portrait cubism male portrait',
 '0000005.jpeg': 'self portrait cubism male portrait famous people',
 '0000006.jpeg': 'nude painting nu cubism female nude',
 '0000007.jpeg': 'design constructivism design sketch',
 '0000008.jpeg': 'design constructivism design sketch',
 '0000009.jpeg': 'nude painting nu post impressionism female nude',
 '0000010.jpeg': 'portrait post impressionism female portrait',
 '0000011.jpeg': 'flower painting post impressionism flower plant',
 '0000012.jpeg': 'life post impressionism food beverage',
 '0000013.jpeg': 'vanita expressionism book letters skeleton skull',
 '0000014.jpeg': 'flower painting post impressionism flower plant',
 '0000015.jpeg': 'design cubism design sketch',
 '0000016.jpeg': 'interior contemporary realism furniture decoration',
 '0000017.jpe

In [271]:
## Phraser (Find common phrases for low memory consumption at training phase)
## NOT REALLY GOOD
# sent = [str(row).split() for row in all_labels]
phrases = Phrases(label_corpus, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
label_corpus_alt = bigram[label_corpus]
print(len(label_corpus_alt.obj.phrasegrams), "phrases found.")

314 phrases found.


In [192]:
model = Word2Vec(min_count=1,
                 window=2,
                 size=1000,
                 sample=6e-5,
                 alpha=0.025,
                 min_alpha=0.0001,
                 negative=50,
                 workers=N_CORES - 1
                )

model.build_vocab(label_corpus,
                  corpus_file=None,
                  update=False,
                  progress_per=1000,
                  keep_raw_vocab=False,
                  trim_rule=None,)

print("Vocabulary shape:", model.wv.vectors.shape)
print("Training..")

model.train(label_corpus,
            corpus_file=None,
            total_examples=model.corpus_count,
            total_words=None,
            epochs=50,
            start_alpha=None,
            end_alpha=None,
            word_count=0,
            queue_factor=2,
            report_delay=1.0,
            compute_loss=False,
            callbacks=(),
            )

Vocabulary shape: (4410, 1000)
Training..


(5348953, 23817100)

In [262]:
keyword = 'cat'
similar_words = model.wv.most_similar(keyword, topn=100)

In [263]:
similar_words

[('bury', 0.7519315481185913),
 ('mouse', 0.7112663388252258),
 ('kitchen', 0.6628247499465942),
 ('dog', 0.6593775749206543),
 ('monkey', 0.6338068246841431),
 ('commonplac', 0.6147680282592773),
 ('dining', 0.598753809928894),
 ('rabbit', 0.5985159873962402),
 ('intimism', 0.5849064588546753),
 ('regionalism', 0.5844130516052246),
 ('decoration', 0.5794298648834229),
 ('fox', 0.5745127201080322),
 ('furniture', 0.5736602544784546),
 ('sweden', 0.5617462992668152),
 ('rooster', 0.5352712869644165),
 ('mealtime', 0.5310050249099731),
 ('fireplace', 0.5308372974395752),
 ('newspaper', 0.5274323225021362),
 ('kitsch', 0.5255146026611328),
 ('plants', 0.5254404544830322),
 ('pigeon', 0.5195714235305786),
 ('read', 0.516494631767273),
 ('home', 0.5158487558364868),
 ('parrot', 0.514686107635498),
 ('rat', 0.5134881734848022),
 ('mule', 0.5112684965133667),
 ('stork', 0.5105884075164795),
 ('dov', 0.510399341583252),
 ('tubism', 0.5103386044502258),
 ('polynesia', 0.508610188961029),
 ('cou

In [195]:
model.save("../models/wikiart_word2vec.model")

In [209]:
model[keyword]

  """Entry point for launching an IPython kernel.


array([ 3.91526431e-01, -1.59340903e-01,  1.45600587e-01,  1.11114001e-02,
       -8.48038569e-02,  2.29631484e-01, -3.10617778e-02,  2.82474726e-01,
       -1.49108628e-02, -1.03743292e-01,  1.48680598e-01,  1.47360057e-01,
       -1.94631405e-02, -1.69294953e-01,  1.53501093e-01, -2.82825589e-01,
        1.54670328e-01,  7.44245648e-02,  7.65438005e-02, -4.79385644e-01,
        2.05496132e-01,  3.22900802e-01, -1.69366688e-01,  4.99262549e-02,
        1.81401968e-01,  1.20202027e-01, -2.04425752e-01, -1.61321491e-01,
       -1.95385572e-02, -2.53311783e-01, -7.06729442e-02,  4.97344472e-02,
        1.93979293e-01, -1.51906088e-01, -9.54204127e-02, -7.29699358e-02,
        6.28556535e-02, -1.03304848e-01, -3.03664748e-02, -2.61823267e-01,
        1.60339102e-02, -4.65798751e-02, -1.17743045e-01, -1.54017005e-02,
       -2.99165905e-01,  1.03178598e-01, -2.93966830e-01, -9.83623117e-02,
       -1.79889664e-01,  5.63098630e-03, -1.34482324e-01,  2.34982967e-01,
        1.17922001e-01,  

In [None]:
# model = word2vec.Word2Vec(tokens,
#                           corpus_file=None,
#                           size=1000,
#                           alpha=0.025,
#                           window=1,
#                           min_count=100,
#                           max_vocab_size=None,
#                           sample=0.001,
#                           seed=1,
#                           workers=3,
#                           min_alpha=0.0001,
#                           sg=0, ## skip-gram 0,1
#                           hs=1, ## hierarchical softmax 0,1 
#                           negative=5,
#                           ns_exponent=0.75,
#                           cbow_mean=1,
#                           iter=5,
#                           null_word=0,
#                           trim_rule=None,
#                           sorted_vocab=1,
#                           batch_words=10000,
#                           compute_loss=False,
#                           callbacks=(),
#                           max_final_vocab=None)

In [286]:
s = "portrait realism famous people children 3 portraits i b kustodieva"
s = re.sub("\b[a-zA-Z0-9]\b", "", s)
s = re.sub("   ", " ", s)
re.sub("  ", " ", s)

'portrait realism famous people children 3 portraits i b kustodieva'