In [3]:
import glob
import os
import re
import string

import numpy as np
import pandas as pd
import multiprocessing

from gensim.models import word2vec, Word2Vec
from gensim.models.phrases import Phrases, Phraser
import spacy

import logging  # Setting up the loggings to monitor gensim

pd.set_option('display.max_rows', 4000)
%load_ext autoreload
%autoreload 2

In [4]:
N_CORES = multiprocessing.cpu_count() # Count the number of cores in a computer

In [5]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

## Deviantart

In [6]:
LABELS_FILE = "../data/deviantart/labels.txt"

In [34]:
## Read lines
with open(LABELS_FILE, 'r') as f:
    lines = f.readlines()
lines = list(map(lambda s: s.strip(), lines))
    
## Set filename-labels dict and collect all unique labels
filenames = []
all_labels_org = []
all_labels = []
for line in lines:
    row = line.strip().split(',')
    filename = row[0]
    all_labels_org.append(row[1:])
    labels = ' '.join(row[1:])
    labels = re.sub("[^A-Za-z0-9']+", ' ', str(labels)).lower()
    filenames.append(filename)
    all_labels.append(labels)

In [35]:
all_labels

['ink inktober',
 'ink day inktober',
 'ink inktober drawing',
 'ink inktober drawing',
 'art tiger digital',
 'lion painting',
 'lion',
 'portrait cat loaf art pet animal',
 'starry cat art painting digital',
 'weasel starry art stoat animal',
 'snow leopard painting',
 'study lion',
 'art tiger digital',
 'starry space tiger nebula sky',
 'space tiger art nebula stars sunset',
 'big cheetah alien cat',
 'bird rain art secretary digital stars',
 'starry drawing bird eagle art painting harpy stars',
 'starry rain animal digital nebula ibex stars',
 'big cat art animal digital lions',
 'art leopard digital',
 'big experimental cat tiger art digital',
 'art tiger digital',
 'snow leopard',
 'rain wash snow leopard away',
 'starry night animal digital leopard sky',
 'cat rain art visitor stars',
 'from starry cat friend visit old cats stars',
 'starry cloudy clouded art leopard skies',
 'big bobcat art cat',
 'art tiger digital',
 'art tiger digital',
 'water tiger digital swimming',
 'pe

In [36]:
labels_dict = {}
labels_txt = [cleaning(doc) for doc in nlp.pipe(all_labels, batch_size=5000, n_threads=-1)]
for i, filename in enumerate(filenames):
    labels_dict[filename] = labels_txt[i]
    
label_corpus = list(map(lambda s:str(s).split(), list(labels_dict.values())))

In [37]:
labels_dict

{'0000001.jpeg': None,
 '0000002.jpeg': 'ink day inktober',
 '0000003.jpeg': 'ink inktober drawing',
 '0000004.jpeg': 'ink inktober drawing',
 '0000005.jpeg': 'art tiger digital',
 '0000006.jpeg': None,
 '0000007.jpeg': None,
 '0000008.jpeg': 'portrait cat loaf art pet animal',
 '0000009.jpeg': 'starry cat art painting digital',
 '0000010.jpeg': 'weasel starry art stoat animal',
 '0000011.jpeg': 'snow leopard painting',
 '0000012.jpeg': None,
 '0000013.jpeg': 'art tiger digital',
 '0000014.jpeg': 'starry space tiger nebula sky',
 '0000015.jpeg': 'space tiger art nebula star sunset',
 '0000016.jpeg': 'big cheetah alien cat',
 '0000017.jpeg': 'bird rain art secretary digital star',
 '0000018.jpeg': 'starry draw bird eagle art paint harpy star',
 '0000019.jpeg': 'starry rain animal digital nebula ibex star',
 '0000020.jpeg': 'big cat art animal digital lion',
 '0000021.jpeg': 'art leopard digital',
 '0000022.jpeg': 'big experimental cat tiger art digital',
 '0000023.jpeg': 'art tiger digi

In [38]:
model = Word2Vec(min_count=1,
                 window=2,
                 size=1000,
                 sample=6e-5,
                 alpha=0.025,
                 min_alpha=0.0001,
                 negative=50,
                 workers=N_CORES - 1
                )

model.build_vocab(label_corpus,
                  corpus_file=None,
                  update=False,
                  progress_per=1000,
                  keep_raw_vocab=False,
                  trim_rule=None,)

print("Vocabulary shape:", model.wv.vectors.shape)
print("Training..")

model.train(label_corpus,
            corpus_file=None,
            total_examples=model.corpus_count,
            total_words=None,
            epochs=50,
            start_alpha=None,
            end_alpha=None,
            word_count=0,
            queue_factor=2,
            report_delay=1.0,
            compute_loss=False,
            callbacks=(),
            )

Vocabulary shape: (8185, 1000)
Training..


(4116910, 9363400)

In [48]:
keyword = 'boy'
similar_words = model.wv.most_similar(keyword, topn=20)

In [49]:
similar_words

[('bitch', 0.81988924741745),
 ('balloon', 0.8144482970237732),
 ('rotten', 0.8011647462844849),
 ('bang', 0.7864789962768555),
 ('bulb', 0.7816920876502991),
 ('wtf', 0.7747443318367004),
 ('androgynous', 0.769142746925354),
 ('cousin', 0.7688477039337158),
 ('sibling', 0.7632150650024414),
 ('pic', 0.7591559290885925),
 ('apprentice', 0.7560466527938843),
 ('misfit', 0.7552689909934998),
 ('gameplay', 0.7398802042007446),
 ('young', 0.7344935536384583),
 ('bangtan', 0.7337987422943115),
 ('phone', 0.7328349947929382),
 ('face', 0.7260253429412842),
 ('babysitter', 0.7260212898254395),
 ('carry', 0.7216712236404419),
 ('sticker', 0.7211924195289612)]

In [33]:
similar_words

[('bellied', 0.914106547832489),
 ('curvy', 0.8198636174201965),
 ('twitter', 0.8148061633110046),
 ('vixen', 0.7992647290229797),
 ('affinity', 0.7986941337585449),
 ('breast', 0.7887340784072876),
 ('tit', 0.7833642959594727),
 ('thicc', 0.7631590366363525),
 ('lard', 0.760961651802063),
 ('luka', 0.7598403096199036)]

In [43]:
model.save("../models/deviantart_word2vec.model")

In [None]:
# model = word2vec.Word2Vec(tokens,
#                           corpus_file=None,
#                           size=1000,
#                           alpha=0.025,
#                           window=1,
#                           min_count=100,
#                           max_vocab_size=None,
#                           sample=0.001,
#                           seed=1,
#                           workers=3,
#                           min_alpha=0.0001,
#                           sg=0, ## skip-gram 0,1
#                           hs=1, ## hierarchical softmax 0,1 
#                           negative=5,
#                           ns_exponent=0.75,
#                           cbow_mean=1,
#                           iter=5,
#                           null_word=0,
#                           trim_rule=None,
#                           sorted_vocab=1,
#                           batch_words=10000,
#                           compute_loss=False,
#                           callbacks=(),
#                           max_final_vocab=None)