In [1]:
keyword_list = ['BASKET_COMM','BASKET_SUM_COMM','BASKET']

In [2]:
import os
import ast
import math
import subprocess
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorboard.plugins import projector
from gensim.models.word2vec import Word2Vec

data_root = os.path.abspath(os.getcwd())+ '/data/'

# raw_data_path = data_root + '/raw/'

processed_data_path = data_root + '/processed/'

model_path = './models/'

if not os.path.isdir(model_path):
    os.makedirs(model_path)
    print("made folder:", model_path)

if not os.path.exists('./models/embedding_log/'):
    os.mkdir('./models/embedding_log/')

for keyword in keyword_list:
    
    #Parameter Definition
    
    csv_path = processed_data_path + keyword + '.csv'
    w2v_csv_path = processed_data_path + keyword + '_w2v' + '.csv'
    model_name = keyword + 'w2v_model_all.md'
    
    #Train & Save W2V models 
    
    pd_corpus = pd.read_csv(w2v_csv_path)
    corpus = pd.DataFrame(pd_corpus.words.sample(frac=1).unique(),
                   columns=['words'])
    
    corpus.words = corpus.words.apply(lambda x: ast.literal_eval(x))
    
    model_keyword_path = model_path + keyword + '/'
    
    if not os.path.isdir(model_keyword_path):
        os.makedirs(model_keyword_path)
        print("made folder:", model_keyword_path)
    
    model_path_cbow = model_keyword_path + model_name + '.' + 'cbow'
    model_path_sg = model_keyword_path + model_name + '.' + 'sg'
    
    try:
        model = Word2Vec(corpus.words)
        model.save(model_path_cbow)
    except:
        continue
        
    try:
        model = Word2Vec(corpus.words, sg=1)
        model.save(model_path_sg)
    except:
        continue
    
    #Embedding log created
    
    
    model_dir = model_path
    
    log_dir = model_dir + 'embedding_log/' +keyword +'_embedding_log'  + '/'
    
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    
    metadata_name = 'metadata.tsv'
    
    word2vec = Word2Vec.load(model_path_cbow)
    
    wv_dict = {'words':[], 'counts':[], 'wv':[]}
    
    for word in word2vec.wv.vocab.keys():
        if word2vec.wv.vocab[word].count > 4:                      #Word count limit
            wv_dict['words'].append(word)
            wv_dict['counts'].append(word2vec.wv.vocab[word].count)
            wv_dict['wv'].append(word2vec[word])
        
    pd_wv = pd.DataFrame(wv_dict)
    
    # write labels and count
    embedding = np.empty((len(pd_wv), word2vec.vector_size), dtype=np.float32)
    with open(os.path.join(log_dir, metadata_name), 'w') as f:
        f.write('word' + '\t' + 'vol_lv' + '\t' + 'count' + '\n')
        for i, row in pd_wv.iterrows():
            f.write("{}\t{}\t{}\n".format(row.words, round(math.log(row.counts, 10)), row.counts))
            embedding[i] = row.wv
    
    
    embeddings_vectors = embedding
    tf.compat.v1.disable_eager_execution()

    # Create some variables.
    emb = tf.compat.v1.Variable(embeddings_vectors, name='word_embeddings')

    # Add an op to initialize the variable.
    init_op = tf.compat.v1.global_variables_initializer()

    # Add ops to save and restore all the variables.
    saver = tf.compat.v1.train.Saver()

    # Later, launch the model, initialize the variables and save the
    # variables to disk.
    
    
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Save the variables to disk.
        save_path = saver.save(sess, os.path.join(log_dir, "model.ckpt"))
        print("Model saved in path: %s" % save_path)
    
        # Set up config
        config = projector.ProjectorConfig()
        # One can add multiple embeddings.
        embedding = config.embeddings.add()
        embedding.tensor_name = emb.name
        # Link this tensor to its metadata file (e.g. labels).
        embedding.metadata_path = metadata_name
        # Saves a config file that TensorBoard will read during startup.
        # projector.visualize_embeddings(log_dir, config)
        
    projector.visualize_embeddings(tf.compat.v1.summary.FileWriter(log_dir), config)

made folder: ./models/BASKET_COMM/




Model saved in path: ./models/embedding_log/BASKET_COMM_embedding_log/model.ckpt
made folder: ./models/BASKET_SUM_COMM/




Model saved in path: ./models/embedding_log/BASKET_SUM_COMM_embedding_log/model.ckpt
made folder: ./models/BASKET/




Model saved in path: ./models/embedding_log/BASKET_embedding_log/model.ckpt
