In [1]:
## import packages
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import numpy as np
import pandas as pd
import tensorflow as tf
import collections
import random
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
import nltk
import itertools

In [2]:
## import data 
train_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter= '\t')
#train_raw = train_raw.iloc[0:10000,] # just a bit
# standardize price here because may as well
normalized_price = np.log1p(train_raw['price'].values)
train_raw['price'] = (normalized_price - np.mean(normalized_price))/ np.std(normalized_price) 

In [39]:
## define functions to use

######## General functions

def rmsle(h, y): 
    log_h = np.log(h+1) # the +1 is to prevent 0 
    log_y = np.log(y+1) # writing these to prevent memoryerror
    sq_logs = np.square(log_h - log_y)
    score_ = np.sqrt(np.mean(sq_logs))
    return score_

######## Basic text manipulation functions (some specific to Mercari Kaggle Competition) 

def split_cat(text): # this one is to reduce the categoriy_name into three subcategories
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

def handle_missing_inplace(dataset):  # this one is to put placeholders in place of missing values (NaN)
    dataset['cat1'].fillna(value='No Label', inplace=True)
    dataset['cat2'].fillna(value='No Label', inplace=True)
    dataset['cat3'].fillna(value='No Label', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='No description yet', inplace=True)
     
def build_dictionary(words, n_words): # dictionary that maps words to indices. this function should be modular.
    #input is [['a','b','c'],['a','b','c']]
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]] # word indexed as "unknown" if not one of the top #n_words (popular/common) words
    count.extend(Counter(words).most_common(n_words - 1)) # most_common returns the top (n_words-1) ['word',count]
    dictionary = dict()
    for word, _ in count: # the 'word, _' is writted because count is a list of list(2), so defining 'word' as the first term per
        dictionary[word] = len(dictionary) # {'word': some number incrementing by one. fyi, no repeats because from most_common)}
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) # {ind. : 'word'} I guess for looking up if needed?
    return dictionary, reversed_dictionary

def clean_and_tokenize(dataset_col): # input is a column of strings
    pattern = '\w+' # does this only keep words
    list_of_lists = list()
    tokenizer = RegexpTokenizer(pattern)
    for word in dataset_col:
        list_of_words = list()
        tokenized = tokenizer.tokenize(word)
        for i in tokenized:
            if (len(i) > 2 ): #ignore words of length 2 or less
                list_of_words.append(i.lower()) # append all words to one list
        list_of_lists.append(list_of_words)
    list_as_series = pd.Series(list_of_lists)
    return list_as_series

def convert_word_to_ind(dataset_col,dictionary): # input the pandas column of texts and dictionary. This should be modular
    # each input should be a string of cleaned words tokenized into a list (ex. ['this', 'is', 'an', 'item'])
    # dictionary should be the dictionary obtained from build_dictionary
    list_of_lists = []
    unk_count = 0 # total 'unknown' words counted
    for word_or_words in dataset_col: # words is the list of all words
        list_of_inds = []
        for word in word_or_words:
            if word in dictionary:
                index = np.int(dictionary[word]) # dictionary contains top words, so if in, it gets an index
            else:
                index = 0  #  or dictionary['UNK']? can figure out later
                unk_count += 1
            list_of_inds.append(index)
        list_of_lists.append(list_of_inds)

    # make list_of_lists into something that can be put into pd.DataFrame
    #list_as_series = pd.Series(list_of_lists)
    list_as_series = np.array(list_of_lists)
    return list_as_series, unk_count

def pad_word_indices(col_of_indices, pad_length): # col_of_indices can be a pd series. 
    temp_series = [] # append vectors into here
    for list_inds in col_of_indices:
        len_list = len(list_inds)
        if len_list >= pad_length:
            temp_series.append(np.array(list_inds[(len_list-pad_length):]))
        else:
            padded_vec = [0]*(pad_length-len_list)
            padded_vec.extend(list_inds)
            temp_series.append(np.array(padded_vec))
    return temp_series

def convert_word_to_padded(dataset_col,dictionary,pad_length): # input the pandas column of texts and dictionary. This should be modular
    # each input should be a string of cleaned words tokenized into a list (ex. ['this', 'is', 'an', 'item'])
    # dictionary should be the dictionary obtained from build_dictionary
    # use this function when you know how long you want your pad_length
    #   - otherwise, use convert_word_to_ind, and pad_word_indices
    #   - eventually, will look into cleaning these three functions up.
    list_of_lists = []
    unk_count = 0 # total 'unknown' words counted
    for word_or_words in dataset_col: # words is the list of all words
        list_of_inds = []
        count_inds = 0
        for word in word_or_words:
            if word in dictionary:
                index = np.int(dictionary[word]) # dictionary contains top words, so if in, it gets an index
            else:
                index = 0  #  or dictionary['UNK']? can figure out later
                unk_count += 1
            count_inds +=1
            list_of_inds.append(index) 
        if count_inds >= pad_length:
            asdf = list_of_inds[(count_inds-pad_length):]
        else: 
            asdf = [0]*(pad_length-count_inds)
            asdf.extend(list_of_inds)
        temp = np.array(asdf)
        list_of_lists.append(temp)
    list_as_series = np.array(list_of_lists)
    return list_as_series, unk_count

######## Word Embedding (this is after strings are transformed into vectors of indices)

# generate batch data (for feeding into word embedding)
# used http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/ for reference
def generate_batch(data, batch_size, num_skips): 
    # data should be [[3,7,9],[7,4,5,9],...] kinda format
    # num_skips configures number of context words to draw. skip_window defines size of window to draw context words from
    assert batch_size % num_skips == 0 # if batch_size was 10, and num_skips was 3, then [cat,cat,cat,sat,sat,sat,...] wont equal
    batch = np.ndarray(shape=(batch_size), dtype=np.int32) # initialize batch variable (input word go in here)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32) # initialize context variable
    counter = 0
    rand_dat_ind = random.sample(range(0,len(data)-1),int(batch_size/num_skips))
    for i in data[rand_dat_ind]:
        while len(i) <= num_skips:
            rnd_again = random.randint(0,len(data)-1)
            i = data[rnd_again]
        target = random.randint(0,len(i)-1) 
        targets_to_avoid = [target] # avoid this index when selecting rando words
        for j in range(num_skips):
            while target in targets_to_avoid: # this is to choose an index that isnt the index of the batch word
                target = random.randint(0, len(i)-1) # target is a context word
            targets_to_avoid.append(target) # so next time, this loop won't select this context word again 
            batch[counter] = i[targets_to_avoid[0]]  # this is the input word (same word repeated i*num_skips+j times)
            context[counter, 0] = i[targets_to_avoid[j+1]]  # these are the context words to the batch word
            counter += 1
    return batch, context # batch is input, context is target variable(s)

def generate_batch_general(x, y, batch_size):
    # this is to generate batches for word2vec comparing against numeric values 
    # in this case, 'brand_name' and cat1/2/3 are compared against 'price'
    rand_dat_ind = random.sample(range(0,len(data)-1),int(batch_size))
    return x[rand_dat_ind], y[rand_dat_ind]
    
    
######## word vector RNN # i dont think im using these yet lol 1/5/2018
# used http://adventuresinmachinelearning.com/recurrent-neural-networks-lstm-tutorial-tensorflow/ for reference
def batch_producer(raw_data, batch_size, num_steps): # produces input/target batches for word vector rnn
    raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) # make tensor out of csv

    data_len = tf.size(raw_data)
    batch_len = data_len // batch_size # determines number of equally sized batches available from input data
    data = tf.reshape(raw_data[0: batch_size * batch_len], # reshape to contain all batches (and exclude the remaining data)  
                      [batch_size, batch_len])

    epoch_size = (batch_len - 1) // num_steps # number of iterations in each epoch

    i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() # extract asynchronous batches of data 
    x = data[:, i * num_steps:(i + 1) * num_steps] # x may be the input variaables
    x.set_shape([batch_size, num_steps]) # shape for feeding
    y = data[:, i * num_steps + 1: (i + 1) * num_steps + 1] # y would then be the target variable based on x (in descriptions)
    y.set_shape([batch_size, num_steps])
    return x, y

class Input(object): # class for utilizing batch_producer (neater with class)
    def __init__(self, batch_size, num_steps, data):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        self.input_data, self.targets = batch_producer(data, batch_size, num_steps)
        
class Model(object): # rnn + LSTM model 
    def __init__(self, input, is_training, hidden_size, vocab_size, num_layers,
                 dropout=0.5, init_scale=0.05):
        self.is_training = is_training
        self.input_obj = input
        self.batch_size = input.batch_size
        self.num_steps = input.num_steps

In [4]:
## clean "category_name" and make numeric indicies for one-worded features (brand_name, cat1/2/3)

train_raw['cat1'],train_raw['cat2'],train_raw['cat3'] = \
zip(*train_raw['category_name'].apply(lambda x: split_cat(x))) # split the categories into three new columns
train_raw.drop('category_name',axis = 1, inplace = True) # remove the column that isn't needed anymore

handle_missing_inplace(train_raw) # replaces NaN with a string placeholder

#le = LabelEncoder() # use this to change categorical names into index numbers (0 1 2 3 or something)
#train_raw.brand_name = le.fit_transform(train_raw.brand_name)
#train_raw.cat1 = le.fit_transform(train_raw.cat1)
#train_raw.cat2 = le.fit_transform(train_raw.cat2)
#train_raw.cat3 = le.fit_transform(train_raw.cat3)

# maybe look into exporting this updated train_raw cuz LabelEncoder takes way too long

In [5]:
## convert name and item_desc to indices, then configure a bit more

all_name_desc = np.hstack((train_raw['name'],train_raw['item_description'])) # get all dem words
all_name_desc = clean_and_tokenize(all_name_desc)
all_name_desc = [item for sublist in all_name_desc for item in sublist]
train_raw['name'] = clean_and_tokenize(train_raw['name'])
train_raw['item_description'] = clean_and_tokenize(train_raw['item_description'])

vocabulary_size = 100000 # keeping 100000 words in the dictionary. can adjust later. will use variable elsewhere
word2vec_dict, reverse_dict = build_dictionary(all_name_desc,vocabulary_size) 
train_raw['name_inds'], count_unk_name = convert_word_to_ind(train_raw['name'],word2vec_dict) 
train_raw['item_desc_inds'], count_unk_item_desc = convert_word_to_ind(train_raw['item_description'], word2vec_dict)  

# delete 'name' and 'item_description' if not needed anymore
#train_raw.drop('name',axis = 1, inplace = True) # remove the column that isn't needed anymore
#train_raw.drop('item_description',axis = 1, inplace = True) # remove the column that isn't needed anymore

print("total words (with repeats): " + str(len(all_name_desc)))
print("total unassigned words in name and item_description: "+ str(count_unk_name) +  ' ' + str(count_unk_item_desc))
# 100k vocab: 88k "UNK" and 161k "UNK" in name and item desc, respectively. Out of 36mil tokens. 

# combine name and item_description for easier modeling
# technically not correct because it might potentially match words between name and item_desc. 
train_raw['name_and_desc'] = train_raw['name_inds'] + train_raw['item_desc_inds']

# padding the name and description column to have equally lengthed lists
pad_length = 25
#train_raw['padded_name_desc'] = pad_word_indices(train_raw['name_and_desc'],pad_length)

total words (with repeats): 36559364
total unassigned words in name and item_description: 36739 66485


In [6]:
## alternative to above block

#name_inds, _ = convert_word_to_padded(train_raw['name'],word2vec_dict,25) 
#item_desc_inds, _ = convert_word_to_padded(train_raw['item_description'], word2vec_dict,25)  
name_and_desc, _  = convert_word_to_padded(list(map(lambda x,y: x+y,train_raw['name'], train_raw['item_description'])),word2vec_dict,pad_length) 
#name_and_desc = [np.append(name_inds[i],item_desc_inds[i]) for i in range(len(name_inds))]


In [7]:
dict_brand_len = 3000
dict_cat1_len = 12
dict_cat2_len= 100
dict_cat3_len = 700

brand_name_dict, brand_name_dict_rev = build_dictionary(train_raw['brand_name'], dict_brand_len)
train_raw['brand_name_inds'], count_unk_brand = convert_word_to_ind(train_raw['brand_name'].values.reshape((-1,1)), brand_name_dict)
cat1_dict ,cat1_rev_dict= build_dictionary(train_raw['cat1'],dict_cat1_len)
train_raw['cat1_inds'], count_unk_cat1 = convert_word_to_ind(train_raw['cat1'].values.reshape((-1,1)), cat1_dict)
cat2_dict ,cat2_rev_dict= build_dictionary(train_raw['cat2'],dict_cat2_len)
train_raw['cat2_inds'], count_unk_cat2 = convert_word_to_ind(train_raw['cat2'].values.reshape((-1,1)), cat2_dict)
cat3_dict ,cat3_rev_dict= build_dictionary(train_raw['cat3'],dict_cat3_len)
train_raw['cat3_inds'], count_unk_cat3 = convert_word_to_ind(train_raw['cat3'].values.reshape((-1,1)), cat3_dict)

print(str(count_unk_brand) + ' ' + str(count_unk_cat1) + ' '+ str(count_unk_cat2) + " " + str(count_unk_cat3))

2382 0 1696 615


In [16]:
train_raw.head(3)


Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,cat1,cat2,cat3,name_inds,item_desc_inds,brand_name_inds,cat1_inds,cat2_inds,cat3_inds,name_and_desc
0,0,"[mlb, cincinnati, reds, shirt, size]",3,missing,-0.775703,1,"[description, yet]",Men,Tops,T-shirts,"[3578, 8211, 6439, 61, 5]","[45, 54]",1,5,19,24,"[3578, 8211, 6439, 61, 5, 45, 54]"
1,1,"[razer, blackwidow, chroma, keyboard]",3,Razer,1.323039,0,"[this, keyboard, great, condition, and, works,...",Electronics,Computers & Tablets,Components & Parts,"[10018, 22659, 14897, 2441]","[16, 2441, 22, 10, 1, 162, 34, 1005, 42, 2, 37...",586,4,44,201,"[10018, 22659, 14897, 2441, 16, 2441, 22, 10, ..."
2,2,"[ava, viv, blouse]",1,Target,-0.775703,1,"[adorable, top, with, hint, lace, and, key, ho...",Women,Tops & Blouses,Blouse,"[7073, 9791, 588]","[533, 41, 6, 4993, 150, 1, 791, 1172, 2, 81, 2...",82,1,3,13,"[7073, 9791, 588, 533, 41, 6, 4993, 150, 1, 79..."


In [11]:
## Embed word ind vectors to be used for NN (set up for this dataset) 

# Initialize batch parameters for generate_batch()

embedding_size = 15  # Dimension of the embedding vector.
num_skips = 3         # How many times to reuse an input to generate a context.
batch_size = 300

# Initialize validation parameters #
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 20     # Random set of words to evaluate similarity on.
valid_window = 1000  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False) # random word indices to use for validation 

# Set up tensorflow placeholders for inputs (batch words) and labels (context words)
train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) # input words
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) # context words
valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # validation words from dictionary

# Look up embeddings for inputs.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # each word will have embedding
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # api that optimizes looking up hidden layer vector for word indices

# Setup the y = xw + b equation for training the embedding vector 
# Construct the variables for the NCE loss (special method to improve time through reduced searching computation)
num_sampled = 1000
nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
nce_loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_labels,
                       inputs=embed,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(nce_loss)

# Utilize cosine similarity to measure "distances" between various word embedding vectors for the different words
# This is to use the validation set and obtain the n most similar words to the validation words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) # obtain A/||A||_2 for all words
normalized_embeddings = embeddings / norm                               # this makes it easy to obtain cosine similarity
valid_embeddings = tf.nn.embedding_lookup( # look up table 
      normalized_embeddings, valid_dataset)
similarity = tf.matmul(                                                 # this is to obtain all A*B/||A||_2*||B|||_2
      valid_embeddings, normalized_embeddings, transpose_b=True)        # between validation and vocab list 

In [14]:
## Running the Embedding vector training model
# this should result in a trained dictionary for "name_and_desc" column. can also be used if i were to keep name and item_desc
# separated. training with 50k steps seemed sufficient, but odd word pairings did occur, so can be improved here


# initialize some variables
num_steps = 50001

data = train_raw['name_and_desc']
init = tf.global_variables_initializer()


with tf.Session() as session: #(graph=graph)
    
    # We must initialize all variables before we use them.
    init.run() # initialize variables (resets them too if previously trained)
    print('Initialized')

    average_loss = 0 # initialize loss 
    for step in range(num_steps):
        
        batch_inputs, batch_context = generate_batch(data,
            batch_size, num_skips)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_context}
        
       
        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, nce_loss], feed_dict=feed_dict)
        average_loss += loss_val # adding up for calculating average every 2000 steps

        if step % 2000 == 0: # calculating average loss
            if step > 0:
                average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
        if step % 10000 == 0: # printing out 8 "most similar" words for validation (subjectively i guess)
            sim = similarity.eval() 
            for i in range(valid_size):
                valid_word = reverse_dict[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dict[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
                
    final_embeddings = normalized_embeddings.eval() # this should be the dense matrix

Initialized
Average loss at step  0 :  3235.2734375
Nearest to rate: ryk, highlighters, atr, marshmallowy, gentally, dignity, mariscal, 12p,
Nearest to wii: lbr, hitchens, 1btexturestraightcan, lalaloopsys, leyvinid, galleria, nwtfree, wildkin,
Nearest to stone: metric, quadcore, sshops, bohnanza, dashund, meechie, 4cm, koltov,
Nearest to photos: 679, shocker, anarkali, fireman, jewell, missle, officiall, entire,
Nearest to adidas: f20, society, biopeel, backstrom, 98pcs, magicsuit, stanford, you,
Nearest to workout: proform, rlpc, erectile, redistribute, straightend, twinpower917, lrgeans, considered,
Nearest to show: gist, islandragz, holden, lga1151, primark, rotate, icure, jhfb,
Nearest to pure: 42d, pr2, 176kbps, indomitable, xb2, teaser, chindi, lularoeleggings,
Nearest to year: loot, babolat, akihabara, artemisia, twentieth, tees, draggle, firmenich,
Nearest to deep: swwet, aerospace, bigosh, samurai, ghoul, mk5165, kwan, megs,
Nearest to bras: mechanix, coconverse, keeley, invi

In [80]:
'''## make an embedding vector for brand_name, cat1/2/3
# make vector representation based on price i guess
# not optimized, but really, what is

# Initialize batch parameters for generate_batch()
batch_size = 300
embedding_size = 15  # Dimension of the embedding vector.
num_skips = 3         # How many times to reuse an input to generate a context.

# initialize some variables
num_steps = 50001

data1 = train_raw['brand_name_inds']
data2 = train_raw['cat1_inds']
data3 = train_raw['cat2_inds']
data4 = train_raw['cat3_inds']
price = train_raw['price']

vocab_sizes = [dict_brand_len, dict_cat1_len, dict_cat2_len, dict_cat3_len]
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.float32, shape=[batch_size, 1]) # context words
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # each word will have embedding
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # api that optimizes looking up hidden layer vector for word indices



num_sampled = 1000
weights_ = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / np.sqrt(embedding_size)))
biases_ = tf.Variable(tf.zeros([batch_size]))
weights2_ = tf.Variable(
        tf.truncated_normal([vocabulary_size, 1],
                            stddev=1.0 / np.sqrt(embedding_size)))
biases2_ = tf.Variable(tf.zeros([batch_size,1]))

alayer = tf.nn.bias_add(tf.matmul(weights_,tf.transpose(embed)),biases_)
blayer = tf.matmul(tf.transpose(alayer),weights2_) + biases2_

loss_ = tf.sqrt(tf.losses.mean_squared_error(train_labels, blayer))
optimizer_regress = tf.train.AdamOptimizer(1.0).minimize(loss_) # optimizer for regression 

with tf.Session() as sess: #(graph=graph)
    
    # We must initialize all variables before we use them.
    sess.run(init) # initialize variables (resets them too if previously trained)
    print('Initialized')

    average_loss = 0 # initialize loss 
    counter = 0
    for data_ in [data1, data2, data3, data4]:
        
        vocabulary_size = vocab_sizes[counter]
        for step in range(num_steps):
            batch_inputs, batch_context = generate_batch_general(data_, price,
            batch_size)
            batch_context = batch_context.reshape(-1,1)
            feed_dict = {train_inputs: batch_inputs.astype(np.int32).values, train_labels: batch_context.astype(np.float32)}
            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
        
            _, loss_val = sess.run([optimizer_regress, loss_], feed_dict=feed_dict) 
            average_loss += loss_val # adding up for calculating average every 2000 steps

            if step % 2000 == 0: # calculating average loss
                if step > 0:
                    average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
        counter += 1
        
    

'''

Initialized


FailedPreconditionError: Attempting to use uninitialized value Variable_89
	 [[Node: Variable_89/read = Identity[T=DT_FLOAT, _class=["loc:@Variable_89"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](Variable_89)]]
	 [[Node: Sqrt_11/_11 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_343_Sqrt_11", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'Variable_89/read', defined at:
  File "/home/bsong/anaconda/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/bsong/anaconda/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/bsong/anaconda/lib/python3.5/site-packages/traitlets/config/application.py", line 653, in launch_instance
    app.start()
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/bsong/anaconda/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/bsong/anaconda/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/bsong/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-80-6060fbb63523>", line 32, in <module>
    stddev=1.0 / np.sqrt(embedding_size)))
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 213, in __init__
    constraint=constraint)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 356, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 125, in identity
    return gen_array_ops.identity(input, name=name)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 2071, in identity
    "Identity", input=input, name=name)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/bsong/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value Variable_89
	 [[Node: Variable_89/read = Identity[T=DT_FLOAT, _class=["loc:@Variable_89"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](Variable_89)]]
	 [[Node: Sqrt_11/_11 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_343_Sqrt_11", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [89]:
## regular feed forward neural network function define here
# This is to use for the simpler columns (brand_name, item_condition, cat1/2/3)
# note to self: maybe separating dropout is better for manipulation purposes (and pooling and dropout lol.)

def RegNN(x, dropout_keep_prob, W_shape, b_shape, embed_size, batch_len):
  
    # x should be of size [batch_len,embed_size] 
    #x_expand is now of length [batch_len, embed_size,1]
    #x_expand = tf.expand_dims(x,-1) #channel dimension which i still dont know what it does
    
    # set up some weights/bias stuff
    W1 = tf.Variable(tf.truncated_normal( [W_shape,batch_len], stddev=0.1))
    b1 = tf.Variable(tf.constant(0.1, shape=[b_shape,1]))    
    print(W1.shape)
    print(b1.shape)
    print(x.shape)
    # xW + b 
    NN_layer = tf.matmul(W1,x) + b1
    
    # ReLU layer
    h = tf.nn.relu(NN_layer)
    
    # Drop Layer
    h_drop = tf.nn.dropout(h, dropout_keep_prob)
    
    # Dense layer (is this necessary?)
    
    return h_drop

def embed(inputs, size, dim):
    std = np.sqrt(2 / dim)
    emb = tf.Variable(tf.random_uniform([size, dim], -std, std))
    lookup = tf.nn.embedding_lookup(emb, inputs)
    #print(lookup.shape)
    return lookup


In [91]:
brand_name_emb = embed(train_raw.brand_name_inds,dict_brand_len, 15)
cat1_emb = embed(train_raw.cat1_inds,dict_cat1_len,10)
cat2_emb = embed(train_raw.cat2_inds,dict_cat2_len,10)
cat3_emb = embed(train_raw.cat3_inds,dict_cat3_len,10)

print(brand_name_emb)



Tensor("embedding_lookup_46:0", shape=(1482535, 15), dtype=float32)


In [None]:
## Start here to rerun NN with different fitting and o/w

# shuffle and split to train/val/test
train_raw = train_raw.sample(frac=1).reset_index(drop=True) # shuffle the data 
nrow_train = round(train_raw.shape[0]*0.95) # index to split train/val with test (basically last 5% is test)

dtest = train_raw.iloc[nrow_train:, ] # the last 5% will be test
dtrain, dvalid = train_test_split(train_raw.iloc[:nrow_train, ], train_size=0.7, test_size = 0.3) # train and validation set

In [65]:
# test block for CNN 
# based on http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
# note to self: maybe separating dropout is better for manipulation purposes (and pooling and dropout lol.)

def test_cnn(input_x,input_y,W_shape,b_shape,dropout_keep_prob): 
    embedded_chars = tf.nn.embedding_lookup(embeddings, input_x)
    embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) # [batch,pad,embed,1]
    W1 = tf.Variable(tf.truncated_normal(W_shape, stddev=0.1), name="W1")
    b1 = tf.Variable(tf.constant(0.1, shape=[b_shape]), name="b1")
    conv = tf.nn.conv2d(
        embedded_chars_expanded,
        W1,
        strides = [1,1,1,1],
        padding="VALID",
        name="conv")
    print('shape of CNN output:' + str(conv.shape))
    h = tf.nn.relu(tf.nn.bias_add(conv, b1), name="relu")
    print('shape after ReLU: ' + str(h.shape))
    pooled = tf.nn.max_pool(
                h,
                ksize=[1, pad_length, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")
    print('shape after max pooling: ' + str(pooled.shape))
    pool_flat = tf.reshape(pooled, [-1, out_nodes])
    print("shape after flattening:" + str(pool_flat.shape))
    # Add dropout
    #with tf.name_scope("dropout"):
    h_drop = tf.nn.dropout(pool_flat, dropout_keep_prob)
    print('shape after dropout: ' + str(h_drop.shape))
    # output
    #with tf.name_scope("output"):
    W2 = tf.Variable(tf.truncated_normal([out_nodes, 1], stddev=0.1), name="W2")
    b2 = tf.Variable(tf.constant(0.1, shape=[batch_len,1]), name="b2")
    predictions = tf.matmul(h_drop,W2) + b2
    print('shape of predictions: ' + str(predictions.shape))
    #predictions = tf.nn.xw_plus_b(h_drop, W2, b2, name="scores")  
    return predictions

In [None]:
x = tf.placeholder(tf.float32, [None,1])


out = RegNN(x, .5, 10, 10, 10,batch_len)
feed_x = train_raw['brand_name'][:1000].values.reshape((1000,-1))
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init) 
sess.run(out,{x: feed_x})

In [66]:
input_x = tf.placeholder(tf.int64,[None, pad_length], name = "input_x") # pad_length = 25 or something defined earlier
input_y = tf.placeholder(tf.float32,[None,1], name = "input_y") # train agianst this
#batch_len = len(name_and_desc)
#batch = name_and_desc[:batch_len]
temp_set = np.concatenate((name_and_desc,train_raw['price'].values.reshape((-1,1))),axis = 1)
 
batch_len = 1000

out_nodes = 10
dropout_keep_prob = .5

W_shape = [1,embedding_size,1,out_nodes] #figure this out if it works
b_shape = out_nodes # same as last dimension in W

y_out = test_cnn(input_x,input_y,W_shape,b_shape,dropout_keep_prob)
#with tf.name_scope("loss"):
loss = tf.sqrt(tf.losses.mean_squared_error(input_y, y_out))        
train_step = tf.train.AdamOptimizer(learning_rate = .001).minimize(loss)

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init) 
#sess.run(train_step,{input_x: batch, input_y: batch_resp})

losses = []
for i in range(10): 
    print('running step: ' + str(i))
    np.random.shuffle(temp_set) # shuffle the data
    batch = temp_set[:batch_len,0:25]
    batch_resp = temp_set[:batch_len,25].reshape((-1,1))
    loss_ = sess.run(loss,{input_x: batch, input_y: batch_resp})
    print(loss_)
    sess.run(train_step,{input_x: batch, input_y: batch_resp})


shape of CNN output:(?, 25, 1, 10)
shape after ReLU: (?, 25, 1, 10)
shape after max pooling: (?, 1, 1, 10)
shape after flattening:(?, 10)
shape after dropout: (?, 10)
shape of predictions: (1000, 1)
running step: 0
1.02222
running step: 1
1.03364
running step: 2
0.946861
running step: 3
1.06321
running step: 4
1.00374
running step: 5
1.01493
running step: 6
1.00195
running step: 7
1.02219
running step: 8
0.97089
running step: 9
1.02509


In [None]:
## set up all the input (feature column) information for tensorflow

max_brand = np.max(train_raw['brand_name']) + 1 # to set num_buckets in feature_columns 
max_cat1 = np.max(train_raw['cat1']) + 1 # these +1 are because of 0 index
max_cat2 = np.max(train_raw['cat2']) + 1
max_cat3 = np.max(train_raw['cat3']) + 1

item_cond = tf.feature_column.categorical_column_with_identity(key = 'item_condition_id',num_buckets = 5,default_value= 3)
brand_name = tf.feature_column.categorical_column_with_identity(key = 'brand_name',num_buckets = max_brand)
shipping = tf.feature_column.categorical_column_with_identity(key = 'shipping',num_buckets = 2)
cat1 = tf.feature_column.categorical_column_with_identity(key = 'cat1', num_buckets = max_cat1)
cat2 = tf.feature_column.categorical_column_with_identity(key = 'cat2', num_buckets = max_cat2)
cat3 = tf.feature_column.categorical_column_with_identity(key = 'cat3', num_buckets = max_cat3)
#names = tf.feature_column.categorical_column_with_identity(key = 'name_inds', num_buckets = vocabulary_size)
#item_desc = tf.feature_column.categorical_column_with_identity(key = 'item_desc_inds',num_buckets = vocabulary_size)
name_and_desc = tf.feature_column.categorical_column_with_identity(key = 'padded_name_desc', num_buckets = vocabulary_size)

price = tf.feature_column.numeric_column(key = 'price', dtype = tf.float32) # LABEL

embed_ = tf.feature_column.embedding_column
#feature_columns = [item_cond, brand_name, shipping, cat1, cat2, cat3] #, names, item_desc
feature_columns = [embed_(item_cond,5), embed_(brand_name,max_brand), 
                   embed_(shipping,2), embed_(cat1,max_cat1), embed_(cat2, max_cat2), 
                   embed_(cat3,max_cat3), embed_(name_and_desc,25)] #, names, item_desc
feature_names = ['item_condition_id','brand_name','shipping','cat1','cat2','cat3','name_and_desc'] #,'name_inds','item_desc_inds'
label_name = 'price'

#with tf.device("/cpu:0"): # used to convert input word index vectors to meaningful vectors for neural network 
#    embedding = tf.Variable(tf.random_uniform([vocab_size, self.hidden_size], -init_scale, init_scale))
#    inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)

In [None]:
tf.feature_column.embedding_column(item_cond,10)

In [None]:
## build an input_fn to feed data into tensorflow properly

def input_fn(data_file, num_epochs=1, batch_size = 1000): 
    # make columns into dictionary
    feat_dict = dict()
    for col_name in feature_names:
        feat_dict[col_name] = data_file[col_name]
    label = tf.constant(data_file[label_name].values,dtype = tf.float32)

    dataset = tf.data.Dataset.from_tensor_slices((feat_dict, label))

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size) 
    
    iterator = dataset.make_one_shot_iterator()
    features, label = iterator.get_next()
    return features, label

In [None]:
## high level api for basic deep feed forward neural network

regressor = tf.estimator.DNNRegressor( 
    model_dir = '/home/bsong/Python_Stuff/Data/Kaggle_Mercari/model/',
    feature_columns = feature_columns, 
    hidden_units = [256,128,64],
    )

In [None]:
## train the model

regressor.train(input_fn = lambda: input_fn(dtrain), steps = 5000)

In [None]:
## evaluate the model on validation set (assuming this is useful for cross-entropy or something)

ev = regressor.evaluate(input_fn = lambda: input_fn(dvalid))
loss_score = ev['loss']
print('Loss: {0:f}'.format(loss_score))

In [None]:
## prediction on test set

y = regressor.predict(input_fn = lambda: input_fn(dtest))

predictions = [i for i in y] # y is a generator, so have to manipulate to extract
preds = list()
for i in predictions:
    temp = list(i.values())
    preds.append(temp[0])
preds = np.array(preds)
preds = np.squeeze(preds, axis=1)
preds = np.expm1(preds) # revert the 1+log(x) transformation

In [None]:
## calculate the RMSLE score to compare to online submissions

rmsle_pred = rmsle(preds,np.expm1(dtest['price'].values)) 
print(rmsle_pred)