# Mercari Price Suggestion Challenge
* url: https://www.kaggle.com/c/mercari-price-suggestion-challenge

* tensorflow ref: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/learn/wide_n_deep_tutorial.py

### import module

In [None]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from collections import Counter, defaultdict
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
import scipy

import gc

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import seaborn as sns

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import helper_function as helper

In [None]:
train = pd.read_csv('sample_data/train.tsv', sep='\t')
test = pd.read_csv('sample_data/test.tsv', sep='\t')
submission = pd.read_csv('sample_data/sample_submission.csv', sep=',')

In [16]:
# train = pd.read_csv('data/train.tsv', sep='\t')
# test = pd.read_csv('data/test.tsv', sep='\t')

### Data Preprocessing

In [4]:
BrandMinNum = 2
NUM_BRANDS = 250

In [5]:
df = pd.concat([train, test], 0)
nrow_train = train.shape[0]
Y_train = np.log1p(train["price"])

del train
gc.collect() # release usage memory (df_train)

print(df.memory_usage(deep = True))

Index                 15984
brand_name           103607
category_name        174313
item_condition_id     15984
item_description     459804
name                 167342
price                 15984
shipping              15984
test_id               15984
train_id              15984
dtype: int64


In [6]:
df["category_name"] = df["category_name"].fillna("Other")#.astype("category")
df["brand_name"] = df["brand_name"].fillna("unknown")

# pop_brands = df["brand_name"].value_counts().index[:NUM_BRANDS]
brand_count = df["brand_name"].value_counts()
pop_brands = brand_count[brand_count > BrandMinNum].index
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"

df["item_description"] = df["item_description"].fillna("None")
df["item_condition_id"] = df["item_condition_id"]#.astype("category")
df["brand_name"] = df["brand_name"]#.astype("category")

print(df.memory_usage(deep = True))

Index                 15984
brand_name           135849
category_name        174691
item_condition_id     15984
item_description     459804
name                 167342
price                 15984
shipping              15984
test_id               15984
train_id              15984
dtype: int64


### Tensorflow

In [7]:
X_train = df[:nrow_train]
X_test = df[nrow_train:]

X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=42)

In [8]:
X_train.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
599,unknown,Women/Dresses/Full-Length,2,Super cute its long and sexy got online but by...,Striped Sexy Dress,6.0,1,,599.0
432,unknown,Men/Tops/T-shirts,3,A black tee with white screen print g and garb...,"'Garbage"" tee",7.0,1,,432.0
221,unknown,Women/Shoes/Sandals,3,Brown faux leather with back zipper. Size 7,Faux leather sandals,12.0,1,,221.0


In [9]:

TRAIN_CSV_COLUMNS = [
    "brand_name", "item_condition_id", 
    #"item_description", "name", "category_name"
    #"price", 
    "shipping"
]

brand_name = tf.feature_column.categorical_column_with_vocabulary_list(
    "brand_name", list(df.brand_name.unique()))
item_condition_id = tf.feature_column.categorical_column_with_vocabulary_list(
    "item_condition_id", list(df.item_condition_id.unique()))
shipping = tf.feature_column.categorical_column_with_vocabulary_list(
    "shipping", list(df.shipping.unique()))

feature_columns = [
    #tf.feature_column.crossed_column(
    #    [item_condition_id, shipping], hash_bucket_size=1000),
    tf.feature_column.indicator_column(item_condition_id),
    tf.feature_column.indicator_column(shipping),
    tf.feature_column.embedding_column(brand_name, dimension=50)]

num_hidden_units = [500, 150, 30, 150, 500]


In [27]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_train,
      y=y_train,
      batch_size=128,
      num_epochs=None,
      shuffle=False,
      num_threads=1)

eval_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_eval,
      y=y_eval,
      num_epochs=1,
      shuffle=False,
      num_threads=1)


In [12]:
model = tf.estimator.DNNRegressor(feature_columns=feature_columns,
                                   hidden_units=num_hidden_units,
                                   activation_fn=tf.nn.relu,
                                   optimizer=tf.train.ProximalAdagradOptimizer(
                                             learning_rate=0.1,
                                             l1_regularization_strength=0.001),
                                   model_dir="./checkpoints/")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_model_dir': './checkpoints/', '_save_summary_steps': 100, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_keep_checkpoint_max': 5, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': 1}


In [13]:
model.train(input_fn=train_input_fn, steps=2000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-2000
INFO:tensorflow:Saving checkpoints for 2001 into ./checkpoints/model.ckpt.
INFO:tensorflow:step = 2001, loss = 39.4737
INFO:tensorflow:global_step/sec: 76.7893
INFO:tensorflow:step = 2101, loss = 57.1608 (1.303 sec)
INFO:tensorflow:global_step/sec: 81.835
INFO:tensorflow:step = 2201, loss = 50.0518 (1.223 sec)
INFO:tensorflow:global_step/sec: 82.7162
INFO:tensorflow:step = 2301, loss = 51.5346 (1.208 sec)
INFO:tensorflow:global_step/sec: 72.7011
INFO:tensorflow:step = 2401, loss = 46.8756 (1.376 sec)
INFO:tensorflow:global_step/sec: 83.0398
INFO:tensorflow:step = 2501, loss = 35.6746 (1.204 sec)
INFO:tensorflow:global_step/sec: 62.6844
INFO:tensorflow:step = 2601, loss = 57.9133 (1.596 sec)
INFO:tensorflow:global_step/sec: 82.1677
INFO:tensorflow:step = 2701, loss = 50.2511 (1.216 sec)
INFO:tensorflow:global_step/sec: 78.7354
INFO:tensorflow:step = 2801, loss = 42.7808 (1

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x104831828>

In [28]:
result = model.evaluate(input_fn=eval_input_fn)

INFO:tensorflow:Starting evaluation at 2017-12-27-09:39:55
INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-4000
INFO:tensorflow:Finished evaluation at 2017-12-27-09:39:56
INFO:tensorflow:Saving dict for global step 4000: average_loss = 0.494589, global_step = 4000, loss = 49.4589


In [29]:
result

{'average_loss': 0.4945893, 'global_step': 4000, 'loss': 49.458931}

In [30]:
some_data = X_eval[:9]

predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=some_data,
      num_epochs=1,
      shuffle=False,
      num_threads=1)

In [31]:
predictions = model.predict(input_fn=predict_input_fn)
pred = [p['predictions'] for p in predictions]

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-4000


In [32]:
pred = np.array(pred, dtype='float32').squeeze()
print("Predict Result: {}\n".format(pred))
print("Real: {}".format(list(y_eval[0:9])))

Predict Result: [ 2.97574806  4.24902058  2.83420682  1.93588972  2.19689965  2.5487318
  2.90255642  2.9406414   2.72847986]

Real: [2.5649493574615367, 2.3978952727983707, 3.1354942159291497, 1.9459101490553132, 2.8903717578961645, 2.3978952727983707, 2.8332133440562162, 3.5553480614894135, 1.9459101490553132]


<br>
### Test

In [36]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      num_epochs=1,
      shuffle=False,
      num_threads=1)

In [37]:
predictions = model.predict(input_fn=predict_input_fn)
pred = [p['predictions'] for p in predictions]
pred_result = np.array(pred, dtype='float32').squeeze()

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-4000


In [42]:
test["price"] = np.expm1(pred_result)
test[["test_id", "price"]].to_csv("dnn_result.csv", index = False)

In [43]:
test[["test_id", "price"]].head(5)

Unnamed: 0,test_id,price
0,0,14.309597
1,1,14.309597
2,2,58.838428
3,3,17.220665
4,4,11.790873
