# Mercari Price Suggestion Challenge
* url: https://www.kaggle.com/c/mercari-price-suggestion-challenge

* tensorflow ref: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/learn/wide_n_deep_tutorial.py

### import module

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from collections import Counter, defaultdict
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
import scipy
from itertools import chain


import gc

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import seaborn as sns

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import helper_function as helper

In [2]:
train = pd.read_csv('sample_data/train.tsv', sep='\t')
test = pd.read_csv('sample_data/test.tsv', sep='\t')
submission = pd.read_csv('sample_data/sample_submission.csv', sep=',')

In [3]:
# train = pd.read_csv('data/train.tsv', sep='\t')
# test = pd.read_csv('data/test.tsv', sep='\t')

### Data Preprocessing

In [4]:
BrandMinNum = 2
NUM_BRANDS = 250

In [5]:
df = pd.concat([train, test], 0)
nrow_train = train.shape[0]
Y_train = np.log1p(train["price"])

del train
gc.collect() # release usage memory (df_train)

print(df.memory_usage(deep = True))

Index                 15984
brand_name           103607
category_name        174313
item_condition_id     15984
item_description     459804
name                 167342
price                 15984
shipping              15984
test_id               15984
train_id              15984
dtype: int64


In [6]:
df["category_name"] = df["category_name"].fillna("Other")#.astype("category")
df["brand_name"] = df["brand_name"].fillna("unknown")

# pop_brands = df["brand_name"].value_counts().index[:NUM_BRANDS]
brand_count = df["brand_name"].value_counts()
pop_brands = brand_count[brand_count > BrandMinNum].index
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"

df["item_description"] = df["item_description"].fillna("None")
df["item_condition_id"] = df["item_condition_id"]#.astype("category")
df["brand_name"] = df["brand_name"]#.astype("category")

print(df.memory_usage(deep = True))

Index                 15984
brand_name           135849
category_name        174691
item_condition_id     15984
item_description     459804
name                 167342
price                 15984
shipping              15984
test_id               15984
train_id              15984
dtype: int64


In [7]:
df.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,unknown,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Other,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0


### Tensorflow

In [8]:
def split_cat(l):
    for text in l:
        yield text.split("/")

In [9]:
categories = list(split_cat(df['category_name']))
df_categories = DataFrame(categories)
df['general_cat'], df['subcat_1'], df['subcat_2'] = \
df_categories[0].fillna("No Label"), df_categories[1].fillna("No Label"), df_categories[2].fillna("No Label")


In [10]:
df.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
0,unknown,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,Tops,T-shirts
1,Other,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,Computers & Tablets,Components & Parts
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,Tops & Blouses,Blouse


In [11]:
X_train = df[:nrow_train]
X_test = df[nrow_train:]

X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=42)

In [12]:
X_train.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
599,unknown,Women/Dresses/Full-Length,2,Super cute its long and sexy got online but by...,Striped Sexy Dress,6.0,1,,599.0,Women,Dresses,Full-Length
432,unknown,Men/Tops/T-shirts,3,A black tee with white screen print g and garb...,"'Garbage"" tee",7.0,1,,432.0,Men,Tops,T-shirts
221,unknown,Women/Shoes/Sandals,3,Brown faux leather with back zipper. Size 7,Faux leather sandals,12.0,1,,221.0,Women,Shoes,Sandals


In [14]:
unique_categories = pd.Series("/".join(df["category_name"].unique().astype("str")).split("/")).unique()

TRAIN_CSV_COLUMNS = [
    "brand_name", "item_condition_id", 
    #"item_description", "name", "category_name"
    #"price", 
    "general_cat", "subcat_1", "subcat_2",
    "shipping"
]

brand_name = tf.feature_column.categorical_column_with_vocabulary_list(
    "brand_name", list(df.brand_name.unique()))
item_condition_id = tf.feature_column.categorical_column_with_vocabulary_list(
    "item_condition_id", list(df.item_condition_id.unique()))
shipping = tf.feature_column.categorical_column_with_vocabulary_list(
    "shipping", list(df.shipping.unique()))
category_name = tf.feature_column.categorical_column_with_vocabulary_list(
    "category_name", unique_categories)

general_cat = tf.feature_column.categorical_column_with_vocabulary_list(
    "general_cat", list(df.general_cat.unique()))
subcat_1 = tf.feature_column.categorical_column_with_vocabulary_list(
    "subcat_1", list(df.subcat_1.unique()))
subcat_2 = tf.feature_column.categorical_column_with_vocabulary_list(
    "subcat_2", list(df.subcat_2.unique()))

feature_columns = [
    #tf.feature_column.crossed_column(
    #    [item_condition_id, shipping], hash_bucket_size=1000),
    tf.feature_column.indicator_column(item_condition_id),
    tf.feature_column.indicator_column(shipping),
    tf.feature_column.embedding_column(brand_name, dimension=50),
#     tf.feature_column.crossed_column(
#        [general_cat, subcat_1, subcat_2], hash_bucket_size=1000),
    tf.feature_column.indicator_column(general_cat),
    tf.feature_column.indicator_column(subcat_1),
    tf.feature_column.indicator_column(subcat_2),
    ]

num_hidden_units = [500, 150, 30, 150, 500]


In [15]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_train,
      y=y_train,
      batch_size=128,
      num_epochs=None,
      shuffle=False,
      num_threads=1)

eval_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_eval,
      y=y_eval,
      num_epochs=1,
      shuffle=False,
      num_threads=1)


In [16]:
model = tf.estimator.DNNRegressor(feature_columns=feature_columns,
                                   hidden_units=num_hidden_units,
                                   activation_fn=tf.nn.relu,
                                   optimizer=tf.train.ProximalAdagradOptimizer(
                                             learning_rate=0.1,
                                             l1_regularization_strength=0.001),
                                   model_dir="./checkpoints/")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_log_step_count_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': 1, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_save_checkpoints_steps': None, '_model_dir': './checkpoints/'}


In [18]:
model.train(input_fn=train_input_fn, steps=2000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into ./checkpoints/model.ckpt.
INFO:tensorflow:step = 1, loss = 1176.97
INFO:tensorflow:global_step/sec: 44.056
INFO:tensorflow:step = 101, loss = 36.1813 (2.273 sec)
INFO:tensorflow:global_step/sec: 50.145
INFO:tensorflow:step = 201, loss = 26.6057 (1.993 sec)
INFO:tensorflow:global_step/sec: 51.7663
INFO:tensorflow:step = 301, loss = 13.9416 (1.932 sec)
INFO:tensorflow:global_step/sec: 51.15
INFO:tensorflow:step = 401, loss = 9.59312 (1.954 sec)
INFO:tensorflow:global_step/sec: 37.0341
INFO:tensorflow:step = 501, loss = 10.0672 (2.701 sec)
INFO:tensorflow:global_step/sec: 33.5913
INFO:tensorflow:step = 601, loss = 16.8563 (2.977 sec)
INFO:tensorflow:global_step/sec: 39.7696
INFO:tensorflow:step = 701, loss = 8.37662 (2.514 sec)
INFO:tensorflow:global_step/sec: 29.6102
INFO:tensorflow:step = 801, loss = 7.46242 (3.382 sec)
INFO:tensorflow:global_step/sec: 49.7788
INFO:tensorflow:step = 901, loss = 11.

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x1040d4048>

In [19]:
result = model.evaluate(input_fn=eval_input_fn)

INFO:tensorflow:Starting evaluation at 2017-12-27-11:29:12
INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-2000
INFO:tensorflow:Finished evaluation at 2017-12-27-11:29:13
INFO:tensorflow:Saving dict for global step 2000: average_loss = 0.603985, global_step = 2000, loss = 60.3985


In [20]:
some_data = X_eval[:9]

predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=some_data,
      num_epochs=1,
      shuffle=False,
      num_threads=1)

In [21]:
predictions = model.predict(input_fn=predict_input_fn)
pred = [p['predictions'] for p in predictions]

pred = np.array(pred, dtype='float32').squeeze()
print("Predict Result: {}\n".format(pred))
print("Real: {}".format(list(y_eval[0:9])))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-2000
Predict Result: [ 2.22325563  3.67049122  3.03620696  2.70529366  2.72866988  2.43480134
  3.42344379  3.23485661  2.27796173]

Real: [2.5649493574615367, 2.3978952727983707, 3.1354942159291497, 1.9459101490553132, 2.8903717578961645, 2.3978952727983707, 2.8332133440562162, 3.5553480614894135, 1.9459101490553132]


<br>
### Test

In [22]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      num_epochs=1,
      shuffle=False,
      num_threads=1)

In [23]:
predictions = model.predict(input_fn=predict_input_fn)
pred = [p['predictions'] for p in predictions]
pred_result = np.array(pred, dtype='float32').squeeze()

test["price"] = np.expm1(pred_result)
test[["test_id", "price"]].to_csv("dnn_result.csv", index = False)

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-2000


In [24]:
test[["test_id", "price"]].head(5)

Unnamed: 0,test_id,price
0,0,13.029753
1,1,16.99262
2,2,16.717607
3,3,12.465987
4,4,32.506908
