# Mercari Price Suggestion Challenge
* url: https://www.kaggle.com/c/mercari-price-suggestion-challenge

* tensorflow ref: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/learn/wide_n_deep_tutorial.py

### import module

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from collections import Counter, defaultdict
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
import scipy
from itertools import chain


import gc

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import seaborn as sns

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import helper_function as helper

In [2]:
# train = pd.read_csv('sample_data/train.tsv', sep='\t')
# test = pd.read_csv('sample_data/test.tsv', sep='\t')
# submission = pd.read_csv('sample_data/sample_submission.csv', sep=',')

In [3]:
train = pd.read_csv('data/train.tsv', sep='\t')
test = pd.read_csv('data/test.tsv', sep='\t')

### Data Preprocessing

In [4]:
BrandMinNum = 2
NUM_BRANDS = 250

In [5]:
df = pd.concat([train, test], 0)
nrow_train = train.shape[0]
Y_train = np.log1p(train["price"])

del train
gc.collect() # release usage memory (df_train)

print(df.memory_usage(deep = True))

Index                 17407152
brand_name           112334308
category_name        190129667
item_condition_id     17407152
item_description     491094704
name                 181881898
price                 17407152
shipping              17407152
test_id               17407152
train_id              17407152
dtype: int64


In [6]:
df["category_name"] = df["category_name"].fillna("Other")#.astype("category")
df["brand_name"] = df["brand_name"].fillna("unknown")

# pop_brands = df["brand_name"].value_counts().index[:NUM_BRANDS]
brand_count = df["brand_name"].value_counts()
pop_brands = brand_count[brand_count > BrandMinNum].index
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"

df["item_description"] = df["item_description"].fillna("None")
df["item_condition_id"] = df["item_condition_id"]#.astype("category")
df["brand_name"] = df["brand_name"]#.astype("category")

print(df.memory_usage(deep = True))

Index                 17407152
brand_name           142081528
category_name        190636457
item_condition_id     17407152
item_description     491094820
name                 181881898
price                 17407152
shipping              17407152
test_id               17407152
train_id              17407152
dtype: int64


In [7]:
df.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,unknown,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0


### Tensorflow

In [8]:
def split_cat(l):
    for text in l:
        yield text.split("/")

In [9]:
categories = list(split_cat(df['category_name']))
df_categories = DataFrame(categories)
df['general_cat'], df['subcat_1'], df['subcat_2'] = \
df_categories[0].fillna("No Label"), df_categories[1].fillna("No Label"), df_categories[2].fillna("No Label")

print("There are {} uniq general categories.".format(len(df['general_cat'].unique())))
print("There are {} uniq sub1 categories.".format(len(df['subcat_1'].unique())))
print("There are {} uniq sub2 categories.".format(len(df['subcat_2'].unique())))
print('')
print("There are {} uniq brand name.".format(len(df['brand_name'].unique())))

There are 10 uniq general categories.
There are 114 uniq sub1 categories.
There are 871 uniq sub2 categories.

There are 3441 uniq brand name.


In [10]:
df.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
0,unknown,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,Tops,T-shirts
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,Computers & Tablets,Components & Parts
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,Tops & Blouses,Blouse


In [11]:
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# 
# tv = TfidfVectorizer(max_features=100,
#                          ngram_range=(1, 3),
#                          stop_words='english')
# X_description = tv.fit_transform(df['item_description'])


In [12]:
# X_description.toarray().shape

In [13]:
X_train = df[:nrow_train]
X_test = df[nrow_train:]

X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=42)

In [14]:
X_train.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
44148,Victoria's Secret,Beauty/Makeup/Lips,3,These are new however the tip is cut off the l...,I have 25 vs matte lippys,24.0,0,,44148.0,Beauty,Makeup,Lips
943660,unknown,"Electronics/Cell Phones & Accessories/Cases, C...",1,Black Waterproof Waist Punch Cover For Apple i...,IPhone Case Fanny Pack Waist Carrier New,11.0,1,,943660.0,Electronics,Cell Phones & Accessories,"Cases, Covers & Skins"
1312383,American Eagle,Women/Tops & Blouses/Blouse,3,American eagle black mesh and lace blouse. Has...,Mesh/ Lace Blouse,9.0,0,,1312383.0,Women,Tops & Blouses,Blouse


In [15]:
unique_categories = pd.Series("/".join(df["category_name"].unique().astype("str")).split("/")).unique()

TRAIN_CSV_COLUMNS = [
    "brand_name", "item_condition_id", 
    #"item_description", "name"
    #"price", 
    "general_cat", "subcat_1", "subcat_2",
    "shipping"
]

brand_name = tf.feature_column.categorical_column_with_vocabulary_list(
    "brand_name", list(df.brand_name.unique()))
item_condition_id = tf.feature_column.categorical_column_with_vocabulary_list(
    "item_condition_id", list(df.item_condition_id.unique()))
shipping = tf.feature_column.categorical_column_with_vocabulary_list(
    "shipping", list(df.shipping.unique()))
category_name = tf.feature_column.categorical_column_with_vocabulary_list(
    "category_name", unique_categories)

general_cat = tf.feature_column.categorical_column_with_vocabulary_list(
    "general_cat", list(df.general_cat.unique()))
subcat_1 = tf.feature_column.categorical_column_with_vocabulary_list(
    "subcat_1", list(df.subcat_1.unique()))
subcat_2 = tf.feature_column.categorical_column_with_vocabulary_list(
    "subcat_2", list(df.subcat_2.unique()))

feature_columns = [
    #tf.feature_column.crossed_column(
    #    [item_condition_id, shipping], hash_bucket_size=1000),
    tf.feature_column.indicator_column(item_condition_id),
    tf.feature_column.indicator_column(shipping),
    tf.feature_column.embedding_column(brand_name, dimension=50),
#     tf.feature_column.crossed_column(
#        [general_cat, subcat_1, subcat_2], hash_bucket_size=1000),
    tf.feature_column.embedding_column(general_cat, dimension=20),
    tf.feature_column.embedding_column(subcat_1, dimension=30),
    tf.feature_column.embedding_column(subcat_2, dimension=50),
]

# num_hidden_units = [500, 150, 50, 10, 50, 150, 300]
num_hidden_units = [500, 150, 10, 150, 500]


In [16]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_train,
      y=y_train,
      batch_size=128,
      num_epochs=None,
      shuffle=False,
      num_threads=1)

eval_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_eval,
      y=y_eval,
      num_epochs=1,
      shuffle=False,
      num_threads=1)


In [17]:
model = tf.estimator.DNNRegressor(feature_columns=feature_columns,
                                   hidden_units=num_hidden_units,
                                   activation_fn=tf.nn.relu,
                                   dropout=0.1,
#                                    optimizer=tf.train.AdamOptimizer(
#                                              learning_rate=0.1),
                                   optimizer=tf.train.ProximalAdagradOptimizer(
                                             learning_rate=0.1,
                                             l1_regularization_strength=0.001),
                                   model_dir="./checkpoints/")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_keep_checkpoint_max': 5, '_save_checkpoints_steps': None, '_model_dir': './checkpoints/', '_save_summary_steps': 100, '_session_config': None, '_tf_random_seed': 1, '_log_step_count_steps': 100}


In [18]:
model.train(input_fn=train_input_fn, steps=2000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-2001
INFO:tensorflow:Saving checkpoints for 2002 into ./checkpoints/model.ckpt.
INFO:tensorflow:step = 2002, loss = 66.7232
INFO:tensorflow:global_step/sec: 47.4611
INFO:tensorflow:step = 2102, loss = 92.7131 (2.107 sec)
INFO:tensorflow:global_step/sec: 43.4968
INFO:tensorflow:step = 2202, loss = 76.558 (2.301 sec)
INFO:tensorflow:global_step/sec: 34.8268
INFO:tensorflow:step = 2302, loss = 79.9633 (2.869 sec)
INFO:tensorflow:global_step/sec: 40.0479
INFO:tensorflow:step = 2402, loss = 77.529 (2.496 sec)
INFO:tensorflow:global_step/sec: 43.5757
INFO:tensorflow:step = 2502, loss = 66.9966 (2.294 sec)
INFO:tensorflow:global_step/sec: 40.8758
INFO:tensorflow:step = 2602, loss = 91.0421 (2.447 sec)
INFO:tensorflow:global_step/sec: 32.6886
INFO:tensorflow:step = 2702, loss = 71.4971 (3.061 sec)
INFO:tensorflow:global_step/sec: 37.2695
INFO:tensorflow:step = 2802, loss = 56.892 (2.6

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x11407ab70>

In [19]:
result = model.evaluate(input_fn=eval_input_fn)

INFO:tensorflow:Starting evaluation at 2017-12-28-03:57:27
INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-4001
INFO:tensorflow:Finished evaluation at 2017-12-28-03:57:40
INFO:tensorflow:Saving dict for global step 4001: average_loss = 0.564098, global_step = 4001, loss = 72.1569


In [20]:
some_data = X_eval[:9]

predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=some_data,
      num_epochs=1,
      shuffle=False,
      num_threads=1)

In [21]:
predictions = model.predict(input_fn=predict_input_fn)
pred = [p['predictions'] for p in predictions]

pred = np.array(pred, dtype='float32').squeeze()
print("Predict Result: {}\n".format(pred))
print("Real: {}".format(list(y_eval[0:9])))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-4001
Predict Result: [ 2.97339249  2.97339249  2.97339249  2.97339249  2.97339249  2.97339249
  2.97339249  2.97339249  2.97339249]

Real: [2.5649493574615367, 4.3438054218536841, 2.5649493574615367, 1.3862943611198906, 3.4339872044851463, 5.4205349992722862, 2.8332133440562162, 2.5649493574615367, 2.7725887222397811]


<br>
### Test

In [22]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      num_epochs=1,
      shuffle=False,
      num_threads=1)

In [23]:
predictions = model.predict(input_fn=predict_input_fn)
pred = [p['predictions'] for p in predictions]
pred_result = np.array(pred, dtype='float32').squeeze()

test["price"] = np.expm1(pred_result)
test[["test_id", "price"]].to_csv("dnn_result.csv", index = False)

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt-4001


In [24]:
test[["test_id", "price"]].head(5)

Unnamed: 0,test_id,price
0,0,18.558159
1,1,18.558159
2,2,18.558159
3,3,18.558159
4,4,18.558159
