# Loading dependencies and Mercari dataset.

In [1]:
# Pandas holds our original dataset, but we'll transfer numerical
# encoding into a sparse numpy array.
import pandas as pd
import numpy as np

# Sparse array tools that work with numpy.
from scipy.sparse import csr_matrix, hstack

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Consider for on `name` and `item_description` columns during second pipeline iteration.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Measure error on X_test.
# Searches through tuning parameters within one model and chooses the best
# set of tuning parameters.
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Tools we'll consider for LDA topic modeling in second pipeline iteration.
#from gensim.models import LdaModel
#from gensim.corpora import Dictionary

# For plotting data. Not used right now.
import matplotlib.pyplot as plt

# System utilities.
import time
import sys

# Currently using this model.
from xgboost import XGBRegressor

import lightgbm as lgb

# Load dataset `train.tsv` into pandas DataFrame `df` and print number of records.

In [2]:
df = pd.read_csv("train.tsv",sep='\t')
print("Number of records:", len(df))

Number of records: 1482535


# Preview the dataset with df.head().

In [3]:
df.head(5)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


# Preprocessing: Separate the `category_name` column by "/".

In [4]:
df[['subcat_A', 'subcat_B', 'subcat_C']] = df["category_name"].str.split(pat = "/", expand = True, n = 2)
df.head(5)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,subcat_A,subcat_B,subcat_C
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


# Preprocessing: Drop `category_name` and `train_id` columns.

In [5]:
df.drop(columns = ["train_id", "category_name"], inplace = True)

df.head(5)

Unnamed: 0,name,item_condition_id,brand_name,price,shipping,item_description,subcat_A,subcat_B,subcat_C
0,MLB Cincinnati Reds T Shirt Size XL,3,,10.0,1,No description yet,Men,Tops,T-shirts
1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,Leather Horse Statues,1,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,24K GOLD plated rose,1,,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


# Preprocessing: Look for empty data in subcat cols, `price` <= 0 or `price` = NaN.

In [6]:

df.loc[df['subcat_A'].isna()]


df.loc[df['subcat_B'].isna()]


df.loc[df['subcat_C'].isna()]


df.loc[df['price'].isna()]


df.loc[df['price'] <= 0] 

Unnamed: 0,name,item_condition_id,brand_name,price,shipping,item_description,subcat_A,subcat_B,subcat_C
1325,Alabama Crimson Tide Quality Lanyard,1,,0.0,1,TOP QUALITY THICK LANYARD Reversible sides wit...,Sports & Outdoors,Fan Shop,NCAA
2544,Levi leggings,3,Levi's®,0.0,0,Light wash. Hardly worn. Size 24 but fits like...,Women,Jeans,"Slim, Skinny"
2712,Simple Red Lace Lingerie Top,3,,0.0,0,"Very simple but cute, a little form fitting as...",Women,Underwear,G-Strings & Thongs
3576,Turquoise 15 Pc Makeup Brush Set,1,,0.0,1,1 left!,Beauty,Tools & Accessories,Makeup Brushes & Tools
3761,Girls size 7/8,3,,0.0,1,5 pair gap jean excellent used condition 3 chi...,Kids,Girls (4+),Tops & T-Shirts
...,...,...,...,...,...,...,...,...,...
1474172,Homecoming or Dama ivory dress,2,,0.0,0,Ivory formal dress. I wore this dress once for...,Women,Dresses,"Above Knee, Mini"
1474198,Tie up flannel top,3,,0.0,1,I got this last year and just didnt wear it mu...,Women,Tops & Blouses,Blouse
1477958,Rae Dunn Measuring Pear,3,,0.0,0,The dash lid appears it was dropped and a piec...,Home,Home Décor,Home Décor Accents
1478519,4 pairs women's wool boots socks,1,,0.0,1,✔️SHARE YOUR SOCKS WITH FRIENDS AND FAMILY! ✔️...,Women,Shoes,Boots


# Preprocessing: Fill in blanks for NaN entries.

In [7]:
df.fillna('', inplace = True)

# Preprocessing: Convert `item_condition_id` to numpy array. One-hot encode `brand_name`, `shipping` and `subcat` columns.

In [8]:
item_cond_arr = np.reshape(df["item_condition_id"].to_numpy(), (-1,1))


print("-"*80)
print("item_cond_arr type and dimensions are: ")
print(type(item_cond_arr))
print(item_cond_arr.shape)


enc = OneHotEncoder(sparse=True)

onehot_arr = enc.fit_transform(df[['brand_name', 'shipping', 'subcat_A', 'subcat_B', 'subcat_C']])

print("-"*80)
print("Type and dimensions of one-hot encoded variables are: ")
print(type(onehot_arr))
print(onehot_arr.shape)
print("-"*80)

--------------------------------------------------------------------------------
item_cond_arr type and dimensions are: 
<class 'numpy.ndarray'>
(1482535, 1)
--------------------------------------------------------------------------------
Type and dimensions of one-hot encoded variables are: 
<class 'scipy.sparse.csr.csr_matrix'>
(1482535, 5809)
--------------------------------------------------------------------------------


In [9]:

vector = CountVectorizer()
Z = vector.fit_transform(df['name'])

In [10]:

vector2 = TfidfVectorizer()
ZZ = vector2.fit_transform(df['item_description'])

# Preprocessing: Concatenate `item_cond_arr`,`onehot_arr` together into `X`. Create array `y` of labels. Display type and dimensions.

In [11]:
y = np.reshape(df["price"].to_numpy(), (-1,1))

print(X.shape)
print(type(X))

print(y.shape)
print(type(y))

print(sys.getsizeof(X) / (1024*1024))
print(X.dtype)
print(X.nnz)

(1482535, 273599)
<class 'scipy.sparse.coo.coo_matrix'>
(1482535, 1)
<class 'numpy.ndarray'>
4.57763671875e-05
float64
45717124


# Train-test split: Perform 80-20 train-test split. Stratify along `shipping` variable.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = df["shipping"])

# Print sparsity and file size statistics for train_test_split outputs.

In [13]:
print("-" * 80)
print("Dimensions of X_train, X_test, y_train and y_test ...")
print("")

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print("-" * 80)
print("Types of of X_train, X_test, y_train and y_test ...")
print("")

print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

print("-" * 80)
print("Sparsity of X_train and X_test ...")
print("")

print(X_train.nnz / (X_train.shape[0] * X_train.shape[1]))
print(X_test.nnz / (X_test.shape[0] * X_test.shape[1]))

print("-" * 80)
print("Disk size of X_train, X_test, y_train and y_test ...")
print("")

print(sys.getsizeof(X_train) / (1024*1024))
print(sys.getsizeof(X_test) / (1024*1024))
print(sys.getsizeof(y_train) / (1024*1024))
print(sys.getsizeof(y_test) / (1024*1024))

print("-" * 80)

--------------------------------------------------------------------------------
Dimensions of X_train, X_test, y_train and y_test ...

(1111901, 273599)
(370634, 273599)
(1111901, 1)
(370634, 1)
--------------------------------------------------------------------------------
Types of of X_train, X_test, y_train and y_test ...

<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
--------------------------------------------------------------------------------
Sparsity of X_train and X_test ...

0.00011272327402818125
0.00011266705449430578
--------------------------------------------------------------------------------
Disk size of X_train, X_test, y_train and y_test ...

4.57763671875e-05
4.57763671875e-05
8.483238220214844
2.82781982421875
--------------------------------------------------------------------------------


# Log transform target variable `y`.

In [14]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# XGBoost: Set regressor model tuning parameters. 

In [15]:
#Inferior Model commented out, but still functional.

#XGB_regressor = XGBRegressor(
#    n_estimators = 150,
#   reg_lambda = 2,
#   gamma = 0.3,
#    max_depth = 8,
#    learning_rate = 0.1,
#    reg_alpha = 0,
#    subsample = 1,
#    colsample_bytree = 1
#)



#params = {
#    'gamma' : [0.3, 0.5, 0.7],
#    'max_depth' : [4, 8, 16]
# }

#XGB_gsc = GridSearchCV(
#     estimator = XGBRegressor(n_estimators = 150, colsample_bytree = 1),
#     param_grid = params,
#     cv = 3,
#     scoring = 'neg_mean_squared_error',
#     verbose = 10,
#     n_jobs = 3
# )
#XX = XGB_gsc.best_params_
#print(XX)

# Alternative Model?: Set regressor model tuning parameters. 

In [16]:
my_LGB_regressor = lgb.LGBMRegressor(
  n_estimators = 500,
  max_depth = 15,
  learning_rate = 0.1,
  colsample_bytree = 1,
  num_leaves = 75,
  application = 'regression',
  metric = 'RMSE',
  boosting_type = 'gbdt'
  )


# XGBoost: Train model. Use `X_train`.

In [17]:
#start_time = time.time()

#XGB_gsc.fit(X_train, y_train_log)

#end_time = time.time()
#print("Time to fit XGBoost regressor model in minutes: ", (end_time - start_time) / 60.0)

# Alternative Model?: Train model. Use `X_train`.

In [18]:

start_time = time.time()


my_LGB_regressor.fit(X_train, y_train_log.ravel(), verbose = True)

end_time = time.time()
print("Time to fit LightGBM regressor model in minutes: ", (end_time - start_time) / 60.0)

Time to fit LightGBM regressor model in minutes:  10.9158997575442


# XGBoost: Perform predictions with regressor model. Use `X_test`.

In [19]:
#y_pred_xgb_log = XGB_gsc.predict(X_test)

# Alternative Model?: Perform predictions with regressor model. Use `X_test`.

In [20]:
y_pred_lbm_log = my_LGB_regressor.predict(X_test)

# Get RMSLE for regressor model.

In [21]:
np.sqrt(mean_squared_error(y_test_log, y_pred_lbm_log))

0.47795895812578004