# MLPS - Mercari Price ML

In [None]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_squared_error

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Perform data cleaning

In [None]:
data = pd.read_csv("Data/train.tsv", delimiter="\t", index_col=0)

# remove items with out a price
data = data[pd.notna(data["price"])]

data["item_description"] = data["item_description"].replace("No description yet", "")
data["item_description"] = data["item_description"].replace(np.nan, "")

temp = data["category_name"].fillna('').str.split('/')
              
data["category_name_1"] = temp.str[0]
data["category_name_2"] = temp.str[1]
data["category_name_3"] = temp.str[2:].str.join("/")

### Implement porter stemming in count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import *
import string

class StemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation + string.digits)
    def __call__(self, doc):
        return [self.ps.stem(w) for w in doc
                .encode('ascii', errors='ignore')
                .decode('ascii')
                .translate(self.translator)
                .split()]

vectorizer = CountVectorizer(lowercase = True,
                             max_df = .5,
                             min_df = .001,
                             tokenizer = StemmerTokenizer(),
                             stop_words='english')

### Count vectorize the data

In [None]:
%%time
tfm = vectorizer.fit_transform(data["item_description"])

In [None]:
vocab = vectorizer.vocabulary_ 
removed_words = vectorizer.stop_words_

In [None]:
%%time
tfidf_vectorizer = TfidfTransformer()
tfidf_transformed = tfidf_vectorizer.fit_transform(tfm)

### Save results for time savings

In [None]:
from scipy.sparse import save_npz
save_npz("tfm.npz", tfm)
save_npz("tfidf_transformed.npz", tfidf_transformed)

## Vocab: write vectorized words to txt

In [None]:
import operator
with open("vocabulary.txt", "w") as f:
    f.write("".join([k + '\n' for k, v in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]))

<br><br><br><hr><hr>

# Load files as necessary for time savings

In [97]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_squared_error

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from scipy.sparse import load_npz

tfm = load_npz("Data/tfm.npz")
tfidf_transformed = load_npz("Data/tfidf_transformed.npz")
cleaned_categorical = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

# cleaned_categorical.loc[cleaned_categorical['price'] < cleaned_categorical['price'].std()*2]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_name_1,category_name_2,category_name_3
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.000,1,,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.000,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.000,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.000,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.000,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.000,0,"Banana republic bottoms, Candies skirt with ma...",Women,Other,Other
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.000,0,Size small but straps slightly shortened to fi...,Women,Swimwear,Two-Piece
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.000,1,You get three pairs of Sophie cheer shorts siz...,Sports & Outdoors,Apparel,Girls
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.000,0,Girls Size small Plus green. Three shorts total.,Sports & Outdoors,Apparel,Girls
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.000,0,I realized his pants are on backwards after th...,Vintage & Collectibles,Collectibles,Doll


In [3]:
tfm.shape

(1482535, 1793)

In [4]:
tfidf_transformed.shape

(1482535, 1793)

In [5]:
cleaned_categorical.shape

(1482535, 11)

## Split into train and test

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y_categorical = cleaned_categorical['price'].astype('float', copy=False)
cleaned_categorical.drop(["train_id","name","category_name","item_description","price"], axis=1, inplace=True)
cleaned_categorical["item_condition_id"] = cleaned_categorical["item_condition_id"].astype('str', copy=False)

In [96]:
y_categorical.std()

38.586066313339806

In [7]:
ohe_cleaned_categorical = pd.get_dummies(cleaned_categorical)

In [None]:
# #Test Train split
# y_cat_train, y_cat_test, X_cat_train, X_cat_test, tfidf_train, tfidf_test, tfm_train, tfm_test = train_test_split(
#     y_categorical, ohe_cleaned_categorical, tfidf_transformed, tfm, test_size=0.2, random_state=95)

In [15]:
y_categorical.shape

(1482535,)

In [109]:
#Test Train sample size
import numpy as np
n_sample = 100000
sample = np.random.permutation(1482535)[:n_sample]

In [110]:
y_categorical_sample = y_categorical[sample]
ohe_cleaned_categorical_sample = ohe_cleaned_categorical.iloc[sample]
tfidf_transformed_sample = tfidf_transformed[sample]

ohe_cleaned_categorical_sample.reindex();
y_categorical_sample = np.asarray(y_categorical_sample, dtype="int")

In [111]:
# split into train and test
yTrain, yTest, XTrain, XTest, tfidfTrain, tfidfTest = train_test_split(
    y_categorical_sample, ohe_cleaned_categorical_sample, tfidf_transformed_sample, test_size=0.7, random_state=95)

XTrain.reindex()
XTest.reindex();

Unnamed: 0,shipping,item_condition_id_1,item_condition_id_2,item_condition_id_3,item_condition_id_4,item_condition_id_5,brand_name_!iT Jeans,brand_name_% Pure,brand_name_10.Deep,brand_name_191 Unlimited,...,category_name_3_Wool,category_name_3_Work & Safety,category_name_3_Wrap,category_name_3_Wristlet,category_name_3_Writing,category_name_3_Yarn,category_name_3_Yoga & Pilates,category_name_3_Zipper,category_name_3_iPad/Tablet/eBook Access,category_name_3_iPad/Tablet/eBook Readers
1281457,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1396300,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1340513,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
983327,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900816,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
617680,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1012321,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172666,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1202226,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1005803,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
# specify number of folds
from sklearn.model_selection import KFold
folds = 2
kf = KFold(n_splits=folds)
kf_inner = KFold(n_splits=folds)

In [None]:
%time
# Test train and validate
alpha_values = [0.01,0.1, 1, 10]
from sklearn.linear_model import Lasso
import copy

mseOuterTrain = []
mseOuterVal = []
mseInnerTrain = []
mseInnerVal = []
mseOverallTrain = []
mseOverallVal= []


for i, (train_index, val_index) in enumerate(kf.split(yTrain)):
    print(i)
    yTrain_fold = yTrain[train_index]
    yVal_fold = yTrain[val_index]

    XTrain_fold = XTrain.iloc[train_index]
    XVal_fold = XTrain.iloc[val_index]
    

    for a_outer in alpha_values:
        print(i, a_outer)
        lasso = Lasso(alpha=a_outer, random_state=111, max_iter=1000).fit(XTrain_fold, yTrain_fold)
        predict_train = lasso.predict(XTrain_fold)
        train_residuals = yTrain_fold - predict_train
        mseOuterTrain.append(copy.deepcopy((i, None, a_outer , np.square(train_residuals).mean())))
        
        predict_val = lasso.predict(XVal_fold)
        val_residuals = yVal_fold - predict_val
        mseOuterVal.append(copy.deepcopy((i, None, a_outer , np.square(val_residuals).mean())))
        
        outer_residuals = yTrain - lasso.predict(XTrain)
            
        for j, (train_inner_index, val_inner_index) in enumerate(kf_inner.split(tfidfTrain)):
            tfidfTrain_fold = tfidfTrain[train_inner_index]
            tfidfVal_fold = tfidfTrain[val_inner_index]
            
            outer_residuals_train = outer_residuals[train_inner_index]
            outer_residuals_val = outer_residuals[val_inner_index]
        
            for a_inner in alpha_values:
                print(j, a_outer, a_inner)
                lasso_inner = Lasso(alpha=a_inner, random_state=111, max_iter=1000).fit(tfidfTrain_fold, outer_residuals_train)
                predict_inner_train = lasso_inner.predict(tfidfTrain_fold)
                inner_train_residuals = train_residuals - predict_inner_train
                mseInnerTrain.append(copy.deepcopy((i, j, a_inner, np.square(inner_train_residuals).mean())))

                predict_inner_val = lasso_inner.predict(tfidfVal_fold)
                inner_val_residuals = val_residuals - predict_inner_val
                mseInnerVal.append(copy.deepcopy((i, j, a_inner, np.square(inner_val_residuals).mean())))
            
                residuals_overall_train = yTrain_fold - predict_train + predict_inner_train
                mseOverallTrain.append((i, j, a_outer, a_inner, np.square(residuals_overall_train).mean()))
            
                residuals_overall_val = yVal_fold - predict_val + predict_inner_val
                mseOverallVal.append((i, j, a_outer, a_inner, np.square(residuals_overall_val).mean()))

    

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs
0
0 0.01
0 0.01 0.01
0 0.01 0.1
0 0.01 1
0 0.01 10
1 0.01 0.01
1 0.01 0.1
1 0.01 1
1 0.01 10
0 0.1


In [None]:
import operator
sorted(mseOverallVal, key = operator.itemgetter(2))

## Lasso: Category (non-text) data

In [14]:
vocab = pd.read_csv('Data/vocabulary.txt', sep=" ", header=None)

In [None]:
%%time
from sklearn.linear_model import LassoCV, Lasso

#alpha_values = np.logspace(-4, 2, 6)
alpha_values = [0.001,0.01,0.1]

mseRow = []
mseAll = []
mseOuterTrain = []
mseOuterValidate = []
mseInner = []

#manually grid search over alpha
for alphaOut in alpha_values:
    
    #non-Text data
    lasso = Lasso(alpha=alphaOut, random_state=111, max_iter=1000).fit(XTrain, yTrain)
    yPredOut = lasso.predict(XValid)
    residuals = yValid - yPredOut
    
    mseRow = []
    
    #can't figure out how to implement a validation or test accuracy as the training of the inner model is dependent 
    # on the outermodel's predictions.  If we predict on XValid, then we need to train the inner model on XValid.
    # not sure if this makes sense
    
    #GridSearch manually
    for alphaIn in alpha_values:
        lassoTF = Lasso(alpha=alphaIn, random_state=777, max_iter=1000).fit(tfidfValid, residuals)
        yPredIn = lassoTF.predict(tfidfValid)
        
        #add predictions from non-Text and Text regressions
        yPredFull = yPredOut + yPredIn
        
        #calc mse between full predictions & actuals
        mseRow.append(mean_squared_error(yValid, yPredFull))
    
    #store for of mse's
    mseAll.append(mseRow)
    
mseAll = np.array(mseAll)

In [None]:
#get bestAlphaOut & In based on the smallest mse from gridsearch
bestAlpha_idx = np.unravel_index(np.argmin(mseAll), mseAll.shape)
#bestAlpha_idx[0]
bestAlphaOut = alpha_values[bestAlpha_idx[0]]
bestAlphaIn = alpha_values[bestAlpha_idx[1]]
print()
print("Best Alpha for Outer Lasso (non-text): {}".format(bestAlphaOut))
print("Best Alpha for Linner Lasso (text): {}".format(bestAlphaIn))

## Run final model with bestAlphas

In [None]:
lasso = Lasso(alpha=bestAlphaOut, random_state=111).fit(XTrain, yTrain)
yPredOut = lasso.predict(XValid)
residualsBest = yValid - yPredOut

lassoTF = Lasso(alpha=bestAlphaIn, random_state=111).fit(tfidfTrain, residualsBest)
yPredIn = lasso.predict(tfidfValid)

yPredFull = yPredOut + yPredIn

mse_Final = mean_squared_error(yValid, yPredFull)
print("The MSE for the final tuned models is: {}".format(mse_Final))

## Pickle trained models

In [None]:
from sklearn.externals import joblib
joblib.dump(lasso, 'lasso_cat.pkl')
joblib.dump(lassTF, 'lasso_tfidf.pkl')
#clf = joblib.load('lasso_cat.pkl') 

# Combine non-Text & text data and run Lasso to see if prediction is diff

In [None]:
#combine non-Text and Text to test it out
# tfDTrain = tfidfTrain.todense()
# comboTrain = pd.concat([XTrain.reset_index(), pd.DataFrame(tfDTrain)], axis=1)
# comboTrain.shape

In [None]:
#save lasso on category data as DataFrame, write to csv
# lasso_cat_coef = pd.DataFrame(lasso.coef_, index=XTrain.columns)
# lasso_cat_coef.to_csv("lasso_cat_coef.csv")

# residualsDF = pd.DataFrame(residuals)
# residualsDF.to_csv("residuals.csv")

## Lasso: Text data on residuals

In [None]:
#save lasso on TFIDF data as DataFrame, write to csv
# lasso_tfidf_coef = pd.DataFrame(lassoTF.coef_, index=vocab.iloc[:,0].values)
# lasso_tfidf_coef.to_csv("lasso_tfidf_coef.csv")

## Lasso: Text data on residuals (per category1)

In [None]:
#get the category1 column headers
cat1_cols = XTrain.iloc[:,4815:4825].columns

#reset index so can filter using list of indices
residuals.reset_index(drop=True, inplace=True)

In [None]:
%%time
cat1_coefs = []
tfidfTrain_df = pd.DataFrame(tfidfTrain.todense())  #I think can remove this and uncomment tocsr() line below

for cat1 in cat1_cols:
    #get indices where this category=1
    idx = np.where(XTrain[cat1] == 1)[0]
   
    #filter data with the indices
    X_tf = tfidfTrain_df.iloc[idx,:]   #Don't think need to use this, uncomment next time to deal with sparse
    #X_tf = tfidfTrain.tocsr()[idx,:]  #way to filter sparse matrx with row indices
    y_tf = residuals[idx]
    
    #run Lasso, fit, store coef array
    lassoCat = Lasso(alpha=bestAlphaIn, random_state=10)
    lassoCat.fit(X_tf, y_tf)

    #add this cat1 coefs to a list
    cat1_coefs.append(lassoCat.coef_)

In [None]:
#covert to DF with cols=vocab and index=category1 names
lasso_allCat1_coef = pd.DataFrame(cat1_coefs, columns=vocab.iloc[:,0].values, index=cat1_cols)
lasso_allCat1_coef.to_csv("lass_allCat1_coef.csv")

<hr><hr><br>

## Recurrent Neural Network

In [None]:
# from keras.models import Sequential
# from keras.layers.core import Dense, Dropout, Activation, Reshape
# from keras.layers.recurrent import LSTM, RNN, GRU, SimpleRNN
# from keras.models import load_model
# import keras
# import h5py



In [None]:
# #default layers = 6, neurons list always should equal 6
# def build_NN(inShape, nnType='RNN'):
    
#     model = Sequential()
#     model.add(Embedding(inShape[1], 64, input_length=maxLen))  #input layer
#     if nnType == 'RNN':
#         model.add(SimpleRNN(32, return_sequences=True))
#         model.add(SimpleRNN(32))
#         model.add(Dense(16, activation='relu'))
#         model.add(Dense(8, activation='relu'))
#     elif nnType == 'GRU':
#         #model.add(Dense(64, input_shape=(inShape[1])))  #input layer
#         model.add(GRU(32, return_sequences=True))
#         model.add(GRU(32))
#         model.add(Dense(16, activation='relu'))
#         model.add(Dense(8, activation='relu'))
#     elif nnType == 'LSTM':
#         #model.add(Dense(64, input_shape=(inShape[1])))  #input layer
#         model.add(LSTM(32, return_sequences=True)) 
#         model.add(LSTM(32))
#         model.add(Dense(16, activation='relu'))      
#         model.add(Dense(8, activation='relu'))
#     elif nnType == 'BASE':
#         #model.add(Dense(64, input_shape=(inShape[1],)))  #input layer
#         model.add(Dense(32, activation='relu'))
#         model.add(Dense(32, activation='relu'))
#         model.add(Dense(16, activation='relu'))
#         model.add(Reshape((-1,)))
#         model.add(Dense(8, activation='relu'))
#     else:
#         print('should not be here')
    
#     model.add(Dense(1, activation='linear', kernel_initializer="uniform")) #output layer
#     model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
#     #model.summary() 

#     return model

In [None]:
# #params for NN
# epochs = 10
# batchSize = 3000

# #Need to prep the data for the NN

# X = tokenizer.texts_to_sequences(train5k)
# data = pad_sequences(sequences, maxlen=max_len)


# modelRNN = build_NN(tfidf_train.shape, nnType='RNN')
# modelRNN.summary()