# MLPS - Mercari Price ML

In [2]:
import pandas as pd 
import numpy as np

### Perform data cleaning

In [2]:
data = pd.read_csv("Data/train.tsv", delimiter="\t", index_col=0)

# remove items with out a price
data = data[pd.notna(data["price"])]

data["item_description"] = data["item_description"].replace("No description yet", "")
data["item_description"] = data["item_description"].replace(np.nan, "")

temp = data["category_name"].fillna('').str.split('/')
              
data["category_name_1"] = temp.str[0]
data["category_name_2"] = temp.str[1]
data["category_name_3"] = temp.str[2:].str.join("/")

  mask |= (ar1 == a)


### Implement porter stemming in count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import *
import string

class StemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation + string.digits)
    def __call__(self, doc):
        return [self.ps.stem(w) for w in doc
                .encode('ascii', errors='ignore')
                .decode('ascii')
                .translate(self.translator)
                .split()]

vectorizer = CountVectorizer(lowercase = True,
                             max_df = .5,
                             min_df = .001,
                             tokenizer = StemmerTokenizer(),
                             stop_words='english')

### Count vectorize the data

In [62]:
%%time
tfm = vectorizer.fit_transform(data["item_description"])

Wall time: 32min 16s


In [63]:
vocab = vectorizer.vocabulary_ 
removed_words = vectorizer.stop_words_

Wall time: 1.45 s


In [63]:
%%time
tfidf_vectorizer = TfidfTransformer()
tfidf_transformed = tfidf_vectorizer.fit_transform(tfm)

Wall time: 1.45 s


### Save results for time savings

In [64]:
from scipy.sparse import save_npz
save_npz("tfm.npz", tfm)
save_npz("tfidf_transformed.npz", tfidf_transformed)

### Load files as necessary for time savings

In [3]:
from scipy.sparse import load_npz

tfm = load_npz("Data/tfm.npz")
tfidf_transformed = load_npz("Data/tfidf_transformed.npz")
cleaned_categorical = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

In [4]:
tfm.shape

(1482535, 1793)

In [5]:
tfidf_transformed.shape

(1482535, 1793)

In [6]:
cleaned_categorical.shape

(1482535, 11)

## Vocab: write vectorized words to txt

In [None]:
import operator
with open("vocabulary.txt", "w") as f:
    f.write("".join([k + '\n' for k, v in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]))

## Split into train and test

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y_categorical = cleaned_categorical['price'].astype('float', copy=False)
cleaned_categorical.drop(["train_id","name","category_name","item_description","price"], axis=1, inplace=True)
cleaned_categorical["item_condition_id"] = cleaned_categorical["item_condition_id"].astype('str', copy=False)

In [8]:
ohe_cleaned_categorical = pd.get_dummies(cleaned_categorical)

In [9]:
#Test Train split
y_cat_train, y_cat_test, X_cat_train, X_cat_test, tfidf_train, tfidf_test, tfm_train, tfm_test = train_test_split(
    y_categorical, ohe_cleaned_categorical, tfidf_transformed, tfm, test_size=0.2, random_state=95)

In [12]:
#subset the data to test models using train_test_split function
yTrain, yTest, XTrain, XTest, tfidfTrain, tfidfTest, tfmTrain, tfmTest = train_test_split(
    y_cat_test, X_cat_test, tfidf_test, tfm_test, test_size=0.3, random_state=10)

## Lasso: Category (non-text) data

In [14]:
%%time
from sklearn.linear_model import LassoCV, Lasso

lasso = LassoCV(cv=5, n_jobs=-1).fit(XTrain, yTrain)

In [15]:
yPred = lasso.predict(XTrain)
residuals = yTrain - yPred

In [26]:
#save lasso on category data as DataFrame, write to csv
lasso_cat_coef = pd.DataFrame(lasso.coef_, index=XTrain.columns)
lasso_cat_coef.to_csv("lasso_cat_coef.csv")

## Lasso: Text data on residuals

In [40]:
alpha = lasso.alpha_

In [45]:
%%time
lassoTF = Lasso(alpha=alpha, random_state=10)
lassoTF.fit(tfidfTrain, residuals)

CPU times: user 7.22 s, sys: 30 ms, total: 7.25 s
Wall time: 7.28 s


In [47]:
vocab = pd.read_csv('Data/vocabulary.txt', sep=" ", header=None)

In [56]:
#save lasso on TFIDF data as DataFrame, write to csv
lasso_tfidf_coef = pd.DataFrame(lassoTF.coef_, index=vocab.iloc[:,0].values)
lasso_tfidf_coef.to_csv("lasso_tfidf_coef.csv")

## Lasso: Text data on residuals (per category1)

In [141]:
#get the category1 column headers
cat1_cols = XTrain.iloc[:,4815:4825].columns

#reset index so can filter using list of indices
residuals.reset_index(drop=True, inplace=True)

In [142]:
%%time
cat1_coefs = []
tfidfTrain_df = pd.DataFrame(tfidfTrain.todense())  #I think can remove this and uncomment tocsr() line below

for cat1 in cat1_cols:
    #get indices where this category=1
    idx = np.where(XTrain[cat1] == 1)[0]
   
    #filter data with the indices
    X_tf = tfidfTrain_df.iloc[idx,:]   #Don't think need to use this, uncomment next time to deal with sparse
    #X_tf = tfidfTrain.tocsr()[idx,:]  #way to filter sparse matrx with row indices
    y_tf = residuals[idx]
    
    #run Lasso, fit, store coef array
    lassoCat = Lasso(alpha=alpha, random_state=10)
    lassoCat.fit(X_tf, y_tf)

    #add this cat1 coefs to a list
    cat1_coefs.append(lassoCat.coef_)

CPU times: user 58.8 s, sys: 10.2 s, total: 1min 9s
Wall time: 40.8 s


In [147]:
#covert to DF with cols=vocab and index=category1 names
lasso_allCat1_coef = pd.DataFrame(cat1_coefs, columns=vocab.iloc[:,0].values, index=cat1_cols)
lasso_allCat1_coef.to_csv("lass_allCat1_coef.csv")

<hr><hr><br>

## Recurrent Neural Network

In [148]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers.recurrent import LSTM, RNN, GRU, SimpleRNN
from keras.models import load_model
import keras
import h5py



Using TensorFlow backend.


In [179]:
#default layers = 6, neurons list always should equal 6
def build_NN(inShape, nnType='RNN'):
    
    model = Sequential()
    model.add(Embedding(inShape[1], 64, input_length=maxLen))  #input layer
    if nnType == 'RNN':
        model.add(SimpleRNN(32, return_sequences=True))
        model.add(SimpleRNN(32))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(8, activation='relu'))
    elif nnType == 'GRU':
        #model.add(Dense(64, input_shape=(inShape[1])))  #input layer
        model.add(GRU(32, return_sequences=True))
        model.add(GRU(32))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(8, activation='relu'))
    elif nnType == 'LSTM':
        #model.add(Dense(64, input_shape=(inShape[1])))  #input layer
        model.add(LSTM(32, return_sequences=True)) 
        model.add(LSTM(32))
        model.add(Dense(16, activation='relu'))      
        model.add(Dense(8, activation='relu'))
    elif nnType == 'BASE':
        #model.add(Dense(64, input_shape=(inShape[1],)))  #input layer
        model.add(Dense(32, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Reshape((-1,)))
        model.add(Dense(8, activation='relu'))
    else:
        print('should not be here')
    
    model.add(Dense(1, activation='linear', kernel_initializer="uniform")) #output layer
    model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
    #model.summary() 

    return model

In [180]:
#params for NN
epochs = 10
batchSize = 3000

In [None]:
#Need to prep the data for the NN

X = tokenizer.texts_to_sequences(train5k)
data = pad_sequences(sequences, maxlen=max_len)

In [181]:

modelRNN = build_NN(tfidf_train.shape, nnType='RNN')
modelRNN.summary()

1793
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 1793, 64)          128       
_________________________________________________________________
simple_rnn_13 (SimpleRNN)    (None, 1793, 32)          3104      
_________________________________________________________________
simple_rnn_14 (SimpleRNN)    (None, 32)                2080      
_________________________________________________________________
dense_23 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_24 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 9         
Total params: 5,985
Trainable params: 5,985
Non-trainable params: 0
_________________________________________________________________


In [183]:
#modelRNN.fit(tfidf_train, y_cat_train, batch_size=batchSize, epochs=epochs, validation_split=0.2, verbose=1)