# MLPS - Mercari Price ML

In [None]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_squared_error

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Perform data cleaning

In [None]:
data = pd.read_csv("Data/train.tsv", delimiter="\t", index_col=0)

# remove items with out a price
data = data[pd.notna(data["price"])]

data["item_description"] = data["item_description"].replace("No description yet", "")
data["item_description"] = data["item_description"].replace(np.nan, "")

temp = data["category_name"].fillna('').str.split('/')
              
data["category_name_1"] = temp.str[0]
data["category_name_2"] = temp.str[1]
data["category_name_3"] = temp.str[2:].str.join("/")

### Implement porter stemming in count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import *
import string

class StemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation + string.digits)
    def __call__(self, doc):
        return [self.ps.stem(w) for w in doc
                .encode('ascii', errors='ignore')
                .decode('ascii')
                .translate(self.translator)
                .split()]

vectorizer = CountVectorizer(lowercase = True,
                             max_df = .5,
                             min_df = .001,
                             tokenizer = StemmerTokenizer(),
                             stop_words='english')

### Count vectorize the data

In [None]:
%%time
tfm = vectorizer.fit_transform(data["item_description"])

In [None]:
vocab = vectorizer.vocabulary_ 
removed_words = vectorizer.stop_words_

In [None]:
%%time
tfidf_vectorizer = TfidfTransformer()
tfidf_transformed = tfidf_vectorizer.fit_transform(tfm)

### Save results for time savings

In [None]:
from scipy.sparse import save_npz
save_npz("tfm.npz", tfm)
save_npz("tfidf_transformed.npz", tfidf_transformed)

## Vocab: write vectorized words to txt

In [None]:
import operator
with open("vocabulary.txt", "w") as f:
    f.write("".join([k + '\n' for k, v in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]))

<br><br><br><hr><hr>

# Load files as necessary for time savings

In [3]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_squared_error

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from scipy.sparse import load_npz

tfm = load_npz("Data/tfm.npz")
tfidf_transformed = load_npz("Data/tfidf_transformed.npz")
cleaned_categorical = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

In [4]:
tfm.shape

(1482535, 1793)

In [5]:
tfidf_transformed.shape

(1482535, 1793)

In [6]:
cleaned_categorical.shape

(1482535, 11)

## Split into train and test

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y_categorical = cleaned_categorical['price'].astype('float', copy=False)
cleaned_categorical.drop(["train_id","name","category_name","item_description","price"], axis=1, inplace=True)
cleaned_categorical["item_condition_id"] = cleaned_categorical["item_condition_id"].astype('str', copy=False)

In [8]:
ohe_cleaned_categorical = pd.get_dummies(cleaned_categorical)

In [None]:
# #Test Train split
# y_cat_train, y_cat_test, X_cat_train, X_cat_test, tfidf_train, tfidf_test, tfm_train, tfm_test = train_test_split(
#     y_categorical, ohe_cleaned_categorical, tfidf_transformed, tfm, test_size=0.2, random_state=95)

In [9]:
#Test Train split
yTrain, yTest, XTrain, XTest, tfidfTrain, tfidfTest = train_test_split(
    y_categorical, ohe_cleaned_categorical, tfidf_transformed, test_size=0.78, random_state=95)

In [10]:
yTrain.shape

(326157,)

In [11]:
#~50K in Valid (just over 200K in Train)
yTrain, yValid, XTrain, XValid, tfidfTrain, tfidfValid = train_test_split(
    yTrain, XTrain, tfidfTrain, test_size=0.15, random_state=10)

#~50K in Test
yTrain, yTest, XTrain, XTest, tfidfTrain, tfidfTest = train_test_split(
    yTrain, XTrain, tfidfTrain, test_size=0.18, random_state=10)

## Lasso: Category (non-text) data

In [12]:
vocab = pd.read_csv('Data/vocabulary.txt', sep=" ", header=None)

In [13]:
%%time
from sklearn.linear_model import LassoCV, Lasso

#alpha_values = np.logspace(-4, 2, 6)
alpha_values = [0.001,0.01,0.1]

mseRow = []
mseAll = []

#manually grid search over alpha
for alphaOut in alpha_values:
    
    #non-Text data
    lasso = Lasso(alpha=alphaOut, random_state=111, max_iter=1000).fit(XTrain, yTrain)
    yPredOut = lasso.predict(XValid)
    residuals = yValid - yPredOut
    
    mseRow = []
    
    #can't figure out how to implement a validation or test accuracy as the training of the inner model is dependent 
    # on the outermodel's predictions.  If we predict on XValid, then we need to train the inner model on XValid.
    # not sure if this makes sense
    
    #GridSearch manually
    for alphaIn in alpha_values:
        lassoTF = Lasso(alpha=alphaIn, random_state=777, max_iter=1000).fit(tfidfValid, residuals)
        yPredIn = lassoTF.predict(tfidfValid)
        
        #add predictions from non-Text and Text regressions
        yPredFull = yPredOut + yPredIn
        
        #calc mse between full predictions & actuals
        mseRow.append(mean_squared_error(yValid, yPredFull))
    
    #store for of mse's
    mseAll.append(mseRow)
    
mseAll = np.array(mseAll)

CPU times: user 40min 55s, sys: 6min, total: 46min 55s
Wall time: 20min 28s


In [14]:
#get bestAlphaOut & In based on the smallest mse from gridsearch
bestAlpha_idx = np.unravel_index(np.argmin(mseAll), mseAll.shape)
#bestAlpha_idx[0]
bestAlphaOut = alpha_values[bestAlpha_idx[0]]
bestAlphaIn = alpha_values[bestAlpha_idx[1]]
print()
print("Best Alpha for Outer Lasso (non-text): {}".format(bestAlphaOut))
print("Best Alpha for Linner Lasso (text): {}".format(bestAlphaIn))


Best Alpha for Outer Lasso (non-text): 0.001
Best Alpha for Linner Lasso (text): 0.001


## Run final model with bestAlphas

In [None]:
lasso = Lasso(alpha=bestAlphaOut, random_state=111).fit(XTrain, yTrain)
yPredOut = lasso.predict(XValid)
residualsBest = yValid - yPredOut

lassoTF = Lasso(alpha=bestAlphaIn, random_state=111).fit(tfidfValid, residualsBest)
yPredIn = lassoTF.predict(tfidfValid)

yPredFull = yPredOut + yPredIn

mse_Final = mean_squared_error(yValid, yPredFull)
print("The MSE for the final tuned models is: {}".format(mse_Final))

## Pickle trained models

In [None]:
from sklearn.externals import joblib
joblib.dump(lasso, 'lasso_cat.pkl')
joblib.dump(lassoTF, 'lasso_tfidf.pkl')
#clf = joblib.load('lasso_cat.pkl') 

In [None]:
from sklearn.externals import joblib
joblib.dump(lasso, 'Users/Arobster/Google Drive/CMU/Courses/95-828 - Machine Learning/MLPSProject/Data/lasso_cat.pkl')
joblib.dump(lassoTF, 'Users/Arobster/Google Drive/CMU/Courses/95-828 - Machine Learning/MLPSProject/Data/lasso_tfidf.pkl')
#clf = joblib.load('lasso_cat.pkl') 

# Combine non-Text & text data and run Lasso to see if prediction is diff

In [None]:
#combine non-Text and Text to test it out
# tfDTrain = tfidfTrain.todense()
# comboTrain = pd.concat([XTrain.reset_index(), pd.DataFrame(tfDTrain)], axis=1)
# comboTrain.shape

In [None]:
#save lasso on category data as DataFrame, write to csv
# lasso_cat_coef = pd.DataFrame(lasso.coef_, index=XTrain.columns)
# lasso_cat_coef.to_csv("lasso_cat_coef.csv")

# residualsDF = pd.DataFrame(residuals)
# residualsDF.to_csv("residuals.csv")

## Lasso: Text data on residuals

In [None]:
#save lasso on TFIDF data as DataFrame, write to csv
# lasso_tfidf_coef = pd.DataFrame(lassoTF.coef_, index=vocab.iloc[:,0].values)
# lasso_tfidf_coef.to_csv("lasso_tfidf_coef.csv")

## Lasso: Text data on residuals (per category1)

In [None]:
#get the category1 column headers
cat1_cols = XTrain.iloc[:,4815:4825].columns

#reset index so can filter using list of indices
residuals.reset_index(drop=True, inplace=True)

In [None]:
%%time
cat1_coefs = []
tfidfTrain_df = pd.DataFrame(tfidfTrain.todense())  #I think can remove this and uncomment tocsr() line below

for cat1 in cat1_cols:
    #get indices where this category=1
    idx = np.where(XTrain[cat1] == 1)[0]
   
    #filter data with the indices
    X_tf = tfidfTrain_df.iloc[idx,:]   #Don't think need to use this, uncomment next time to deal with sparse
    #X_tf = tfidfTrain.tocsr()[idx,:]  #way to filter sparse matrx with row indices
    y_tf = residuals[idx]
    
    #run Lasso, fit, store coef array
    lassoCat = Lasso(alpha=bestAlphaIn, random_state=10)
    lassoCat.fit(X_tf, y_tf)

    #add this cat1 coefs to a list
    cat1_coefs.append(lassoCat.coef_)

In [None]:
#covert to DF with cols=vocab and index=category1 names
lasso_allCat1_coef = pd.DataFrame(cat1_coefs, columns=vocab.iloc[:,0].values, index=cat1_cols)
lasso_allCat1_coef.to_csv("lass_allCat1_coef.csv")

<hr><hr><br>

## Recurrent Neural Network

In [None]:
# from keras.models import Sequential
# from keras.layers.core import Dense, Dropout, Activation, Reshape
# from keras.layers.recurrent import LSTM, RNN, GRU, SimpleRNN
# from keras.models import load_model
# import keras
# import h5py



In [None]:
# #default layers = 6, neurons list always should equal 6
# def build_NN(inShape, nnType='RNN'):
    
#     model = Sequential()
#     model.add(Embedding(inShape[1], 64, input_length=maxLen))  #input layer
#     if nnType == 'RNN':
#         model.add(SimpleRNN(32, return_sequences=True))
#         model.add(SimpleRNN(32))
#         model.add(Dense(16, activation='relu'))
#         model.add(Dense(8, activation='relu'))
#     elif nnType == 'GRU':
#         #model.add(Dense(64, input_shape=(inShape[1])))  #input layer
#         model.add(GRU(32, return_sequences=True))
#         model.add(GRU(32))
#         model.add(Dense(16, activation='relu'))
#         model.add(Dense(8, activation='relu'))
#     elif nnType == 'LSTM':
#         #model.add(Dense(64, input_shape=(inShape[1])))  #input layer
#         model.add(LSTM(32, return_sequences=True)) 
#         model.add(LSTM(32))
#         model.add(Dense(16, activation='relu'))      
#         model.add(Dense(8, activation='relu'))
#     elif nnType == 'BASE':
#         #model.add(Dense(64, input_shape=(inShape[1],)))  #input layer
#         model.add(Dense(32, activation='relu'))
#         model.add(Dense(32, activation='relu'))
#         model.add(Dense(16, activation='relu'))
#         model.add(Reshape((-1,)))
#         model.add(Dense(8, activation='relu'))
#     else:
#         print('should not be here')
    
#     model.add(Dense(1, activation='linear', kernel_initializer="uniform")) #output layer
#     model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
#     #model.summary() 

#     return model

In [None]:
# #params for NN
# epochs = 10
# batchSize = 3000

# #Need to prep the data for the NN

# X = tokenizer.texts_to_sequences(train5k)
# data = pad_sequences(sequences, maxlen=max_len)


# modelRNN = build_NN(tfidf_train.shape, nnType='RNN')
# modelRNN.summary()