# MLPS - Mercari Price ML

In [2]:
import pandas as pd 
import numpy as np

### Perform data cleaning

In [None]:
data = pd.read_csv("Data/train.tsv", delimiter="\t", index_col=0)

# remove items with out a price
data = data[pd.notna(data["price"])]

data["item_description"] = data["item_description"].replace("No description yet", "")
data["item_description"] = data["item_description"].replace(np.nan, "")

temp = data["category_name"].fillna('').str.split('/')
              
data["category_name_1"] = temp.str[0]
data["category_name_2"] = temp.str[1]
data["category_name_3"] = temp.str[2:].str.join("/")

### Implement porter stemming in count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import *
import string

class StemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation + string.digits)
    def __call__(self, doc):
        return [self.ps.stem(w) for w in doc.split()]

vectorizer = CountVectorizer(lowercase = True,
                             max_df = .5,
                             min_df = .001,
                             tokenizer = StemmerTokenizer(),
                             stop_words='english')

### Count vectorize the data

In [None]:
%%time
tfm = vectorizer.fit_transform(data["item_description"])

In [None]:
%%time
tfidf_vectorizer = TfidfTransformer()
tfidf_transformed = tfidf_vectorizer.fit_transform(tfm)

### Save results for time savings

In [None]:
from scipy.sparse import save_npz
save_npz("tfm.npz", tfm)
save_npz("tfidf_transformed.npz", tfidf_transformed)

### Load files as necessary for time savings

In [4]:
from scipy.sparse import load_npz

tfm = load_npz("Data/tfm.npz")
tfidf_transformed = load_npz("Data/tfidf_transformed.npz")
cleaned_categorical = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

In [5]:
tfm.shape

(1482535, 1798)

In [6]:
tfidf_transformed.shape

(1482535, 1798)

In [7]:
cleaned_categorical.shape

(1482535, 11)

# Split into train and test

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y_categorical = cleaned_categorical["price"].astype('float', copy=False)
cleaned_categorical.drop(["train_id", "name", "category_name", "item_description", "price"], axis=1, inplace=True)
cleaned_categorical["item_condition_id"] = cleaned_categorical["item_condition_id"].astype('str', copy=False)

In [9]:
ohe_cleaned_categorical = pd.get_dummies(cleaned_categorical)

y_cat_train, y_cat_test, X_cat_train, X_cat_test, tfidf_train, tfidf_test, tfm_train, tfm_test = train_test_split(
    y_categorical, ohe_cleaned_categorical, tfidf_transformed, tfm, test_size=0.2, random_state=95)

# Split into X and Y and remove unused columns

In [10]:
ohe_cleaned_categorical.head()

Unnamed: 0,shipping,item_condition_id_1,item_condition_id_2,item_condition_id_3,item_condition_id_4,item_condition_id_5,brand_name_!iT Jeans,brand_name_% Pure,brand_name_10.Deep,brand_name_191 Unlimited,...,category_name_3_Wool,category_name_3_Work & Safety,category_name_3_Wrap,category_name_3_Wristlet,category_name_3_Writing,category_name_3_Yarn,category_name_3_Yoga & Pilates,category_name_3_Zipper,category_name_3_iPad/Tablet/eBook Access,category_name_3_iPad/Tablet/eBook Readers
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV 

# def modelvalidation(n_split, rand_state):
#     estimator

search_range = {"max_depth": [1, 2]}
folds = 2

y_catSm_train, extra, X_catSm_train, extra2 = train_test_split(y_cat_train, X_cat_train,  test_size=0.9, random_state=95)
#y_catSm_train, y_catSm_test, X_catSm_train, X_catSm_test = train_test_split(y_cat_train, y_cat_test, X_cat_train, X_cat_test,  test_size=0.9, random_state=95)

y_catSm_train.shape

(118602,)

In [15]:
X_catSm_train.shape

(118602, 5809)

In [16]:
cv = GridSearchCV(estimator=DecisionTreeRegressor(),  cv= folds, param_grid=search_range,  refit=True, n_jobs= -1, verbose=1, return_train_score=True)
cv.fit(X_catSm_train, y_catSm_train)  

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   53.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   53.3s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 2]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=1)

In [19]:
X_catSm_train[0]

KeyError: 0

In [18]:
cv.predict(X_catSm_train[0])

0.037550773509564705

In [38]:
X_categorical_train.head()

Unnamed: 0,item_condition_id,brand_name,shipping,category_name_1,category_name_2,category_name_3
66980,1,Maybelline,1,Beauty,Makeup,Makeup Palettes
87674,2,Pokemon,1,Vintage & Collectibles,Trading Cards,Vintage
333597,3,Adidas,1,Sports & Outdoors,Golf,Golf Shoes
741182,3,,0,Women,Women's Accessories,Watches
1272239,2,Lululemon,0,Women,Athletic Apparel,Jackets
