# MLPS - Mercari Price ML

In [None]:
import pandas as pd 
import numpy as np

### Perform data cleaning

In [None]:
data = pd.read_csv("Data/train.tsv", delimiter="\t", index_col=0)

# remove items with out a price
data = data[pd.notna(data["price"])]

data["item_description"] = data["item_description"].replace("No description yet", "")
data["item_description"] = data["item_description"].replace(np.nan, "")

temp = data["category_name"].fillna('').str.split('/')
              
data["category_name_1"] = temp.str[0]
data["category_name_2"] = temp.str[1]
data["category_name_3"] = temp.str[2:].str.join("/")

### Implement porter stemming in count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import *
import string

class StemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation + string.digits)
    def __call__(self, doc):
        return [self.ps.stem(w) for w in doc.translate(translator).split()]

vectorizer = CountVectorizer(lowercase = True,
                             max_df = .5,
                             min_df = .001,
                             tokenizer = StemmerTokenizer(),
                             stop_words='english')

### Count vectorize the data

In [None]:
%%time
tfm = vectorizer.fit_transform(data["item_description"])

In [None]:
vocab = vectorizer.vocabulary_ 
removed_words = vectorizer.stop_words_

In [None]:
%%time
tfidf_vectorizer = TfidfTransformer()
tfidf_transformed = tfidf_vectorizer.fit_transform(tfm)

### Save results for time savings

In [None]:
from scipy.sparse import save_npz
save_npz("tfm.npz", tfm)
save_npz("tfidf_transformed.npz", tfidf_transformed)

### Load files as necessary for time savings

In [None]:
from scipy.sparse import load_npz

tfm = load_npz("tfm.npz")
tfidf_transformed = load_npz("tfidf_transformed.npz")
cleaned_categorical = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

In [None]:
tfm.shape

In [None]:
tfidf_transformed.shape

In [None]:
cleaned_categorical.shape

# Split into train and test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y_categorical = cleaned_categorical["price"].astype('float', copy=False)
cleaned_categorical.drop(["train_id", "name", "category_name", "item_description", "price"], axis=1, inplace=True)
cleaned_categorical["item_condition_id"] = cleaned_categorical["item_condition_id"].astype('str', copy=False)

In [None]:
ohe_cleaned_categorical = pd.get_dummies(cleaned_categorical)

y_cat_train, y_cat_test, X_cat_train, X_cat_test, tfidf_train, tfidf_test, tfm_train, tfm_test = train_test_split(
    y_categorical, ohe_cleaned_categorical, tfidf_transformed, tfm, test_size=0.2, random_state=95)

# Split into X and Y and remove unused columns

In [None]:
ohe_cleaned_categorical.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV 

# def modelvalidation(n_split, rand_state):
#     estimator

search_range = {"max_depth": [1, 2]}
folds = 2

cv = GridSearchCV(estimator=DecisionTreeRegressor(),  cv= folds, param_grid=search_range,  refit=True, n_jobs= -1, verbose=1, return_train_score=True)
cv.fit(X_cat_train, y_cat_train)  

In [None]:
X_categorical_train.head()