# MLPS - Mercari Price ML

In [1]:
import pandas as pd 
import numpy as np

### Perform data cleaning

In [2]:
data = pd.read_csv("Data/train.tsv", delimiter="\t", index_col=0)

# remove items with out a price
data = data[pd.notna(data["price"])]

data["item_description"] = data["item_description"].replace("No description yet", "")
data["item_description"] = data["item_description"].replace(np.nan, "")

temp = data["category_name"].fillna('').str.split('/')
              
data["category_name_1"] = temp.str[0]
data["category_name_2"] = temp.str[1]
data["category_name_3"] = temp.str[2:].str.join("/")

  mask |= (ar1 == a)


### Implement porter stemming in count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import *
import string

class StemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
        self.translator = str.maketrans('', '', string.punctuation + string.digits)
    def __call__(self, doc):
        return [self.ps.stem(w) for w in doc
                .encode('ascii', errors='ignore')
                .decode('ascii')
                .translate(self.translator)
                .split()]

vectorizer = CountVectorizer(lowercase = True,
                             max_df = .5,
                             min_df = .001,
                             tokenizer = StemmerTokenizer(),
                             stop_words='english')

### Count vectorize the data

In [62]:
%%time
tfm = vectorizer.fit_transform(data["item_description"])

Wall time: 32min 16s


In [63]:
vocab = vectorizer.vocabulary_ 
removed_words = vectorizer.stop_words_

Wall time: 1.45 s


In [63]:
%%time
tfidf_vectorizer = TfidfTransformer()
tfidf_transformed = tfidf_vectorizer.fit_transform(tfm)

Wall time: 1.45 s


### Save results for time savings

In [64]:
from scipy.sparse import save_npz
save_npz("tfm.npz", tfm)
save_npz("tfidf_transformed.npz", tfidf_transformed)

### Load files as necessary for time savings

In [None]:
from scipy.sparse import load_npz

tfm = load_npz("tfm.npz")
tfidf_transformed = load_npz("tfidf_transformed.npz")
cleaned_categorical = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

In [None]:
tfm.shape

In [None]:
tfidf_transformed.shape

In [None]:
cleaned_categorical.shape

# Split into train and test

In [None]:
import operator
with open("vocabulary.txt", "w") as f:
    f.write("".join([k + '\n' for k, v in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]))

In [4]:
from scipy.sparse import load_npz

tfm = load_npz("tfm.npz")
tfidf_transformed = load_npz("tfidf_transformed.npz")
