# Mercari Price Prediction Challenge 

### Import Libraries

In [22]:
import warnings
warnings.filterwarnings('ignore')

import joblib
from scipy.sparse import csr_matrix, hstack
import pandas as pd 
import numpy as np
import string
import re
from sklearn import preprocessing
import keras
import tensorflow as tf

from keras import optimizers, callbacks
from keras.models import Model
from keras.layers import Input, Dropout, Dense

from sklearn.metrics import mean_squared_log_error

### User Defined Input

In [23]:
user_input = [{
    'name' : 'Levis Black Leggings, Womens. Size L.',
    'item_condition_id' : 1,
    'category_name' : 'Women/Athletic Apparel/Pants, Tights, Leggings',
    'brand_name' : 'Levis',
    'shipping' : 1,
    'item_description' : 'Adorable gym wear from a well known brand. In great condition. Size L. Black and stretchable.'
}, 
{
    'name' : 'Coach bag',
    'item_condition_id' : 1,
    'category_name' : 'Vintage & Collectibles/Bags and Purses/Handbag',
    'brand_name' : 'Coach',
    'shipping' : 1,
    'item_description' : 'Brand new coach bag. Bought for [rm] at a Coach outlet.'
}]

input_data = pd.DataFrame.from_dict(user_input)

price = [10, 45]

### Initialize Files

In [24]:
enc_c0 = joblib.load('/kaggle/input/picklefiles/vect-1.pickle')
enc_c1 = joblib.load('/kaggle/input/picklefiles/vect-2.pickle')
enc_c2 = joblib.load('/kaggle/input/picklefiles/vect-3.pickle')

enc_n = joblib.load('/kaggle/input/picklefiles/vect-name.pickle')
enc_t = joblib.load('/kaggle/input/picklefiles/vect-text.pickle')

scaler = joblib.load('/kaggle/input/picklefiles/scaler.pickle') 

### Processing Data

In [25]:
#stopwords without no, not, etc
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [26]:
def handle_missing_values(data):
    
    data.fillna({'name':'missing', 'item_description':'missing'}, inplace=True)
    
    return data

In [27]:
def remove_emoji(sentence):
    
    pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return pattern.sub(r'', sentence)

In [28]:
def remove_punctuation(sentence):
    
    regular_punct = list(string.punctuation)
    
    for punc in regular_punct:
        if punc in sentence:
            sentence = sentence.replace(punc, ' ')

    return sentence.strip()

In [29]:
def decontracted(phrase):
    
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase


In [30]:
def process_text(data, cols):
    
    for col in cols:
        
        processed_data = []
        
        for sentence in data[col].values:
            
            sent = decontracted(sentence)
            sent = sentence
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            sent = remove_emoji(sent)
            sent = remove_punctuation(sent)
            sent = ' '.join(e for e in sent.split() if e not in STOPWORDS)
            processed_data.append(sent.lower().strip())
            
        data[col] = processed_data
        
    return data

In [31]:
def process_category(data):
    
    for i in range(3):
        
        def get_part(x):
            
            if type(x) != str:
                return np.nan
        
            parts = x.split('/')
            
            if i >= len(parts):
                return np.nan
            else:
                return parts[i]

        field_name = 'category_' + str(i)
        
        data[field_name] = data['category_name'].apply(get_part)
    
    return data

### Featurerize Data

In [32]:
def get_features(data):
    
    luxury_brands = ["MCM", "MCM Worldwide", "Louis Vuitton", "Burberry", "Burberry London", "Burberry Brit", "HERMES", "Tieks",
                     "Rolex", "Apple", "Gucci", "Valentino", "Valentino Garavani", "RED Valentino", "Cartier", "Christian Louboutin",
                     "Yves Saint Laurent", "Saint Laurent", "YSL Yves Saint Laurent", "Georgio Armani", "Armani Collezioni", "Emporio Armani"]
    
    data['is_luxurious'] = (data['brand_name'].isin(luxury_brands)).astype(np.int8)

    expensive_brands = ["Michael Kors", "Louis Vuitton", "Lululemon", "LuLaRoe", "Kendra Scott", "Tory Burch", "Apple", "Kate Spade",
                  "UGG Australia", "Coach", "Gucci", "Rae Dunn", "Tiffany & Co.", "Rock Revival", "Adidas", "Beats", "Burberry",
                  "Christian Louboutin", "David Yurman", "Ray-Ban", "Chanel"]

    data['is_expensive'] = (data['brand_name'].isin(expensive_brands)).astype(np.int8)
    
    return data

In [33]:
def preprocess(data):

    #handle missing values
    data = handle_missing_values(data)
    
    data = process_category(data)
    
    data = process_text(data, ['name', 'item_description', 'category_name'])
    
    data = get_features(data)
    
    data.fillna({'brand_name': ' ', 'category_0': 'other', 'category_1': 'other', 'category_2': 'other'}, inplace = True)
    
    #concat columns
    data['name'] = data['name'] + ' ' + data['brand_name'] + ' ' + data['category_name'] 
    data['text'] = data['name'] + ' ' + data['item_description']
    
    #drop columns which are not required
    data = data.drop(columns = ['brand_name', 'item_description', 'category_name'], axis = 1)

    return data

In [34]:
#encode features
def get_encodings(data):
    
    category_0 = enc_c0.transform(data['category_0'].values)

    category_1 = enc_c1.transform(data['category_1'].values)

    category_2 = enc_c2.transform(data['category_2'].values)
    
    nums = csr_matrix(pd.get_dummies(data[['shipping', 'item_condition_id', 'is_expensive', 'is_luxurious']], sparse=True).values)

    name = enc_n.transform(data['name'].values)
    
    text = enc_t.transform(data['text'].values)

    data = hstack((category_0, category_1, category_2, nums, name, text)).tocsr().astype('float32')

    return data

In [35]:
def f_regr():

    model_in = Input(shape = (363097,), dtype = 'float32', sparse = True)
    out = Dense(256, activation = 'relu')(model_in)
    out = Dropout(0.2)(out)
    out = Dense(128, activation = 'relu')(out)
    out = Dense(64, activation = 'relu')(out)
    out = Dense(32, activation = 'relu')(out)
    out = Dense(16, activation = 'relu')(out)
    model_out = Dense(1)(out)
    
    model = Model(model_in, model_out)
    
    model.compile(loss = 'mean_squared_error', optimizer = keras.optimizers.Adam(learning_rate=0.001))
    
    return model

In [36]:
def s_regr():
    
    model_in = Input(shape = (363097,), dtype = 'float32', sparse = True)
    out = Dense(1024, activation='relu')(model_in)
    out = Dropout(0.2)(out)
    out = Dense(512, activation='relu')(out)
    out = Dropout(0.2)(out)
    out = Dense(256, activation='relu')(out)
    out = Dense(128, activation='relu')(out)
    out = Dense(64, activation='relu')(out)
    out = Dense(32, activation='relu')(out)
    out = Dense(16, activation='relu')(out)
    model_out = Dense(1)(out)
    
    model = Model(model_in, model_out)
    
    model.compile(loss = 'mean_squared_error', optimizer = keras.optimizers.Adam(learning_rate=0.001))
    
    return model

In [37]:
def final_fun_1(input_data):
    
    print('Processing Data...')
    processed_data = preprocess(input_data)

    print('Generating Encodings...')
    encoded_data = get_encodings(processed_data)

    print('Making Predictions...')
    model = f_regr()
    model.load_weights('/kaggle/input/picklefiles/model-1.h5')
    pred = model.predict(encoded_data)
    pred_nn_1 = np.expm1(scaler.inverse_transform(pred.reshape(-1, 1))[:,0])

    model = s_regr()
    model.load_weights('/kaggle/input/picklefiles/model-2.h5')
    pred = model.predict(encoded_data)
    pred_nn_2 = np.expm1(scaler.inverse_transform(pred.reshape(-1, 1))[:,0])
    
    w = 0.35
    prediction = (w * pred_nn_1) + ((1 - w) * pred_nn_2)
    
    return prediction

### Function 1

#### Returns the predicted price

In [38]:
def main():
    
    prediction = final_fun_1(input_data)
    
    for pred in prediction:
        print('Predicted price is ${:.2f}'.format(pred))

In [39]:
%%time
if __name__ == "__main__":
    main()

Processing Data...
Generating Encodings...
Making Predictions...
Predicted price is $9.79
Predicted price is $38.10
CPU times: user 8.1 s, sys: 12 s, total: 20.1 s
Wall time: 8.01 s


### Function 2

#### Returns the RMSLE score

In [40]:
def final_fun_2(X, y):
    
    return get_rmsle(y, final_fun_1(X))
    
    
def get_rmsle(y_true, y_pred):
    
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


def main():
    
    rmsle = final_fun_2(input_data, price)
    
    print('Computed RMSLE is {:.3f}'.format(rmsle))

In [41]:
%%time
if __name__ == "__main__":
    main()

Processing Data...
Generating Encodings...
Making Predictions...
Computed RMSLE is 0.636
CPU times: user 6.47 s, sys: 7.43 s, total: 13.9 s
Wall time: 6.11 s


In [42]:
####

In [43]:
####

In [None]:
####