In [8]:
# import packages here
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from keras import backend as K
import gc
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

In [9]:
# import data

train_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter= '\t')
#test_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/test.tsv',delimiter = '\t') # too lazy to submit kaggle kernel

In [10]:
# define methods here

def split_cat(text): # this one is to reduce the categoriy_name into three subcategories
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

def handle_missing_inplace(dataset):  # this one is to put placeholders in place of missing values (NaN)
    dataset['cat1'].fillna(value='No Label', inplace=True)
    dataset['cat2'].fillna(value='No Label', inplace=True)
    dataset['cat3'].fillna(value='No Label', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='No description yet', inplace=True)
    
def to_categorical(dataset): # this is to define the datatype as "category"
    dataset['cat1'] = dataset['cat1'].astype('category')
    dataset['cat2'] = dataset['cat2'].astype('category')
    dataset['cat3'] = dataset['cat3'].astype('category')
    
def raw_text(dataset):    # this is to tokenize, filter, clean, etc raw text values 
    raw_text = np.hstack([dataset.item_description.str.lower(), dataset.name.str.lower()])  #make into one array i think
    tok_raw = Tokenizer(num_words=20000, #max words
                    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                    lower=True,
                    split=" ",
                    char_level=False) #if True, every character will be treated as a token.
    tok_raw.fit_on_texts(raw_text) # fit_on_texts is to train whatever text on ( in this case, everything in item_desc & name)
                                   # basically made it into a dictionary for texts_to_sequences
                                   # it seems fit_on_texts counts words and returns a vocab list (ordered by count decreasing)
    # texts_to_sequences maybe saves space? and lookup time?  
    dataset["seq_item_description"] = tok_raw.texts_to_sequences(dataset.item_description.str.lower())
    dataset["seq_name"] = tok_raw.texts_to_sequences(dataset.name.str.lower())
    dataset["Raw Text Combined"] = dataset.seq_name + dataset.seq_item_description # just for reference i think
    
def get_keras_data(dataset): # converts input dataset into something keras can use (seems to be dict)
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=10) # pad_sequences is making the values equally padded/truncated i guess 
        ,'item_desc': pad_sequences(dataset.seq_item_description, maxlen=75)
        ,'brand_name': np.array(dataset.brand_name)
        ,'cat1': np.array(dataset.cat1)
        ,'cat2': np.array(dataset.cat2)
        ,'cat3': np.array(dataset.cat3)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset.shipping)
    }
    return X

def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    
    pop_category1 = dataset['cat1'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['cat2'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['cat3'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['cat1'].isin(pop_category1), 'cat1'] = 'missing'
    dataset.loc[~dataset['cat2'].isin(pop_category2), 'cat2'] = 'missing'
    dataset.loc[~dataset['cat3'].isin(pop_category3), 'cat3'] = 'missing'

NUM_BRANDS = 4000
NUM_CATEGORIES = 1000



In [11]:
merge = train_raw
#submission = test_raw[['test_id']]

merge['cat1'],merge['cat2'],merge['cat3'] = \
zip(*merge['category_name'].apply(lambda x: split_cat(x))) # split the categories into three new columns
merge.drop('category_name',axis = 1, inplace = True) # remove the column that isn't needed anymore

handle_missing_inplace(merge) # replaces NaN with a string placeholder

cutting(merge) # can't figure out what this does. maybe setting unpopular brands as 'missing'?

raw_text(merge)

le = LabelEncoder() # use this to change categorical names into index numbers (0 1 2 3 or something)
merge.brand_name = le.fit_transform(merge.brand_name)
merge.cat1 = le.fit_transform(merge.cat1)
merge.cat2 = le.fit_transform(merge.cat2)
merge.cat3 = le.fit_transform(merge.cat3)

In [17]:
#EXTRACT DEVELOPTMENT TEST
dtest = merge.iloc[nrow_train:, ]
dtrain, dvalid = train_test_split(merge.iloc[:nrow_train, ], random_state=123, train_size=0.7)
print(dtrain.shape)
print(dvalid.shape)


X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
X_test = get_keras_data(dtest)

Y_train =  np.log1p(np.array(dtrain.price))
Y_valid =  np.log1p(np.array(dvalid.price))



(985885, 13)
(422523, 13)


In [18]:
# upper parameter of text columns. max is equivalent to the maximum number of words in column(s)

MAX_TEXT = np.max([np.max(merge.seq_name.max()), np.max(merge.seq_item_description.max())])+2 #max of max 
MAX_cat1 = np.max([merge.cat1.max()])+1 # maybe +1 because of 0 index. np.max is unnecessary here
MAX_cat2 = np.max([merge.cat2.max()])+1
MAX_cat3 = np.max([merge.cat3.max()])+1
MAX_BRAND = np.max([merge.brand_name.max()])+1

In [None]:
#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, Conv1D, GlobalMaxPooling1D, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K

def get_callbacks(filepath, patience=2):#when is this called?
    es = EarlyStopping('val_loss', patience=patience, mode="min") # break training if small/no improvement after 2 epochs
    msave = ModelCheckpoint(filepath, save_best_only=True) # save best model 
    return [es, msave]

def rmsle_cust(y_true, y_pred): # calculating the rmsle
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1e-5)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1e-5)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

def get_model():
    #params
    dr_r = 0.5
    
    #Inputs (initializing. similar to tf.placeholder maybe)
    name = Input(shape=[X_train["name"].shape[1]], name="name") # I guess make sure its Input(shape,name)
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name") # one word, so shape = [1]
    cat1 = Input(shape=[1], name="cat1") 
    cat2 = Input(shape=[1], name="cat2")
    cat3 = Input(shape=[1], name="cat3")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[1], name="num_vars")
    
    #Embeddings layers (text based columns go through this. can only be used as the first layer. basically word2vec)
    emb_name = Embedding(MAX_TEXT, 10)(name) # Embedding(input_dim,output_dim)
    emb_item_desc = Embedding(MAX_TEXT, 10)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 50)(brand_name)
    emb_cat1 = Embedding(MAX_cat1, 10)(cat1)
    emb_cat2 = Embedding(MAX_cat2, 20)(cat2)
    emb_cat3 = Embedding(MAX_cat3, 30)(cat3)
 
    #rnn layer
    cnn_layer1 = Conv1D(filters=16, kernel_size=3, activation='relu') (emb_item_desc) # text cnn for 'item_description'
    cnn_layer2 = Conv1D(filters=8, kernel_size=3, activation='relu')(emb_name) # text cnn for 'name'
    
    cnn_layer1 = GlobalMaxPooling1D()(cnn_layer1) 
    cnn_layer2 = GlobalMaxPooling1D()(cnn_layer2)
    
    #main layer
    main_l = concatenate([
        Flatten() (emb_brand_name) # embedding step may have output of shape (MAX_BRAND,50)
        , Flatten() (emb_cat1)     # https://stackoverflow.com/questions/43237124/role-of-flatten-in-keras
        , Flatten() (emb_cat2)
        , Flatten() (emb_cat3)
        , cnn_layer1
        , cnn_layer2
        , num_vars
        , item_condition
    ])
    
    main_l = Dropout(dr_r) (Dense(256, activation="relu") (main_l))
    main_l = Dropout(dr_r) (Dense(128, activation="relu") (main_l))
    main_l = Dropout(dr_r) (Dense(64, activation="relu") (main_l))
    
    
    #output
    output = Dense(1, activation="linear") (main_l)
    
    #model
    model = Model([name, item_desc, brand_name, cat1, cat2, cat3, item_condition, num_vars], output)
    
    model.compile(loss="mse", optimizer="adam", metrics=["mae", rmsle_cust])
    
    return model

    
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
item_desc (InputLayer)          (None, 75)           0                                            
__________________________________________________________________________________________________
name (InputLayer)               (None, 10)           0                                            
__________________________________________________________________________________________________
brand_name (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
cat1 (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
cat2 (Inpu

In [None]:
# FITTING THE MODEL (keras method)
BATCH_SIZE = 20000
epochs = 25

model = get_model()
model.fit(X_train, Y_train, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, Y_valid)
          , verbose=1)



Train on 985885 samples, validate on 422523 samples
Epoch 1/25


In [None]:
preds = model.predict(X_test, batch_size=BATCH_SIZE)
preds = np.squeeze(np.expm1(preds))

#correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))    
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


In [None]:
print(preds.shape,Y_test.shape)

In [None]:
def rmsle(h, y): 
    log_h = np.log(h+1) # the +1 is to prevent 0 
    log_y = np.log(y+1) # writing these to prevent memoryerror
    sq_logs = np.square(log_h - log_y)
    score_ = np.sqrt(np.mean(sq_logs))
    return score_

In [None]:
rmsle_score = rmsle(preds,Y_test)
print(rmsle_score)