In [1]:
import csv, time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Neural Network
import keras
import keras.backend as K
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Record total notebook run time
notebook_start = time.time()

In [3]:
print('Parsing data from TSV files...')
start_time = time.time()
train_df = pd.read_csv('train.tsv', delimiter='\t', index_col=['train_id'])
X_test_df = pd.read_csv('test.tsv', delimiter='\t', index_col=['test_id'])
print(f'Done. Took {time.time() - start_time:.2f}s')

Parsing data from TSV files...


  mask |= (ar1 == a)


Done. Took 8.37s


## Data preparation

For data exploration, refer to the file "Data Exploration.ipynb"

In [4]:
# Drop training entries where price is zero
initial_size = train_df.shape[0]
train_df = train_df.drop(train_df[(train_df['price'] == 0)].index)
print(f'Dropped {initial_size - train_df.shape[0]} zero-price entries (train)')

Dropped 874 zero-price entries (train)


In [5]:
# X/y split: split the training df into X (features/inputs) and y (target variable, price)
y_train_df = train_df.loc[:, 'price']
y_train_df = pd.to_numeric(y_train_df)

X_train_df = train_df
del X_train_df['price']

#X_train_df.head()

In [6]:
# Concatenate training and test data
X_df = pd.concat([X_train_df, X_test_df], axis=0)

X_df.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity


### Deal with missing data

In [7]:
# Replace NaN with ''
X_df['brand_name'] = X_df['brand_name'].fillna('unknown')

# Replace NaN and 'No description yet' with ''
X_df['item_description'] = X_df['item_description'].apply(lambda x: '' if x=='No description yet' or type(x) != str else x)

In [8]:
# Recover missing brands, by trying to guess unknown brands from their name and category
start_time = time.time()

def category_set(categories):
    return set(categories)

def recover_brands(name, category):    
    for brand in brands_descending:
        if brand in name and category in brand_categories[brand]:
            return brand
    return 'unknown'

# Find the category combinations that each brand is present in
brand_categories = dict(X_df[X_df['brand_name'] != 'unknown'][['brand_name', 'category_name']].astype(str).\
                       groupby('brand_name').agg(category_set).reset_index().values.tolist())

# Brands sorted by decreasing length (longer brand names will have priority when recovering)
brands_descending = list(sorted(filter(lambda y: len(y) >= 3, list(brand_categories.keys())),\
                                key = lambda x: -len(x)))

print('Recovering brands...')
# Get entries that have an unknown brand, retrieve just the 'name' and 'category_name'
# columns, then convert to np array
X_unknown_brand = X_df[X_df['brand_name'] == 'unknown']
X_unknown_brand = np.array(X_unknown_brand[['name','category_name']]) # .astype('str')

# Recover brands and update our dataframe
recovered_brands = [recover_brands(name, category) for name, category in X_unknown_brand]
X_df.loc[X_df['brand_name'] == 'unknown', 'brand_name'] = recovered_brands

# Get the number of brands recovered
num_recovered = len(X_unknown_brand) - len(X_df[X_df['brand_name'] == 'unknown'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s')
print(f'Recovered {num_recovered} brands out of {len(X_unknown_brand)} missing brands')

#X_df.head()

Recovering brands...
Done. Time taken: 257.68s
Recovered 133056 brands out of 927861 missing brands


### Split categories

In [9]:
def split_categories(category_name):
    """This function will split the category into multiple separate subcategories """
    # "try" since some category_names are NaN. If we get more than 3 categories, only the
    # first 3 will be returned
    try:
        categories = category_name.split('/')
        return categories[:3]
    except:
        return '', '', ''
    
start_time = time.time()
print('Splitting categories into subcategories...')
# Creates three new columns, one for each "level" of the category
X_df['category1'], X_df['category2'], X_df['category3']=\
zip(*X_df['category_name'].apply(split_categories))

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

#X_df.head()

Splitting categories into subcategories...
Done. Time taken: 6.45s


### Create dummy variables (one-hot encoding)

In [10]:
start_time = time.time()

print('Creating dummy variables and storing as sparse matrices...')
X_brand = csr_matrix(pd.get_dummies(X_df['brand_name'], sparse=True, prefix='brand'))
X_cat1 = csr_matrix(pd.get_dummies(X_df['category1'], sparse=True, prefix='cat1'))
X_cat2 = csr_matrix(pd.get_dummies(X_df['category2'], sparse=True, prefix='cat2'))
X_cat3 = csr_matrix(pd.get_dummies(X_df['category3'], sparse=True, prefix='cat3'))
X_cond = csr_matrix(pd.get_dummies(X_df['item_condition_id'], sparse=True, prefix='cond'))
X_ship = csr_matrix(pd.get_dummies(X_df['shipping'], sparse=True, prefix='ship'))

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating dummy variables and storing as sparse matrices...
Done. Time taken: 292.44s


### Transform name and item_description

In [11]:
cv_name = CountVectorizer(max_features=20000, lowercase=True, token_pattern='\w+', ngram_range=(1,1))
tv_desc = TfidfVectorizer(max_features=20000, lowercase=True, token_pattern='\w+', ngram_range=(1,2))

# Vectorise name
start_time = time.time()
print('Vectorising name...')
X_name = cv_name.fit_transform(X_df['name'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

# Vectorise item description
start_time = time.time()
print('Vectorising item description...')
X_desc = tv_desc.fit_transform(X_df['item_description'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Vectorising name...
Done. Time taken: 12.04s

Vectorising item description...
Done. Time taken: 132.08s


### Combine features and prepare for training

In [12]:
print('Feature dimensions:')
for feature in (X_name,
                X_cond,
                X_ship,
                X_desc,
                X_brand,
                X_cat1,
                X_cat2,
                X_cat3):
    print(feature.shape)

Feature dimensions:
(2175020, 20000)
(2175020, 5)
(2175020, 2)
(2175020, 20000)
(2175020, 5288)
(2175020, 11)
(2175020, 114)
(2175020, 883)


In [13]:
start_time = time.time()
print('Creating overall input data...')

# Create final sparse matrix for X
X_csr = hstack((X_name,
                X_cond,
                X_ship,
                X_desc,
                X_brand,
                X_cat1,
                X_cat2,
                X_cat3)
                ).tocsr()

split = len(X_train_df)
X_input = X_csr[:split]
X_test = X_csr[split:]

# Convert y to np array, then log(y+1)
y_input = np.array(y_train_df).reshape(-1,1)
y_input = np.log1p(y_input)

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating overall input data...
Done. Time taken: 8.40s


In [14]:
# Dimensions of our model inputs
print(X_input.shape)
print(y_input.shape)

(1481661, 46303)
(1481661, 1)


In [15]:
start_time = time.time()

print('Creating train/validation split...')
X_train, X_val, y_train, y_val = train_test_split(X_input, y_input, test_size=0.1)
print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

print(f'Train shapes\nX: {X_train.shape}\nY: {y_train.shape}\n')
print(f'Validation shapes\nX: {X_val.shape}\nY: {y_val.shape}')

Creating train/validation split...
Done. Time taken: 2.52s

Train shapes
X: (1333494, 46303)
Y: (1333494, 1)

Validation shapes
X: (148167, 46303)
Y: (148167, 1)


## Fully-connected Neural Network

In [16]:
## Keras functions
def r2_metric(y, y_hat):
    RSS = K.sum(K.square(y - y_hat))
    TSS = K.sum(K.square(y - K.mean(y)))
    r2 = 1 - RSS / (TSS + K.epsilon())
    return r2

def RMSLE_metric(y, y_hat):    
    RMSLE = K.sqrt(K.mean(K.square(y_hat - y)))
    return RMSLE

## Numpy functions
def calc_r2(y, y_hat):
    RSS = np.sum((y - y_hat)**2)
    TSS = np.sum((y - np.mean(y))**2)
    r2 = 1 - RSS / TSS
    return r2

def calc_RMSLE(y, y_hat):
    RMSLE = np.sqrt(np.mean((np.log(y_hat + 1) - np.log(y + 1))**2))
    return RMSLE

In [17]:
model_input = keras.Input(shape=(X_input.shape[1],), dtype='float32', sparse=True)

hidden1 = Dropout(0.3)(Dense(192, activation='relu')(model_input))
hidden2 = Dropout(0.3)(Dense(64, activation='relu')(hidden1))
hidden3 = Dropout(0.3)(Dense(64, activation='relu')(hidden2))
model_output = Dense(1)(hidden3)

model = keras.Model(model_input, model_output)
model.compile(loss='mean_squared_error', optimizer=keras.optimizers.adam(lr=3e-3), metrics=[RMSLE_metric])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 46303)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 192)               8890368   
_________________________________________________________________
dropout_1 (Dropout)          (None, 192)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                12352     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
__________

In [18]:
training_start = time.time()
# brand recovery - 0.4196
history = model.fit(X_train, y_train, batch_size=2048, epochs=10, validation_data=(X_val, y_val), verbose=1,
                    callbacks=[EarlyStopping(patience=1, monitor='val_loss', restore_best_weights=True)])

print(f'Training finished. Time taken: {time.time() - training_start:.2f}s')

Train on 1333494 samples, validate on 148167 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Training finished. Time taken: 462.79s


In [19]:
# Get train/validation predictions
y_hat_train = model.predict(X_train, verbose=1)
y_hat_val = model.predict(X_val, verbose=1)



In [20]:
# Root mean squared logarithmic error
RMSLE_train = calc_RMSLE(np.expm1(y_train), np.expm1(y_hat_train))
RMSLE_val = calc_RMSLE(np.expm1(y_val), np.expm1(y_hat_val))
print(f'RMSLE\nTrain: {RMSLE_train:.4f}\nVal: {RMSLE_val:.4f}\n')

# R squared (fraction of variance explained)
r2_train = calc_r2(np.expm1(y_train), np.expm1(y_hat_train))
r2_val = calc_r2(np.expm1(y_val), np.expm1(y_hat_val))
print(f'r2\nTrain: {r2_train:.4f}\nVal: {r2_val:.4f}\n')

RMSLE
Train: 0.3233
Val: 0.4211

r2
Train: 0.6774
Val: 0.5426



## Get predictions for test data

In [21]:
# Predictions
y_hat_test = model.predict(X_test, verbose=1)



In [22]:
start_time = time.time()
print('Creating submission file...')

# Submission
submission = pd.DataFrame()
submission['test_id'] = range(len(y_hat_test))
submission['price'] = np.expm1(y_hat_test)
submission.to_csv('submission.csv', index=False)

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating submission file...
Done. Time taken: 1.84s


In [23]:
print(f'Total notebook run time: {time.time() - notebook_start:.2f}s')

Total notebook run time: 1268.27s
