In [1]:
import csv, time
import numpy as np
import pandas as pd
from collections import defaultdict

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Neural Network
import keras
import keras.backend as K
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Record total notebook run time
notebook_start = time.time()

In [3]:
print('Parsing data from TSV files...')
start_time = time.time()
df_train = pd.read_csv('train.tsv', delimiter='\t', index_col=['train_id'])
df_test = pd.read_csv('test.tsv', delimiter='\t', index_col=['test_id'])
print(f'Done. Took {time.time() - start_time:.2f}s')

Parsing data from TSV files...


  mask |= (ar1 == a)


Done. Took 7.89s


In [4]:
# Drop entries where price is zero
initial_size = df_train.shape[0]
df_train = df_train.drop(df_train[(df_train['price'] == 0)].index)
print(f'Dropped {initial_size - df_train.shape[0]} zero-price entries')

Dropped 874 zero-price entries


## X/y split

In [5]:
# We will now split the dataframe into X (features/inputs) and y (target variable).
y = df_train.loc[:, 'price']
y = pd.to_numeric(y)

X = df_train
del X['price']

X.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity


## Exploring data

In [6]:
# Count of unique brands
brand_counts = X.groupby('brand_name').apply(lambda x: x.index.nunique())
print(f'Total brands: {len(brand_counts)}\n')

# Count of category lengths
num_categories_dict = defaultdict(int)
for i, category in enumerate(X['category_name']):
    if type(category) != str:
        num_categories_dict['null'] += 1
        continue
    else:
        categories = category.split('/')
        num_categories = len(categories)
        num_categories_dict[num_categories] += 1
        
for k, v in num_categories_dict.items():
    print(f'Number of entries with {k} categories: {v}')

Total brands: 4807

Number of entries with 3 categories: 1470962
Number of entries with null categories: 6314
Number of entries with 5 categories: 3055
Number of entries with 4 categories: 1330


## Data preparation

### Deal with missing data

In [7]:
# Replace NaN with ''
X['brand_name'] = X['brand_name'].fillna('')

# Replace NaN and 'No description yet' with ''
X['item_description'] = X['item_description'].apply(lambda x: '' if x=='No description yet' or type(x) != str else x)

### Split categories

In [8]:
# Split category into 3 columns (cat1/cat2/cat3 -> separate columns). NaN returns ''
def split_categories(category, level):
    if type(category) != str:
        return ''
    else:
        categories = category.split('/')
        return categories[level]
        
X['category1'] = X['category_name'].apply(lambda x: split_categories(x, 0))
X['category2'] = X['category_name'].apply(lambda x: split_categories(x, 1))
X['category3'] = X['category_name'].apply(lambda x: split_categories(x, 2))

X.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description,category1,category2,category3
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,,Men,Tops,T-shirts
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


### Create dummy variables (one-hot encoding)

In [9]:
start_time = time.time()

print('Creating dummy variables and storing as sparse matrices...')
X_brand = csr_matrix(pd.get_dummies(X['brand_name'], sparse=True, prefix='brand'))
X_cat1 = csr_matrix(pd.get_dummies(X['category1'], sparse=True, prefix='cat1'))
X_cat2 = csr_matrix(pd.get_dummies(X['category2'], sparse=True, prefix='cat2'))
X_cat3 = csr_matrix(pd.get_dummies(X['category3'], sparse=True, prefix='cat3'))
X_cond = csr_matrix(pd.get_dummies(X['item_condition_id'], sparse=True, prefix='cond'))
X_ship = csr_matrix(pd.get_dummies(X['shipping'], sparse=True, prefix='ship'))

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating dummy variables and storing as sparse matrices...
Done. Time taken: 160.85s


### Transform name and item_description

In [10]:
cv_name = CountVectorizer(max_features=20000, lowercase=True, token_pattern='\w+', ngram_range=(1,1))
tv_desc = TfidfVectorizer(max_features=20000, lowercase=True, token_pattern='\w+', ngram_range=(1,2))

# Vectorise name
start_time = time.time()
print('Vectorising name...')
X_name = cv_name.fit_transform(X['name'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

# Vectorise item description
start_time = time.time()
print('Vectorising item description...')
X_desc = tv_desc.fit_transform(X['item_description'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Vectorising name...
Done. Time taken: 7.61s

Vectorising item description...
Done. Time taken: 86.35s


### Combine features and prepare for training

In [11]:
print('Feature dimensions:')
for feature in (X_name,
                X_cond,
                X_ship,
                X_desc,
                X_brand,
                X_cat1,
                X_cat2,
                X_cat3):
    print(feature.shape)

Feature dimensions:
(1481661, 20000)
(1481661, 5)
(1481661, 2)
(1481661, 20000)
(1481661, 4808)
(1481661, 11)
(1481661, 114)
(1481661, 871)


In [12]:
start_time = time.time()
print('Creating overall input data...')

# Create final sparse matrix for X
X_input = hstack((X_name,
                  X_cond,
                  X_ship,
                  X_desc,
                  X_brand,
                  X_cat1,
                  X_cat2,
                  X_cat3)
                ).tocsr()

# y -> numpy array, then log(y+1)
y_input = np.array(y).reshape(-1,1)
y_input = np.log1p(y_input)

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating overall input data...
Done. Time taken: 4.27s


In [13]:
print(X_input.shape)
print(y_input.shape)

(1481661, 45811)
(1481661, 1)


In [14]:
start_time = time.time()
print('Creating train/validation split...')

X_train, X_val, y_train, y_val = train_test_split(X_input, y_input, test_size=0.1)

print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

print(f'Train shapes\nX: {X_train.shape}\nY: {y_train.shape}\n')
print(f'Validation shapes\nX: {X_val.shape}\nY: {y_val.shape}')

Creating train/validation split...
Done. Time taken: 2.36s

Train shapes
X: (1333494, 45811)
Y: (1333494, 1)

Validation shapes
X: (148167, 45811)
Y: (148167, 1)


## Neural Network

In [15]:
## Keras functions
def r2_metric(y, y_hat):
    RSS = K.sum(K.square(y - y_hat))
    TSS = K.sum(K.square(y - K.mean(y)))
    r2 = 1 - RSS / (TSS + K.epsilon())
    return r2

def RMSLE_metric(y, y_hat):    
    RMSLE = K.sqrt(K.mean(K.square(y_hat - y)))
    return RMSLE

## Numpy functions
def calc_r2(y, y_hat):
    RSS = np.sum((y - y_hat)**2)
    TSS = np.sum((y - np.mean(y))**2)
    r2 = 1 - RSS / TSS
    return r2

def calc_RMSLE(y, y_hat):
    RMSLE = np.sqrt(np.mean((np.log(y_hat + 1) - np.log(y + 1))**2))
    return RMSLE

In [16]:
model_input = keras.Input(shape=(X_input.shape[1],), dtype='float32', sparse=True)
hidden1 = keras.layers.Dense(192, activation='relu')(model_input)
hidden2 = keras.layers.Dense(64, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(64, activation='relu')(hidden2)
model_output = keras.layers.Dense(1)(hidden3)

model = keras.Model(model_input, model_output)
model.compile(loss='mean_squared_error', optimizer=keras.optimizers.adam(lr=3e-3), metrics=[RMSLE_metric])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 45811)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 192)               8795904   
_________________________________________________________________
dense_2 (Dense)              (None, 64)                12352     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 8,812,481
Trainable params: 8,812,481
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
epochs = 2
training_start = time.time() # 0.5036,0.4439 / 0.4071,0.4332

# Epochs done via for loop so we can increase batch size over time
for i in range(1, epochs+1): # verbose ~3.5% slower
    epoch_start = time.time()
    # print(f'Epoch {i}/{epochs}')
    model.fit(X_train, y_train, batch_size=2048*i, epochs=1, validation_data=(X_val, y_val), verbose=1,
              callbacks=[EarlyStopping(patience=1, monitor='val_loss', restore_best_weights=True)])
    print()
    # print(f'Finished. Time taken: {time.time() - epoch_start:.2f}s')
    
print(f'Training finished. Time taken: {time.time() - training_start:.2f}s')

Train on 1333494 samples, validate on 148167 samples
Epoch 1/1

Train on 1333494 samples, validate on 148167 samples
Epoch 1/1

Training finished. Time taken: 90.80s


In [18]:
# Get train/validation predictions
y_hat_train = model.predict(X_train, verbose=1)
y_hat_val = model.predict(X_val, verbose=1)



In [19]:
# Root mean squared logarithmic error
RMSLE_train = calc_RMSLE(np.expm1(y_train), np.expm1(y_hat_train))
RMSLE_val = calc_RMSLE(np.expm1(y_val), np.expm1(y_hat_val))
print(f'RMSLE\nTrain: {RMSLE_train:.4f}\nVal: {RMSLE_val:.4f}\n')
# Best | Train: 0.3272 | Val: 0.4254 | 20k/20k CV/TV

# R squared (fraction of variance explained)
# Non-log
r2_train = calc_r2(np.expm1(y_train), np.expm1(y_hat_train))
r2_val = calc_r2(np.expm1(y_val), np.expm1(y_hat_val))
print(f'r2\nTrain: {r2_train:.4f}\nVal: {r2_val:.4f}\n')

# Log
log_r2_train = calc_r2(y_train, y_hat_train)
log_r2_val = calc_r2(y_val, y_hat_val)
print(f'log r2\nTrain: {log_r2_train:.4f}\nVal: {log_r2_val:.4f}')

RMSLE
Train: 0.3623
Val: 0.4279

r2
Train: 0.6531
Val: 0.5291

log r2
Train: 0.7641
Val: 0.6706


In [20]:
print(f'Total notebook run time: {time.time() - notebook_start:.2f}s')

Total notebook run time: 418.64s


## Bonus stuff

### Ridge regression (for comparison)

In [21]:
from sklearn.linear_model import Ridge

start_time = time.time()
model2 = Ridge(solver='sag', fit_intercept=False)
print('Fitting ridge regression...')
model2.fit(X_train, y_train)
print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Fitting ridge regression...
Done. Time taken: 164.31s


In [22]:
y_hat_train2 = model2.predict(X_train)
y_hat_val2 = model2.predict(X_val)

RMSLE_train2 = calc_RMSLE(np.exp(y_train), np.exp(y_hat_train2))
RMSLE_val2 = calc_RMSLE(np.exp(y_val), np.exp(y_hat_val2))

print(f'RMSLE\nTrain: {RMSLE_train2:.4f}\nVal: {RMSLE_val2:.4f}\n')

r2_train2 = calc_r2(np.exp(y_train), np.exp(y_hat_train2))
r2_val2 = calc_r2(np.exp(y_val), np.exp(y_hat_val2))

print(f'r2\nTrain: {r2_train2:.4f}\nVal: {r2_val2:.4f}\n')

log_r2_train2 = calc_r2(y_train, y_hat_train2)
log_r2_val2 = calc_r2(y_val, y_hat_val2)

print(f'log r2\nTrain: {log_r2_train2:.4f}\nVal: {log_r2_val2:.4f}')

RMSLE
Train: 0.4239
Val: 0.4404

r2
Train: 0.4833
Val: 0.4551

log r2
Train: 0.6424
Val: 0.6137
