In [1]:
import csv, string, re, time, nltk, sys
import numpy as np
import pandas as pd
from collections import defaultdict

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
stopwords = nltk.corpus.stopwords.words('english')

# Neural Network
import keras
import keras.backend as K
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Record total notebook run time
notebook_start = time.time()

In [3]:
print('Parsing data from TSV files...')
start_time = time.time()
df_train = pd.read_csv('train.tsv', delimiter='\t', index_col=['train_id'])
df_test = pd.read_csv('test.tsv', delimiter='\t', index_col=['test_id'])
print(f'Done. Took {time.time()-start_time:.2f}s')

Parsing data from TSV files...


  mask |= (ar1 == a)


Done. Took 8.04s


In [4]:
df_train.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
# Drop price
initial_size = df_train.shape[0]
df_train = df_train.drop(df_train[(df_train['price'] == 0)].index)
print(f'Dropped {initial_size - df_train.shape[0]} zero-price entries')

Dropped 874 zero-price entries


## X/y split

In [6]:
# We will now split the dataframe into X (features/inputs) and y (target variable).
y = df_train.loc[:, 'price']
y = pd.to_numeric(y)

X = df_train
del X['price']

In [7]:
X.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity


## Category

In [8]:
# Exploring data
num_categories_dict = defaultdict(int)
for i, category in enumerate(X['category_name']):
    if type(category) != str:
        num_categories_dict['null'] += 1
        continue
    else:
        categories = category.split('/')
        num_categories = len(categories)
        num_categories_dict[num_categories] += 1
        
for k, v in num_categories_dict.items():
    print(f'Number of entries with {k} categories: {v}')

Number of entries with 3 categories: 1470962
Number of entries with null categories: 6314
Number of entries with 5 categories: 3055
Number of entries with 4 categories: 1330


In [9]:
# Split category into 3 columns (cat1/cat2/cat3 -> separate columns)
def split_categories(category, level):
    if type(category) != str:
        return 'null'
    else:
        categories = category.split('/')
        return categories[level]
        
X['category1'] = X['category_name'].apply(lambda x: split_categories(x, 0))
X['category2'] = X['category_name'].apply(lambda x: split_categories(x, 1))
X['category3'] = X['category_name'].apply(lambda x: split_categories(x, 2))

#X.head()

## Brand name

In [10]:
threshold = 5
brand_counts = X.groupby('brand_name').apply(lambda x: x.index.nunique())
low_brand_counts = brand_counts[brand_counts < threshold]
print(f'Total brands: {len(brand_counts)}\nBrands with fewer than {threshold} instances: {len(low_brand_counts)}')

# Convert missing data - and brands with less than 'threshold' instances - to brand "Other"
X['brand_name'] = X['brand_name'].apply(lambda x: 'Other' if (x in low_brand_counts or type(x) != str) else x)

Total brands: 4807
Brands with fewer than 5 instances: 2418


## Dummy variables (one-hot encoding)

In [11]:
start_time = time.time()

print('Creating dummy variables and storing as sparse matrices...')
X_brand = csr_matrix(pd.get_dummies(X['brand_name'], sparse=True, prefix='brand'))
X_cat1 = csr_matrix(pd.get_dummies(X['category1'], sparse=True, prefix='cat1'))
X_cat2 = csr_matrix(pd.get_dummies(X['category2'], sparse=True, prefix='cat2'))
X_cat3 = csr_matrix(pd.get_dummies(X['category3'], sparse=True, prefix='cat3'))
X_cond = csr_matrix(pd.get_dummies(X['item_condition_id'], sparse=True, prefix='cond'))
X_ship = csr_matrix(pd.get_dummies(X['shipping'], sparse=True, prefix='ship'))

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating dummy variables and storing as sparse matrices...
Done. Time taken: 88.15s


## Cleaning name and item description

In [12]:
def format_string(s):
    """
    Takes a string as input and 'cleans' it.
    """
    # Lowercase
    s = s.lower()
    # Remove hyperlinks
    s = re.sub(r'https?:\/\/.*\/\w*', '', s)
    # Replace hyphens with a space
    s = s.replace('-', ' ')
    # Remove punctuation (important that we removed hyperlinks before this stage)
    s = re.sub(r'[^\w\s]', '', s)
    # Remove words with <=2 letters
    s = re.sub(r'\b\w{1,2}\b', '', s)
    # Replace excess whitespace with a single space, then strip the whitespace on the edges of the string
    s = re.sub(r'\s\s+', ' ', s)
    s = s.strip()
    
    # Break sentence into tokens, remove stopwords and stem the string
    s = ' '.join([stemmer.stem(token) for token in s.split(' ') if token not in stopwords])
    return s

In [13]:
stemmer = nltk.stem.porter.PorterStemmer()

start_time = time.time()

print('Cleaning name and item description...')
X['name'] = X['name'].apply(lambda x: format_string(x))
X['item_description'] = X['item_description'].apply(lambda x: 'None' if x=='No description yet' or type(x) != str else format_string(x))

print(f'Done. Time taken: {time.time() - start_time:.2f}s')
X.head()

Cleaning name and item description...
Done. Time taken: 677.94s


Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description,category1,category2,category3
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,mlb cincinnati red shirt size,3,Men/Tops/T-shirts,Other,1,,Men,Tops,T-shirts
1,razer blackwidow chroma keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,keyboard great condit work like came box port ...,Electronics,Computers & Tablets,Components & Parts
2,ava viv blous,1,Women/Tops & Blouses/Blouse,Target,1,ador top hint lace key hole back pale pink als...,Women,Tops & Blouses,Blouse
3,leather hors statu,1,Home/Home Décor/Home Décor Accents,Other,1,new tag leather hors retail stand foot high so...,Home,Home Décor,Home Décor Accents
4,24k gold plate rose,1,Women/Jewelry/Necklaces,Other,0,complet certif authent,Women,Jewelry,Necklaces


## Transform name and item_description

In [14]:
cv = CountVectorizer(max_features=5000, ngram_range=(1,1))
tv = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Vectorise name
start_time = time.time()
print('Vectorising name...')
X_name = cv.fit_transform(X['name'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

# Vectorise item description
start_time = time.time()
print('Vectorising item description...')
X_desc = tv.fit_transform(X['item_description'])
print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Vectorising name...
Done. Time taken: 14.70s

Vectorising item description...
Done. Time taken: 77.06s


In [15]:
print('Feature dimensions:')
for feature in (X_name,
                X_cond,
                X_ship,
                X_desc,
                X_brand,
                X_cat1,
                X_cat2,
                X_cat3):
    print(feature.shape)

Feature dimensions:
(1481661, 5000)
(1481661, 5)
(1481661, 2)
(1481661, 5000)
(1481661, 2390)
(1481661, 11)
(1481661, 114)
(1481661, 871)


In [16]:
start_time = time.time()
print('Creating overall input data...')

# Create final sparse matrix for X
X_input = hstack((X_name,
                  X_cond,
                  X_ship,
                  X_desc,
                  X_brand,
                  X_cat1,
                  X_cat2,
                  X_cat3)
                ).tocsr()

# y -> numpy array, then log(y+1)
y_input = np.array(y).reshape(-1,1)
y_input = np.log1p(y_input)

print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Creating overall input data...
Done. Time taken: 2.80s


In [17]:
print(X_input.shape)
print(y_input.shape)

(1481661, 13393)
(1481661, 1)


In [18]:
start_time = time.time()
print('Creating train/val split...')

X_train, X_val, y_train, y_val = train_test_split(X_input, y_input, test_size=0.1, random_state=8)

print(f'Done. Time taken: {time.time() - start_time:.2f}s\n')

print(f'Train shapes\nX: {X_train.shape}\nY: {y_train.shape}\n')
print(f'Validation shapes\nX: {X_val.shape}\nY: {y_val.shape}')

Creating train/val split...
Done. Time taken: 1.60s

Train shapes
X: (1333494, 13393)
Y: (1333494, 1)

Validation shapes
X: (148167, 13393)
Y: (148167, 1)


## Neural Network

In [19]:
## Keras functions
def r2_metric(y, y_hat):
    RSS = K.sum(K.square(y - y_hat))
    TSS = K.sum(K.square(y - K.mean(y)))
    r2 = 1 - RSS / (TSS + K.epsilon())
    return r2

def RMSLE_metric(y, y_hat):    
    RMSLE = K.sqrt(K.mean(K.square(y_hat - y)))
    return RMSLE

## Numpy functions
def calc_r2(y, y_hat):
    RSS = np.sum((y - y_hat)**2)
    TSS = np.sum((y - np.mean(y))**2)
    r2 = 1 - RSS / TSS
    return r2

def calc_RMSLE(y, y_hat):
    RMSLE = np.sqrt(np.mean((np.log(y_hat + 1) - np.log(y + 1))**2))
    return RMSLE

In [20]:
# Fully connected network
model = Sequential()
model.add(Dense(50, input_dim=X_input.shape[1], activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=[r2_metric])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                669700    
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51        
Total params: 672,301
Trainable params: 672,301
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val), # 16,1
                    callbacks=[EarlyStopping(patience=1, monitor='val_loss', restore_best_weights=True)])

Train on 1333494 samples, validate on 148167 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [22]:
# Get train/validation predictions
y_hat_train = model.predict(X_train, verbose=1)
print(y_hat_train.shape)

y_hat_val = model.predict(X_val, verbose=1)
print(y_hat_val.shape)

(1333494, 1)
(148167, 1)


In [23]:
RMSLE_train = calc_RMSLE(np.expm1(y_train), np.expm1(y_hat_train))
RMSLE_val = calc_RMSLE(np.expm1(y_val), np.expm1(y_hat_val))

print(f'RMSLE\nTrain: {RMSLE_train:.4f}\nVal: {RMSLE_val:.4f}\n') # Best: 0.4577 / 0.4701
# 0.3900 / 0.4400 | runtime=1605.62s | 5k feature limit, <5 brands, (1,1)/(1,2) ngrams -> CV categories & BL brands
# 0.3962 / 0.4384 | runtime=1402.01s | 5k feature limit, <5 brands, (1,1)/(1,2) ngrams -> cat/brands as dummies

r2_train = calc_r2(np.expm1(y_train), np.expm1(y_hat_train))
r2_val = calc_r2(np.expm1(y_val), np.expm1(y_hat_val))

print(f'r2\nTrain: {r2_train:.4f}\nVal: {r2_val:.4f}\n')

log_r2_train = calc_r2(y_train, y_hat_train)
log_r2_val = calc_r2(y_val, y_hat_val)

print(f'log r2\nTrain: {log_r2_train:.4f}\nVal: {log_r2_val:.4f}')

RMSLE
Train: 0.3980
Val: 0.4395

r2
Train: 0.5676
Val: 0.5175

log r2
Train: 0.7153
Val: 0.6539


In [24]:
print(f'Total notebook run time: {time.time() - notebook_start:.2f}s')

Total notebook run time: 1877.89s


## Bonus stuff

### Ridge regression (for comparison)

In [25]:
from sklearn.linear_model import Ridge

start_time = time.time()
model2 = Ridge(solver='sag', fit_intercept=False)
print('Fitting ridge regression...')
model2.fit(X_train, y_train)
print(f'Done. Time taken: {time.time() - start_time:.2f}s')

Fitting ridge regression...
Done. Time taken: 120.08s


In [26]:
y_hat_train2 = model2.predict(X_train)
y_hat_val2 = model2.predict(X_val)

RMSLE_train2 = calc_RMSLE(np.exp(y_train), np.exp(y_hat_train2))
RMSLE_val2 = calc_RMSLE(np.exp(y_val), np.exp(y_hat_val2))

print(f'RMSLE\nTrain: {RMSLE_train2:.4f}\nVal: {RMSLE_val2:.4f}\n')

r2_train2 = calc_r2(np.exp(y_train), np.exp(y_hat_train2))
r2_val2 = calc_r2(np.exp(y_val), np.exp(y_hat_val2))

print(f'r2\nTrain: {r2_train2:.4f}\nVal: {r2_val2:.4f}\n')

log_r2_train2 = calc_r2(y_train, y_hat_train2)
log_r2_val2 = calc_r2(y_val, y_hat_val2)

print(f'log r2\nTrain: {log_r2_train2:.4f}\nVal: {log_r2_val2:.4f}')

RMSLE
Train: 0.4561
Val: 0.4610

r2
Train: 0.4138
Val: 0.4174

log r2
Train: 0.5861
Val: 0.5783


### Generate N-grams (manually)

In [27]:
# def generate_ngrams(string, n=2):
#     if type(string) != str:
#         return []
#     # Break sentence into tokens
#     tokens = [token for token in string.split(' ')]
#     if len(tokens) < n:
#         return []
#     return list(nltk.ngrams(tokens, n))

In [28]:
# start_time = time.time()
# print('Creating ngrams...')

# name_ngrams = list(X_train['name'].apply(lambda x: generate_ngrams(x, 2)))
# desc_ngrams = list(X_train['item_description'].apply(lambda x: generate_ngrams(x, 2)))
# for i in name_ngrams:
#     ngrams += i  
# for i in desc_ngrams:
#     ngrams += i  

# print(f'Done. Time taken: {time.time() - start_time:.2f}s')
    
# top_ngrams = pd.Series(ngrams).value_counts().head(200).keys()
# print(list(top_ngrams))

In [29]:
# ngram_counts = pd.Series(ngrams).value_counts()

# # point_one_pct = int(len(X_train) * 0.001)
# # point_one_ngrams = list(ngram_counts[ngram_counts > point_one_pct].keys())
# # print(len(point_one_ngrams))

# one_pct = int(len(X_train) * 0.01)
# one_ngrams = list(ngram_counts[ngram_counts > one_pct].keys())
# print(len(one_ngrams))

In [30]:
# start_time = time.time()
# print('Creating new columns for ngrams...')
# for ngram in point_one_ngrams:
#     col_name = f'desc_{ngram[0]}_{ngram[1]}'
#     print(col_name)
#     #X_train[col_name] =
#     temp1 = X_train['item_description'].apply(lambda x: 1 if ngram in generate_ngrams(x) else 0)
#     temp2 = X_train['name'].apply(lambda x: 1 if ngram in generate_ngrams(x) else 0)
#     #print(temp1 | temp2)
#     break
    
# print(f'Done. Time taken: {time.time() - start_time:.2f}s')