In [1]:
# Author: Alastair Hamilton
# Date: May/June 2018
# Title: Model for Home-depot Kaggle Competition

# Model for Home-depot Kaggle Competition

## Imports

In [2]:
## Data Wrangling
import numpy as np
import pandas as pd

In [3]:
## Misc
import os
import re
from pprint import pprint as pp
import time

In [4]:
## NLP
import nltk
from nltk.stem.porter import *
from nltk.stem.snowball import *
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

In [124]:
## ML
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tpot import TPOTRegressor
from xgboost import XGBRegressor

## Config

In [8]:
# # Set path to data
data_path = "../data/"

In [9]:
# # Processing features
proc_feat = ['search_term', 'product_title', 'product_description', 'attributes']

## Functions

In [10]:
# Tokenise a pandas Series
def tokenise(s, tokeniser, tokenise_fn=False):
    if tokenise_fn:
        return s.apply(tokeniser.tokenize)
    else:
        return s.apply(tokeniser)

In [11]:
def stem(s, stemmer):
    return s.apply(lambda x: tuple(map(stemmer.stem, x)))

In [12]:
# Remove punctuation from a pandas Series
def rmv_punc(s):
    return s.apply(lambda x: tuple(filter(lambda y: not y.is_punct, x)))

In [13]:
# Remove stop words
def rmv_stop(s, stops):
    return s.apply(lambda x: tuple(filter(lambda y: y not in stops, x)))

In [14]:
# Apply function on rows of data frame (2 cols max)
def func_row(df, func):
    return df.apply(lambda row: func(row[0], row[1]), axis=1)

In [63]:
# Find number of words in one document (doc1) that are in another document (doc2)
def common_words(l1, l2):
    return sum(int(word in l1) for word in l2)

## Imports Data

In [109]:
print('Importing data...')

# # Get all zipped files in data path
zips = [f for f in os.listdir(data_path) if re.search(".zip$", f)]

# # Unzip all files and put into dictionary, keyed by file stem
data_dict = {}
for zipped in zips:
    print('- Importing {}...'.format(zipped))
    data_dict[zipped.split('.')[0]] = pd.read_csv(data_path+zipped, compression='zip', encoding='ISO-8859-1')

# # Set dataframe to piece in data dictionary
train_df = data_dict['train']
test_df = data_dict['test']
prod_desc = data_dict['product_descriptions']
attributes = data_dict['attributes']

# # Clean up
del data_dict

Importing data...
- Importing attributes.csv.zip...
- Importing train.csv.zip...
- Importing sample_submission.csv.zip...
- Importing test.csv.zip...
- Importing product_descriptions.csv.zip...


## Process Data

In [32]:
print("Processing data...")

# # Process attributes data
print("- Handling attributes data...")

# # # Deal with N/As in attributes data (drop empty records and fill in name and values with empty string)
attr = attributes.dropna(how='all')
attr[['name','value']] = attr[['name','value']].fillna('')

# # # Ensure UID is int
attr['product_uid'] = attr['product_uid'].apply(lambda x: int(x))

# # # If "bullet" in attribute name then asserting name is meaningless - make an empty string
attr['name'] = attr['name'].apply(lambda x: '' if "Bullet" in x else x)

# # # Group name and value in attributes into single column, separated by a tab and ending in newline (for grouping stage next)
attr['attributes'] = attr['name'] + '\t' + attr['value'] + '\n'

# # # Drop name and values, groupby UID and sum grouped values, reset index...
# # # ...(ie. all attributes in single cell now, separated by newlines as set up above)
attr = attr.drop(['name','value'], axis=1).groupby('product_uid').sum().reset_index()

Processing data...
- Handling attributes data...


In [33]:
# # Create master data frame
print("- Creating master data frame...")

# # # Append train and test data
data = train_df.append(test_df, sort=False)

# # # Merge all data into one master dataframe by merging descriptions and attributes onto training data on UID...
# # # ...Fill any NAs with empty string
data = pd.merge(data, prod_desc, how='left', on='product_uid')
data = data.drop('id', axis=1).merge(attr, on='product_uid', how='left').fillna('')

# # # Finally create a master index column, which will be used to reference individual search terms
data = data.reset_index(drop=True)

- Creating master data frame...


In [34]:
data.head()

Unnamed: 0,product_uid,product_title,search_term,relevance,product_description,attributes
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ...",\tVersatile connector for various 90Â° connect...
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ...",\tVersatile connector for various 90Â° connect...
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...,"Application Method\tBrush,Roller,Spray\nAssemb..."
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,Bath Faucet Type\tCombo Tub and Shower\nBuilt-...
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,Bath Faucet Type\tCombo Tub and Shower\nBuilt-...


In [35]:
# # Clean up
del train_df, test_df
del prod_desc
del attr

## Pre-processing

data has ```74067``` rows.
- Whole pipeline takes ~16s/column (pipeline column which isn't as rich as some of the others) [9/5/19]

In [36]:
text_cols = ['product_title', 'search_term', 'product_description', 'product_description', 'attributes']

In [37]:
data_proc = data.loc[:, text_cols]

### Tokenisation
Using the regexp tokeniser in NLTK with ```r'\w+'``` was significantly faster and got rid of punctuation, which was intended.

- Need to add it to ignore the weird a^ character

|Tokeniser|Time taken/200 cells in product title (ms)|
|---|---|
|Regexp|1.28|
|wordpunct|1.57|
|wordtokenise|48.5|

In [38]:
tokenise_anon = lambda x: tokenise(x, RegexpTokenizer(r'\w+'), tokenise_fn=True)
data_proc = data_proc.apply(tokenise_anon, axis=0)

### Remove Stop Words
Caching the stop words corpus was ridiculously faster (ie loading ```stopwords.words('english')``` once).

|Remover|Time taken/200 cells in product title (ms)|
|---|---|
|nltk stop corpus filter (no cache)|435|
|nltk stop corpus filter (cache)|6.71|

In [22]:
# stopwords_anon = lambda x: rmv_stop(x, stopwords.words('english'))
# data_proc = data_proc.apply(stopwords_anon, axis=0)

### Stemming
Worth seeing if this affects the model further down the line. 
- Choosing snowball as faster and seen it be used for this problem before [9/5/19]

|Stemmer|Time taken/200 cells in product title (ms)|
|---|---|
|Porter|45|
|Snowball|37.9|

In [42]:
stemmer = SnowballStemmer('english')
stemming_anon = lambda x: [stemmer.stem(word) for word in x]
data_proc = data_proc.applymap(stemming_anon)

### Check nans

In [43]:
data_proc.isna().sum()

product_title          0
search_term            0
product_description    0
product_description    0
attributes             0
dtype: int64

## Feature Generation

In [47]:
data_proc['product_uid'] = data['product_uid']

In [48]:
data_proc['relevance'] = data['relevance']

In [49]:
data_proc.head(2)

Unnamed: 0,product_title,search_term,product_description,product_description.1,attributes,product_uid,relevance
0,"[simpson, strong, tie, 12, gaug, angl]","[angl, bracket]","[not, onli, do, angl, make, joint, stronger, t...","[not, onli, do, angl, make, joint, stronger, t...","[versatil, connector, for, various, 90â, conne...",100001,3.0
1,"[simpson, strong, tie, 12, gaug, angl]","[l, bracket]","[not, onli, do, angl, make, joint, stronger, t...","[not, onli, do, angl, make, joint, stronger, t...","[versatil, connector, for, various, 90â, conne...",100001,2.5


In [50]:
data_feat = data_proc.loc[:, ['product_uid', 'relevance']]

In [51]:
# # Len of query
data_feat['q_len'] = data_proc['search_term'].apply(len).astype('int64')

In [72]:
# # Get common words between query and returned product title
com_word_anon = lambda row: common_words(row['search_term'], row['product_title'])
data_feat['com_title'] = data_proc.loc[:, ['search_term', 'product_title']].apply(com_word_anon, axis=1)

In [67]:
# # Get common words between query and returned product description
com_word_anon = lambda row: common_words(row['search_term'], row['product_description'])
data_feat['com_desc'] = data_proc.loc[:, ['search_term', 'product_description']].apply(com_word_anon, axis=1)

In [68]:
# # Get common words between query and returned attributes
com_word_anon = lambda row: common_words(row['search_term'], row['attributes'])
data_feat['com_attr'] = data_proc.loc[:, ['search_term', 'attributes']].apply(com_word_anon, axis=1)

In [74]:
data_feat.head(2)

Unnamed: 0,product_uid,relevance,q_len,com_title,com_desc,com_attr
0,100001,3.0,2,1,0,1
1,100001,2.5,2,0,0,0


## ML Model

### Data

In [76]:
data_ml = data_feat.drop('product_uid', axis=1)

In [89]:
train, test = data_ml[~(data_ml['relevance'] == '')], data_ml[data_ml['relevance'] == ''].drop('relevance', axis=1).reset_index(drop=True)

In [93]:
X, y = train[[x for x in train.columns if x != 'relevance']], train['relevance']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Models

In [117]:
scorer = lambda true, pred: np.sqrt(mean_squared_error(true, pred))

In [128]:
# tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)
# tpot.fit(X_train, y_train)
# print(tpot.score(X_test, y_test))
# preds = tpot.predict(X_test)
# scorer(y_test, preds)

0.49118202139283473

In [129]:
mdl = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.5, max_delta_step=0,
       max_depth=2, min_child_weight=13, missing=None, n_estimators=100,
       n_jobs=1, nthread=1, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.8500000000000001, verbosity=1)

In [123]:
y_pred = mdl.fit(X,)

0.49118202139283473

### Make Submission

In [113]:
res = pd.DataFrame(y_pred, index=test_df['id'], columns=['relevance'])

In [114]:
res.head()

Unnamed: 0_level_0,relevance
id,Unnamed: 1_level_1
1,2.061226
4,2.061226
5,2.204339
6,2.617579
7,2.327509


In [115]:
res.to_csv('../data/submission.csv')