In [1]:
# Author: Alastair Hamilton
# Date: May/June 2018
# Title: Model for Home-depot Kaggle Competition

# Model for Home-depot Kaggle Competition

## Imports

In [1]:
## Data Wrangling
import numpy as np
import pandas as pd

In [2]:
## Misc
import os
import re
from pprint import pprint as pp
import time

In [83]:
## NLP
import nltk
from nltk.stem.porter import *
from nltk.stem.snowball import *
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

## Config

In [7]:
# # Pandas error display OFF
pd.options.mode.chained_assignment = None

In [8]:
# # Set path to data
data_path = "../data/"

In [10]:
# # Processing features
proc_feat = ['search_term', 'product_title', 'product_description', 'attributes']

## Functions

In [75]:
# Tokenise a pandas Series
def tokenise(s, tokeniser, tokenise_fn=False):
    if tokenise_fn:
        return s.apply(tokeniser.tokenize)
    else:
        return s.apply(tokeniser)

In [49]:
def stem(s, stemmer):
    return s.apply(lambda x: tuple(map(stemmer.stem, x)))

In [36]:
# Remove punctuation from a pandas Series
def rmv_punc(s):
    return s.apply(lambda x: tuple(filter(lambda y: not y.is_punct, x)))

In [80]:
# Remove stop words
def rmv_stop(s, stops):
    return s.apply(lambda x: tuple(filter(lambda y: y not in stops, x)))

In [16]:
# Apply function on rows of data frame (2 cols max)
def func_row(df, func):
    return df.apply(lambda row: func(row[0], row[1]), axis=1)

In [17]:
# Find number of words in one document (doc1) that are in another document (doc2)
def common_words_doc(doc1, doc2):
    tot = 0
    for w1 in doc1:
        for w2 in doc2:
            if w2.find(w1) >= 0:
                tot += 1
                break
    return tot

## Imports Data

In [30]:
print('Importing data...')

# # Get all zipped files in data path
zips = [f for f in os.listdir(data_path) if re.search(".zip$", f)]

# # Unzip all files and put into dictionary, keyed by file stem
data_dict = {}
for zipped in zips:
    print('- Importing {}...'.format(zipped))
    data_dict[zipped.split('.')[0]] = pd.read_csv(data_path+zipped, compression='zip', encoding='latin1')

# # Set dataframe to piece in data dictionary
train_df = data_dict['train']
prod_desc = data_dict['product_descriptions']
attributes = data_dict['attributes']

# # Clean up
del data_dict

Importing data...
- Importing attributes.csv.zip...
- Importing test.csv.zip...
- Importing train.csv.zip...
- Importing product_descriptions.csv.zip...
- Importing sample_submission.csv.zip...


## Process Data

In [31]:
print("Processing data...")

# # Process attributes data
print("- Handling attributes data...")

# # # Deal with N/As in attributes data (drop empty records and fill in name and values with empty string)
attr = attributes.dropna(how='all')
attr[['name','value']] = attr[['name','value']].fillna('')

# # # Ensure UID is int
attr['product_uid'] = attr['product_uid'].apply(lambda x: int(x))

# # # If "bullet" in attribute name then asserting name is meaningless - make an empty string
attr['name'] = attr['name'].apply(lambda x: '' if "Bullet" in x else x)

# # # Group name and value in attributes into single column, separated by a tab and ending in newline (for grouping stage next)
attr['attributes'] = attr['name'] + '\t' + attr['value'] + '\n'

# # # Drop name and values, groupby UID and sum grouped values, reset index...
# # # ...(ie. all attributes in single cell now, separated by newlines as set up above)
attr = attr.drop(['name','value'], axis=1).groupby('product_uid').sum().reset_index()

Processing data...
- Handling attributes data...


In [32]:
# # Create master data frame
print("- Creating master data frame...")

# # # Merge all data into one master dataframe by merging descriptions and attributes onto training data on UID...
# # # ...Fill any NAs with empty string
data = pd.merge(train_df, prod_desc, how='left',
                on='product_uid').drop('id', axis=1).merge(attr, on='product_uid', how='left').fillna('')

# # # Finally create a master index column, which will be used to reference individual search terms
data = data.reset_index(drop=True).reset_index(drop=True)

- Creating master data frame...


In [33]:
# # Clean up
del train_df
del prod_desc
del attr

## Pre-processing

data has ```74067``` rows.
- Whole pipeline takes ~16s/column (pipeline column which isn't as rich as some of the others) [9/5/19]

In [123]:
text_cols = ['product_title', 'search_term', 'product_description', 'product_description', 'attributes']

In [129]:
data_proc = data.copy(deep=True)

### Tokenisation
Using the regexp tokeniser in NLTK with ```r'\w+'``` was significantly faster and got rid of punctuation, which was intended.

- Need to add it to ignore the weird a^ character

|Tokeniser|Time taken/200 cells in product title (ms)|
|---|---|
|Regexp|1.28|
|wordpunct|1.57|
|wordtokenise|48.5|

In [130]:
tokenise_anon = lambda x: tokenise(x, RegexpTokenizer(r'\w+'), tokenise_fn=True)
data_proc.loc[:, text_cols] = data_proc.loc[:, text_cols].apply(tokenise_anon, axis=0)

### Remove Stop Words
Caching the stop words corpus was ridiculously faster (ie loading ```stopwords.words('english')``` once).

|Remover|Time taken/200 cells in product title (ms)|
|---|---|
|nltk stop corpus filter (no cache)|435|
|nltk stop corpus filter (cache)|6.71|

In [131]:
stopwords_anon = lambda x: rmv_stop(x, stopwords.words('english'))
data_proc.loc[:, text_cols] = data_proc.loc[:, text_cols].apply(stopwords_anon, axis=0)

### Stemming
Worth seeing if this affects the model further down the line. 
- Choosing snowball as faster and seen it be used for this problem before [9/5/19]

|Stemmer|Time taken/200 cells in product title (ms)|
|---|---|
|Porter|45|
|Snowball|37.9|

In [132]:
stemming_anon = lambda x: stem(x, SnowballStemmer('english'))
data_proc.loc[:, text_cols] = data_proc.loc[:, text_cols].apply(stemming_anon, axis=0)

## Feature Generation

In [136]:
data_proc.head(2)

Unnamed: 0,product_uid,product_title,search_term,relevance,product_description,attributes
0,100001,"(simpson, strong, tie, 12, gaug, angl)","(angl, bracket)",3.0,"(not, angl, make, joint, stronger, also, provi...","(versatil, connector, various, 90â, connect, h..."
1,100001,"(simpson, strong, tie, 12, gaug, angl)","(l, bracket)",2.5,"(not, angl, make, joint, stronger, also, provi...","(versatil, connector, various, 90â, connect, h..."


In [163]:
data_feat = pd.DataFrame(data_proc['product_uid'])

In [165]:
# # Len of query
data_feat['q_len'] = data_proc['search_term'].apply(len)

In [167]:
# # Get common words between query and returned product title
data_feat['com_title'] = func_row(data_proc[['search_term', 'product_title']], common_words_doc)

In [168]:
# # Get common words between query and returned product description
data_feat['com_desc'] = func_row(data_proc[['search_term', 'product_description']], common_words_doc)

In [169]:
# # Get common words between query and returned attributes
data_feat['com_attr'] = func_row(data_proc[['search_term', 'attributes']], common_words_doc)

In [170]:
data_feat.head(2)

Unnamed: 0,product_uid,q_len,com_title,com_desc,com_attr
0,100001,2.0,1.0,1.0,1.0
1,100001,2.0,1.0,1.0,1.0


In [171]:
# # Write data to file
data_feat.to_csv(data_path+'features.csv')