In [1]:
# Author: Alastair Hamilton
# Date: May/June 2018
# Title: Model for Home-depot Kaggle Competition

# Imports

In [2]:
## Data Wrangling
import numpy as np
import pandas as pd

In [3]:
## Misc
import os
import re
from pprint import pprint as pp
import time

In [4]:
## NLP
import nltk
from nltk.stem.porter import *
from nltk.stem.snowball import *
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

# Config

In [5]:
# # Set path to data
data_path = "../.data/"

In [6]:
# # Processing features
proc_feat = ['search_term', 'product_title', 'product_description', 'attributes']

# Functions

In [7]:
# Tokenise a pandas Series
def tokenise(s, tokeniser, tokenise_fn=False):
    if tokenise_fn:
        return s.apply(tokeniser.tokenize)
    else:
        return s.apply(tokeniser)

In [8]:
def stem(s, stemmer):
    return s.apply(lambda x: tuple(map(stemmer.stem, x)))

In [9]:
# Remove punctuation from a pandas Series
def rmv_punc(s):
    return s.apply(lambda x: tuple(filter(lambda y: not y.is_punct, x)))

In [10]:
# Remove stop words
def rmv_stop(s, stops):
    return s.apply(lambda x: tuple(filter(lambda y: y not in stops, x)))

In [11]:
# Apply function on rows of data frame (2 cols max)
def func_row(df, func):
    return df.apply(lambda row: func(row[0], row[1]), axis=1)

In [12]:
# Find number of words in one document (doc1) that are in another document (doc2)
def common_words_doc(doc1, doc2):
    tot = 0
    for w1 in doc1:
        for w2 in doc2:
            if w2.find(w1) >= 0:
                tot += 1
                break
    return tot

# Get Data

In [170]:
print('Importing data...')

# # Get all zipped files in data path
zips = [f for f in os.listdir(data_path) if re.search(".zip$", f)]

# # Unzip all files and put into dictionary, keyed by file stem
data_dict = {}
for zipped in zips:
    print('- Importing {}...'.format(zipped))
    if zipped == 'attributes.csv.zip':
        encoding = 'utf-8'
    else:
        encoding = 'latin1'
    data_dict[zipped.split('.')[0]] = pd.read_csv(data_path+zipped, compression='zip', encoding=encoding)

# # Set dataframe to piece in data dictionary
train_df = data_dict['train']
prod_desc = data_dict['product_descriptions']
attributes = data_dict['attributes']

# # Clean up
del data_dict

Importing data...
- Importing attributes.csv.zip...
- Importing test.csv.zip...
- Importing train.csv.zip...
- Importing product_descriptions.csv.zip...
- Importing sample_submission.csv.zip...


# Pre-processing

## Cleaning

In [186]:
print("Processing data...")

# # Process attributes data
print("- Handling attributes data...")

# # # Deal with N/As in attributes data
attr = attributes.dropna(how='all')
attr = attributes.dropna(how='all', subset=['value'])

# # # Ensure UID is int
attr.loc[:, 'product_uid'] = attr['product_uid'].apply(lambda x: int(x))

# # # If starts with Bullet followed by 2 digits then a bullet point
# # # If bullet then replace with '*'
def is_match(pattern, string):
    return bool(re.match(pattern, string))
bullet_point_regex_pattern = "Bullet\\d{2}"
attr.loc[:, 'name'] = attr['name'].apply(lambda s: '*' if is_match(bullet_point_regex_pattern, s) else s)

# # # Group name and value in attributes into single column, marking name and value
attr = attr.assign(
    attributes =
        attr.apply(
            lambda row:
                    row['name'] + ' ' + row['value'] if row['name'] == '*'
                    else row['name'] + ':' + row['value'],
            axis=1
        )
)

# # # Drop name and values, groupby UID and sum grouped values, reset index...
# # # ...(ie. all attributes in single cell now)
attr = attr.drop(['name','value'], axis=1).groupby('product_uid').sum().reset_index()

Processing data...
- Handling attributes data...


In [187]:
# # Create master data frame
print("- Creating main dataframe...")

# # # Merge all data into one master dataframe by merging descriptions and attributes onto training data on UID...
# # # ...Fill any NAs with empty string
data = pd.merge(train_df, prod_desc, how='left',
                on='product_uid').drop('id', axis=1).merge(attr, on='product_uid', how='left').fillna('')

# # # Finally create a master index column, which will be used to reference individual search terms
data = data.reset_index(drop=True).reset_index(drop=True)

- Creating main dataframe...


In [188]:
text_cols = ['product_title', 'search_term', 'product_description', 'attributes']

In [189]:
data_proc = data.copy(deep=True)

## Tokenisation
Using the regexp tokeniser in NLTK with ```r'\w+'``` was significantly faster and got rid of punctuation, which was intended.

- Need to add it to ignore the weird a^ character

|Tokeniser|Time taken/200 cells in product title (ms)|
|---|---|
|Regexp|1.28|
|wordpunct|1.57|
|wordtokenise|48.5|

In [190]:
tokenise_anon = lambda x: tokenise(x, RegexpTokenizer(r'\w+'), tokenise_fn=True)
data_proc.loc[:, text_cols] = data_proc.loc[:, text_cols].apply(tokenise_anon, axis=0)

## Remove Stop Words
Caching the stop words corpus was ridiculously faster (ie loading ```stopwords.words('english')``` once).

|Remover|Time taken/200 cells in product title (ms)|
|---|---|
|nltk stop corpus filter (no cache)|435|
|nltk stop corpus filter (cache)|6.71|

In [191]:
stopwords_anon = lambda x: rmv_stop(x, stopwords.words('english'))
data_proc.loc[:, text_cols] = data_proc.loc[:, text_cols].apply(stopwords_anon, axis=0)

## Stemming
Worth seeing if this affects the model further down the line. 
- Choosing snowball as faster and seen it be used for this problem before [9/5/19]

|Stemmer|Time taken/200 cells in product title (ms)|
|---|---|
|Porter|45|
|Snowball|37.9|

In [192]:
stemming_anon = lambda x: stem(x, SnowballStemmer('english'))
data_proc.loc[:, text_cols] = data_proc.loc[:, text_cols].apply(stemming_anon, axis=0)

## Check nans

In [193]:
data_proc.isna().sum()

product_uid            0
product_title          0
search_term            0
relevance              0
product_description    0
attributes             0
dtype: int64

# Feature Engineering

In [194]:
data_proc.head(2)

Unnamed: 0,product_uid,product_title,search_term,relevance,product_description,attributes
0,100001,"(simpson, strong, tie, 12, gaug, angl)","(angl, bracket)",3.0,"(not, angl, make, joint, stronger, also, provi...","(versatil, connector, various, 90, connect, ho..."
1,100001,"(simpson, strong, tie, 12, gaug, angl)","(l, bracket)",2.5,"(not, angl, make, joint, stronger, also, provi...","(versatil, connector, various, 90, connect, ho..."


In [195]:
data_feat = pd.DataFrame(data_proc[['product_uid', 'relevance']])

In [196]:
# # Len of query
data_feat['q_len'] = data_proc['search_term'].apply(len)

In [197]:
# # Get common words between query and returned product title
data_feat['com_title'] = func_row(data_proc[['search_term', 'product_title']], common_words_doc)

In [198]:
# # Get common words between query and returned product description
data_feat['com_desc'] = func_row(data_proc[['search_term', 'product_description']], common_words_doc)

In [199]:
# # Get common words between query and returned attributes
data_feat['com_attr'] = func_row(data_proc[['search_term', 'attributes']], common_words_doc)

In [200]:
data_feat.head(2)

Unnamed: 0,product_uid,relevance,q_len,com_title,com_desc,com_attr
0,100001,3.0,2,1,1,1
1,100001,2.5,2,1,1,1


# Modelling