# Imports

In [22]:
import os
import re
import time
import sys

import numpy as np
import pandas as pd

import spacy
from spellchecker import SpellChecker

from multiprocessing import Pool

In [2]:
%matplotlib inline

# Config

In [3]:
data_path = "../data/"

# Main

## Get data

In [4]:
print('Importing data...')

# Get all zipped files in data path
zips = [f for f in os.listdir(data_path) if re.search(".zip$", f)]

# Unzip all files and put into dictionary, keyed by file stem
data_dict = {}
for zipped in zips:
    print('- Importing {}...'.format(zipped))
    data_dict[zipped.split('.')[0]] = pd.read_csv(data_path+zipped, compression='zip', encoding='latin1')
    
# Set dataframe to piece in data dictionary
train_df = data_dict['train']
test_df = data_dict['test']
prod_desc = data_dict['product_descriptions']
attributes = data_dict['attributes']

del data_dict

Importing data...
- Importing attributes.csv.zip...
- Importing train.csv.zip...
- Importing sample_submission.csv.zip...
- Importing test.csv.zip...
- Importing product_descriptions.csv.zip...


## Preprocess

In [5]:
# Deal with N/As in attributes data (drop empty records and fill in name and values with empty string)
attr = attributes.dropna(how='all')
attr[['name','value']] = attr[['name','value']].fillna('')

# Ensure UID is int
attr['product_uid'] = attr['product_uid'].apply(lambda x: int(x))

# If "bullet" in attribute name then asserting name is meaningless - make an empty string
attr['name'] = attr['name'].apply(lambda x: '' if "Bullet" in x else x)

# Group name and value in attributes into single column, separated by a tab and ending in newline (for grouping stage next)
attr['attributes'] = attr['name'] + '\t' + attr['value'] + '\n'

# Drop name and values, groupby UID and sum grouped values, reset index...
# ...(ie. all attributes in single cell now, separated by newlines as set up above)
attr = attr.drop(['name','value'], axis=1).groupby('product_uid').sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/i

In [6]:
# # Create master data frame
print("- Creating master data frame...")

# # # Merge all data into one master dataframe by merging descriptions and attributes onto training data on UID...
# # # ...Fill any NAs with empty string
data = pd.merge(data, prod_desc, how='left', on='product_uid').drop('id', axis=1).merge(attr, on='product_uid', how='left').fillna('')

# # # Finally create a master index column, which will be used to reference individual search terms
data = data.reset_index().drop('index', axis=1).reset_index()

- Creating master data frame...


In [7]:
# # Clean up
del train_df
del prod_desc
del attr
del attributes

In [8]:
data.tail()['relevance'][240759]

''

In [9]:
data['relevance'] = data['relevance'].apply(lambda x: int(np.round(x)) if x != '' else x)

In [10]:
data.head()

Unnamed: 0,index,product_uid,product_title,search_term,relevance,product_description,attributes
0,0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3,"Not only do angles make joints stronger, they ...",\tVersatile connector for various 90Â° connect...
1,1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2,"Not only do angles make joints stronger, they ...",\tVersatile connector for various 90Â° connect...
2,2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3,BEHR Premium Textured DECKOVER is an innovativ...,"Application Method\tBrush,Roller,Spray\nAssemb..."
3,3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2,Update your bathroom with the Delta Vero Singl...,Bath Faucet Type\tCombo Tub and Shower\nBuilt-...
4,4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,3,Update your bathroom with the Delta Vero Singl...,Bath Faucet Type\tCombo Tub and Shower\nBuilt-...


## Prepare

In [43]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
spell = SpellChecker(distance=1)

In [44]:
def text_filter(x):
    return (x.is_alpha or x.is_digit) and not x.is_stop

def text_process(string):
    return ' '.join([x.lemma_.lower() for x in filter(text_filter, nlp(string))])

def spell_correction(sentence):
    corrected_sentence = sentence.split()
    misspelled = spell.unknown(corrected_sentence)
    misspelled = list(zip(set(map(lambda word: corrected_sentence.index(word), misspelled)), misspelled))
    for idx, word in misspelled:
        corrected_sentence[idx] = spell.correction(word)
    return ' '.join(corrected_sentence)

In [45]:
process_cols = ['product_title', 'search_term', 'product_description', 'attributes']

In [46]:
def process_row(row):
    return row[1].loc[process_cols].apply(text_process)

def spellcheck(row):
    return row[1].loc[process_cols].apply(spell_correction)

In [47]:
p = Pool(6)

start = time.time()
print('Processing...')
data_proc = pd.DataFrame(p.map(process_row, data.loc[:,:].iterrows()))
delta_time = (time.time() - start) / 60
print('%sm' % round(delta_time, 4))

Processing...
12.4441m


In [48]:
p = Pool(6)

start = time.time()
print('Spellcheck...')

data_proc = pd.DataFrame(p.map(spellcheck, data_proc.loc[:,:].iterrows()))
delta_time = (time.time() - start) / 60
print('%sm' % round(delta_time, 2))

Spellcheck...
10.07m


In [49]:
data_proc.head()

Unnamed: 0,product_title,search_term,product_description,attributes
0,simpson strong tie angle,angle bracket,angle joint strong provide consistent straight...,versatile connector connection home repair pro...
1,simpson strong tie angle,l bracket,angle joint strong provide consistent straight...,versatile connector connection home repair pro...
2,behr premium texture deckover tugboat wood con...,deck,behr premium textured finishfor innovative sol...,application method brush roller spray assemble...
3,delta vero shower faucet trim kit chrome valve...,rain shower head,update bathroom delta vero single handle showe...,bath faucet type combo tub shower built water ...
4,delta vero shower faucet trim kit chrome valve...,shower faucet,update bathroom delta vero single handle showe...,bath faucet type combo tub shower built water ...


## Features

In [50]:
def common_words(series):
    all_words = ' '.join(series).split()
    return len(set(all_words))

In [51]:
start = time.time()
data_proc['q_len'] = data_proc['search_term'].apply(len)
data_proc['com_title'] = data_proc.loc[:, ['search_term', 'product_title']].apply(common_words, axis=1)
data_proc['com_desc'] = data_proc.loc[:, ['search_term', 'product_description']].apply(common_words, axis=1)
data_proc['com_attr'] = data_proc.loc[:, ['search_term', 'attributes']].apply(common_words, axis=1)
processed = data_proc.drop(process_cols, axis=1)
delta_time = (time.time() - start) / 60
print('%sm' % round(delta_time, 2))

0.22m


In [52]:
processed = processed.join(data['relevance'])

In [53]:
processed.head()

Unnamed: 0,q_len,com_title,com_desc,com_attr,relevance
0,13,5,62,47,3
1,9,6,63,48,2
2,4,9,86,117,3
3,16,11,56,84,2
4,13,9,54,83,3


In [54]:
len(processed)

240760

In [58]:
train = processed[processed['relevance'] != '']

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mu, f1_score, accuracy_score, precision_score

In [135]:
X = train[train.columns[train.columns != 'relevance']]
y = pd.get_dummies(train['relevance'].apply(int))

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [137]:
mdl = RandomForestClassifier(n_estimators=100, max_depth=20)

In [138]:
mdl.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [139]:
preds = mdl.predict(X_test)

In [142]:
preds

array([[0., 0., 1.],
       [0., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 0.]])

In [143]:
confusion_matrix(y_test.values.argmax(axis=1), preds.argmax(axis=1))

array([[ 421,  860,  454],
       [2151, 5427, 3772],
       [1766, 4429, 5163]])

In [144]:
f1_score(y_test.values.argmax(axis=1), preds.argmax(axis=1), average='micro')

0.45047661907294523