# Imports

In [1]:
##############
# basic libs #
##############

from subprocess import call
from tqdm import *
from time import time
import warnings
warnings.filterwarnings('ignore')
import os, sys, time, datetime, json, string, glob, re, random
random.seed(2016)

###########
# science #
###########

import scipy as sp
import numpy as np
import pandas as pd

#######
# ML #
######

import theano as thno
import theano.tensor as T
from sklearn import preprocessing
from sklearn import metrics
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import pipeline
from sklearn import feature_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
stemmer = PorterStemmer()

# matplotlib
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

# seaborn
import seaborn as sns
sns.set(style="dark", palette="muted")
sns.set_context("notebook",
                font_scale=1.5,
                rc={"lines.linewidth": 2.5})

In [2]:
%reload_ext watermark
%watermark -a "Ken Cavagnolo" -n -u -v -m -h -p numpy,scipy,pandas,seaborn,scikit-learn,joblib

Ken Cavagnolo 
Last updated: Wed Feb 17 2016 

CPython 2.7.10
IPython 4.0.3

numpy 1.10.4
scipy 0.17.0
pandas 0.17.1
seaborn 0.7.0
scikit-learn 0.17
joblib 0.9.4

compiler   : GCC 5.2.1 20151010
system     : Linux
release    : 4.2.0-23-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit
host name  : ubuntu


# Functions

In [3]:
def str_stemmer(s):
    slist = [stemmer.stem(w) for w in s]
    s = " ".join(slist)
    return s

In [4]:
def str_stem(s):
    if isinstance(s, basestring):
        s = s.lower()
        s = s.replace("'","in.")
        s = s.replace("inches","in.")
        s = s.replace("inch","in.")
        s = s.replace(" in ","in. ")
        s = s.replace(" in.","in.")
        s = s.replace("''","ft.")
        s = s.replace(" feet ","ft. ")
        s = s.replace("feet","ft.")
        s = s.replace("foot","ft.")
        s = s.replace(" ft ","ft. ")
        s = s.replace(" ft.","ft.")
        s = s.replace(" pounds ","lb. ")
        s = s.replace(" pound ","lb. ")
        s = s.replace("pound","lb.")
        s = s.replace(" lb ","lb. ")
        s = s.replace(" lb.","lb.")
        s = s.replace(" lbs ","lb. ")
        s = s.replace("lbs.","lb.")
        s = s.replace(" x "," xby ")
        s = s.replace("*"," xby ")
        s = s.replace(" by "," xby")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")
        s = s.replace(" sq ft","sq.ft. ")
        s = s.replace("sq ft","sq.ft. ")
        s = s.replace("sqft","sq.ft. ")
        s = s.replace(" sqft ","sq.ft. ")
        s = s.replace("sq. ft","sq.ft. ")
        s = s.replace("sq ft.","sq.ft. ")
        s = s.replace("sq feet","sq.ft. ")
        s = s.replace("square feet","sq.ft. ")
        s = s.replace(" gallons ","gal. ")
        s = s.replace(" gallon ","gal. ")
        s = s.replace("gallons","gal.")
        s = s.replace("gallon","gal.")
        s = s.replace(" gal ","gal. ")
        s = s.replace(" gal","gal.")
        s = s.replace("ounces","oz.")
        s = s.replace("ounce","oz.")
        s = s.replace(" oz.","oz. ")
        s = s.replace(" oz ","oz. ")
        s = s.replace("centimeters","cm.")
        s = s.replace(" cm.","cm.")
        s = s.replace(" cm ","cm. ")
        s = s.replace("milimeters","mm.")
        s = s.replace(" mm.","mm.")
        s = s.replace(" mm ","mm. ")
        s = s.replace(u"\u00b0","deg. ")
        s = s.replace("degrees","deg. ")
        s = s.replace("degree","deg. ")
        s = s.replace("volts","volt. ")
        s = s.replace("volt","volt. ")
        s = s.replace("watts","watt. ")
        s = s.replace("watt","watt. ")
        s = s.replace("ampere","amp. ")
        s = s.replace("amps","amp. ")
        s = s.replace(" amp ","amp. ")
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")
        s = s.replace("  "," ")
        s = str_stemmer(s.lower().split())
        return s.lower()
    else:
        return "null"

In [5]:
def str_common_word(str1, str2):
    return sum(int(str2.find(word) >= 0) for word in str1.split())

In [6]:
def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [7]:
def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = metrics.mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

In [8]:
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['id','relevance','search_term',
                     'product_title','product_description',
                     'product_info','attr','brand']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

In [9]:
class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]

# Preprocess

In [37]:
datadir = '/Users/cavagnolo/ml_fun/home_depot/data/'

df_train = pd.read_csv(datadir + 'train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(datadir + 'test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(datadir + 'product_descriptions.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv(datadir + 'attributes.csv', encoding="ISO-8859-1")

df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

df_all.to_csv('df_all.csv', encoding="ISO-8859-1")

In [38]:
df_all = pd.read_csv('data/df_all.csv', encoding="ISO-8859-1")
df_all.drop('Unnamed: 0', 1, inplace=True)
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...,Delta
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...,Delta


In [39]:
df_all['search_term'] = df_all['search_term'].map(lambda x: str_stem(x))
df_all['product_title'] = df_all['product_title'].map(lambda x: str_stem(x))
df_all['product_description'] = df_all['product_description'].map(lambda x: str_stem(x))
df_all['brand'] = df_all['brand'].map(lambda x: str_stem(x))

df_all['len_of_query'] = df_all['search_term'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x: len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term'] + "\t" + df_all['product_title'] + "\t" + df_all['product_description']
df_all['attr'] = df_all['search_term'] + "\t" + df_all['brand']

df_all['query_in_title'] = df_all['product_info'].map(lambda x: str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x: str_whole_word(x.split('\t')[0],x.split('\t')[2],0))

df_all['word_in_title'] = df_all['product_info'].map(lambda x: str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x: str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

df_all['ratio_title'] = df_all['word_in_title'] / df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description'] / df_all['len_of_query']
df_all['ratio_brand'] = df_all['word_in_brand'] / df_all['len_of_brand']

df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
df_all['brand_feature'] = df_all['brand'].map(lambda x: d[x])
df_all['search_term_feature'] = df_all['search_term'].map(lambda x: len(x))

In [None]:
df_all['is_train'] = False
num_train = df_train.shape[0]
df_all['is_train'].iloc[:num_train] = True

In [119]:
# save to hdf5 for easier loading later
hdf = pd.HDFStore(hdf_file)
hdf.put('df_test', df_test)
hdf.put('df_train', df_train)
hdf.put('df_all', df_all)
hdf.close()

# Reload Point

In [12]:
# get datadir
import platform
uname = platform.uname()[0]
if uname == 'Linux':
    datadir = '/home/kcavagnolo/ml_fun/home_depot/data/'
elif uname == 'Darwin':
    datadir = '/Users/cavagnolo/ml_fun/home_depot/data/'
else:
    raise OSError("Unknown system: "+str(uname))

# get files
files = sorted(glob.glob(datadir + '*.csv'))
hdf_file = datadir + 'features.h5'

# reopen hdf store
hdf = pd.HDFStore(hdf_file)

# reload df's
print hdf
df_all = hdf['df_all']
hdf.close()

<class 'pandas.io.pytables.HDFStore'>
File path: /home/kcavagnolo/ml_fun/home_depot/data/features.h5
/df_all              frame        (shape->[240760,3])
/df_test             frame        (shape->[166693,3])
/df_train            frame        (shape->[74067,3]) 


# Modeling

In [26]:
df_train = df_all[(df_all.is_train == True)].copy().drop('is_train', 1)
df_test = df_all[(df_all.is_train == False)].copy().drop('is_train', 1)

In [None]:
# in data

id_test  = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train[:]
X_test  = df_test[:]

In [92]:
# random forest
rf = RandomForestRegressor(n_estimators = 1000,
                           random_state = 2016,
                           n_jobs = -1,
                           verbose = 1)

# 
tfidf = TfidfVectorizer(ngram_range=(1, 1),
                        stop_words='english')

tsvd = TruncatedSVD(n_components=12,
                    random_state = 2016)

combined_features = pipeline.FeatureUnion([('cst',  cust_regression_vals()),
                                  ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')),
                                                              ('tfidf1', tfidf),
                                                              ('tsvd1', tsvd)])),
                                  ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')),
                                                              ('tfidf2', tfidf),
                                                              ('tsvd2', tsvd)])),
                                  ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')),
                                                              ('tfidf3', tfidf),
                                                              ('tsvd3', tsvd)])),
                                  ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')),
                                                              ('tfidf4', tfidf),
                                                              ('tsvd4', tsvd)]))
                                 ],
                                 n_jobs = 1,
                                 transformer_weights= {'cst': 1.0,
                                                       'txt1': 0.5,
                                                       'txt2': 0.25,
                                                       'txt3': 0.0,
                                                       'txt4': 0.5},
                                )

clf = pipeline.Pipeline([('features', combined_features), ('rf', rf)])

param_grid = {'rf__max_features': [12],
              'rf__max_depth': [20],
              'features__txt3__tsvd3__n_components': [6, 10, 12, 16]
             }

RMSE = metrics.make_scorer(fmean_squared_error, greater_is_better=False)

grid_model = grid_search.GridSearchCV(estimator = clf,
                                      param_grid = param_grid,
                                      cv = 5,
                                      scoring = RMSE,
                                      n_jobs = -1,
                                      verbose = 1)

In [93]:
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


ImportError: [joblib] Attempting to do parallel computing without protecting your import on a system that does not support forking. To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'". Please see the joblib documentation on Parallel for more information

In [85]:
print("\nBest score: {:0.5f}".format(grid_model.best_score_))
print("\nBest parameters set:")
best_parameters = grid_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t{}: {}".format(param_name, best_parameters[param_name]))


Best score: -0.47121

Best parameters set:
	rf__max_depth: 20
	rf__max_features: 12


All the below use:
n_comps = 12,
cst: 1.0,
txt1: 0.5,
txt2: 0.25,
txt3: 0.0,
txt4: 0.5

N_comps | Max Feature | Max Depth | Local   | KLB
:--     | :--         | :--       | :--:    | :--:
10*     | 10*, 20     | 10*, 20   | 0.47551 | 0.47415
12*     | 12*         | 20*       | 0.47133 | 0.47373
12*     | 12*, 16, 18 | None*     | 0.47102 | 0.47423

All the below use:
cst: 1.0,
txt1: 0.5,
txt2: 0.15,
txt3: 0.1,
txt4: 0.5

N_comps | Max Feature | Max Depth | Local   | KLB
:--     | :--         | :--       | :--:    | :--:
12*     | 12*         | 20*       | 0.47121 | 0.47423

In [86]:
y_pred = model.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   15.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   36.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  1.5min finished


# Scratch Space

In [None]:
# save to hdf5 for easier loading later
call(["rm", "-rf", hdf_file])
hdf = pd.HDFStore(hdf_file)
hdf.put('df_test', df_test)
hdf.put('df_train', df_train)
hdf.put('df_all', df_all)
hdf.close()
call(["lrztar", "-zf", hdf_file])