# Imports

In [1]:
##############
# basic libs #
##############

from subprocess import call
from tqdm import *
from time import time
import warnings
warnings.filterwarnings('ignore')
import os, sys, time, datetime, json, string, glob, re, random
random.seed(2016)

###########
# science #
###########

import scipy as sp
import numpy as np
import pandas as pd

#######
# ML #
######

from sklearn import preprocessing
from sklearn import metrics
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import pipeline
from sklearn import feature_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
stemmer = PorterStemmer()

# matplotlib
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

# seaborn
import seaborn as sns
sns.set(style="dark", palette="muted")
sns.set_context("notebook",
                font_scale=1.5,
                rc={"lines.linewidth": 2.5})

In [2]:
%reload_ext watermark
%watermark -a "Ken Cavagnolo" -n -u -v -m -h -p numpy,scipy,pandas,seaborn,scikit-learn,joblib

Ken Cavagnolo 
last updated: Mon Feb 22 2016 

CPython 2.7.10
IPython 4.1.1

numpy 1.10.4
scipy 0.16.1
pandas 0.17.0
seaborn 0.6.0
scikit-learn 0.17
joblib 0.8.4

compiler   : GCC 4.2.1 Compatible Apple LLVM 7.0.0 (clang-700.0.72)
system     : Darwin
release    : 15.3.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
host name  : DrGonzo.local


# Functions

In [2]:
def str_stemmer(s):
    slist = [stemmer.stem(w) for w in s]
    s = " ".join(slist)
    return s

In [3]:
def str_stem(s):
    if isinstance(s, basestring):
        s = s.lower()
        s = s.replace("''","inch")
        s = s.replace("inches","inch")
        s = s.replace("' ","feet ")
        s = s.replace("foot","feet")
        s = s.replace(" ft "," feet ")
        s = s.replace(" ft.","feet")
        s = s.replace("pound","pounds")
        s = s.replace(" lb "," pounds ")
        s = s.replace(" lb."," pounds")
        s = s.replace(" lbs "," pounds ")
        s = s.replace("lbs.","pounds")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")
        s = s.replace("sq ft","sqft")
        s = s.replace("sq. ft","sqft")
        s = s.replace("sq ft.","sqft")
        s = s.replace("sq feet","sqft")
        s = s.replace("square feet","sqft")
        s = s.replace("gallon","gallons")
        s = s.replace(" gal. "," gallons ")
        s = s.replace(" gal "," gallons ")
        s = s.replace("ounce","ounces")
        s = s.replace(" oz."," ounces")
        s = s.replace(" oz "," ounces ")
        s = s.replace(" cm."," centimeters ")
        s = s.replace(" cm "," centimeters ")
        s = s.replace(" mm."," millimeters")
        s = s.replace(" mm "," millimeters ")
        s = s.replace(u"\u00b0","degrees")
        s = s.replace("degree","degrees")
        s = s.replace("volt","volts")
        s = s.replace("watt","watts")
        s = s.replace("amps","ampere")
        s = s.replace(" amp "," ampere ")
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")
        s = s.replace("  "," ")
        s = str_stemmer(s.lower().split())
        return s.lower()
    else:
        return "null"

In [4]:
def str_common_word(str1, str2):
    return sum(int(str2.find(word) >= 0) for word in str1.split())

In [5]:
def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [6]:
def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = metrics.mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

In [7]:
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['id','relevance','search_term',
                     'product_title','product_description',
                     'product_info','attr','brand']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

In [8]:
class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]

# Preprocess

In [18]:
# get datadir
import platform
uname = platform.uname()[0]
if uname == 'Linux':
    datadir = '/home/kcavagnolo/ml_fun/home_depot/Data/'
elif uname == 'Darwin':
    datadir = '/Users/cavagnolo/ml_fun/home_depot/Data/'
else:
    raise OSError("Unknown system: "+str(uname))

# get files
files = sorted(glob.glob(datadir + '*.csv'))
hdf_file = datadir + 'features.h5'

In [19]:
# read data from csv's
df_train = pd.read_csv(datadir + 'train_orig.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(datadir + 'test_orig.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(datadir + 'product_descriptions.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv(datadir + 'attributes.csv', encoding="ISO-8859-1")

# build new df's w/ merged data
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

In [None]:
df_all['search_term'] = df_all['search_term'].map(lambda x: str_stem(x))
df_all['product_title'] = df_all['product_title'].map(lambda x: str_stem(x))
df_all['product_description'] = df_all['product_description'].map(lambda x: str_stem(x))
df_all['brand'] = df_all['brand'].map(lambda x: str_stem(x))

df_all['len_of_query'] = df_all['search_term'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x: len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x: len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term'] + "\t" + df_all['product_title'] + "\t" + df_all['product_description']
df_all['attr'] = df_all['search_term'] + "\t" + df_all['brand']

df_all['query_in_title'] = df_all['product_info'].map(lambda x: str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x: str_whole_word(x.split('\t')[0],x.split('\t')[2],0))

df_all['word_in_title'] = df_all['product_info'].map(lambda x: str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x: str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

df_all['ratio_title'] = df_all['word_in_title'] / df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description'] / df_all['len_of_query']
df_all['ratio_brand'] = df_all['word_in_brand'] / df_all['len_of_brand']

df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s] = i
    i += 1
df_all['brand_feature'] = df_all['brand'].map(lambda x: d[x])
df_all['search_term_feature'] = df_all['search_term'].map(lambda x: len(x))

In [15]:
# designate train/test in one df
df_all['is_train'] = False
num_train = df_train.shape[0]
df_all['is_train'].iloc[:num_train] = True

In [18]:
# save to hdf5 for easier loading later
hdf_file = datadir + 'features.h5'
hdf = pd.HDFStore(hdf_file)
hdf.put('df_test', df_test)
hdf.put('df_train', df_train)
hdf.put('df_all', df_all)
hdf.close()

# Reload Point

In [24]:
# get datadir
import platform
uname = platform.uname()[0]
if uname == 'Linux':
    datadir = '/home/kcavagnolo/ml_fun/home_depot/data/'
elif uname == 'Darwin':
    datadir = '/Users/cavagnolo/ml_fun/home_depot/data/'
else:
    raise OSError("Unknown system: "+str(uname))

# get files
files = sorted(glob.glob(datadir + '*.csv'))
hdf_file = datadir + 'features.h5'

# reopen hdf store
hdf = pd.HDFStore(hdf_file)

# reload df's
print hdf
df_all = hdf['df_all']
hdf.close()

<class 'pandas.io.pytables.HDFStore'>
File path: /home/kcavagnolo/ml_fun/home_depot/data/features.h5
/df_all              frame        (shape->[240760,4])
/df_test             frame        (shape->[166693,2])
/df_train            frame        (shape->[74067,3]) 


# Modeling

In [26]:
# in data
df_train = df_all[(df_all.is_train == True)].copy().drop('is_train', 1)
df_test  = df_all[(df_all.is_train == False)].copy().drop('is_train', 1)

y_train  = df_train['relevance'].values
X_train  = df_train[:]

X_test   = df_test[:]
id_test  = df_test['id']

In [30]:
# random forest
rf = RandomForestRegressor(n_estimators = 1000,
                           random_state = 2016,
                           n_jobs = -1,
                           verbose = 1)

# 
tfidf = TfidfVectorizer(ngram_range=(1, 1),
                        stop_words='english')

tsvd = TruncatedSVD(n_components=12,
                    random_state = 2016)

combined_features = pipeline.FeatureUnion([('cst',  cust_regression_vals()),
                                  ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')),
                                                              ('tfidf1', tfidf),
                                                              ('tsvd1', tsvd)])),
                                  ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')),
                                                              ('tfidf2', tfidf),
                                                              ('tsvd2', tsvd)])),
                                  ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')),
                                                              ('tfidf3', tfidf),
                                                              ('tsvd3', tsvd)])),
                                  ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')),
                                                              ('tfidf4', tfidf),
                                                              ('tsvd4', tsvd)]))
                                 ],
                                 n_jobs = 1,
                                 transformer_weights= {'cst': 1.0,
                                                       'txt1': 0.5,
                                                       'txt2': 0.25,
                                                       'txt3': 0.0,
                                                       'txt4': 0.5},
                                )

clf = pipeline.Pipeline([('features', combined_features), ('rf', rf)])

param_grid = {'rf__max_features': [12],
              'rf__max_depth': [20],
              'features__txt3__tsvd3__n_components': [6, 10, 12, 16]
             }

RMSE = metrics.make_scorer(fmean_squared_error, greater_is_better=False)

grid_model = grid_search.GridSearchCV(estimator = clf,
                                      param_grid = param_grid,
                                      cv = 5,
                                      scoring = RMSE,
                                      n_jobs = 1,
                                      verbose = 1)

In [31]:
grid_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(steps=[('s1', cust_txt_col(key='search_term')), ('tfidf1', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf...ators=1000, n_jobs=-1, oob_score=False,
           random_state=2016, verbose=1, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'rf__max_depth': [20], 'features__txt3__tsvd3__n_components': [6, 10, 12, 16], 'rf__max_features': [12]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(fmean_squared_error, greater_is_better=False),
       verbose=1)

In [32]:
print("\nBest score: {:0.5f}".format(grid_model.best_score_))
print("\nBest parameters set:")
best_parameters = grid_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t{}: {}".format(param_name, best_parameters[param_name]))


Best score: -0.46373

Best parameters set:
	features__txt3__tsvd3__n_components: 16
	rf__max_depth: 20
	rf__max_features: 12


All the below use:
n_comps = 12,
cst: 1.0,
txt1: 0.5,
txt2: 0.25,
txt3: 0.0,
txt4: 0.5

N_comps | Max Feature | Max Depth | Local   | KLB
:--     | :--         | :--       | :--:    | :--:
10*     | 10*, 20     | 10*, 20   | 0.47551 | 0.47415
12*     | 12*         | 20*       | 0.47133 | 0.47373
12*     | 12*, 16, 18 | None*     | 0.47102 | 0.47423

All the below use:
cst: 1.0,
txt1: 0.5,
txt2: 0.15,
txt3: 0.1,
txt4: 0.5

N_comps | Max Feature | Max Depth | Local   | KLB
:--     | :--         | :--       | :--:    | :--:
12*     | 12*         | 20*       | 0.47121 | 0.47423

In [34]:
y_pred = grid_model.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    8.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   19.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   35.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   45.8s finished


# CF Formating

In [17]:
df_all.columns

Index([                 u'id',       u'product_title',         u'product_uid',
          u'median_relevance', u'product_description',               u'query',
        u'relevance_variance',            u'is_train'],
      dtype='object')

In [20]:
#"id","query","product_title","product_description","median_relevance","relevance_variance"
df_all.drop('product_uid', 1, inplace=True)
df_all['query'] = df_all['search_term'].map(lambda x: str_stem(x))
df_all.drop('search_term', 1, inplace=True)
df_all['product_title'] = df_all['brand'].map(lambda x: str_stem(x)) + " " + df_all['product_title'].map(lambda x: str_stem(x))
df_all.drop('brand', 1, inplace=True)
df_all['product_description'] = df_all['product_description'].map(lambda x: str_stem(x))
df_all.rename(columns={'relevance': 'median_relevance'}, inplace=True)
df_all['relevance_variance'] = df_all['median_relevance'].apply(np.sqrt)

In [21]:
# designate train/test in one df
df_all['is_train'] = False
num_train = df_train.shape[0]
df_all['is_train'].iloc[:num_train] = True
df_all[(df_all.is_train == True)].copy().drop('is_train', 1).to_csv('train_mod.csv', encoding="ISO-8859-1", index=False)
df_all[(df_all.is_train == False)].copy().drop('is_train', 1).to_csv('test_mod.csv', encoding="ISO-8859-1", index=False)