# Imports

In [29]:
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import pipeline, grid_search
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer
from nltk.stem.porter import *
import re
import random

stemmer = PorterStemmer()
random.seed(2016)

In [None]:
def str_stemmer(s):
    slist = [stemmer.stem(w) for w in s]
    s = " ".join(slist)
    return s

In [3]:
def str_stem(s):
    if isinstance(s, basestring):
        s = s.lower()
        s = s.replace("'","in.")
        s = s.replace("inches","in.")
        s = s.replace("inch","in.")
        s = s.replace(" in ","in. ")
        s = s.replace(" in.","in.")
        s = s.replace("''","ft.")
        s = s.replace(" feet ","ft. ")
        s = s.replace("feet","ft.")
        s = s.replace("foot","ft.")
        s = s.replace(" ft ","ft. ")
        s = s.replace(" ft.","ft.")
        s = s.replace(" pounds ","lb. ")
        s = s.replace(" pound ","lb. ")
        s = s.replace("pound","lb.")
        s = s.replace(" lb ","lb. ")
        s = s.replace(" lb.","lb.")
        s = s.replace(" lbs ","lb. ")
        s = s.replace("lbs.","lb.")
        s = s.replace(" x "," xby ")
        s = s.replace("*"," xby ")
        s = s.replace(" by "," xby")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")
        s = s.replace(" sq ft","sq.ft. ")
        s = s.replace("sq ft","sq.ft. ")
        s = s.replace("sqft","sq.ft. ")
        s = s.replace(" sqft ","sq.ft. ")
        s = s.replace("sq. ft","sq.ft. ")
        s = s.replace("sq ft.","sq.ft. ")
        s = s.replace("sq feet","sq.ft. ")
        s = s.replace("square feet","sq.ft. ")
        s = s.replace(" gallons ","gal. ")
        s = s.replace(" gallon ","gal. ")
        s = s.replace("gallons","gal.")
        s = s.replace("gallon","gal.")
        s = s.replace(" gal ","gal. ")
        s = s.replace(" gal","gal.")
        s = s.replace("ounces","oz.")
        s = s.replace("ounce","oz.")
        s = s.replace(" oz.","oz. ")
        s = s.replace(" oz ","oz. ")
        s = s.replace("centimeters","cm.")
        s = s.replace(" cm.","cm.")
        s = s.replace(" cm ","cm. ")
        s = s.replace("milimeters","mm.")
        s = s.replace(" mm.","mm.")
        s = s.replace(" mm ","mm. ")
        s = s.replace(u"\u00b0","deg. ")
        s = s.replace("degrees","deg. ")
        s = s.replace("degree","deg. ")
        s = s.replace("volts","volt. ")
        s = s.replace("volt","volt. ")
        s = s.replace("watts","watt. ")
        s = s.replace("watt","watt. ")
        s = s.replace("ampere","amp. ")
        s = s.replace("amps","amp. ")
        s = s.replace(" amp ","amp. ")
        s = s.replace("whirpool","whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless","whirlpool stainless")
        s = s.replace("  "," ")
        s = str_stemmer(s.lower().split())
        return s.lower()
    else:
        return "null"

In [4]:
def str_common_word(str1, str2):
    return sum(int(str2.find(word) >= 0) for word in str1.split())

In [5]:
def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [6]:
def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

In [7]:
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['id','relevance','search_term',
                     'product_title','product_description',
                     'product_info','attr','brand']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

In [58]:
class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]#.apply(str)

# Preprocess

In [37]:
datadir = '/Users/cavagnolo/ml_fun/home_depot/data/'

df_train = pd.read_csv(datadir + 'train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(datadir + 'test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(datadir + 'product_descriptions.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv(datadir + 'attributes.csv', encoding="ISO-8859-1")

df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

df_all.to_csv('df_all.csv', encoding="ISO-8859-1")

In [38]:
df_all = pd.read_csv('data/df_all.csv', encoding="ISO-8859-1")
df_all.drop('Unnamed: 0', 1, inplace=True)
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...,Delta
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...,Delta


In [39]:
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stem(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stem(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stem(x))
df_all['brand'] = df_all['brand'].map(lambda x:str_stem(x))
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] +"\t"+df_all['product_description']
df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']
df_all['word_in_brand'] = df_all['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']
df_brand = pd.unique(df_all.brand.ravel())
d={}
i = 1
for s in df_brand:
    d[s]=i
    i+=1
df_all['brand_feature'] = df_all['brand'].map(lambda x:d[x])
df_all['search_term_feature'] = df_all['search_term'].map(lambda x:len(x))

In [40]:
df_all.to_csv('df_clean.csv', encoding="ISO-8859-1")

# Reload Point

In [41]:
df_all = pd.read_csv('data/df_clean.csv', encoding="ISO-8859-1")
df_all.drop('Unnamed: 0', 1, inplace=True)
df_train = pd.read_csv('data/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('data/test.csv', encoding="ISO-8859-1")
num_train = df_train.shape[0]

In [42]:
df_train = df_all.iloc[:num_train]
df_test  = df_all.iloc[num_train:]
id_test  = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train[:]
X_test  = df_test[:]

In [None]:
model = joblib.load('grid_rf_tfidf_tsvd.pkl') 

# Modeling

In [69]:
rf = RandomForestRegressor(n_estimators = 1000,
                           random_state = 2016,
                           n_jobs = -1,
                           verbose = 1)

tfidf = TfidfVectorizer(ngram_range=(1, 1),
                        stop_words='english')

tsvd = TruncatedSVD(n_components=12,
                    random_state = 2016)

combined_features = FeatureUnion([('cst',  cust_regression_vals()),
                                  ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')),
                                                              ('tfidf1', tfidf),
                                                              ('tsvd1', tsvd)])),
                                  ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')),
                                                              ('tfidf2', tfidf),
                                                              ('tsvd2', tsvd)])),
                                  ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')),
                                                              ('tfidf3', tfidf),
                                                              ('tsvd3', tsvd)])),
                                  ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')),
                                                              ('tfidf4', tfidf),
                                                              ('tsvd4', tsvd)]))
                                 ],
                                 n_jobs = 1,
                                 transformer_weights= {'cst': 1.0,
                                                       'txt1': 0.5,
                                                       'txt2': 0.25,
                                                       'txt3': 0.0,
                                                       'txt4': 0.5},
                                )

clf = pipeline.Pipeline([('features', combined_features), ('rf', rf)])

param_grid = {'rf__max_features': [12, 16, 18]}#,
              #'rf__max_depth': [20]}

RMSE = make_scorer(fmean_squared_error, greater_is_better=False)

model = grid_search.GridSearchCV(estimator = clf,
                                 param_grid = param_grid,                                 
                                 cv = 2,
                                 scoring = RMSE,
                                 n_jobs = 1,
                                 verbose = 10)

In [None]:
model.fit(X_train, y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] rf__max_features=12 .............................................

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    8.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   10.7s finished



[CV] ................... rf__max_features=12, score=-0.470257 - 2.7min
[CV] rf__max_features=12 .............................................

[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   13.3s finished



[CV] ................... rf__max_features=12, score=-0.471016 - 2.8min
[CV] rf__max_features=16 .............................................

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.0min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   10.3s finished



[CV] ................... rf__max_features=16, score=-0.471313 - 3.4min
[CV] rf__max_features=16 .............................................

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    5.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   11.5s finished



[CV] ................... rf__max_features=16, score=-0.471343 - 3.6min
[CV] rf__max_features=18 .............................................

[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   12.3s finished



[CV] ................... rf__max_features=18, score=-0.471354 - 3.6min
[CV] rf__max_features=18 .............................................

Max Feature | Max Depth | Local   | KLB
:--         | :--       | :--:    | :--:
10, 20      | 10, 20    | 0.47551 | 0.47415
12          | 20        | 0.47133 | 0.47373
12, 16, 18  | None      | ------- | -------

In [66]:
print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

Best parameters found by grid search:
{'rf__max_depth': 20, 'rf__max_features': 12}
Best CV score:
-0.471332341701


In [62]:
from sklearn.externals import joblib
joblib.dump(model, 'grid_rf_tfidf_tsvd.pkl') 

['grid_rf_tfidf_tsvd.pkl',
 'grid_rf_tfidf_tsvd.pkl_01.npy',
 'grid_rf_tfidf_tsvd.pkl_02.npy',
 'grid_rf_tfidf_tsvd.pkl_03.npy',
 'grid_rf_tfidf_tsvd.pkl_04.npy',
 'grid_rf_tfidf_tsvd.pkl_05.npy',
 'grid_rf_tfidf_tsvd.pkl_06.npy',
 'grid_rf_tfidf_tsvd.pkl_07.npy',
 'grid_rf_tfidf_tsvd.pkl_08.npy',
 'grid_rf_tfidf_tsvd.pkl_09.npy',
 'grid_rf_tfidf_tsvd.pkl_10.npy',
 'grid_rf_tfidf_tsvd.pkl_11.npy',
 'grid_rf_tfidf_tsvd.pkl_12.npy',
 'grid_rf_tfidf_tsvd.pkl_13.npy',
 'grid_rf_tfidf_tsvd.pkl_14.npy',
 'grid_rf_tfidf_tsvd.pkl_15.npy',
 'grid_rf_tfidf_tsvd.pkl_16.npy',
 'grid_rf_tfidf_tsvd.pkl_17.npy',
 'grid_rf_tfidf_tsvd.pkl_18.npy',
 'grid_rf_tfidf_tsvd.pkl_19.npy',
 'grid_rf_tfidf_tsvd.pkl_20.npy',
 'grid_rf_tfidf_tsvd.pkl_21.npy',
 'grid_rf_tfidf_tsvd.pkl_22.npy',
 'grid_rf_tfidf_tsvd.pkl_23.npy',
 'grid_rf_tfidf_tsvd.pkl_24.npy',
 'grid_rf_tfidf_tsvd.pkl_25.npy',
 'grid_rf_tfidf_tsvd.pkl_26.npy',
 'grid_rf_tfidf_tsvd.pkl_27.npy',
 'grid_rf_tfidf_tsvd.pkl_28.npy',
 'grid_rf_tfidf_tsvd.

In [67]:
y_pred = model.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    8.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   22.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   42.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   51.3s finished
