# Final Project - Presentation Version

## Group Name: Lambda

### Student Names
1. Jian Wang
2. Chong Geng
3. Alan Perry
4. Divya Bhargavi
5. Robert Sandor

## Feature Engineering

One of the major tasks of this project was to generate numerical features based upon the text we were provided. After doing some research, we decided upon a number of features that we thought might work.

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from Feature_Engineering.custom_estimators import *

  from numpy.core.umath_tests import inner1d


In [3]:
products = pd.read_csv('Data/product_descriptions.csv')
train = pd.read_csv('Data/train.csv', encoding='ISO-8859-1')
attributes = pd.read_csv('Data/attributes.csv', encoding='ISO-8859-1')

In [4]:
# BEWARE: this takes ~1 min to run
attrib_per_product = attrib_stack(attributes, 'Data/attrib_per_product.csv')

In [5]:
train, attrib_per_product = join_attrib(train, attrib_per_product)
train = search_term_in_attrib(train)

In [6]:
train = color_df(attributes, train)

In [7]:
glove_file = 'Data/glove.6B.300d.txt'
glove_dic = make_dictionary(glove_file)

In [8]:
modified_train = train.set_index('product_uid').join(
        products.set_index('product_uid'))
modified_train = modified_train.reset_index()

In [9]:
modified_train['total_description'] = modified_train['product_title'] + \
        modified_train['product_description']

In [10]:
test_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('cleaned_pipeline', Pipeline([
            ('cleaned_terms', CleanedTerms()),
            ('secondary_cleaned_features', FeatureUnion([
                ('cleaned_search_term_length', Length(
                    'cleaned_search_term_length', 'cleaned_terms')),
                ('title_entropy', Entropy('product_title')),
                ('search_terms_entropy', Entropy('cleaned_terms')),
                ('jscore_title', Jaro('search_term', 'product_title')),
                ('jscore_desc', Jaro('search_term', 'product_description')),
                ('clean_terms_in_title', FindTermsInCorpus(
                    'cleaned_terms', 'product_title')),
                ('clean_terms_in_desc', FindTermsInCorpus(
                    'cleaned_terms', 'product_description')),
                ('jaccard_index_title', JaccardIndex('product_title',
                                                     'cleaned_terms')),
                ('jaccard_index_desc', JaccardIndex('product_description',
                                                    'cleaned_terms')),
                ('lcs_title', LCS('cleaned_terms',
                                  'product_title')),
                ('lcs_desc', LCS('cleaned_terms',
                                 'product_description'))
            ])),
        ])),
        ('neighbours_pipeline', Pipeline([
            ('cleaned_terms', CleanedTerms()),
            ('terms_neighbour', FindNeighbors(
                'cleaned_terms', glove_dic, glove_file)),
            ('neighbors_numerical_features', FeatureUnion([
                ('neighbors_in_title', FindNeighborsInCorpus('terms_neighbour',
                                                             'product_title')),
                ('neighbors_in_desc', FindNeighborsInCorpus('terms_neighbour',
                                                            'product_description'))
            ]))
        ])),
        ('stemmed_pipeline', Pipeline([
            ('stemmed_terms', StemmedTerms('stemmed_terms', 'search_term')),
            ('stemmed_title', StemmedTerms('stemmed_title', 'product_title')),
            ('stemmed_desc', StemmedTerms('stemmed_desc', 'product_description')),
            ('secondary_stemmed_features', FeatureUnion([
                ('stemmed_terms_in_title', FindTermsInCorpus(
                    'stemmed_terms', 'stemmed_title')),
                ('stemmed_terms_in_desc', FindTermsInCorpus(
                    'stemmed_terms', 'stemmed_desc'))
            ]))
        ])),
        ('lemmatized_pipeline', Pipeline([
            ('lemmatized_terms', LemmatizedTerms(
                'lemmatized_terms', 'search_term')),
            ('lemmatized_title', LemmatizedTerms(
                'lemmatized_title', 'product_title')),
            ('lemmatized_desc', LemmatizedTerms(
                'lemmatized_desc', 'product_description')),
            ('lemmatized_numerical_features', FeatureUnion([
                ('lemmatized_terms_in_title', FindTermsInCorpus('lemmatized_terms',
                                                                'lemmatized_title')),
                ('lemmatized_terms_in_desc', FindTermsInCorpus('lemmatized_terms',
                                                               'lemmatized_desc'))
            ]))
        ])),
        ('miscellaneous_pipeline', Pipeline([
            ('primary_misc_features', FeatureUnion([
                ('num_words_in_description', CountWords(
                    'total_description', lambda x: len(tokenize(x)))),
                ('num_stop_words', CountWords('search_term',
                                              lambda x: num_stop_words(x.split(' ')))),
                ('num_search_words', CountWords(
                    'search_term', lambda x: len(x.split(' ')))),
                ('tfidf_search_common', TFIDFSearchIntersection()),
                ('num_attributes', CountAttributes(attributes)),
                ('title_length', Length('product_title_length', 'product_title')),
                ('desc_length', Length('product_desc_length', 'product_description')),
                ('min_levenstein_dist_title', MinLevensteinDistTitle()),
                ('min_levenstein_dist_brand', MinLevensteinDistBrand(attributes)),
                ('color_in_search_term', FindColorInSearchTerm(attributes)),
                ('search_title_SW', SW_Score(
                    'search_term', 'product_title')),  
                ('search_desc_SW', SW_Score(
                    'search_term', 'product_description')),
                ('NCD_query_title', NCD('product_title', 'search_term'))
            ]))
        ]))
    ]))
])

In [11]:
modified_train.fillna(0, inplace=True)

In [12]:
X_train = modified_train[['product_title', 'search_term',
                          'name', 'value', 'search_term_split', 'search_term_in_attrib',
                          'product_description', 'product_uid', 'total_description']]
y_train = modified_train[['relevance']]

In [13]:
train_data, test_data, train_target, test_target = train_test_split(X_train,
                                                                    y_train)

In [14]:
# BEWARE: takes ~4 hr to run (with all features)
features = test_pipeline.fit_transform(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['cleaned_terms'] = cleaned_terms
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['cleaned_terms'] = cleaned_terms
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  k_dict, list(X[self.terms]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentat

## Load Data

Becaues of the time-consuming nature of the feature engineering, we saved the data into a csv. If you don't want to run all of the previous functions (which takes ~4 hours to run), just start with the cells below.

In [22]:
zip_data = zipfile.ZipFile('Data/final_dataset.csv.zip')
zip_data.extractall('Data/')
modified_train = pd.read_csv('Data/final_dataset.csv')
modified_train = modified_train.drop('Unnamed: 0', axis=1)
modified_train.fillna(0, inplace=True)
assert(len(modified_train.columns) == 49)

In [23]:
X_train = modified_train[['clean_length', 'title_length',
                          'desc_length', 'clean_terms_in_title',
                          'clean_terms_in_desc', 
                          'min_levenstein_dist_title', 'min_levenstein_dist_brand',
                          'stemmed_terms_in_title', 'stemmed_terms_in_desc',
                          'lemmatized_terms_in_title', 'lemmatized_terms_in_desc',
                          'neighbours_in_title', 'neighbours_in_desc', 'search_terms_entropy',
                          'title_entropy', 'jaccard_index_title', 'jaccard_index_desc', 'lcs_title',
                          'lcs_desc', 'jscore_query_desc', 'jscore_query_title', 'search_title_SW',
                          'search_desc_SW', 'NCD_query_title', 'num_words_in_description', 'num_stop_words',
                          'num_search_words', 'tfidf_search_common', 'num_attrib']]
y_train = modified_train[['relevance']]

Since we can't see the relevancy scores of the test set, we decided to split the training set further into our own training and test set.

In [24]:
train_data, test_data, train_target, test_target = train_test_split(X_train,
                                                                    y_train)

In [25]:
assert(int(math.floor(len(X_train) * .75)) == len(train_data))
assert(len(train_data) == len(train_target))
assert(int(math.ceil(len(X_train) * .25)) == len(test_data))
assert(len(test_data) == len(test_target))

## Baseline

As a simple baseline, we considered the RMSE of a completely random model.

In [26]:
print(len(y_train)) # no empty relevancy score.

74071


Here we perform a randomization test on the relevancy scores where we permute the relevancy scores. In that way, we can keep the same distribution and effectively get a completely random model

In [27]:
random_score = np.random.permutation(y_train)
random_chance_performance = math.sqrt(mean_squared_error(random_score, y_train))
print(random_chance_performance)

0.7544671945558852


# Model Fitting

## Linear Models

For our simplest model, we decided to use linear regression.

In [28]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(train_data, train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
predicted = lin_reg_model.predict(test_data)
print(predicted[:5])
print(test_target[:5])

[[2.31324209]
 [2.4665354 ]
 [2.87651038]
 [2.77161773]
 [2.2364893 ]]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


After some exploration, we realized that some of the predictions produced by linear regression exceeded the bounds of the relevancy score metric. To account for that, we performed a min-max scaling to get the predictions within the bounds.

In [30]:
scaler = MinMaxScaler(feature_range=(1.0, 3.0))
scaled_linear_predicted = scaler.fit_transform(predicted)
print(scaled_linear_predicted[:5])
print(test_target[:5])

[[2.20801308]
 [2.34930012]
 [2.72716496]
 [2.63048773]
 [2.13727174]]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


For good measure, we considered alternative linear models that have regularization like Lasso, Ridge and ElasticNet to evaluate if there was overfitting. 

In [31]:
# BEWARE: this takes ~11m to run
alt_linear_models = [Pipeline([('ls_reg', Lasso())]),
                     Pipeline([('ridge_reg', Ridge())]),
                     Pipeline([('en_reg' ,ElasticNet())])]

alt_linear_grid_params = [{'ls_reg__alpha': np.linspace(start=0.25, stop=1.0, num=4),
                           'ls_reg__normalize': [False, True],
                           'ls_reg__selection': ['cyclic', 'random']},
                          {'ridge_reg__alpha': np.linspace(start=0.5, stop=2.0, num=4),
                           'ridge_reg__normalize': [False, True],
                           'ridge_reg__solver': ['svd', 'lsqr', 'sag', 'saga']},
                          {'en_reg__alpha': np.linspace(start=0.5, stop=2.0, num=4),
                           'en_reg__l1_ratio': np.linspace(start=0.25, stop=1.0, num=4),
                           'en_reg__normalize': [False, True],
                           'en_reg__selection': ['cyclic', 'random']}]

best_models_alt_linear = grid_search_models_rmse(alt_linear_models, alt_linear_grid_params, train_data, train_target)

print(best_models_alt_linear)

[(0.5270565244813905, {'ls_reg__alpha': 0.25, 'ls_reg__normalize': False, 'ls_reg__selection': 'random'}, Pipeline(memory=None,
     steps=[('ls_reg', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])), (0.4909474416460913, {'ridge_reg__alpha': 0.5, 'ridge_reg__normalize': False, 'ridge_reg__solver': 'svd'}, Pipeline(memory=None,
     steps=[('ridge_reg', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])), (0.5143959951062845, {'en_reg__alpha': 0.5, 'en_reg__l1_ratio': 0.25, 'en_reg__normalize': False, 'en_reg__selection': 'cyclic'}, Pipeline(memory=None,
     steps=[('en_reg', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', to

In [32]:
best_model_alt_linear = change_best_params_keys(best_models_alt_linear, 
                                                ['ridge_reg__alpha', 'ridge_reg__normalize', 'ridge_reg__solver'],
                                                ['alpha', 'normalize', 'solver'])

In [33]:
alt_linear_predicted = fit_best_model(best_model_alt_linear, train_data, train_target, test_data)
print(alt_linear_predicted[:5])
print(test_target[:5])

[[2.31352757]
 [2.46523882]
 [2.87508328]
 [2.77329945]
 [2.23622066]]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


## Tree-based Models

After considering various linear models, we decided to test if tree-based models could improve upon the performance of the linear models.

In [34]:
# BEWARE: this takes ~26m to run
tree_models = [Pipeline([('rf_reg', RandomForestRegressor())]),
               Pipeline([('ab_reg', AdaBoostRegressor())])]

grid_params_tree = [{'rf_reg__n_estimators': range(1, 30, 5),
                     'rf_reg__max_features': ['auto', 'sqrt', 'log2', None]},
                    {'ab_reg__n_estimators': range(1, 30, 5),
                     'ab_reg__loss': ['linear', 'square'],
                     'ab_reg__learning_rate': np.linspace(start=0.5, stop=1.5, num=5)}]

best_models_tree = grid_search_models_rmse(tree_models, grid_params_tree, train_data, train_target)

print(best_models_tree)

[(0.4754440798246978, {'rf_reg__max_features': 'sqrt', 'rf_reg__n_estimators': 26}, Pipeline(memory=None,
     steps=[('rf_reg', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])), (0.4997047751670171, {'ab_reg__learning_rate': 0.5, 'ab_reg__loss': 'linear', 'ab_reg__n_estimators': 11}, Pipeline(memory=None,
     steps=[('ab_reg', AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None))]))]


In [35]:
best_tree_model = change_best_params_keys(best_models_tree, 
                                          ['rf_reg__n_estimators', 'rf_reg__max_features'],
                                          ['n_estimators', 'max_features'])

In [36]:
# here we chose the best tree model
tree_predicted = fit_best_model(best_tree_model, train_data, train_target, test_data)
print(tree_predicted[:5])
print(test_target[:5])

  best_model.fit(train_data, train_target)


[2.25653846 2.31384615 2.91038462 2.55269231 2.39692308]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


In [37]:
# BEWARE : this takes ~17m to run
rf_models = [Pipeline([('rf_reg', RandomForestRegressor())])]

grid_params_rf = [{'rf_reg__n_estimators': range(25, 151, 10),
                   'rf_reg__max_features': ['log2']}]

best_models_rf = grid_search_models_rmse(rf_models, grid_params_rf, train_data, train_target)

print(best_models_rf)

[(0.46847941511634583, {'rf_reg__max_features': 'log2', 'rf_reg__n_estimators': 125}, Pipeline(memory=None,
     steps=[('rf_reg', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]))]


In [38]:
best_rf_model = change_best_params_keys(best_models_rf, 
                                        ['rf_reg__n_estimators', 'rf_reg__max_features'],
                                        ['n_estimators', 'max_features'])

In [39]:
rf_predicted = fit_best_model(best_rf_model, train_data, train_target, test_data)
print(rf_predicted[:5])
print(test_target[:5])

  best_model.fit(train_data, train_target)


[2.24134667 2.35872    2.8776     2.57264    2.40992   ]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


To be thorough, we also considered a model that used boosting (Gradient Boosting) to see how boosting would compare to all of our models thus far.

In [40]:
# BEWARE: this takes ~14m to run
gb_model_pipeline = [Pipeline([('gb_reg', GradientBoostingRegressor())])]

grid_params_gb = [{'gb_reg__loss': ['ls', 'huber'],
                   'gb_reg__n_estimators': range(1, 26, 5),
                   'gb_reg__learning_rate': [0.1, 0.25, 0.5, 0.75, 0.9]}]

best_gb_models = grid_search_models_rmse(
    gb_model_pipeline, grid_params_gb, train_data, train_target)

print(best_gb_models)

[(0.48007125970513725, {'gb_reg__learning_rate': 0.5, 'gb_reg__loss': 'ls', 'gb_reg__n_estimators': 21}, Pipeline(memory=None,
     steps=[('gb_reg', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False))]))]


In [41]:
best_gb_model = change_best_params_keys(best_gb_models, 
                                          ['gb_reg__learning_rate', 'gb_reg__n_estimators', 'gb_reg__loss'],
                                          ['learning_rate', 'n_estimators', 'loss'])

In [42]:
gb_predicted = fit_best_model(best_gb_model, train_data, train_target, test_data)
print(gb_predicted[:5])
print(test_target[:5])

  y = column_or_1d(y, warn=True)


[2.43141544 2.37561736 2.69874211 2.64194775 2.47703813]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


In [43]:
# BEWARE: this takes ~11m to run
refined_gb_model_pipeline = [
    Pipeline([('gb_reg', GradientBoostingRegressor())])]

grid_params_refined_gb = [{'gb_reg__loss': ['ls'],
                           'gb_reg__n_estimators': range(25, 151, 25),
                           'gb_reg__learning_rate': [0.4, 0.5, 0.6]}]

best_refined_gb_models = grid_search_models_rmse(
    refined_gb_model_pipeline, grid_params_refined_gb, train_data, train_target)

print(best_refined_gb_models)

[(0.4759104270479097, {'gb_reg__learning_rate': 0.4, 'gb_reg__loss': 'ls', 'gb_reg__n_estimators': 150}, Pipeline(memory=None,
     steps=[('gb_reg', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False))]))]


We even considered XGBoost to see if that would get us the best RMSE.

In [44]:
import xgboost as xgb

data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
xg_reg = xgb.XGBRegressor(objective='reg:linear', booster='gblinear', reg_lambda=0.01)
xg_reg.fit(train_data, train_target)
xgb_predictions = xg_reg.predict(test_data)

In [45]:
xgb_rmse = sqrt(mean_squared_error(xgb_predictions, test_target))
print(xgb_rmse)

0.5158811009929469


In [46]:
data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
xg_reg = xgb.XGBRegressor(objective='reg:linear', learning_rate=0.1, booster='gbtree', gamma=0.5, max_depth=5)
xg_reg.fit(train_data, train_target)
xgb_predictions = xg_reg.predict(test_data)

## Multi-Layer Perceptron (aka Neural Net)

Here, we tested out a Multi-Layer Perceptron, or `sklearn`'s version of neural networks, for good measure.

In [50]:
# BEWARE: this takes ~7m to run
nn_model_pipeline = [Pipeline([('nn_reg', MLPRegressor())])]

grid_params_nn = [{'nn_reg__activation': ['logistic', 'tanh', 'relu'],
                   'nn_reg__solver': ['lbfgs', 'sgd', 'adam']}]

best_nn_models = grid_search_models_rmse(nn_model_pipeline, grid_params_nn, train_data, train_target)

print(best_nn_models)

[(0.49174152911273955, {'nn_reg__activation': 'logistic', 'nn_reg__solver': 'adam'}, Pipeline(memory=None,
     steps=[('nn_reg', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]))]


Based on a cursory look at the RMSE for the training portion, it didn't perform even close to what the tree-based models did and we decided not to use it.

## Tree-Based Models - Reduced Dimensionality

After evaluating performance on Random Forest models, we wondered how applying PCA (Principal Component Analysis) to reduce the dimensionality of our data would affect performance.

In [51]:
# BEWARE: this takes ~6m to run
transformed_rf_models = [Pipeline([('scale', StandardScaler()),
                                   ('pca', PCA(n_components=int(
                                       math.log2(len(X_train.columns))))),
                                   ('rf_reg', RandomForestRegressor())])]

grid_params_transformed_rf = [{'rf_reg__n_estimators': range(138, 143, 2),
                               'rf_reg__max_features': ['log2']}]

best_models_transformed_rf = grid_search_models_rmse(
    transformed_rf_models, grid_params_transformed_rf, train_data, train_target)

print(best_models_transformed_rf)

[(0.512190454365941, {'rf_reg__max_features': 'log2', 'rf_reg__n_estimators': 138}, Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('rf_reg', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_...timators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]))]


In [52]:
best_transformed_rf_model = change_best_params_keys(best_models_transformed_rf, 
                                          ['rf_reg__n_estimators', 'rf_reg__max_features'],
                                          ['n_estimators', 'max_features'])

In [53]:
transformed_rf_predicted = fit_best_model(best_transformed_rf_model, train_data, train_target, test_data)
print(transformed_rf_predicted[:5])
print(test_target[:5])

  best_model.fit(train_data, train_target)


[2.17403986 2.37243659 2.82630435 2.55514493 2.37086957]
       relevance
107         2.67
52975       2.67
17575       3.00
71718       2.67
23030       2.33


## Testing Smaller Feature Space

We decided to also test out how using only certain 'categories' of our features would affect our model. We grouped our model into similarity features (Jaro index, Jaccard index, etc.),  count features, and length/entropy features. 

We then tested out the best models using only those subsets of features to compare to the performance of the model with the full set of features. This is different than simply using PCA or other dimensionality reduction techniques because PCA may choose a few select features from each of the categories whereas this tests only features from one particular category.

In [17]:
all_num_features = modified_train[getAllNumericalCols(modified_train)]
all_num_features.head(3)

Unnamed: 0,stemmed_terms,clean_terms_in_title,clean_terms_in_desc,stemmed_terms_in_title,stemmed_terms_in_desc,lemmatized_terms_in_title,lemmatized_terms_in_desc,neighbours_in_title,neighbours_in_desc,search_terms_entropy,...,jscore_query_desc,jscore_query_title,search_title_SW,search_desc_SW,NCD_query_title,num_words_in_description,num_stop_words,num_search_words,tfidf_search_common,num_attrib
0,angl bracket,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,3.251629,...,2.833333,0.866667,1.0,4.0,0.107077,79,0,2,1,15.0
1,bracket,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.807355,...,0.0,0.0,0.0,0.0,0.107077,79,0,2,0,15.0
2,deck,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,...,2.711111,0.0,0.0,3.0,0.109091,109,1,2,1,35.0


In [25]:
all_similarity_features = modified_train[getSimilarityCols(all_num_features)]
all_similarity_features.head(3)

Unnamed: 0,neighbours_in_title,neighbours_in_desc,jaccard_index_title,jaccard_index_desc,lcs_title,lcs_desc,jscore_query_desc,jscore_query_title,search_title_SW,search_desc_SW,NCD_query_title
0,0.0,1.0,0.166667,0.0,6,13,2.833333,0.866667,1.0,4.0,0.107077
1,0.0,0.0,0.0,0.0,3,7,0.0,0.0,0.0,0.0,0.107077
2,0.0,1.0,0.0,0.012048,4,4,2.711111,0.0,0.0,3.0,0.109091


In [26]:
count_cols, len_h_cols = getCountAndOtherCols(
    all_similarity_features, all_num_features)
all_count_features = modified_train[count_cols]
len_entropy_features = modified_train[len_h_cols]

In [27]:
train_data_numerical_subset, test_data_numerical_subset, train_target_numerical_subset, test_target_numerical_subset = train_test_split(all_num_features,
                                                                                                                                        y_train)

In [262]:
best_transformed_rf_model.fit(
    train_data_numerical_subset, train_target_numerical_subset)
transformed_rf_predicted_numerical_subset = best_transformed_rf_model.predict(
    test_data_numerical_subset)
print(transformed_rf_predicted_numerical_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.24535714 2.49071429 2.64038095 2.60203571 2.58490476]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [266]:
best_rf_model.fit(train_data_numerical_subset, train_target_numerical_subset)
rf_predicted_numerical_subset = best_rf_model.predict(
    test_data_numerical_subset)
print(rf_predicted_numerical_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.22330935 2.45896043 2.65258993 2.58966427 2.46791367]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [263]:
train_data_similarity_subset, test_data_similarity_subset, train_target_similarity_subset, test_target_similarity_subset = train_test_split(all_similarity_features,
                                                                                                                                            y_train)

In [264]:
best_transformed_rf_model.fit(
    train_data_similarity_subset, train_target_similarity_subset)
transformed_rf_predicted_similarity_subset = best_transformed_rf_model.predict(
    test_data_similarity_subset)
print(transformed_rf_predicted_similarity_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.11857143 2.27058673 2.1015     2.44828571 2.39452381]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [268]:
best_rf_model.fit(train_data_similarity_subset, train_target_similarity_subset)
rf_predicted_similarity_subset = best_rf_model.predict(
    test_data_similarity_subset)
print(rf_predicted_similarity_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.02188849 2.34651079 2.10170264 2.58411871 2.42079137]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [273]:
train_data_count_subset, test_data_count_subset, train_target_count_subset, test_target_count_subset = train_test_split(all_count_features,
                                                                                                                        y_train)

In [274]:
best_transformed_rf_model.fit(
    train_data_count_subset, train_target_count_subset)
transformed_rf_predicted_count_subset = best_transformed_rf_model.predict(
    test_data_count_subset)
print(transformed_rf_predicted_count_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.62299464 2.61285714 2.37335278 2.22497168 2.24638285]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [275]:
best_rf_model.fit(train_data_count_subset, train_target_count_subset)
rf_predicted_count_subset = best_rf_model.predict(test_data_count_subset)
print(rf_predicted_count_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.62617557 2.52879496 2.36541988 2.20950156 2.27060029]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [278]:
train_data_len_entropy_subset, test_data_len_entropy_subset, train_target_len_entropy_subset, test_target_len_entropy_subset = train_test_split(len_entropy_features,
                                                                                                                                                y_train)

In [279]:
best_transformed_rf_model.fit(
    train_data_len_entropy_subset, train_target_len_entropy_subset)
transformed_rf_predicted_len_entropy_subset = best_transformed_rf_model.predict(
    test_data_len_entropy_subset)
print(transformed_rf_predicted_len_entropy_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.54094286 2.59914286 2.36814286 2.59955051 2.06421429]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [280]:
best_rf_model.fit(train_data_len_entropy_subset,
                  train_target_len_entropy_subset)
rf_predicted_len_entropy_subset = best_rf_model.predict(
    test_data_len_entropy_subset)
print(rf_predicted_len_entropy_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.51521223 2.61115108 2.4346283  2.65931655 1.94434053]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


## Stacking

We also considered stacking and tested out a few stacked models to see how much of an improvement, if any, there was over using individual models.

In [None]:
stacked_regression = StackedRegressor([RandomForestRegressor(n_estimators=140)], LinearRegression())

In [None]:
stacked_regression.fit(train_data, train_target)

In [None]:
stacked_predictions = stacked_regression.predict(test_data)

In [None]:
stacked_regression_v2 = StackedRegressor([RandomForestRegressor(n_estimators=140, max_features='log2', max_depth=9, min_samples_split=5),
                                       LinearRegression(),
                                      GradientBoostingRegressor(loss='ls', learning_rate=0.4, n_estimators=150),
                                      AdaBoostRegressor(n_estimators=6, loss='linear', learning_rate=0.25)], 
                                      RandomForestRegressor(n_estimators=140, max_features='log2', max_depth=9, min_samples_split=5))

In [None]:
stacked_regression_v2.fit(train_data, train_target)

In [None]:
stacked_predictions_v2 = stacked_regression_v2.predict(test_data)

## Evaluation Metric

The benchmark was ~rank 1681 on the Kaggle leaderboard for this competition with an RMSE of .51049

1st place had an RMSE of .43192

https://www.kaggle.com/c/home-depot-product-search-relevance/leaderboard

In [9]:
rmse_lin_reg = sqrt(mean_squared_error(predicted, test_target))

print(f"{rmse_lin_reg:.4f}")

NameError: name 'predicted' is not defined

In [None]:
rmse_lin_reg_scaled = sqrt(mean_squared_error(scaled_linear_predicted, test_target))

print(f"{rmse_lin_reg_scaled:.4f}")

In [None]:
rmse_tree = sqrt(mean_squared_error(tree_predicted, test_target))

print(f"{rmse_tree:.4f}")

In [None]:
rmse_alt_linear = sqrt(mean_squared_error(alt_linear_predicted, test_target))

print(f"{rmse_alt_linear:.4f}")

In [None]:
rmse_rf = sqrt(mean_squared_error(rf_predicted, test_target))

print(f"{rmse_rf:.4f}")

In [10]:
rmse_transformed_rf = sqrt(mean_squared_error(
    transformed_rf_predicted, test_target))

print(f"{rmse_transformed_rf:.4f}")

0.4658


In [None]:
rmse_gb = sqrt(mean_squared_error(
    gb_predicted, test_target))

print(f"{rmse_gb:.4f}")

In [None]:
xgb_rmse = sqrt(mean_squared_error(xgb_predictions, test_target))

print(f"{xgb_rmse:.4f}")

In [None]:
rmse_stacked = sqrt(mean_squared_error(
    stacked_predictions, test_target))

print(f"{rmse_stacked:.4f}")

In [None]:
rmse_stacked_v2 = sqrt(mean_squared_error(
    stacked_predictions_v2, test_target))

print(f"{rmse_stacked:.4f}")

### Reduced Subset RMSE - Random Forest after PCA

In [240]:
rmse_transformed_rf_numerical_subset = sqrt(mean_squared_error(
    transformed_rf_predicted_numerical_subset, test_target))

print(f"{rmse_transformed_rf_numerical_subset:.4f}")

0.5992


In [265]:
rmse_transformed_rf_predicted_similarity_subset = sqrt(
    mean_squared_error(transformed_rf_predicted_similarity_subset, test_target))

print(f"{transformed_rf_predicted_similarity_subset:.4f}")

0.5953


In [276]:
rmse_transformed_rf_count_subset = sqrt(mean_squared_error(
    transformed_rf_predicted_count_subset, test_target))

print(f"{rmse_transformed_rf_count_subset:.4f}")

0.6039


In [281]:
rmse_transformed_rf_len_entropy_subset = sqrt(mean_squared_error(
    transformed_rf_predicted_len_entropy_subset, test_target))

print(f"{rmse_transformed_rf_len_entropy_subset:.4f}")

0.5788


### Reduced Subset RMSE - Regular Random Forest

In [267]:
rmse_rf_numerical_subset = sqrt(mean_squared_error(
    rf_predicted_numerical_subset, test_target))

print(f"{rmse_rf_numerical_subset:.4f}")

0.5988


In [269]:
rmse_rf_similarity_subset = sqrt(mean_squared_error(
    rf_predicted_similarity_subset, test_target))

print(f"{rmse_rf_similarity_subset:.4f}")

0.5950


In [277]:
rmse_rf_count_subset = sqrt(mean_squared_error(
    rf_predicted_count_subset, test_target))

print(f"{rmse_rf_count_subset:.4f}")

0.6036


In [282]:
rmse_rf_len_entropy_subset = sqrt(mean_squared_error(
    rf_predicted_len_entropy_subset, test_target))

print(f"{rmse_rf_len_entropy_subset:.4f}")

0.5786


## Test Data Transformation

Here, we performed the same transformation used upon the training data on the test data. If you would prefer not to go through the transformation process, there is a cell below where you can load the saved results of the transformations.

In [124]:
products = pd.read_csv('Data/product_descriptions.csv')
test = pd.read_csv('Data/test.csv', encoding='ISO-8859-1')
attributes = pd.read_csv('Data/attributes.csv', encoding='ISO-8859-1')

In [4]:
# BEWARE: this takes ~1 min to run
attrib_per_product_test = attrib_stack(attributes, 'Data/attrib_per_product.csv')

In [5]:
test, attrib_per_product_test = join_attrib(test, attrib_per_product_test)
test = search_term_in_attrib(test)

In [6]:
test = color_df(attributes, test)

In [8]:
modified_test = test.set_index('product_uid').join(
        products.set_index('product_uid'))
modified_test = modified_test.reset_index()

In [9]:
modified_test['total_description'] = modified_test['product_title'] + \
        modified_test['product_description']

In [14]:
glove_file = 'Data/glove.6B.300d.txt'
glove_dic = make_dictionary(glove_file)

In [15]:
modified_test = test.set_index('product_uid').join(
        products.set_index('product_uid'))
modified_test = modified_test.reset_index()
modified_test = create_cleaned_terms_col(modified_test)

In [None]:
modified_test.fillna(0, inplace=True)

In [None]:
X_test = modified_test[['product_title', 'search_term',
                        'name', 'value', 'search_term_split', 'search_term_in_attrib',
                        'product_description', 'product_uid', 'total_description']]

In [None]:
test_features = test_pipeline.fit_transform(X_test)

After saving the file, we then use our best model thus far to perform the transformation upon the test set to submit to Kaggle.

In [131]:
modified_test = pd.read_csv('./Data/feature_engineered_test.csv')

In [132]:
X_test = modified_test[['clean_length', 'title_length',
                          'desc_length', 'clean_terms_in_title',
                          'clean_terms_in_desc', 
                          'min_levenstein_dist_title', 'min_levenstein_dist_brand',
                          'stemmed_terms_in_title', 'stemmed_terms_in_desc',
                          'lemmatized_terms_in_title', 'lemmatized_terms_in_desc',
                          'neighbours_in_title', 'neighbours_in_desc', 'search_terms_entropy',
                          'title_entropy', 'jaccard_index_title', 'jaccard_index_desc', 'lcs_title',
                          'lcs_desc', 'jscore_query_desc', 'jscore_query_title', 'search_title_SW',
                          'search_desc_SW', 'NCD_query_title', 'num_words_in_description', 'num_stop_words',
                          'num_search_words', 'tfidf_search_common', 'num_attrib']]

In [191]:
# BEWARE : this takes ~17m to run
rf_models = [Pipeline([('rf_reg', RandomForestRegressor())])]

grid_params_rf = [{'rf_reg__n_estimators': [143],
                   'rf_reg__max_features': ['log2'],
                   'rf_reg__max_depth': [9],
                   'rf_reg__min_samples_split': [5]}]
best_models_rf = []
for model in zip(rf_models, grid_params_rf):
    gs = GridSearchCV(estimator=model[0],
                      param_grid=model[1],
                      scoring='neg_mean_squared_error',
                      cv=5)
    if type(y_train) != np.ndarray:
        y = y_train.values.ravel()
        y_train = np.array(y).astype(float)
    gs.fit(X_train, y_train.ravel())
    best_models_rf.append(
        (sqrt(-1 * gs.best_score_), gs.best_params_, model[0]))

print(best_models_rf)

[(0.4829405264940038, {'rf_reg__max_depth': 9, 'rf_reg__max_features': 'log2', 'rf_reg__min_samples_split': 5, 'rf_reg__n_estimators': 143}, Pipeline(memory=None,
     steps=[('rf_reg', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]))]


In [192]:
best_rf_model = change_best_params_keys(best_models_rf, 
                                        ['rf_reg__n_estimators', 'rf_reg__max_features', 
                                         'rf_reg__max_depth', 'rf_reg__min_samples_split'],
                                        ['n_estimators', 'max_features', 'max_depth', 'min_samples_split'])

In [193]:
best_rf_model = best_rf_model[2].steps[0][1].__class__(
    **best_rf_model[1])
best_rf_model.fit(X_train, y_train)
rf_predicted = best_rf_model.predict(X_test)
print(rf_predicted[:5])

[1.94516864 1.97368064 2.14306728 2.52168675 2.45696723]


In [38]:
print(rf_predicted.shape)

(166693,)


In [194]:
output = 'id,relevance\n'
predictions = '\n'.join([str(test['id'][idx]) + ',' +str(prediction) for idx, prediction in enumerate(rf_predicted)])
output += predictions
    
prediction_file = open('home_depot_search_relevancy_test_predictions.csv', 'w')
prediction_file.write(output)
prediction_file.close()