## Imports

In [1]:
# Data
import pandas as pd
import numpy as np

In [2]:
# Misc
import os
import re
from pprint import pprint as pp

In [3]:
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline

In [4]:
# NLP
import spacy

In [125]:
# ML
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Config

In [6]:
# Pandas error display OFF
pd.options.mode.chained_assignment = None

# Data Path
# data_path = "/Users/alastairhamilton/Documents/Kag-home_depot-data/"
data_path = "/Users/alastairhamilton/Dropbox/Development/kaggle/competitions/KAG_home-depot/data/"

# NLP Model
nlp_tag = spacy.load('en', disable=['parser', 'ner'])

## Functions

In [7]:
def read_pickled_col(fp, f_stem):
    files = [x for x in os.listdir(fp) if (f_stem in x and ".pickle" in x)]
    df = pd.DataFrame()
    i=1
    for file in files:
        print("Reading file {} of {}...".format(i, len(files)))
        file = open(fp + file, 'rb')
        df = df.append(pd.read_pickle(file))
        i += 1
    return df

In [8]:
def rmv_stoppunc(s):
    return s.apply(lambda x: tuple(filter(lambda y: not (y.is_stop or y.is_punct), x)))

In [9]:
def rmv_punc(df, col):
    df.loc[:, col] = df.loc[:,col].apply(lambda x: tuple(filter(lambda y: not y.is_punct, x)))
    return df

In [10]:
def func_2series(s1, s2, func):
    return pd.concat([s1,s2], axis=1).apply(lambda row: func(row[0], row[1]), axis=1)

In [82]:
def common_words_doc(doc1, doc2):
    tot = 0
    for w1 in doc1:
        for w2 in doc2:
#             if w1.lemma_ == w2.lemma_:
#                 tot += 1
#                 break
            if w2.lemma_.find(w1.lemma_) >= 0:
                tot += 1
                break
    return tot

## Import Data

In [12]:
# Get UID and relevance
data = pd.read_pickle(data_path + 'uid_relevance.pickle')

In [13]:
print(len(data))
data.head(1)

74067


Unnamed: 0,index,relevance
0,0,3.0


In [14]:
os.listdir(data_path)

['product_description4.pickle',
 'attributes.csv.zip',
 'uid_relevance.pickle',
 '.DS_Store',
 'attribute5.pickle',
 'attribute1.pickle',
 'product_description2.pickle',
 'attribute3.pickle',
 'search_term.pickle',
 'attribute4.pickle',
 'product_description3.pickle',
 'test.csv.zip',
 'train.csv.zip',
 'product_descriptions.csv.zip',
 'product_title.pickle',
 'sample_submission.csv.zip',
 'product_description1.pickle',
 'attribute2.pickle']

In [15]:
# Preprocessing to do on a column
# # Get Column and Remove stop words and punctuation
print('Reading in searches...')
searches = rmv_punc(read_pickled_col(data_path, 'search_term'), 'search_term')
print('Done!')

Reading in searches...
Reading file 1 of 1...
Done!


## Feature Generation

In [16]:
# Features to calculate
# # Len of query
data['q_len'] = searches['search_term'].apply(lambda x: len(x))

In [84]:
# # Common words in title (compared to search term)

# # # Preprocessing to do on a column
# # # # Get Column and Remove stop words and punctuation
print('Reading in titles...')
titles = rmv_punc(read_pickled_col(data_path, 'product_title'), 'product_title')

# # # Get Common Words
data['com_title'] = func_2series(searches['search_term'], titles['product_title'], common_words_doc)

# # # Clean up
del titles
print('Done!')

Reading in titles...
Reading file 1 of 1...
Done!


In [85]:
# # Common words in description (compared to search term)

# # # Preprocessing to do on a column
# # # # Get Column and Remove stop words and punctuation
print('Reading in descriptions...')
descriptions = rmv_punc(read_pickled_col(data_path, 'product_description'), 'product_description')

# # # Get Common Words
data['com_desc'] = func_2series(searches['search_term'], descriptions['product_description'], common_words_doc)

# # # Clean up
del descriptions
print('Done!')

Reading in descriptions...
Reading file 1 of 4...
Reading file 2 of 4...
Reading file 3 of 4...
Reading file 4 of 4...
Done!


In [86]:
# # Common words in attributes (compared to search term)

# # # Preprocessing to do on a column
# # # # Get Column and Remove stop words and punctuation
print('Reading in attributes...')
attributes = rmv_punc(read_pickled_col(data_path, 'attribute'), 'Attributes')

# # # Get Common Words
data['com_attr'] = func_2series(searches['search_term'], attributes['Attributes'], common_words_doc)

# # # Clean up
del attributes
print('Done!')

Reading in attributes...
Reading file 1 of 5...
Reading file 2 of 5...
Reading file 3 of 5...
Reading file 4 of 5...
Reading file 5 of 5...
Done!


In [None]:
# # Compound Noun matches


In [None]:
# # # Similarity score?
# data['in_title?'] = data['com_title'].apply(lambda x: 1 if x>0 else 0)
# data['in_desc?'] = data['com_desc'].apply(lambda x: 1 if x>0 else 0)
# data['in_attr?'] = data['com_attr'].apply(lambda x: 1 if x>0 else 0)
# data['in_all?'] = data[['in_title?','in_desc?','in_attr?']].apply(sum, axis=1)

In [116]:
data.head()

Unnamed: 0,index,relevance,q_len,com_title,com_desc,com_attr
0,0,3.0,2,1,1,1
1,1,2.5,2,1,1,1
2,2,3.0,2,2,2,2
3,3,2.33,3,1,1,3
4,4,2.67,3,3,3,3


In [126]:
df['relevance'] = df['relevance'].apply(round).apply(int)

## Regression

In [131]:
# Split
# X_train, X_test, y_train, y_test = train_test_split(data[['q_len','com_title','com_desc','com_attr']],
#                                                     data['relevance'],
#                                                     test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df.drop('relevance',axis=1),
                                                    df['relevance'],
                                                    test_size=0.33, random_state=42)

In [136]:
# Gradient Boosted Tree Regressor Model
params = {'n_estimators':150, 'learning_rate': 0.4, 'loss': 'huber', 'alpha':0.9}
gbr = GradientBoostingRegressor(**params)

# Bagging Regressor Model (with random forest)
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Random Forest Classifier
rf_cl = RandomForestClassifier(n_jobs=2, random_state=0)

# Chosen
mdl = clf

In [137]:
# Fit
mdl.fit(X_train, y_train)

BaggingRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.1, n_estimators=45, n_jobs=1, oob_score=False,
         random_state=25, verbose=0, warm_start=False)

In [None]:
importances = pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
importances.plot(kind='bar')

In [138]:
# Predictions...
pred = mdl.predict(X_test)

## How'd we do?

In [139]:
# Metrics
print('MSE: {:.4f}\nMAE: {:.4f}\nR2: {:.4f}'.format(mean_squared_error(y_test, pred), mean_absolute_error(y_test, pred), r2_score(y_test, pred), mdl.score(X_test, y_test)))

MSE: 0.3313
MAE: 0.4876
R2: 0.1297


In [None]:
print('Cross Val Score')
print(cross_val_score(mdl, X_train, y_train))

In [None]:
# Deviance (over fitting?)

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(rf.staged_predict(X_test)):
    test_score[i] = rf.loss_(y_test, pred)

In [None]:
dev = pd.DataFrame({'Boosting_Iterations': np.arange(params['n_estimators'])+1,
              'Train_Deviance':mdl.train_score_,
              'Test_Deviance': test_score}).set_index('Boosting_Iterations')

In [None]:
dev.plot()

In [None]:
resids = pred - y_test.values

In [None]:
df = pd.DataFrame({"resids":resids, "preds":pred})

In [None]:
df['resids'].hist(bins=5)

In [None]:
df.plot(kind='scatter', x='preds', y='resids', style='o')

In [42]:
moomin = pd.read_csv(data_path+'moomin.csv')

In [45]:
moomin = moomin.drop('Unnamed: 0', axis=1)

In [48]:
t1 = data.set_index('index')
m = moomin.set_index('index')

In [52]:
m = m.applymap(lambda x: float(x))

In [55]:
t1 = t1.applymap(lambda x: float(x))

In [65]:
m.head()

Unnamed: 0_level_0,relevance,query_len,word_in_title,word_in_description
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.0,2.0,1.0,1.0
1,2.5,2.0,1.0,1.0
2,3.0,2.0,1.0,1.0
3,2.33,3.0,1.0,1.0
4,2.67,3.0,3.0,2.0


In [67]:
t1.head()

Unnamed: 0_level_0,relevance,q_len,com_title,com_desc,com_attr
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.0,2.0,1.0,1.0,1.0
1,2.5,2.0,0.0,0.0,0.0
2,3.0,2.0,0.0,1.0,1.0
3,2.33,3.0,1.0,1.0,2.0
4,2.67,3.0,3.0,3.0,3.0


In [74]:
titles = rmv_punc(read_pickled_col(data_path, 'product_title'), 'product_title')

Reading file 1 of 1...


In [77]:
searches.loc[1,'search_term']

(l, bracket)

In [79]:
titles.loc[1,'product_title']

(Simpson, Strong, Tie, 12-Gauge, Angle)