In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df_train = pd.read_csv('train.csv', encoding="ISO-8859-1")

In [4]:
df_test = pd.read_csv('test.csv', encoding="ISO-8859-1")

In [5]:
from nltk.stem.snowball import SnowballStemmer 

In [6]:
stemmer = SnowballStemmer('english')

In [7]:
%%time
df_pro_desc = pd.read_csv('product_descriptions.csv')

num_train = df_train.shape[0]

def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())


df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

CPU times: user 11min 46s, sys: 13.1 s, total: 11min 59s
Wall time: 13min 16s


In [8]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [31]:
df_attr = pd.read_csv('attributes.csv')
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})


In [34]:
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

In [41]:
df_all['brand'] = df_all['brand'].fillna('unbranded')

In [42]:
%%time
df_all['brand'] = df_all['brand'].map(lambda x:str_stemmer(x))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 8: ordinal not in range(128)

In [30]:
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

KeyError: 'brand'

In [9]:
def jackard_similarity_index(x, y):
    x = set(x)
    y = set(y)
    z = x.intersection(y)
    return len(z)/float(len(x)+len(y)-len(z))

In [10]:
x = df_all['search_term'][0]
y = df_all['product_title'][0]

In [11]:
%%time
df_all['jackard_similarity'] = df_all.apply(lambda x: jackard_similarity_index(x['search_term'], x['product_title']), axis=1)

CPU times: user 6.75 s, sys: 194 ms, total: 6.94 s
Wall time: 7.43 s


In [12]:
%%time
df_new = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

CPU times: user 3 ms, sys: 15 ms, total: 18 ms
Wall time: 33.4 ms


In [13]:
%%time
df_train = df_new.iloc[:num_train]
df_test = df_new.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

CPU times: user 20 ms, sys: 14 ms, total: 34 ms
Wall time: 42.4 ms


In [29]:
df_train

Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description,jackard_similarity
0,2,100001,3.00,2,1,1,0.350000
1,3,100001,2.50,2,1,1,0.250000
2,9,100002,3.00,2,1,1,0.296296
3,16,100005,2.33,3,1,1,0.478261
4,17,100005,2.67,3,3,2,0.652174
5,18,100006,3.00,2,1,2,0.347826
6,20,100006,2.67,3,2,2,0.521739
7,21,100006,3.00,1,1,1,0.347826
8,23,100007,2.67,2,2,2,0.409091
9,27,100009,3.00,2,2,2,0.318182


#Training a random forest

In [14]:
%%time
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

CPU times: user 13.8 s, sys: 157 ms, total: 14 s
Wall time: 15 s


#Training a extreme gradient boosted tree

In [15]:
import xgboost as xgb

In [16]:
from sklearn.cross_validation import train_test_split

In [17]:
XTrain, XTest, yTrain, yTest = train_test_split(X_train, y_train, test_size=0.25)

In [18]:
dTrain = xgb.DMatrix(XTrain, label=yTrain)
dTest = xgb.DMatrix(XTest, label=yTest)
param = {'bst:max_depth':6, 'bst:eta':0.1, 'silent':0, 'objective':'reg:linear' }
param['nthread'] = 4
evallist  = [(dTest,'eval'), (dTrain,'train')]

In [19]:
num_round=200

In [20]:
bst = xgb.train( param, dTrain, num_round, evallist )

[0]	eval-rmse:1.774484	train-rmse:1.772379
[1]	eval-rmse:1.610963	train-rmse:1.608918
[2]	eval-rmse:1.465164	train-rmse:1.463176
[3]	eval-rmse:1.335447	train-rmse:1.333421
[4]	eval-rmse:1.220322	train-rmse:1.218213
[5]	eval-rmse:1.118354	train-rmse:1.116156
[6]	eval-rmse:1.028357	train-rmse:1.026053
[7]	eval-rmse:0.949180	train-rmse:0.946753
[8]	eval-rmse:0.879850	train-rmse:0.877205
[9]	eval-rmse:0.819323	train-rmse:0.816514
[10]	eval-rmse:0.766741	train-rmse:0.763651
[11]	eval-rmse:0.721296	train-rmse:0.717929
[12]	eval-rmse:0.682246	train-rmse:0.678548
[13]	eval-rmse:0.648910	train-rmse:0.644912
[14]	eval-rmse:0.620470	train-rmse:0.616148
[15]	eval-rmse:0.596412	train-rmse:0.591773
[16]	eval-rmse:0.576215	train-rmse:0.571299
[17]	eval-rmse:0.559225	train-rmse:0.554065
[18]	eval-rmse:0.545086	train-rmse:0.539698
[19]	eval-rmse:0.533179	train-rmse:0.527540
[20]	eval-rmse:0.523371	train-rmse:0.517496
[21]	eval-rmse:0.515209	train-rmse:0.509138
[22]	eval-rmse:0.508641	train-rmse:0.50229

In [21]:
pred = bst.predict(xgb.DMatrix(X_test))
pred[pred>3] = 3
pred[pred<1] = 1

In [22]:
pd.DataFrame({"id": id_test, "relevance": pred}).to_csv('submission_xgb_02_29_2016.csv',index=False)

In [27]:
avg_pred = (y_pred+pred)/2

In [28]:
pd.DataFrame({"id": id_test, "relevance": pred}).to_csv('submission_xgb_rfr_averaged.csv',index=False)