In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# load files
train_data = pd.read_csv("input/train.csv", encoding="ISO-8859-1")
test_data = pd.read_csv("input/test.csv", encoding="ISO-8859-1")
att_data = pd.read_csv("input/attributes.csv")
descriptions = pd.read_csv("input/product_descriptions.csv")

In [3]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
stop = stopwords.words('english')
stemmer = SnowballStemmer('english')
def stm(s):
    return ' '.join([stemmer.stem(word)  for word in str(s).split() if word not in stop])

In [4]:
train_data['search_term']=train_data['search_term'].map(lambda x: stm(x))
descriptions['product_description']=descriptions['product_description'].map(lambda x: stm(x))
brands=att_data[['product_uid','value']][att_data.name=='MFG Brand Name']
materials=att_data[['product_uid','value']][att_data.name=='Material']
functionalities=att_data[['product_uid','value']][att_data.name=='Functionality']

In [5]:
att_data['value']=att_data['value'].map(lambda x: stm(x))
ser_att=pd.Series()
for p,v in zip(att_data['product_uid'],att_data['value']):
	s=' '.join([str(ser_att.get(p,'')),v])
	ser_att[p]=s

In [6]:
#count how many occurence if search items in  a string s
def search_in_str(search,s):
    return sum([s.count(term) for term in search.split()])

#count how many terms of search_term are in a string s
def count_search(search,s):
    return sum(int(s.find(sr)>=0) for sr in search.split())

In [7]:
brands.columns=['product_uid','brand']
materials.columns=['product_uid','material']
functionalities.columns=['product_uid','functionality']
train_data=train_data.merge(brands,how='left',on='product_uid')
train_data=train_data.merge(materials,how='left',on='product_uid')
train_data=train_data.merge(functionalities,how='left',on='product_uid')
train_data=train_data.merge(descriptions,how='left',on='product_uid')
train_data['search_in_title']=[search_in_str(x,y) for (x,y) in zip(train_data['search_term'],train_data['product_title'])]
train_data['search_in_brand']=[search_in_str(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['brand'])]
train_data['search_in_mat']=[search_in_str(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['material'])]
train_data['search_in_fun']=[search_in_str(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['functionality'])]
train_data['search_in_desc']=[search_in_str(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['product_description'])]
train_data['attr']=train_data['product_uid'].map(lambda x: ser_att.get(x,''))
train_data['search_in_att']=[search_in_str(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['attr'])]

train_data['c_search_in_title']=[count_search(x,y) for (x,y) in zip(train_data['search_term'],train_data['product_title'])]
train_data['c_search_in_brand']=[count_search(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['brand'])]
train_data['c_search_in_mat']=[count_search(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['material'])]
train_data['c_search_in_fun']=[count_search(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['functionality'])]
train_data['c_search_in_desc']=[count_search(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['product_description'])]
train_data['c_search_in_att']=[count_search(str(x),str(y)) for (x,y) in zip(train_data['search_term'],train_data['attr'])]

#del train_data['attr']
#del train_data['product_title']
#del train_data['brand']
#del train_data['product_description']

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
import math
rf = RandomForestRegressor(n_estimators=15, max_depth=6)

predictors=['search_in_title','search_in_brand','search_in_desc','search_in_att','search_in_mat','search_in_fun']
scores=cross_validation.cross_val_score(rf,train_data[predictors],train_data['relevance'],cv=5,scoring='mean_squared_error')
print(np.mean([math.sqrt(-x) for x in scores]))

0.522222236308


In [9]:
c_rf=RandomForestRegressor(n_estimators=15,max_depth=6)

c_predictors=['c_search_in_title','c_search_in_brand','c_search_in_desc','c_search_in_att','c_search_in_mat','c_search_in_fun']
c_scores=cross_validation.cross_val_score(rf,train_data[c_predictors],train_data['relevance'],cv=5,scoring='mean_squared_error')
print(np.mean([math.sqrt(-x) for x in c_scores]))

0.52135979632


In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
abr=AdaBoostRegressor(DecisionTreeRegressor(max_depth=6),n_estimators=15)
abr_scores=cross_validation.cross_val_score(abr,train_data[c_predictors],train_data['relevance'],cv=5,scoring='mean_squared_error')
print(np.mean([math.sqrt(-x) for x in abr_scores]))

0.527469060161


In [11]:
test_data = pd.read_csv("input/test.csv", encoding="ISO-8859-1")
test_data['search_term']=test_data['search_term'].map(lambda x: stm(x))
test_data=test_data.merge(brands,how='left',on='product_uid')
test_data=test_data.merge(materials,how='left',on='product_uid')
test_data=test_data.merge(functionalities,how='left',on='product_uid')
test_data=test_data.merge(descriptions,how='left',on='product_uid')
test_data['search_in_title']=[search_in_str(x,y) for (x,y) in zip(test_data['search_term'],test_data['product_title'])]
test_data['search_in_brand']=[search_in_str(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['brand'])]
test_data['search_in_mat']=[search_in_str(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['material'])]
test_data['search_in_fun']=[search_in_str(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['functionality'])]

test_data['search_in_desc']=[search_in_str(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['product_description'])]
test_data['attr']=test_data['product_uid'].map(lambda x: ser_att.get(x,''))
test_data['search_in_att']=[search_in_str(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['attr'])]

In [12]:
y_train = train_data['relevance'].values
X_train = train_data[predictors].values
X_test = test_data[predictors].values
id_test = test_data['id']

from sklearn.ensemble import RandomForestRegressor
rf_for_sub = RandomForestRegressor(n_estimators=15, max_depth=3)
rf_for_sub.fit(X_train, y_train)
y_pred = rf_for_sub.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

This submission got a score of 0.52235 on Kaggle 

In [28]:
test_data['c_search_in_title']=[count_search(x,y) for (x,y) in zip(test_data['search_term'],test_data['product_title'])]
test_data['c_search_in_brand']=[count_search(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['brand'])]
test_data['c_search_in_mat']=[count_search(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['material'])]
test_data['c_search_in_fun']=[count_search(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['functionality'])]
test_data['c_search_in_desc']=[count_search(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['product_description'])]
test_data['c_search_in_att']=[count_search(str(x),str(y)) for (x,y) in zip(test_data['search_term'],test_data['attr'])]

In [29]:
y_train = train_data['relevance'].values
X_train = train_data[c_predictors].values
X_test = test_data[c_predictors].values
id_test = test_data['id']

from sklearn.ensemble import RandomForestRegressor
c_rf_for_sub = RandomForestRegressor(n_estimators=15, max_depth=3)
c_rf_for_sub.fit(X_train, y_train)
y_pred = c_rf_for_sub.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

This submission got a score of 0.52168 on Kaggle 