In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

df_train = pd.read_csv('train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('product_descriptions.csv')
num_train = df_train.shape[0]

In [2]:
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])
def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())


In [3]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')


In [4]:
len(df_all)

240760

In [5]:
df_all[1:10]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...
5,18,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.0,convection otr,Achieving delicious results is almost effortle...
6,20,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,2.67,microwave over stove,Achieving delicious results is almost effortle...
7,21,Whirlpool 1.9 cu. ft. Over the Range Convectio...,100006,3.0,microwaves,Achieving delicious results is almost effortle...
8,23,Lithonia Lighting Quantum 2-Light Black LED Em...,100007,2.67,emergency light,The Quantum Adjustable 2-Light LED Black Emerg...
9,27,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,100009,3.0,mdf 3/4,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...


In [6]:
#pre-processing
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [7]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
df_all[1:10]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,len_of_query,product_info,word_in_title,word_in_description
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als...",2,l bracket\tsimpson strong-ti 12-gaug angl\tnot...,1,1
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...,2,deck over\tbehr premium textur deckov 1-gal. #...,1,1
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...,3,rain shower head\tdelta vero 1-handl shower on...,1,1
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...,3,shower onli faucet\tdelta vero 1-handl shower ...,3,2
5,18,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,3.0,convect otr,achiev delici result is almost effortless with...,2,convect otr\twhirlpool 1.9 cu. ft. over the ra...,1,2
6,20,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,2.67,microwav over stove,achiev delici result is almost effortless with...,3,microwav over stove\twhirlpool 1.9 cu. ft. ove...,2,2
7,21,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,3.0,microwav,achiev delici result is almost effortless with...,1,microwav\twhirlpool 1.9 cu. ft. over the rang ...,1,1
8,23,lithonia light quantum 2-light black led emerg...,100007,2.67,emerg light,the quantum adjust 2-light led black emerg lig...,2,emerg light\tlithonia light quantum 2-light bl...,2,2
9,27,hous of fara 3/4 in. x 3 in. x 8 ft. mdf flute...,100009,3.0,mdf 3/4,get the hous of fara 3/4 in. x 3 in. x 8 ft. m...,2,mdf 3/4\thous of fara 3/4 in. x 3 in. x 8 ft. ...,2,2


In [8]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)
df_all[1:10]

Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
1,3,100001,2.5,2,1,1
2,9,100002,3.0,2,1,1
3,16,100005,2.33,3,1,1
4,17,100005,2.67,3,3,2
5,18,100006,3.0,2,1,2
6,20,100006,2.67,3,2,2
7,21,100006,3.0,1,1,1
8,23,100007,2.67,2,2,2
9,27,100009,3.0,2,2,2


In [9]:
df_train = df_all.iloc[:num_train]
#fetch rows from 0 to num_train
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [10]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [11]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

In [12]:
print(y_pred)

[ 2.07215887  2.22417453  2.22417453 ...,  2.12862001  2.42032514
  2.31581996]


In [13]:
len(y_pred)

166693

In [14]:
df_sol = pd.read_csv('solution.csv', encoding="ISO-8859-1")
df_sol['pred'] = y_pred
df_fliter = df_sol[df_sol.Usage == "Public"]

In [15]:
from sklearn.metrics import mean_squared_error, make_scorer
mse = mean_squared_error(df_fliter.relevance, df_fliter.pred, sample_weight=None, multioutput='uniform_average')
np.sqrt(mse)

0.4872118744835135