In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer


stemmer = SnowballStemmer('english')

df_train = pd.read_csv('D:/datamining/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('D:/datamining/test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('D:/datamining/product_descriptions.csv')
num_train = df_train.shape[0]

In [2]:
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

In [3]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')


In [4]:
print(df_all)

            id                                      product_title  \
0            2                  Simpson Strong-Tie 12-Gauge Angle   
1            3                  Simpson Strong-Tie 12-Gauge Angle   
2            9  BEHR Premium Textured DeckOver 1-gal. #SC-141 ...   
3           16  Delta Vero 1-Handle Shower Only Faucet Trim Ki...   
4           17  Delta Vero 1-Handle Shower Only Faucet Trim Ki...   
5           18  Whirlpool 1.9 cu. ft. Over the Range Convectio...   
6           20  Whirlpool 1.9 cu. ft. Over the Range Convectio...   
7           21  Whirlpool 1.9 cu. ft. Over the Range Convectio...   
8           23  Lithonia Lighting Quantum 2-Light Black LED Em...   
9           27  House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...   
10          34       Valley View Industries Metal Stakes (4-Pack)   
11          35  Toro Personal Pace Recycler 22 in. Variable Sp...   
12          37  Toro Personal Pace Recycler 22 in. Variable Sp...   
13          38  Toro Personal Pace

In [5]:
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [6]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [7]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

In [57]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [60]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_predrf = clf.predict(X_test)

In [61]:
#nnregressor part

In [63]:
X_train=X_train[:,1:]
X_test=X_test[:,1:]

In [65]:
from sklearn.neural_network import MLPRegressor

In [66]:
nn = MLPRegressor(hidden_layer_sizes=(10, 3),  alpha=1e-5, random_state=1, batch_size=100) 
nn.fit(X_train, y_train)
y_prednn = nn.predict(X_test)

In [67]:
print(y_prednn)

[ 2.00580212  2.08847471  2.08847471 ...,  2.48738294  2.66612812
  2.29046084]


In [69]:
print(y_predrf)

[ 2.07215887  2.22417453  2.22417453 ...,  2.12862001  2.42032514
  2.31581996]


In [70]:
#do the ensemble

In [71]:
y_pred = 0.5*y_predrf +0.5*y_prednn

In [72]:
print(y_pred)

[ 2.0389805   2.15632462  2.15632462 ...,  2.30800147  2.54322663
  2.3031404 ]
