In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

In [2]:
df_train = pd.read_csv('train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('test.csv', encoding="ISO-8859-1")

In [3]:
df_train.iloc[1:5,]

Unnamed: 0,id,product_uid,product_title,search_term,relevance
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [4]:
df_test.loc[1:5,]

Unnamed: 0,id,product_uid,product_title,search_term
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668
5,8,100001,Simpson Strong-Tie 12-Gauge Angle,wood connectors


In [5]:
df_pro_desc = pd.read_csv('product_descriptions.csv')
df_pro_desc.iloc[1:5,]

Unnamed: 0,product_uid,product_description
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [6]:
num_train = df_train.shape[0]
num_train

74067

In [7]:
stemmer = SnowballStemmer('english')

In [8]:
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

In [9]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [10]:
df_all.iloc[0:5,]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...


In [11]:
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))

In [12]:
df_all = df_all.iloc[1:1000,]

In [28]:
df_all.iloc[0:5,]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,Update your bathroom with the Delta Vero Singl...
5,18,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,3.0,convect otr,Achieving delicious results is almost effortle...


In [29]:
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [46]:
df_all.iloc[0:5,]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,len_of_query,product_info,word_in_title,word_in_description
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als...",2,l bracket\tsimpson strong-ti 12-gaug angl\tnot...,1,1
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...,2,deck over\tbehr premium textur deckov 1-gal. #...,1,1
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...,3,rain shower head\tdelta vero 1-handl shower on...,1,1
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...,3,shower onli faucet\tdelta vero 1-handl shower ...,3,2
5,18,whirlpool 1.9 cu. ft. over the rang convect mi...,100006,3.0,convect otr,achiev delici result is almost effortless with...,2,convect otr\twhirlpool 1.9 cu. ft. over the ra...,1,2


In [31]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

In [33]:
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

In [35]:
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [47]:
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

In [48]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

In [53]:
y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [54]:
X_train

array([[100001,      2,      1,      1],
       [100002,      2,      1,      1],
       [100005,      3,      1,      1],
       ...,
       [100550,      5,      2,      3],
       [100551,      2,      2,      2],
       [100552,      3,      2,      2]])

In [None]:
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)