## Import

In [84]:
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

import csv

## upload data and preprocessing

In [3]:
df_train = pd.read_csv('/Users/zhangyiman/Desktop/train.csv',encoding="ISO-8859-1")
df_test = pd.read_csv('/Users/zhangyiman/Desktop/test.csv',encoding="ISO-8859-1")
df_attr = pd.read_csv('/Users/zhangyiman/Desktop/attributes.csv',encoding="ISO-8859-1")
df_pro_desc = pd.read_csv('/Users/zhangyiman/Desktop/product_descriptions.csv',encoding="ISO-8859-1")

In [4]:
stemmer = SnowballStemmer('english')
# stemmer

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())

In [5]:
df_train[0:4]

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33


In [6]:
num_train = df_train.shape[0]

## concat dataset

In [7]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

In [8]:
df_all[0:3]

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...


In [9]:
insert_id = []
for i in range(0,240760):
    insert_id.append(i)

In [10]:
df_all['call_id'] = pd.Series(insert_id ,index = df_all.index)

## add feature and encode feature

In [11]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [12]:
df_all = df_all.drop(['product_title','product_description','product_info'],axis=1)

In [13]:
df_all[0:3]

Unnamed: 0,id,product_uid,relevance,search_term,call_id,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,angle bracket,0,2,0,1
1,3,100001,2.5,l bracket,1,2,1,1
2,9,100002,3.0,deck over,2,2,0,2


In [14]:
#Set the random seed
np.random.seed(12)
# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()


#encode feature
df_all["search_term"] = label_encoder.fit_transform(df_all["search_term"])

In [15]:
df_all[0:3]

Unnamed: 0,id,product_uid,relevance,search_term,call_id,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,4809,0,2,0,1
1,3,100001,2.5,13903,1,2,1,1
2,9,100002,3.0,8486,2,2,0,2


In [20]:
df_all.shape[0]

240760

In [16]:
df_all[df_all['search_term']==4809][1:3]

Unnamed: 0,id,product_uid,relevance,search_term,call_id,len_of_query,word_in_title,word_in_description
1212,3784,100664,2.67,4809,1212,2,0,1
1229,3826,100672,2.33,4809,1229,2,0,1


In [17]:
# Define our features
features = ["product_uid", "search_term", "len_of_query", "word_in_title", "word_in_description"]

In [71]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]

## FIND pairs of document for same query

In [75]:
exsited_search_term = []
first_call_id = []
second_call_id = []

In [78]:
for call_id in range (0,74067):
# find all search_term in whole database

    if df_train['search_term'][call_id] not in exsited_search_term:
        exsited_search_term.append(df_train['search_term'][call_id])
        # find all pairs for this query
        
        search_term = df_train['search_term'][call_id]
        
        len_of_same_query_document = len(df_train[df_train['search_term']==search_term]['call_id'].values)
        
        list_all_in_same_query = []
        for i in range(0,len_of_same_query_document):
            list_all_in_same_query.append(df_train[df_train['search_term']==search_term]['call_id'].values[i])

        for i in range(0,len_of_same_query_document):
            first_document_call_id = list_all_in_same_query[i]
            for j in range(0,len_of_same_query_document):
                second_document_call_id = list_all_in_same_query[j]
                
                if first_document_call_id != second_document_call_id:
                    first_call_id.append(first_document_call_id)
                    second_call_id.append(second_document_call_id)

## A dataset for pairs of document

In [81]:
search_term_list = []

id_list = []
product_id_list = []
relevance_list = []
len_of_query_list = []
word_in_title_list = []
word_in_description_list = []

id_list_2 = []
product_id_list_2 = []
relevance_list_2 = []
len_of_query_list_2 = []
word_in_title_list_2 = []
word_in_description_list_2 = []

In [82]:
# first_call_id
# second_call_id
len(second_call_id)

529310

In [83]:
for i in range(0,len(second_call_id)):
    
    first_document = first_call_id[i]
    second_document = second_call_id[i]
    
    search_term_list.append(df_train['search_term'][first_document])
    id_list.append(df_train['id'][first_document])
    product_id_list.append(df_train['product_uid'][first_document])
    relevance_list.append(df_train['relevance'][first_document])

    len_of_query_list.append(df_train['len_of_query'][first_document])
    word_in_title_list.append(df_train['word_in_title'][first_document])
    word_in_description_list.append(df_train['word_in_description'][first_document])

    id_list_2.append(df_train['id'][second_document])
    product_id_list_2.append(df_train['product_uid'][second_document])
    relevance_list_2.append(df_train['relevance'][second_document])

    len_of_query_list_2.append(df_train['len_of_query'][second_document])
    word_in_title_list_2.append(df_train['word_in_title'][second_document])
    word_in_description_list_2.append(df_train['word_in_description'][second_document])
    

In [86]:
# write subdataset in a new csv file
with open('/Users/zhangyiman/Desktop/dataset_new.csv', 'w') as datacsv:
    writer = csv.writer(datacsv,dialect=("excel"))
    writer.writerow(['search_term','id_first_document','product_id_first_document','relevance_first_document','len_of_query_first_document','word_in_title_first_document','word_in_description_first_document','id_second_document','product_id_second_document','relevance_second_document','len_of_query_second_document','word_in_title_second_document','word_in_description_second_document'])
    
    for i in range(0,len(search_term_list)):
        writer.writerow([search_term_list[i],id_list[i],product_id_list[i],relevance_list[i],len_of_query_list[i],word_in_title_list[i],word_in_description_list[i],id_list_2[i],product_id_list_2[i],relevance_list_2[i],len_of_query_list_2[i],word_in_title_list_2[i],word_in_description_list_2[i]])
    

In [87]:
df_pair = pd.read_csv('/Users/zhangyiman/Desktop/dataset_new.csv')

In [91]:
#df_pair[0:100]
df_pair[0:5]

Unnamed: 0,search_term,id_first_document,product_id_first_document,relevance_first_document,len_of_query_first_document,word_in_title_first_document,word_in_description_first_document,id_second_document,product_id_second_document,relevance_second_document,len_of_query_second_document,word_in_title_second_document,word_in_description_second_document
0,4809,2,100001,3.0,2,0,1,3784,100664,2.67,2,0,1
1,4809,2,100001,3.0,2,0,1,3826,100672,2.33,2,0,1
2,4809,2,100001,3.0,2,0,1,4319,100739,3.0,2,0,1
3,4809,2,100001,3.0,2,0,1,6030,101036,1.33,2,0,1
4,4809,2,100001,3.0,2,0,1,8105,101370,3.0,2,0,1


## set label {-1,0,1} means the which document has higher rank for this query

In [95]:
df_pair.shape[0]

529310

In [96]:
num_pairs = df_pair.shape[0]

In [98]:
rank_label = []

for i in range(0,num_pairs):
    relavence_1 = df_pair['relevance_first_document'][i]
    relavence_2 = df_pair['relevance_second_document'][i]
    if relavence_1 > relavence_2:
        rank_label.append(1)
    if relavence_1 == relavence_2:
        rank_label.append(0)
    if relavence_1 < relavence_2:
        rank_label.append(-1)


In [99]:
df_pair['rank_label'] = pd.Series(rank_label ,index = df_pair.index)

In [100]:
df_pair[0:3]

Unnamed: 0,search_term,id_first_document,product_id_first_document,relevance_first_document,len_of_query_first_document,word_in_title_first_document,word_in_description_first_document,id_second_document,product_id_second_document,relevance_second_document,len_of_query_second_document,word_in_title_second_document,word_in_description_second_document,rank_label
0,4809,2,100001,3.0,2,0,1,3784,100664,2.67,2,0,1,1
1,4809,2,100001,3.0,2,0,1,3826,100672,2.33,2,0,1,1
2,4809,2,100001,3.0,2,0,1,4319,100739,3.0,2,0,1,0


In [102]:
df_pair = df_pair.drop(['relevance_first_document','relevance_second_document'],axis=1)

In [103]:
df_pair[0:3]

Unnamed: 0,search_term,id_first_document,product_id_first_document,len_of_query_first_document,word_in_title_first_document,word_in_description_first_document,id_second_document,product_id_second_document,len_of_query_second_document,word_in_title_second_document,word_in_description_second_document,rank_label
0,4809,2,100001,2,0,1,3784,100664,2,0,1,1
1,4809,2,100001,2,0,1,3826,100672,2,0,1,1
2,4809,2,100001,2,0,1,4319,100739,2,0,1,0


## Predict the probability for each pair of document

In [114]:
# Initialize the Random Forest model

rf_model = RandomForestRegressor(n_estimators=1000)

In [116]:
# Define our features
features = ["search_term", "id_first_document", "len_of_query_first_document","word_in_title_first_document",
            "word_in_description_first_document", "id_second_document", "product_id_second_document", "len_of_query_second_document",
            "word_in_title_second_document", "word_in_description_second_document"]

# Train the model
X = df_pair_train[features].values
y = df_pair_train["rank_label"].values

In [123]:
train_x, test_x, train_y, test_y = train_test_split(X, 
                                                    y, 
                                                    test_size=0.80, 
                                                    random_state=0)

In [124]:
# Train the RF
rf_model.fit(train_x, train_y)
predict_y = rf_model.predict(test_x)

In [129]:
predict_y

array([ 0.039, -0.166, -0.012, ...,  0.674,  0.   , -0.268])

## evaluation 

In [126]:
# Caculate the root mean square rooot
def rmse(predictions, targets):

    differences = predictions - targets                       #the DIFFERENCEs.

    differences_squared = differences ** 2                    #the SQUAREs of ^

    mean_of_differences_squared = differences_squared.mean()  #the MEAN of ^

    rmse_val = np.sqrt(mean_of_differences_squared)           #ROOT of ^

    return rmse_val 

In [127]:
print ("RMSE on test dataset = %.2f" % (rmse(test_y, predict_y)))

RMSE on test dataset = 0.82
