## Data Cleaning and Shape Examining 


In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

training_data = pd.read_csv("input/train.csv.zip", encoding="ISO-8859-1")
testing_data = pd.read_csv("input/test.csv.zip", encoding="ISO-8859-1")
attribute_data = pd.read_csv('input/attributes.csv.zip')
descriptions = pd.read_csv('input/product_descriptions.csv.zip')

training_data = pd.merge(training_data, descriptions, 
                         on="product_uid", how="left")

In [3]:
import spacy
nlp = spacy.load('en')



In [4]:
training_data['length_search_terms'] = training_data.search_term.str.len()

In [7]:
training_data

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,length_search_terms
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00,"Not only do angles make joints stronger, they ...",13
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50,"Not only do angles make joints stronger, they ...",9
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00,BEHR Premium Textured DECKOVER is an innovativ...,9
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,16
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,18
5,18,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.00,Achieving delicious results is almost effortle...,14
6,20,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwave over stove,2.67,Achieving delicious results is almost effortle...,20
7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.00,Achieving delicious results is almost effortle...,10
8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67,The Quantum Adjustable 2-Light LED Black Emerg...,15
9,27,100009,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,mdf 3/4,3.00,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...,7


As in the dataset page has mentioned that it might contains some embedded html tags, let's plot and see how many in percentage, more prececily the fields 'product_description'

In [None]:
%matplotlib inline
total_length = len(descriptions['product_description'] )
has_tag = sum([1 for _ in descriptions['product_description'] if '<br' in _])
no_tags = total_length - has_tag

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.hist(x=[has_tag])
ax.ticklabel_format(useOffset=False)
_ = plt.xlabel('number of phrase which has html tags in')

plt.show()
print('has html tags in ',has_tag)
print('doesn\'t have html tags in ', no_tags)


Now let's see what is the frequency of search query which include digits in it with respect to product_title which 
includes words. As you can see most of them includes digits in search bar. 


In [None]:
(training_data.search_term.str.count("\\w+") + 1).hist(bins=30) #plot number of words in search therms
(training_data.search_term.str.count("\\d+") + 1).hist(bins=30) #plot number of digits in search terms
# (training_data.product_title.str.count("\\d+") + 1).hist(bins=30)#plot number of digits in title






In [None]:
(training_data.product_title.str.count("\\w+") + 1).hist(bins=30)#plot number of words in title
(training_data.search_term.str.count("\\w+") + 1).hist(bins=30) #plot number of words in search query





In [None]:
(training_data.product_title.str.count("\\d+") + 1).hist(bins=30)#plot number of words in title
(training_data.search_term.str.count("\\d+") + 1).hist(bins=30) #plot number of words in search query






In [None]:
(training_data.product_description.str.count("\\d+") + 1).hist(bins=30)
(training_data.product_description.str.count("\\d+\W+\d+") + 1).hist(bins=30)



let's plot at histogram following number of words in search query, and on the other hand relevancy score

In [None]:
(training_data.search_term.str.count("\\w+") + 1).hist(bins=30)
(training_data.relevance + 1).hist(bins=30)



let's take a look how does the persistence of digits in the search query influence the relevancy score, from below plot it clearly that most of the search query must have between 2.0 and 3.0 

In [None]:
(training_data.search_term.str.count("\\d+")).hist(bins=30)
(training_data.relevance ).hist(bins=30)

let’s assume that there are zero response for null query search term


In [None]:
training_data[training_data.search_term.str.count('\\w+') < 1]
# training_data[training_data.search_term.str.contains('^\d+') < 1]

In [None]:
# an interest case can be see below, unfortunattly we cannot get rid of this element since it will make a bad impact on model
training_data[training_data.product_uid==100030]

In [None]:
training_data[training_data.product_description.str.contains('.* x .*')].head(4) # at first it looks like nothing unsual 


Unfortunately, it is kind ambiguous to figure out the meaning of digits in the search context like an example below, it can mean anything. we should take care of this when cleaning context. It looks that most of the case the meaning of X is denoted the unit of measure like fit/inch/or something by something


In [None]:
# training_data[training_data.search_term.str.contains("^\\d+ . \\d+$")].head(4)
training_data[(training_data.search_term.str.contains("^\\d+ . \\d+$") )& (training_data.relevance > 2)].head(4)

In [None]:
# exception_number = training_data[training_data.search_term.str.contains("^\\d+ . \\d+$") ]['search_term'].values
# training_data['test'] = training_data[training_data.search_term.str.contains("^\\d+ . \\d+$") ]['search_term'].str.split(' ').values
# training_data[['product_title', 'product_description', 'relevance', ]].corr()
training_data.head(10)

In order to apply any standard method for analysis we have to standardize metric for text fields, which we define as follows

- split into tokens by white space
- remove punctuation from each token
- remove remaining tokens that are not alphabetic
- filter out stop words
- filter out short tokens

and lets also create new feature in the same time which will denote our hypothesis.

\begin{equation*}
H_1 = \{\ \frac{ card(search query)}{ card(product title)} = high\ relevance\ score\} \\
H_2 = \{ length(search query)\ influence\ relevance\ score \} \\
H_3 = \{ card(common\_words(search\ query,product\ title,product\ description)) = influence\ relevance\ score \} \\
\end{equation*}

In [9]:
from bs4 import BeautifulSoup
import lxml
import re
import nltk
from nltk.corpus import stopwords # Import the stop word list
from nltk.metrics import edit_distance
from string import punctuation
from collections import Counter
import nltk



def remove_html_tag(text):
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text().replace('Click here to review our return policy for additional information regarding returns', '')
    return text

def str_stemmer(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

def str_stemmer_title(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    whole_set = set(str1.split())
    return sum(int(str2.find(word)>=0) for word in whole_set)

 



Now let's build feature accordingly to 
\begin{equation*}
H_1, H_2, H_3\\
\end{equation*}

In [10]:
num_train = training_data.shape[0]
############### cleaning html tags ##################
has_tag_in = training_data.product_description.str.contains('<br')
training_data.loc[has_tag_in, 'product_description'] = training_data.loc[has_tag_in, 'product_description'].map(lambda x:remove_html_tag(x))
###############

############## apply stemming #####################
training_data['search_term'] = training_data['search_term'].map(lambda x:str_stemmer_title(x))
training_data['product_title'] = training_data['product_title'].map(lambda x:str_stemmer(x))

training_data['product_description'] = training_data['product_description'].map(lambda x:str_stemmer(x))
############## end stemming #####################

############## building custome feature, let's build a few of them before compare which one is the best ###########
training_data['len_of_query'] = training_data['search_term'].map(lambda x:len(x.split())).astype(np.int64)
training_data['shared_words'] = training_data[['search_term','product_description', 'product_title']].apply(lambda row:sum([str_common_word(*row[:-1]), str_common_word(*row[1:])]), axis=1)

# training_data['frequency_digits_in_sq']=training_data.product_description.str.count("\\d+")
training_data['frequency_words_in_sq'] = training_data.product_description.str.count("\\w+")
training_data["distance"] = training_data.loc[:, ["search_term","product_title"]].apply(lambda x: edit_distance(*x), axis=1)


In [13]:
#let's take a look if there is not empty search query now
# empty_search_query = training_data[training_data.search_term.str.count('\\w+') < 1].values
# print('data frame of empty seach query along with products',empty_search_query)
# # training_data[training_data.product_uid==100030]
# is_anything_none = training_data.isnull().values.any()
# print('presence of Nan values',  is_anything_none)

# training_data['H_1'] = training_data.loc[:, ["product_title","search_term"]].apply(lambda x: len(x[1])/len(x[0]), axis=1)

# training_data
training_data.corr()


Unnamed: 0,id,product_uid,relevance,len_of_query,shared_words,frequency_words_in_sq,distance
id,1.0,0.986738,-0.116426,0.1929,-0.036805,-0.067528,0.044136
product_uid,0.986738,1.0,-0.130656,0.206109,-0.033058,-0.062033,0.045716
relevance,-0.116426,-0.130656,1.0,-0.072279,0.054164,0.040146,-0.034573
len_of_query,0.1929,0.206109,-0.072279,1.0,0.221767,0.063501,0.017572
shared_words,-0.036805,-0.033058,0.054164,0.221767,1.0,0.339178,0.320273
frequency_words_in_sq,-0.067528,-0.062033,0.040146,0.063501,0.339178,1.0,0.317115
distance,0.044136,0.045716,-0.034573,0.017572,0.320273,0.317115,1.0


In [12]:
training_data

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,len_of_query,shared_words,frequency_words_in_sq,distance
0,2,100001,Simpson StrongTie Angle,angl bracket,3.00,Not angles make joints stronger also provide c...,2,3,74,20
1,3,100001,Simpson StrongTie Angle,l bracket,2.50,Not angles make joints stronger also provide c...,2,3,74,20
2,9,100002,BEHR Premium Textured DeckOver Tugboat Wood Co...,deck over,3.00,BEHR Premium Textured DECKOVER innovative soli...,2,6,116,53
3,16,100005,Delta Vero Shower Only Faucet Trim Kit Chrome ...,rain shower head,2.33,Update bathroom Delta Vero SingleHandle Shower...,3,7,64,52
4,17,100005,Delta Vero Shower Only Faucet Trim Kit Chrome ...,shower onli faucet,2.67,Update bathroom Delta Vero SingleHandle Shower...,3,7,64,50
5,18,100006,Whirlpool cu ft Over Range Convection Microwav...,convect otr,3.00,Achieving delicious results almost effortless ...,2,6,317,67
6,20,100006,Whirlpool cu ft Over Range Convection Microwav...,microwav over stove,2.67,Achieving delicious results almost effortless ...,3,7,317,65
7,21,100006,Whirlpool cu ft Over Range Convection Microwav...,microwav,3.00,Achieving delicious results almost effortless ...,1,6,317,71
8,23,100007,Lithonia Lighting Quantum Black LED Emergency ...,emerg light,2.67,The Quantum Adjustable LED Black Emergency Lig...,2,8,77,51
9,27,100009,House Fara ft MDF Fluted Casing,mdf 3/4,3.00,Get House Fara ft MDF Fluted Casing add elegan...,2,6,46,29


In [None]:
#lets create new feature which will denote 
# training_data['test'] = training_data[['shared_words', 'frequency_words_in_sq']].apply(lambda row:row[1]+row[0],axis=1)


In [None]:
# training_data[['product_title','len_of_query','shared_words','frequency_words_in_sq','relevance', 'test']].corr()
training_data[['product_title','len_of_query','shared_words','frequency_words_in_sq','relevance', 'test']].corr()´

In [None]:
# %matplotlib inline
# r = training_data[training_data.search_term.str.contains('^\d+\s+?\w\s+?\d+$')]
# (r.relevance ).hist(bins=30)
# r.describe()
training_data.head(3)

In [None]:
df_all = training_data.drop(['search_term','product_description','product_title','test'],axis=1)

In [None]:
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values


In [None]:
df_train.head(3)

In [None]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=4, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)


Analysis of the Model


In [None]:
from nltk.metrics import edit_distance
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')


df_train = pd.read_csv("../input/train.csv", encoding="ISO-8859-1")
df_test = pd.read_csv("../input/test.csv", encoding="ISO-8859-1")
attribute_data = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('../input/product_descriptions.csv')



df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

num_train = df_train.shape[0]

def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())


df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')


############## apply stemming #####################
training_data['search_term'] = training_data['search_term'].map(lambda x:str_stemmer_title(x))
training_data['product_title'] = training_data['product_title'].map(lambda x:str_stemmer(x))

training_data['product_description'] = training_data['product_description'].map(lambda x:str_stemmer(x))
############## end stemming #####################

############## building custome feature, let's build a few of them before compare which one is the best ###########
training_data['len_of_query'] = training_data['search_term'].map(lambda x:len(x.split())).astype(np.int64)
training_data['shared_words'] = training_data[['search_term','product_description', 'product_title']].apply(lambda row:sum([str_common_word(*row[:-1]), str_common_word(*row[1:])]), axis=1).astype(np.int64)

# training_data['frequency_digits_in_sq']=training_data.product_description.str.count("\\d+")
training_data['frequency_words_in_sq'] = training_data.product_description.str.count("\\w+").astype(np.int64)
training_data["distance_levistein"] = training_data.loc[:, ["search_term","product_title"]].apply(lambda x: edit_distance(*x), axis=1).astype(np.int64)

training_data['length in product info'] = training_data[['product_title','product_description']].apply(lambda row:sum([len(*row[:-1]), len(*row[1:])]), axis=1).astype(np.int64)

df_all = df_all.drop(['search_term','product_title','product_description'],axis=1)


df_train = df_all.iloc[:num_train]
print('df_train',df_train)
df_test = df_all.iloc[num_train:]
print('df_test',df_test)
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

#### Feature to the same scale
scX = StandardScaler()
X_train = scX.fit_transform(X_train)
X_test = scX.fit_transform(X_test)

rf = RandomForestRegressor(n_estimators=4, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=4, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)

In [None]:
training_data[training_data.search_term.str.count('\\w+')<1]

In [None]:
training_data[training_data.relevance < 2]['product_title']

In [None]:
import numpy as np
 
 
# Get a text from the Brown Corpus
sentences = brown.sents('ca01')
 
print(sentences)
# [[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.'], [u'The', u'jury', u'further', u'said', u'in', u'term-end', u'presentments', u'that', u'the', u'City', u'Executive', u'Committee', u',', u'which', u'had', u'over-all', u'charge', u'of', u'the', u'election', u',', u'``', u'deserves', u'the', u'praise', u'and', u'thanks', u'of', u'the', u'City', u'of', u'Atlanta', u"''", u'for', u'the', u'manner', u'in', u'which', u'the', u'election', u'was', u'conducted', u'.'], ...]
 
print(len(sentences))  #  98
 
# get the english list of stopwords
stop_words = stopwords.words('english')
 
def build_similarity_matrix(sentences, stopwords=None):
    # Create an empty similarity matrix
    S = np.zeros((len(sentences), len(sentences)))
 
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
#             if idx1 == idx2:
#                 continue
            if idx1 != idx2:
                S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
 
    # normalize the matrix row-wise
    for idx in range(len(S)):
        S[idx] /= S[idx].sum()
 
    return S
 
# S = build_similarity_matrix(sentences, stop_words)    
# print(S)

In [None]:
training_data[training_data.text_rank_sp > 0.01]

In [None]:
(training_data.text_rank_sp+1).hist(bins=30)
(training_data.relevance).hist(bins=30)
# (training_data.distance / training_data.frequency_words_in_sq).hist(bins=30)




In [None]:
training_data['test']=training_data['distance'] / training_data['frequency_words_in_sq']

In [None]:
import sklearn

In [None]:
training_data