In [45]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

training_data = pd.read_csv("input/train.csv.zip", encoding="ISO-8859-1")
testing_data = pd.read_csv("input/test.csv.zip", encoding="ISO-8859-1")
attribute_data = pd.read_csv('input/attributes.csv.zip')
descriptions = pd.read_csv('input/product_descriptions.csv.zip')

training_data = pd.merge(training_data, descriptions, 
                         on="product_uid", how="left")

num_train = training_data.shape[0]



In [46]:
from bs4 import BeautifulSoup
import lxml
import re
import nltk
from nltk.corpus import stopwords # Import the stop word list
from nltk.metrics import edit_distance
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from string import punctuation
from collections import Counter
import nltk

from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance


def remove_html_tag(text):
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text().replace('Click here to review our return policy for additional information regarding returns', '')
    return text

def str_stemmer(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

def str_stemmer_title(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    whole_set = set(str1.split())
    return sum(int(str2.find(word)>=0) for word in whole_set)




In [34]:
training_data['search_term'] = training_data['search_term'].map(lambda x:str_stemmer_title(x))
training_data['product_title'] = training_data['product_title'].map(lambda x:str_stemmer(x))

training_data['product_description'] = training_data['product_description'].map(lambda x:str_stemmer(x))
############## end stemming #####################

############## building custome feature, let's build a few of them before compare which one is the best ###########
training_data['len_of_query'] = training_data['search_term'].map(lambda x:len(x.split())).astype(np.int64)
training_data['shared_words'] = training_data[['search_term','product_description', 'product_title']].apply(lambda row:sum([str_common_word(*row[:-1]), str_common_word(*row[1:])]), axis=1)

# training_data['frequency_digits_in_sq']=training_data.product_description.str.count("\\d+")
training_data['frequency_words_in_sq'] = training_data.product_description.str.count("\\w+")
training_data["distance"] = training_data.loc[:, ["search_term","product_title"]].apply(lambda x: edit_distance(*x), axis=1)

In [35]:
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
 
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
    sent1 = sent1.split(' ')
    sent2 = sent2.split(' ')
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [36]:
import numpy as np
 
 
# Get a text from the Brown Corpus
sentences = brown.sents('ca01')
 
print(sentences)
# [[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.'], [u'The', u'jury', u'further', u'said', u'in', u'term-end', u'presentments', u'that', u'the', u'City', u'Executive', u'Committee', u',', u'which', u'had', u'over-all', u'charge', u'of', u'the', u'election', u',', u'``', u'deserves', u'the', u'praise', u'and', u'thanks', u'of', u'the', u'City', u'of', u'Atlanta', u"''", u'for', u'the', u'manner', u'in', u'which', u'the', u'election', u'was', u'conducted', u'.'], ...]
 
print(len(sentences))  #  98
 
# get the english list of stopwords
stop_words = stopwords.words('english')
 


def build_similarity_matrix(sentences, stopwords=None):
    # Create an empty similarity matrix
    S = np.zeros((len(sentences), len(sentences)))
 
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
#             if idx1 == idx2:
#                 continue
            if idx1 != idx2:
                S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
 
    # normalize the matrix row-wise
    for idx in range(len(S)):
        S[idx] /= S[idx].sum()
 
    return S
 
# S = build_similarity_matrix(sentences, stop_words)    
# print(S)

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
98


In [37]:
training_data['build_similarity_matrix_sp'] =  training_data.loc[:, ["search_term","product_title"]].apply(lambda x: sentence_similarity(*x), axis=1)
training_data['build_similarity_matrix_sd'] =  training_data.loc[:, ["search_term","product_description"]].apply(lambda x: sentence_similarity(*x), axis=1)



In [38]:
training_data

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,len_of_query,shared_words,frequency_words_in_sq,distance,build_similarity_matrix_sp,build_similarity_matrix_sd
0,2,100001,Simpson StrongTie Angle,angl bracket,3.00,Not angles make joints stronger also provide c...,2,3,74,20,0.000000,0.000000
1,3,100001,Simpson StrongTie Angle,l bracket,2.50,Not angles make joints stronger also provide c...,2,3,74,20,0.000000,0.000000
2,9,100002,BEHR Premium Textured DeckOver Tugboat Wood Co...,deck over,3.00,BEHR Premium Textured DECKOVER innovative soli...,2,6,116,53,0.000000,0.056614
3,16,100005,Delta Vero Shower Only Faucet Trim Kit Chrome ...,rain shower head,2.33,Update bathroom Delta Vero SingleHandle Shower...,3,7,64,52,0.174078,0.065372
4,17,100005,Delta Vero Shower Only Faucet Trim Kit Chrome ...,shower onli faucet,2.67,Update bathroom Delta Vero SingleHandle Shower...,3,7,64,50,0.348155,0.130744
5,18,100006,Whirlpool cu ft Over Range Convection Microwav...,convect otr,3.00,Achieving delicious results almost effortless ...,2,6,317,67,0.000000,0.000000
6,20,100006,Whirlpool cu ft Over Range Convection Microwav...,microwav over stove,2.67,Achieving delicious results almost effortless ...,3,7,317,65,0.174078,0.000000
7,21,100006,Whirlpool cu ft Over Range Convection Microwav...,microwav,3.00,Achieving delicious results almost effortless ...,1,6,317,71,0.000000,0.000000
8,23,100007,Lithonia Lighting Quantum Black LED Emergency ...,emerg light,2.67,The Quantum Adjustable LED Black Emergency Lig...,2,8,77,51,0.000000,0.145095
9,27,100009,House Fara ft MDF Fluted Casing,mdf 3/4,3.00,Get House Fara ft MDF Fluted Casing add elegan...,2,6,46,29,0.288675,0.092848


In [43]:
from nltk.metrics import edit_distance
from sklearn.preprocessing import StandardScaler

df_all = training_data.drop(['search_term','product_title','product_description'],axis=1)

df_train = df_all.iloc[:num_train]
print('df_train',df_train)
df_test = df_all.iloc[num_train:]
print('df_test',df_test)
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

#### Feature to the same scale
scX = StandardScaler()
X_train = scX.fit_transform(X_train)
X_test = scX.fit_transform(X_test)

rf = RandomForestRegressor(n_estimators=4, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=4, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)


df_train            id  product_uid  relevance  len_of_query  shared_words  \
0           2       100001       3.00             2             3   
1           3       100001       2.50             2             3   
2           9       100002       3.00             2             6   
3          16       100005       2.33             3             7   
4          17       100005       2.67             3             7   
5          18       100006       3.00             2             6   
6          20       100006       2.67             3             7   
7          21       100006       3.00             1             6   
8          23       100007       2.67             2             8   
9          27       100009       3.00             2             6   
10         34       100010       2.67             2             7   
11         35       100011       3.00             5            16   
12         37       100011       3.00             2            14   
13         38       10001

ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required by StandardScaler.

In [40]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('./submission.csv',index=False)



NameError: name 'y_pred' is not defined

In [41]:
y_pred

NameError: name 'y_pred' is not defined

In [42]:
y_pred = clf.predict(X_test)



NameError: name 'clf' is not defined