In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction import stop_words

from nltk.stem.porter import PorterStemmer
from scipy import spatial
import operator
import time

In [2]:
glove_file = 'glove.6B.300d.txt'
def make_dictionary(file):
    '''
    Initiate the glove model as a dictionary
    input: A String which is a file in the project directory
    returns: A dictionary with item = word : 300 d list
    '''
    vecs = dict()
    with open(file) as f:
        lines = f.readlines()
        for word_and_vec in lines:
            elems = word_and_vec.strip().split(' ')
            word = elems[0]
            vec = np.array(elems[1:], dtype=float)
            vecs[word] = vec
    return vecs

In [3]:
glove_dic = make_dictionary(glove_file)

In [4]:
stops = list(stop_words.ENGLISH_STOP_WORDS)

In [5]:
products = pd.read_csv('product_descriptions.csv')
train = pd.read_csv('train.csv', encoding='ISO-8859-1')
train = train.set_index('product_uid').join(products.set_index('product_uid'))
train = train.reset_index()

In [6]:
search_term=train['search_term']

In [7]:
def clean_term_list(search_terms:list):
    f = open('output.txt', "w")
    f.close()
    for i in range(len(search_terms)):
        t=''
        search_terms[i]=search_terms[i].lower()
        for term in search_terms[i].split(' '):
            if ( term.isalpha() and len(term)>2 and (term not in stops) and (term in glove_dic)):
                t = t +' '+ term
        search_terms[i]=t

        f = open('output.txt', "a")
        f.write(search_terms[i])
        f.write('\n')
        f.close()

In [8]:
# the following code takes hours....
# you can load the 'output.txt' file to check the result

# clean_term_list(search_term)

In [9]:
f = open("output.txt","rt") 
clean_terms = f.readlines()
clean_terms = [line.strip() for line in clean_terms]
f.close()

In [10]:
print(train['search_term'][:5])
print(clean_terms[:5])

0         angle bracket
1             l bracket
2             deck over
3      rain shower head
4    shower only faucet
Name: search_term, dtype: object
['angle bracket', 'bracket', 'deck', 'rain shower head', 'shower faucet']


In [11]:
train['clean_terms']=clean_terms

In [12]:
cleaned = list(train['clean_terms'])  # without transfering into list, the changes made on cleaned will be made on train!
title = list(train['product_title'])
desc = list(train['product_description'])

In [13]:
cleaned[:5]

['angle bracket', 'bracket', 'deck', 'rain shower head', 'shower faucet']

In [14]:
clean_length=np.zeros(len(cleaned))
for i in range(len(cleaned)):
        if not pd.isnull(cleaned[i]):
            clean_length[i]=len(cleaned[i].split(' '))
            
train['clean_length']=clean_length  

In [15]:
title_length=np.zeros(len(title))
for i in range(len(title)):
        title_length[i]=len(title[i].split(' '))
            
train['title_length']=title_length    

In [16]:
desc_length=np.zeros(len(desc))
for i in range(len(desc)):
        desc_length[i]=len(desc[i].split(' '))
            
train['desc_length']=desc_length  

In [17]:
def clean_term_in_doc(terms:list, title:list):
    count=np.zeros(len(terms))
    for i in range(len(terms)):
        if not pd.isnull(terms[i]): 
            title[i]=title[i].lower()
            #print(terms[i])
            for term in terms[i].split(' '):
                #print(term, title[i].split(' '))
                if term in title[i].split(' '):
                    count[i]+=1
    return count

In [18]:
c_title = clean_term_in_doc(cleaned, title)
c_desc = clean_term_in_doc(cleaned, desc)

In [19]:
train['clean_terms_in_title']=c_title
train['clean_terms_in_desc']=c_desc

In [20]:
train[:5]

Unnamed: 0,product_uid,id,product_title,search_term,relevance,product_description,clean_terms,clean_length,title_length,desc_length,clean_terms_in_title,clean_terms_in_desc
0,100001,2,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ...",angle bracket,2.0,4.0,129.0,1.0,0.0
1,100001,3,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ...",bracket,1.0,4.0,129.0,0.0,0.0
2,100002,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...,deck,1.0,11.0,168.0,0.0,0.0
3,100005,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,rain shower head,3.0,13.0,104.0,1.0,1.0
4,100005,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,shower faucet,2.0,13.0,104.0,2.0,2.0


## Potential Model

We preprocessed both the search terms and the title by removing numbers and extraneous stop words from them. We decided to remove the numbers because the relevancy of the results from retrieving the closest words from the Glove dictionary was substantially improved after doing so.

We are creating a model based on the number of words in the "cleaned" search terms, the number of words in the "cleaned" title, the number of words in the "cleaned" description length, the count of how many times one of the cleaned search terms appeared in title, and the count of how many times one of the cleaned search terms appeared in the description. We think there might be some relationship between how many times the search terms appeared in the title or description and the relevancy of the search.

In [None]:
import re
import nltk
import string
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error

In [23]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    text = text.lower()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if (len(w) > 2 and (w not in stops))]  # ignore a, an, to, at, be, ...
    return words

In [None]:
def get_length(column):
    """
    This calculates and returns the number of words
    for each row in a specified column
    """
    length = np.zeros(len(column))
    for index in range(len(column)):
        print(column[index])
        if not pd.isnull(column[index]):
            length[index] = len(column[index].split(' '))
    return length

In [32]:
#X_train = train[['clean_length', 'title_length', 'desc_length', 'clean_terms_in_title', 'clean_terms_in_desc']]
X_train = train[['clean_terms_in_title', 'clean_terms_in_desc']]
y_train = train[['relevance']]
print(X_train[:10])
print(y_train[:10])

   clean_length  title_length  desc_length  clean_terms_in_title  \
0           2.0           4.0        129.0                   1.0   
1           1.0           4.0        129.0                   0.0   
2           1.0          11.0        168.0                   0.0   
3           3.0          13.0        104.0                   1.0   
4           2.0          13.0        104.0                   2.0   
5           2.0          15.0        490.0                   1.0   
6           2.0          15.0        490.0                   1.0   
7           1.0          15.0        490.0                   0.0   
8           2.0           9.0        120.0                   1.0   
9           1.0          14.0         82.0                   1.0   

   clean_terms_in_desc  
0                  0.0  
1                  0.0  
2                  0.0  
3                  1.0  
4                  2.0  
5                  1.0  
6                  1.0  
7                  0.0  
8                  2.0  
9

In [33]:
# since we can't see the relevancy scores of the test set,
# I decided to split the training set 
train_data, test_data, train_target, test_target = train_test_split(X_train,
                                                                        y_train,
                                                                        random_state=42)

In [35]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(train_data, train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
predicted = lin_reg_model.predict(test_data)
print(predicted[:5])
print(test_target[:5])

array([[2.12839404],
       [2.34648638],
       [2.40951281],
       [2.64887041],
       [2.63710402]])

In [40]:
# since I couldn't find an RMSE function in the sklearn library,
# I just used the MSE function and took the square root of that
rmse_lin_reg = sqrt(mean_squared_error(predicted, test_target))
rmse_lin_reg
# this value is equivalent to rank 1756 on the Kaggle leaderboard for this competition
# the benchmark was ~ rank 1680 

0.5162702750709969

In [None]:
test = pd.read_csv('test.csv', encoding='ISO-8859-1')
test[:5]

In [None]:
test = test.set_index('product_uid').join(products.set_index('product_uid'))
test = test.reset_index()
test[:5]

In [None]:
# here I take the search_term column...
search_term_test = test['search_term']
search_term_test[:10]

In [None]:
# this is basically the equivalent of the clean_term_list function created above
# this effectively removes any stop words and numbers in the search terms
cleaned_search_term_test = [' '.join(tokenize(search_term)) for search_term in search_term_test]
cleaned_search_term_test[:10]

In [None]:
test['clean_terms'] = cleaned_search_term_test

cleaned_test = list(test['clean_terms'])
title_test = list(test['product_title'])
desc_test = list(test['product_description'])

In [None]:
print(cleaned_test[:5])
print(title_test[:5])
print(desc_test[:2])

In [None]:
test['clean_length'] = get_length(cleaned_test)
test['clean_length'][0]
test['title_length'] = get_length(title_test)
test['title_length'][0]
test['desc_length'] = get_length(desc_test)
test['desc_length'][0]

In [None]:
test['clean_terms_in_title'] = clean_term_in_doc(cleaned_test, title_test)
test['clean_terms_in_desc'] = clean_term_in_doc(cleaned_test, desc_test)
test[:5]