In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction import stop_words

from nltk.stem.porter import PorterStemmer
from autocorrect import spell
from scipy import spatial
import operator
import time

In [3]:
glove_file = '../data/glove.6B/glove.6B.300d.txt'
def make_dictionary(file):
    '''
    Initiate the glove model as a dictionary
    input: A String which is a file in the project directory
    returns: A dictionary with item = word : 300 d list
    '''
    vecs = dict()
    with open(file) as f:
        lines = f.readlines()
        for word_and_vec in lines:
            elems = word_and_vec.strip().split(' ')
            word = elems[0]
            vec = np.array(elems[1:], dtype=float)
            vecs[word] = vec
    return vecs

In [4]:
glove_dic=make_dictionary(glove_file)

In [5]:
stops = list(stop_words.ENGLISH_STOP_WORDS)

In [6]:
products = pd.read_csv('../data/product_descriptions.csv')
train = pd.read_csv('../data/train.csv', encoding='ISO-8859-1')
train = train.set_index('product_uid').join(products.set_index('product_uid'))
train = train.reset_index()

In [7]:
search_term=train['search_term']

In [8]:
def clean_term_list(search_terms:list):
    f = open('output.txt', "w")
    f.close()
    for i in range(len(search_terms)):
        t=''
        search_terms[i]=search_terms[i].lower()
        for term in search_terms[i].split(' '):
            if ( term.isalpha() and len(term)>2 and (term not in stops) and (term in glove_dic)):
                t = t +' '+ term
        search_terms[i]=t

        f = open('output.txt', "a")
        f.write(search_terms[i])
        f.write('\n')
        f.close()

In [9]:
# the following code takes hours....
# you can load the 'output.txt' file to check the result

# clean_term_list(search_term)

In [10]:
f = open("output.txt","rt") 
clean_terms = f.readlines()
clean_terms = [line.strip() for line in clean_terms]
f.close()

In [11]:
train['clean_terms']=clean_terms

In [12]:
cleaned=list(train['clean_terms'])  # without transfering into list, the changes made on cleaned will be made on train!
title=list(train['product_title'])
desc=list(train['product_description'])

In [13]:
clean_length=np.zeros(len(cleaned))
for i in range(len(cleaned)):
        if not pd.isnull(cleaned[i]):
            clean_length[i]=len(cleaned[i].split(' '))
            
train['clean_length']=clean_length  

In [14]:
title_length=np.zeros(len(title))
for i in range(len(title)):
        title_length[i]=len(title[i].split(' '))
            
train['title_length']=title_length    

In [15]:
desc_length=np.zeros(len(desc))
for i in range(len(desc)):
        desc_length[i]=len(desc[i].split(' '))
            
train['desc_length']=desc_length  

In [16]:
def clean_term_in_doc(terms:list, title:list):
    count=np.zeros(len(terms))
    for i in range(len(terms)):
        if not pd.isnull(terms[i]): 
            title[i]=title[i].lower()
            #print(terms[i])
            for term in terms[i].split(' '):
                #print(term, title[i].split(' '))
                if term in title[i].split(' '):
                    count[i]+=1
    return count

In [18]:
c_title=clean_term_in_doc(cleaned, title)
c_desc=clean_term_in_doc(cleaned, desc)

In [19]:
train['clean_terms_in_title']=c_title
train['clean_terms_in_desc']=c_desc

In [20]:
train

Unnamed: 0,product_uid,id,product_title,search_term,relevance,product_description,clean_terms,clean_length,title_length,desc_length,clean_terms_in_title,clean_terms_in_desc
0,100001,2,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00,"Not only do angles make joints stronger, they ...",angle bracket,2.0,4.0,129.0,1.0,0.0
1,100001,3,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50,"Not only do angles make joints stronger, they ...",bracket,1.0,4.0,129.0,0.0,0.0
2,100002,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00,BEHR Premium Textured DECKOVER is an innovativ...,deck,1.0,11.0,168.0,0.0,0.0
3,100005,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,rain shower head,3.0,13.0,104.0,1.0,1.0
4,100005,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,shower faucet,2.0,13.0,104.0,2.0,2.0
5,100006,18,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.00,Achieving delicious results is almost effortle...,convection otr,2.0,15.0,490.0,1.0,1.0
6,100006,20,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwave over stove,2.67,Achieving delicious results is almost effortle...,microwave stove,2.0,15.0,490.0,1.0,1.0
7,100006,21,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.00,Achieving delicious results is almost effortle...,microwaves,1.0,15.0,490.0,0.0,0.0
8,100007,23,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67,The Quantum Adjustable 2-Light LED Black Emerg...,emergency light,2.0,9.0,120.0,1.0,2.0
9,100009,27,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,mdf 3/4,3.00,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...,mdf,1.0,14.0,82.0,1.0,1.0
