In [1]:
cd ..

/home/agrigorev/notebooks/home-depot/homedepot


In [2]:
import Google_spell_check

In [3]:
import numpy as np
import pandas as pd

In [4]:
from time import time

In [5]:
import re

from unidecode import unidecode
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

In [6]:
root_path = '/home/agrigorev/notebooks/home-depot/input'

df_train = pd.read_csv(root_path + '/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(root_path + '/test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(root_path + '/product_descriptions.csv')

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [7]:
df_products = df_all.groupby('product_uid')[['product_title', 'product_uid']].head(n=1)
df_products.fillna('', inplace=1)

df_all.drop(['product_title'], axis=1, inplace=1)

df_products = df_products.merge(df_pro_desc, on='product_uid')

In [8]:
df_attr = pd.read_csv(root_path + '/attributes.csv', encoding='utf-8')
df_attr.name = df_attr.name.str.lower()
df_brand = df_attr[df_attr.name == "mfg brand name"][["product_uid", "value"]].rename(columns={"value": "brand"})

In [9]:
def combine_colors(group):
    if len(group) != 2:
        return list(group)[0]
    
    el1, el2 = list(group)
    if el1 in el2:
        return el2
    elif el2 in el1:
        return el1
    else:
        return ' '.join(set(group))

df_colors = df_attr[df_attr.name.isin(['color/finish', 'color family'])].fillna('')
df_colors = df_colors.groupby('product_uid').value.agg(dict(color=combine_colors)).reset_index()

In [10]:
df_material = df_attr[df_attr.name == u'material'].copy()
df_material.value = df_material.value + ' '
df_material = df_material.groupby('product_uid').value.sum()
df_material = df_material.reset_index()
df_material.rename(columns=dict(value='material'), inplace=1)

In [11]:
attr_names = [
    u'bullet01', u'bullet02', u'bullet03', u'bullet04', u'bullet05', 
    u'bullet06', u'bullet07', u'bullet08', u'bullet09', u'bullet10', 
    u'bullet11', u'bullet12', u'bullet13', u'bullet14', u'bullet15', 
    u'bullet16', u'bullet17', u'bullet18', u'bullet19', u'bullet20', 
    u'bullet21', u'bullet22']

df_attr_selected = df_attr[df_attr.name.isin(attr_names)].copy()

In [12]:
spell_check = Google_spell_check.spell_check_dict

def get_or_identity(query):
    return spell_check.get(query, query)

df_all.search_term = df_all.search_term.apply(get_or_identity)

In [13]:
def parse_correction(line):
    return line.strip().split('->')

with open('one-word-corrections.txt', 'r') as f:
    corrections = [parse_correction(s) for s in f.readlines()]
    corrections = {k: v.split(' ') for (k, v) in corrections}

In [14]:
stopwords = {'a', 'from', 'only', 'do', 'with', 'the', 'and', 'the', 'for', 'up', 'to', 'be',
             'per'}

In [15]:
def str_stem(s): 
    if not isinstance(s, (str, unicode)):
        return []

    if isinstance(s, str):
        s = unicode(s.decode('utf-8'))

    # some title edits?
    s = s.replace("&quot;"," ")
    s = s.replace(u"è_"," ")
    s = s.replace(u"å¡"," ")
    s = s.replace(u"Û"," ")
    s = s.replace(u"åÊ"," ")
    s = s.replace(u"ÛÒ"," ")
    s = s.replace(u"Ûª"," ")
    s = s.replace(u"ÛÜ"," ")
    s = s.replace(u"Û÷"," ")
    s = s.replace(u"ÈÀ"," ")
    s = s.replace(u"ã¢"," ")        
    s = s.replace(u"Ã¥Â¡"," ")
    s = s.replace(u"ã¨_"," ")

    s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) #Split words with a.A?
    s = s.replace("U.S."," US ")
    s = s.lower()

    # some title edits END?
    s = s.replace("&#39;s"," ")

    s = s.replace("  "," ")
    s = s.replace(",","") #could be number / segment later?
    s = s.replace("$"," ")
    s = s.replace("+"," plus ")
    s = s.replace(";"," ")
    s = s.replace(":"," ")
    s = s.replace("&amp;"," ")
    s = s.replace("&amp"," ")
    s = s.replace("?"," ")
    s = s.replace("-"," ")
    s = s.replace("#"," ")
    s = s.replace("("," ")
    s = s.replace(")"," ")
    s = s.replace("//","/")
    s = s.replace("..",".")
    s = s.replace(" / "," ovr ")
    s = s.replace(" \\ "," ")
    s = s.replace("."," . ")

    s = re.sub(r"(^\.|/)", r" ", s)
    s = re.sub(r"(\.|/)$", r" ", s)
    s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
    s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
    s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s)
    s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s)

    s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
    s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in ", s)
    s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft ", s)
    s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb ", s)
    s = re.sub(r"([0-9]+)( *)(square|sq)\.?", r"\1sq ", s)
    s = re.sub(r"([0-9]+)( *)(cubic|cu)\.?", r"\1cu ", s)
    s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal ", s)
    s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz ", s)
    s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm ", s)
    s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm ", s)
    s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg ", s)
    s = re.sub(r"([0-9]+)( *)(volts|volt|v)\.?", r"\1volt ", s)
    s = re.sub(r"([0-9]+)( *)(watts|watt|w)\.?", r"\1watt ", s)
    s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp ", s)

    s = s.replace(" x ", " xby ")
    s = s.replace("*", " xby ")
    s = s.replace(" by ", " xby")
    s = s.replace("x0", " xby 0")
    s = s.replace("x1", " xby 1")
    s = s.replace("x2", " xby 2")
    s = s.replace("x3", " xby 3")
    s = s.replace("x4", " xby 4")
    s = s.replace("x5", " xby 5")
    s = s.replace("x6", " xby 6")
    s = s.replace("x7", " xby 7")
    s = s.replace("x8", " xby 8")
    s = s.replace("x9", " xby 9")
    s = s.replace("0x", "0 xby ")
    s = s.replace("1x", "1 xby ")
    s = s.replace("2x", "2 xby ")
    s = s.replace("3x", "3 xby ")
    s = s.replace("4x", "4 xby ")
    s = s.replace("5x", "5 xby ")
    s = s.replace("6x", "6 xby ")
    s = s.replace("7x", "7 xby ")
    s = s.replace("8x", "8 xby ")
    s = s.replace("9x", "9 xby ")

    s = s.replace("&"," ")
    s = s.replace("'"," ")
    s = s.replace("  "," ")
    s = s.replace(" . "," ")

    s = unidecode(s.lower())
    
    result = []

    for z in s.split(" "):
        z = z.strip()
        if not z:
            continue
        if z in stopwords:
            continue

        if z in corrections:
            result.extend(corrections[z])
        else:
            result.append(z)

    return [stemmer.stem(z) for z in result]

In [16]:
from multiprocessing import Pool
pool = Pool(processes=8) 

In [17]:
def process_parallel(pool, series, function):
    return pool.map(function, series)

In [18]:
t0 = time()

df_all.search_term = process_parallel(pool, df_all.search_term, str_stem)
df_products.product_title = process_parallel(pool, df_products.product_title, str_stem)
df_products.product_description = process_parallel(pool, df_products.product_description, str_stem)

df_brand.brand = process_parallel(pool, df_brand.brand, str_stem)
df_colors.color = process_parallel(pool, df_colors.color, str_stem)
df_material.material = process_parallel(pool, df_material.material, str_stem)
df_attr_selected.value = process_parallel(pool, df_attr_selected.value, str_stem)

print 'took %0.5fs.' % (time() - t0)

took 128.64880s.


In [19]:
df_attr_pivot = df_attr_selected.pivot(index='product_uid', columns='name')
df_attr_pivot.columns = df_attr_pivot.columns.levels[1]
df_attr_pivot.reset_index(inplace=1)

In [20]:
df_all_merged = df_all
df_all_merged = df_all_merged.merge(df_products, how='left', on='product_uid')
df_all_merged = df_all_merged.merge(df_brand, how='left', on='product_uid')
df_all_merged = df_all_merged.merge(df_colors, how='left', on='product_uid')
df_all_merged = df_all_merged.merge(df_material, how='left', on='product_uid')
df_all_merged = df_all_merged.merge(df_attr_pivot, how='left', on='product_uid')

In [21]:
text_fields = ['brand', 'product_title', 'product_description', 'color', 'material'] + attr_names

In [22]:
list_ref = []

def nan_to_list(val):
    if isinstance(val, float) and np.isnan(val):
        return list_ref
    return val

for c in text_fields:
    df_all_merged[c] = df_all_merged[c].apply(nan_to_list)

In [23]:
df_all_merged['all_text'] = df_all_merged[text_fields].sum(axis=1)
text_fields = text_fields + ['all_text']

In [24]:
from gensim.models import Word2Vec

Couldn't import dot_parser, loading of dot files will not be possible.


In [25]:
from tqdm import tqdm

In [26]:
model = Word2Vec.load('home_depot_w2v.bin')
dim, = model['cat'].shape

def to_w2v(val):
    res = np.zeros(dim)
    if not val:
        return res

    for s in val:
        if s not in model:
            continue
        res = res + model[s]
    
    norm = np.linalg.norm(res)
    if norm > 0:
        return res / np.linalg.norm(res)
    else:
        return res

In [41]:
t0 = time()

w2v_arrays = {}
for c in ['search_term'] + text_fields:
    w2v_arrays[c] = process_parallel(pool, df_all_merged[c], to_w2v)

print 'took %0.5fs.' % (time() - t0)

took 175.59549s.


In [53]:
search_term_w2v = np.array(w2v_arrays['search_term'])

df_w2v_sim = pd.DataFrame({'id': df_all_merged.id})

for c in tqdm(text_fields):
    col_w2v = np.array(w2v_arrays[c])
    df_w2v_sim['w2v_query_' + c] = (search_term_w2v * col_w2v).sum(axis=1)



In [55]:
df_w2v_sim.to_csv('w2v_features_full.csv', index=False)