In [None]:
import pandas as pd
modified_train.to_csv('modified_train.csv')
mt = pd.read_csv('modified_train.csv')
mt = mt.iloc[:,1:]

In [13]:
import re
import nltk
import string
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    text = text.lower()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if (len(w) > 2 and (w not in stops))]  # ignore a, an, to, at, be, ...
    # print words
    return words


def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    words = [stemmer.stem(t) for t in words]

    return words

def tokenizer(text):
    return stemwords(tokenize(text))


def attrib_stack(attributes):
    """
    Aggregate all the features of a product into a single description
    and return a dataframe with product id and description that is tokenized.
    
    """
    attributes['value'] = attributes['value'].apply(lambda x: str(x))
    attrib_per_product = attributes.groupby('product_uid').agg(lambda x: x.tolist())
    attrib_per_product = attrib_per_product.reset_index()
    attrib_per_product['value'] = attrib_per_product['value'].apply(lambda x: ','.join(x))
    attrib_per_product['value'] = attrib_per_product['value'].apply(lambda x: tokenizer(x))
    attrib_per_product['value'] = attrib_per_product['value'].apply(lambda x: ','.join(x))
    attrib_per_product.to_csv('attrib_per_product.csv')
    attrib_per_product = pd.read_csv('attrib_per_product.csv')
    attrib_per_product = attrib_per_product.drop('Unnamed: 0' ,axis = 1)
    return attrib_per_product

def join_attrib(train, attrib_per_product):
    """
    Join the aggregated attributes to the train dataframe
    """
    train = train.set_index('product_uid').join(attrib_per_product.set_index('product_uid'))
    train = train.reset_index()
    attrib_per_product = attrib_per_product.reset_index()
    return train, attrib_per_product
    
def search_term_in_attrib(train):
    """
    Convert the search term (stemmed) and attributes description to a set of words
    and find the number of common terms between both in the column search_term_in_attrib.
    """
    train['value'].fillna('', inplace = True)
    train['value'] = train['value'].apply(lambda x: set(x.split(',')))
    train['search_term_split'] = train['search_term'].apply(lambda x: set(tokenizer(x)))
    search_term_in_attrib = [] 
    for i in range(74067):
        p = len(train['search_term_split'][i].intersection(train['value'][i]))
        search_term_in_attrib.append(p)
    train['search_term_in_attrib'] = search_term_in_attrib
    return train

def color_df(attributes, train):
    """
    Find the attributes for color per product, join it with train data and 
    check for match in the search term
    
    """
    attrib_col = attributes[attributes['name'].apply(lambda x: 'color' in str(x).lower())]
    attrib_col = attrib_col.groupby('product_uid').agg(lambda x: x.tolist())
    attrib_col = attrib_col.drop('name',axis = 1)
    attrib_col = attrib_col.reset_index()
    attrib_col = attrib_col.rename(columns={'value': 'color'})
    attrib_col['color'] = attrib_col['color'].apply(lambda x: ','.join(x))
    attrib_col['color'] = attrib_col['color'].apply(lambda x: ','.join(x.replace('/','').replace(' ',',').split(',')).replace(',,',','))
    train = train.set_index('product_uid').join(attrib_col.set_index('product_uid'))
    train = train.reset_index()
    attrib_col = attrib_col.reset_index()
    train['color'].fillna('', inplace = True)
    train['search_term'].fillna('', inplace = True)
    train['color'] = train['color'].apply(lambda x: set(x.split(',')))
    color_in_search_term = [] 
    for i in range(74067):
        p = len(train['color'][i].intersection(train['search_term_split'][i]))
        color_in_search_term.append(p)
    train['color_in_search_term']= color_in_search_term
    
    return train
    
    

In [14]:
def search_title_lev_dist(train):
    """
    Calculate Levenshtein distance between search term and the product title
    """
    from Levenshtein import distance
    train.to_csv('train_with_search_in_attrib.csv')
    train = pd.read_csv('train_with_search_in_attrib.csv')
    train = train.drop(['Unnamed: 0'], axis = 1)
    train['product_title_clean'] = train['product_title'].apply(lambda x: list(set(tokenize(x))))
    train['search_term'].fillna('', inplace = True)
    train['search_term_split'] = train['search_term'].apply(lambda x: x.split(' '))
    
    p = []
    for i in range(0,74067):
        q = []
        if len(train['search_term_split'][i][0])>0:
            for j in range(len(train['search_term_split'][i])):
                for k in range(len(train['product_title_clean'][i])):
                    
                    if train['search_term_split'][i][j] in train['product_title_clean'][i][k]:
                        q.append((train['product_title_clean'][i][k],train['product_title_clean'][i][k]))
                        continue
                    elif train['search_term_split'][i][j][0] == train['product_title_clean'][i][k][0]:
                        q.append((train['search_term_split'][i][j], train['product_title_clean'][i][k]))
        p.append(q)
    
    l = []
    for i in range(len(p)):
        q = []
        for j in range(len(p[i])):
            q.append(distance(p[i][j][0], p[i][j][1]))
        l.append(q)
        
    m = []
    for q in l:
        if q == []:
            m.append(1000)
        else :
            m.append(min(q))

    train['min_levenstein_dist_title'] = m
    
    
    return train

In [19]:
def search_brand_lev_dist(train, attributes):
    """
    Filter out the brand from attributes, join it with train data.
    Calculate Levenshtein distance between search term and the brand
    """
    from collections import defaultdict
    from Levenshtein import distance
    attr_brand = attributes[(attributes['name'].str.lower().str.contains('brand')==True) & attributes['value'].notnull()]
    attr_brand = attr_brand.drop('name',axis=1)
    attr_brand =attr_brand.rename(columns = {'value':'brand'})
    attr_brand['product_uid'] = attr_brand['product_uid'].apply(lambda x:int(x))
    
    d = defaultdict(list)
    p =list(attr_brand['product_uid'])
    b = list(attr_brand['brand'])
    for i in range(len(p)):
        if p[i] not in d:
            d[p[i]] = tokenize(b[i])
        else:
            continue
    train['brand'] = train['product_uid'].apply(lambda x: d[x])
    train['brand'].fillna('',inplace=True)
    train['search_term'].fillna('', inplace = True)
    train['search_term_split'] =  train['search_term'].apply(lambda x: x.split(' '))
    
    p = []
    for i in range(74067):
        q = []
        if len(train['search_term_split'][i][0])>0:
            for j in range(len(train['search_term_split'][i])):
                for k in range(len(train['brand'][i])):
                    if train['search_term_split'][i][j] in train['brand'][i][k]:
                        q.append((train['brand'][i][k],train['brand'][i][k]))
                        continue
                    elif train['search_term_split'][i][j][0] == train['brand'][i][k][0]:
                        q.append((train['search_term_split'][i][j], train['brand'][i][k]))
        p.append(q)
        
    l = []
    for i in range(len(p)):
        q = []
        for j in range(len(p[i])):
            q.append(distance(p[i][j][0], p[i][j][1]))
        l.append(q)
        
    m = []
    for q in l:
        if q == []:
            m.append(1000)
        else :
            m.append(min(q))
            
    train['min_levenstein_dist_brand'] = m
    
    return train    

### Combine the attributes per product

In [8]:
attributes = pd.read_csv('attributes.csv', encoding='ISO-8859-1')
attrib_per_product = attrib_stack(attributes)

### Join the attributes with train data

In [10]:
train = pd.read_csv('train.csv', encoding='ISO-8859-1')
train, attrib_per_product = join_attrib(train, attrib_per_product)

### Look for search term in attributes

In [11]:
train = search_term_in_attrib(train)

### Filter out the color attribute from attributes data frame and check for presence in the search term

In [None]:
train = color_df(attributes, train)
train = search_title_lev_dist(train)

### Save csv

In [16]:
train.to_csv('train_with_search_in_attrib.csv')


### Find levenshtein distance between search term and attribute descriptions

In [20]:
train = search_brand_lev_dist(train, attributes)

### Save csv

In [21]:
train.to_csv('train_with_search_in_attrib.csv')


### Drop redundant columns, save only the target variable and features

In [22]:
train_temp = train.drop(['id','name','value','search_term_split','color','product_title_clean','brand'] ,axis = 1)

### Save csv

In [230]:
train_temp.to_csv('train_with_distance_metrics.csv')

### Invert the Levenshtein distance: Higher the similarity higher the inverted distance

In [237]:
train_temp['min_levenstein_dist_title'] = train_temp['min_levenstein_dist_title'].apply(lambda x: 10 if x==0 else 1/x)

In [238]:
train_temp['min_levenstein_dist_brand'] = train_temp['min_levenstein_dist_brand'].apply(lambda x: 10 if x==0 else 1/x)

### Save csv

In [240]:
train_temp.to_csv('train_with_distance_metrics.csv')