In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train_with_distance_metrics.csv')

In [9]:
product = pd.read_csv('product_descriptions.csv')

In [5]:
import sys
import re
import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
import xml.etree.cElementTree as ET
from collections import Counter
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import zipfile
import os

PARTIALS = False

def gettext(xmltext):
    """
    Parse xmltext and return the text from <title> and <text> tags
    """

    xmltext = xmltext.encode('ascii', 'ignore') # ensure there are no weird char
    root = ET.fromstring(xmltext)
    text = []
    for elem in root.iterfind('title'):
        text.append(elem.text)
    for elem in root.iterfind('.//text/*'):
        text.append(elem.text)
    text = ' '.join(text)

    return text


def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    text = text.lower()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if (len(w) > 2 and (w not in stops))]  # ignore a, an, to, at, be, ...
    # print words


    return words



def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    words = [stemmer.stem(t) for t in words]

    return words

def tokenizer(text):
    return stemwords(tokenize(text))


def compute_tfidf(corpus):
    """
    Create and return a TfidfVectorizer object after training it on
    the list of articles pulled from the corpus dictionary. The
    corpus argument is a dictionary mapping file name to xml text.
    """
    tfidf = TfidfVectorizer(input='content',
                            analyzer='word',
                            preprocessor=gettext,
                            tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore')
    tfidf.fit(list(corpus.values()))

    return tfidf


In [4]:
def description_column(train, product):
    """
    Add the product description from product df to train df.
    Concatenate Title and description to form total_description column.
    """
    train = train.drop('Unnamed: 0', axis = 1)
    train = train.set_index('product_uid').join(product.set_index('product_uid'))
    train = train.reset_index()
    product = product.reset_index()
    train['total_description'] = train['product_title'] + train['product_description']
    return train
    
    

In [6]:
train = description_column(train, product)

# Filter total description and remove duplicates

In [7]:
train_temp = train[['product_uid', 'total_description']]
train_temp = train_temp.drop_duplicates()

# Create tfidf vectors for each product description and filter out 5 words with maximum tfidf scores for a product . This could help identify words that are unique to a product. CAUTION: The code below runs for 3-4 hrs

In [48]:
tfidf = TfidfVectorizer(input='content',
                            analyzer='word',
                            tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore')
tfidf.fit(train_temp['total_description'])

p = []
total_description = list(train_temp['total_description'])
for i in range(len(train_temp)):
    response = tfidf.transform([total_description[i]])
    feature_names = tfidf.get_feature_names()
    col = response.nonzero()[1]
    t = []
    t = [(feature_names[col], response[0, col]) for col in response.nonzero()[1] if response[0, col] >= 0.09]
    t.sort(key=lambda x: x[1], reverse=True)
    p.append(t[0:5])
    
train_temp['tfidf'] = p

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenizer at 0x110ede400>, use_idf=True,
        vocabulary=None)

In [106]:
def get_words(x):
    """
    Remove the tfidf scores and return only the top tfidf words
    """
    q = []
    for i in range(len(x)):
        if x[i][0] != [] :
            q.append(x[i][0])
    return q

In [None]:
# what are train_temp and train supposed to represent?
# is train_temp just the product_uid and total_description columns?
# why are the dataframes being joined?
# is this just adding tfidf and num_words_in_description to the training set?
def add_cols_to_train(train_temp, train):
    train_temp['tfidf'] = train_temp['tfidf'].apply(lambda x: get_words(x))
    
    # is the line below duplication from cells above?
    train_temp_1 = train_temp[['product_uid','tfidf']]
    train_temp_1['tfidf'] = train_temp_1['tfidf'].apply( lambda x: ','.join(x))
    train = train.set_index('product_uid').join(train_temp_1.set_index('product_uid'))
    train = train.reset_index()
    train_temp_1 = train_temp_1.reset_index()
    train['tfidf'] = train['tfidf'].apply(lambda x: x.split(','))
    train['num_words_in_description'] = train['total_description'].apply(lambda x: len(tokenize(x)))
    
    return train

    

# Add tfidf column to train

In [None]:
train = add_cols_to_train(train_temp, train)

In [None]:
def num_stop_words(x):
    from sklearn.feature_extraction import stop_words
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    return len([w for w in x if w in stops])

In [None]:
train['num_stop_words'] = train['search_term'].apply(lambda x: num_stop_words(x.split(' ')))
train['num_search_words'] = train['search_term'].apply(lambda x: len(x.split(' ')))
train['search_term_split'] = train['search_term'].apply(lambda x: tokenizer(x))

In [None]:
def find_tfidf_words_in_search(train):
    p = train['search_term_split']
    q = train['tfidf']
    l = []
    for i in range(len(p)):
        l.append(len(set(p[i]).intersection(set(q[i]))))
    train['tfidf_search_common'] = l
    
    return train
        

In [None]:
train = find_tfidf_words_in_search(train)

In [122]:
def num_attrib_per_product(attributes):
    """
    Find the number of attributes per product
    """
    attributes['value'] = attributes['value'].apply(lambda x:tokenizer(str(x)))
    attributes['value'] = attributes['value'].apply(lambda x: ','.join(x))
    attrib_per_product = attributes.groupby('product_uid').agg(lambda x: x.tolist())
    attrib_per_product = attrib_per_product.reset_index()
    attrib_per_product['value'] = attrib_per_product['value'].apply(lambda x: ','.join(x).replace(',',' '))
    attrib_per_product['num_attrib'] = attrib_per_product['name'].apply(lambda x: len(x))
    attrib_per_product['value'].fillna('', inplace = True)
    attrib_per_product.rename(columns = {'value':'attribs'})
    attrib_per_product['product_uid'] = attrib_per_product['product_uid'].apply(lambda x: int(x))
    
    return attrib_per_product
    
    

In [None]:
attributes = pd.read_csv('attributes.csv', encoding='ISO-8859-1')
attrib_per_product = num_attrib_per_product(attributes)

In [None]:
# why are the dataframes being joined?
train = train.set_index('product_uid').join(attrib_per_product.set_index('product_uid'))
train = train.reset_index()
attrib_per_product = attrib_per_product.reset_index()

In [10]:
train = pd.read_csv('train_with_tfidf.csv')
combined = pd.read_csv('/Users/congchen/Downloads/combined_df-v3.csv')

In [15]:
train = train.drop('Unnamed: 0', axis = 1)
combined = combined.drop('Unnamed: 0', axis = 1)

In [35]:
train_temp = train[['product_uid','search_term','relevance', 'num_words_in_description','num_stop_words', 'num_search_words', 'tfidf_search_common','num_attrib']]

In [37]:
train_temp = train_temp.drop_duplicates()
len(train_temp)

74071

In [40]:
new_df = pd.merge(combined, train_temp,  how='right', left_on=['product_uid','search_term','relevance'], right_on = ['product_uid','search_term','relevance'])

In [43]:
new_df.to_csv('final_combined.csv')