# Final Project Check-in 2018-11-16

## Group Name: Lambda

### Student Names
1. Jian Wang
2. Chong Geng
3. Alan Perry
4. Divya Bhargavi
5. Robert Sandor

## Load Data

In [17]:
from collections import defaultdict, Counter
from Levenshtein import distance
import lzma
import math
from math import sqrt
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import *
import operator
import os
import re
from scipy import spatial
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import string
import sys
import time
import xml.etree.cElementTree as ET
import zipfile

In [2]:
def make_dictionary(file):
    '''
    Initiate the glove model as a dictionary
    input: A String which is a file in the project directory
    returns: A dictionary with item = word : 300 d list

    :param file:            the filepath string of the dictionary
    :returns:               a dictionary with words as keys 
                            and 300d vectors as values
    '''
    vecs = defaultdict(lambda: np.zeros(shape=(300, 1)))
    with open(file) as f:
        lines = f.readlines()
        for word_and_vec in lines:
            elems = word_and_vec.strip().split(' ')
            word = elems[0]
            vec = np.array(elems[1:], dtype=float)
            vecs[word] = vec
    return vecs

In [3]:
def split_dictionary():
    """
    firstly, I split the dictionary into a wordlist and a matrix.
    returns a list of words and 
    a 2d matrix of the normalized word vectors

    :returns:               the words and matrix associated with
                            the glove dictionary
    """
    wordlist = []
    matrix = []
    with open(glove_file) as f:
        lines = f.readlines()
        for word_and_vec in lines:
            wordvec = np.array([float(x) for x in word_and_vec.split()[1:]])
            matrix.append(wordvec / np.linalg.norm(wordvec))
            wordlist.append(word_and_vec.split()[0])
        matrix = np.array(matrix)
    return wordlist, matrix


def unique_words(train_df):
    """
    I then obtain the unique words that appear in the search_term.

    :param train_df:        the training set Pandas dataframe
    :returns:               a list of unique words from search terms
                            that have been stripped of numbers, symbols, etc.
    """
    cleaned = list(train_df['cleaned_terms'])
    all_words = []
    for t in cleaned:
        all_words += t.split(' ')

    return list(set(all_words))[1:]


def find_nearest_neighbors(filename, cleaned_set, matrix, wordlist, dictionary):
    """
    here I count the cos_distance of each word that is in the cleaned_set.
    the output file looks like (each line): w0, w1, w2, w3, w4,
    i didn't print the distance, just the neighbour words
    this will take couple of minutes.

    :param filename:        a string representing the filename to write to
    :param clenaed_set:     a list of search terms that have 
                            been stripped of numbers, symbols, etc.
    :param matrix:          a 2d Numpy array of the word vectors in wordlist
    :param wordlist:        a list of words from the glove dictionary
    :param dictionary:      a dictionary with words as keys 
                            and 300d vectors as values
    """
    output_string = ''

    for word in cleaned_set:
        dots = matrix.dot(dictionary[word])
        close_index_vec = np.argsort(dots)
        for i in range(5):
            output_string += wordlist[int(close_index_vec[-1-i])] + ','
        output_string += '\n'

    f = open(filename, "w")
    f.write(output_string)
    f.close()


def get_all_terms_neighbors(dictionary, cleaned):
    """
    terms_neighbour is the list which stores the top 4 neighbours of each searching_terms. 
    for example, if the searching term is: cleaned[0]='w1_w2', 
    then the terms_neighbour[0]='n11_n12_n13_n14_n21_n22_n23_n24'.

    :param dictionary:      a dictionary
    :param cleaned:         a list of search terms that have
                            been stripped of numbers, symbols, etc.
    :returns:               a list of concatenated words that are neighbors
                            of the 'cleaned' terms
    """
    terms_neighbour = []
    for i in range(len(cleaned)):
        neighbours = ''
        if cleaned[i] != '':
            words = cleaned[i].split(' ')
            for w in words:
                neighbours = neighbours + dictionary[w] + ' '
        terms_neighbour.append(neighbours)
    return terms_neighbour


def build_dictionary(file):
    """
    based on the above output file, I then built a dictionary;
    this dictionary stores each word (as key) 
    with its top 4 neighbour words (as value) 

    :param file:            the file containing the list of strings of neighbors
    :returns:               a dictionary with words as keys 
                            and 4 neighbors of that word as values
    """
    k_dic = defaultdict(lambda: '')
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            words = line.strip().split(',')
            k_dic[words[0]] = words[1] + ' ' + \
                words[2] + ' ' + words[3] + ' ' + words[4]
    return k_dic


def clean_term_in_doc(terms, title):
    """
    This cleans the given terms in the specified document

    :param terms:           a list of unique search terms
    :param title:           a list of titles of products
    :return:                a list of the counts of the 
                            cleaned terms within a product's title
    """
    count = np.zeros(len(terms))
    for i in range(len(terms)):
        if not pd.isnull(terms[i]):
            title[i] = title[i].lower()
            for term in terms[i].split(' '):
                if term in title[i].split(' '):
                    count[i] += 1
    return count


def get_length(column):
    """
    This calculates and returns the number of words
    for each row in a specified column

    :param column:          the feature/attribute which
                            will have its words counted
    :returns:               a column with the count of 
                            words in each string
    """
    length = np.zeros(len(column))
    for index in range(len(column)):
        if not pd.isnull(column[index]):
            length[index] = len(column[index].split(' '))
    return length


def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.

    :param text:            a string
    :returns:               the same string stripped of numbers,
                            tabs, newline characters, and punctuation
    """
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    text = text.lower()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    # delete stuff but leave at least a space to avoid clumping together
    nopunct = regex.sub(" ", text)
    words = nopunct.split(" ")
    # ignore a, an, to, at, be, ...
    words = [w for w in words if (len(w) > 2 and (w not in stops))]
    return words


def stemmed(words):
    """
    Stem a tokenized text and return a non-unique list of stemmed words
    found in the text. This is based on the output of function
    tokenize(text).

    :param text:            a list of tokenized words
    :returns:               a list of stemmed words
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in words]


def lemmatized(words):
    """
    lemmatize a tokenized text and return a non-unique list of stemmed words
    found in the text. This is based on the output of function
    tokenize(text).

    :param text:            a list of tokenized words
    :returns:               a list of lemmatized words
    """
    lemmatized_words = [nltk.stem.WordNetLemmatizer().lemmatize(w)
                        for w in words]
    return lemmatized_words

In [4]:
def tokenizer(text):
    return stemmed(tokenize(text))


def attrib_stack(attributes):
    """
    Aggregate all the features of a product into a single description
    and return a dataframe with product id and description that is tokenized.
    """
    attributes['value'] = attributes['value'].apply(lambda x: str(x))
    attrib_per_product = attributes.groupby(
        'product_uid').agg(lambda x: x.tolist())
    attrib_per_product = attrib_per_product.reset_index()
    attrib_per_product['value'] = attrib_per_product['value'].apply(
        lambda x: ','.join(x))
    attrib_per_product['value'] = attrib_per_product['value'].apply(
        lambda x: tokenizer(x))
    attrib_per_product['value'] = attrib_per_product['value'].apply(
        lambda x: ','.join(x))
    attrib_per_product.to_csv('attrib_per_product.csv')
    attrib_per_product = pd.read_csv('attrib_per_product.csv')
    attrib_per_product = attrib_per_product.drop('Unnamed: 0', axis=1)
    return attrib_per_product


def join_attrib(train, attrib_per_product):
    """
    Join the aggregated attributes to the train dataframe
    """
    train = train.set_index('product_uid').join(
        attrib_per_product.set_index('product_uid'))
    train = train.reset_index()
    attrib_per_product = attrib_per_product.reset_index()
    return train, attrib_per_product


def search_term_in_attrib(train):
    """
    Convert the search term (stemmed) and attributes description to a set of words
    and find the number of common terms between both in the column search_term_in_attrib.
    """
    train['value'].fillna('', inplace=True)
    train['value'] = train['value'].apply(lambda x: set(x.split(',')))
    train['search_term_split'] = train['search_term'].apply(
        lambda x: set(tokenizer(x)))
    search_term_in_attrib = []
    for i in range(len(train)):
        p = len(train['search_term_split'][i].intersection(train['value'][i]))
        search_term_in_attrib.append(p)
    train['search_term_in_attrib'] = search_term_in_attrib
    return train


def color_df(attributes, train):
    """
    Find the attributes for color per product, join it with train data and 
    check for match in the search term
    """
    attrib_col = attributes[attributes['name'].apply(
        lambda x: 'color' in str(x).lower())]
    attrib_col = attrib_col.groupby('product_uid').agg(lambda x: x.tolist())
    attrib_col = attrib_col.drop('name', axis=1)
    attrib_col = attrib_col.reset_index()
    attrib_col = attrib_col.rename(columns={'value': 'color'})

    attrib_col['color'] = attrib_col['color'].apply(lambda x: ','.join(x))
    attrib_col['color'] = attrib_col['color'].apply(
        lambda x: ','.join(x.replace('/', '').replace(' ', ',').split(',')).replace(',,', ','))

    train = train.set_index('product_uid').join(
        attrib_col.set_index('product_uid'))
    train = train.reset_index()
    attrib_col = attrib_col.reset_index()
    train['color'].fillna('', inplace=True)
    train['search_term'].fillna('', inplace=True)
    train['color'] = train['color'].apply(lambda x: set(x.split(',')))

    color_in_search_term = []
    for i in range(len(train)):
        p = len(train['color'][i].intersection(train['search_term_split'][i]))
        color_in_search_term.append(p)
    train['color_in_search_term'] = color_in_search_term

    return train


def search_title_lev_dist(train):
    """
    Calculate Levenshtein distance between search term and the product title
    """
    train.to_csv('train_with_search_in_attrib.csv')
    train = pd.read_csv('train_with_search_in_attrib.csv')
    train = train.drop(['Unnamed: 0'], axis=1)
    train['product_title_clean'] = train['product_title'].apply(
        lambda x: list(set(tokenize(x))))
    train['search_term'].fillna('', inplace=True)
    train['search_term_split'] = train['search_term'].apply(
        lambda x: x.split(' '))

    p = []
    for i in range(len(train)):
        q = []
        if len(train['search_term_split'][i][0]) > 0:
            for j in range(len(train['search_term_split'][i])):
                for k in range(len(train['product_title_clean'][i])):
                    if train['search_term_split'][i][j] in train['product_title_clean'][i][k]:
                        q.append((train['product_title_clean'][i]
                                  [k], train['product_title_clean'][i][k]))
                        continue
                    elif train['search_term_split'][i][j][0] == train['product_title_clean'][i][k][0]:
                        q.append((train['search_term_split'][i][j],
                                  train['product_title_clean'][i][k]))
        p.append(q)

    l = []
    for i in range(len(p)):
        q = []
        for j in range(len(p[i])):
            q.append(distance(p[i][j][0], p[i][j][1]))
        l.append(q)

    m = []
    for q in l:
        if q == []:
            m.append(1000)
        else:
            m.append(min(q))

    train['min_levenstein_dist_title'] = m

    return train


def search_brand_lev_dist(train, attributes):
    """
    Filter out the brand from attributes, join it with train data.
    Calculate Levenshtein distance between search term and the brand
    """
    attr_brand = attributes[(attributes['name'].str.lower().str.contains(
        'brand') == True) & attributes['value'].notnull()]
    attr_brand = attr_brand.drop('name', axis=1)
    attr_brand = attr_brand.rename(columns={'value': 'brand'})
    attr_brand['product_uid'] = attr_brand['product_uid'].apply(
        lambda x: int(x))

    d = defaultdict(list)
    p = list(attr_brand['product_uid'])
    b = list(attr_brand['brand'])
    for i in range(len(p)):
        if p[i] not in d:
            d[p[i]] = tokenize(b[i])
        else:
            continue
    train['brand'] = train['product_uid'].apply(lambda x: d[x])
    train['brand'].fillna('', inplace=True)
    train['search_term'].fillna('', inplace=True)
    train['search_term_split'] = train['search_term'].apply(
        lambda x: x.split(' '))

    p = []
    for i in range(len(train)):
        q = []
        if len(train['search_term_split'][i][0]) > 0:
            for j in range(len(train['search_term_split'][i])):
                for k in range(len(train['brand'][i])):
                    if train['search_term_split'][i][j] in train['brand'][i][k]:
                        q.append((train['brand'][i][k], train['brand'][i][k]))
                        continue
                    elif train['search_term_split'][i][j][0] == train['brand'][i][k][0]:
                        q.append((train['search_term_split']
                                  [i][j], train['brand'][i][k]))
        p.append(q)

    l = []
    for i in range(len(p)):
        q = []
        for j in range(len(p[i])):
            q.append(distance(p[i][j][0], p[i][j][1]))
        l.append(q)

    m = []
    for q in l:
        if q == []:
            m.append(1000)
        else:
            m.append(min(q))

    train['min_levenstein_dist_brand'] = m

    return train

In [5]:
def letter_prob(phrases):
    """
    :param phrases:         a list of strings of text
    :returns:               a list of dictionaries of probabilities for characters in the text 
    """
    letter_counters = []
    for phrase in phrases:
        letter_count = defaultdict(lambda: 0)
        for char in phrase:
            if char.isalpha():
                if char in letter_count:
                    letter_count[char] += 1
                else:
                    letter_count[char] = 1
        letter_counters.append(letter_count)

        total_count = float(sum(list(letter_count.values())))

        for key in letter_count.keys():
            letter_count[key] = letter_count[key] / total_count

    return letter_counters


def calculate_entropy(probs_list):
    """
    :param probs_list:      a list of dictionaries in which the values are probabilities
    :returns:               a list of entropies calculated for the given probs_list
    """
    entropies = []
    for distribution in probs_list:
        entropy = 0
        for key in distribution.keys():
            entropy += distribution[key] * math.log2(distribution[key])
        entropy *= -1
        entropies.append(entropy)
    return entropies


def longest_common_subsequence(X, Y):
    """
    :param X:               a list of strings of text
    :param Y:               a list of strings of text
    :returns:               a list of the integer length of the longest common subsequence 
                            between the strings
    """
    lcs = []

    for idx, x in enumerate(X):
        m = len(x)
        n = len(Y[idx])

        L = [[None]*(n+1) for i in range(m+1)]

        for i in range(m+1):
            for j in range(n+1):
                if i == 0 or j == 0:
                    L[i][j] = 0
                elif x[i-1] == Y[idx][j-1]:
                    L[i][j] = L[i-1][j-1]+1
                else:
                    L[i][j] = max(L[i-1][j], L[i][j-1])
        lcs.append(L[m][n])

    return lcs


def calculate_jaccard_index(text_1, text_2):
    """
    :param text_1:         a list of strings of text
    :param text_2:         a second list of strings of text
    :returns:              a list of jaccard indices (intersection of words / union of words)
                           between the strings of text provided
    """
    jaccard_indices = []
    for text in zip(text_1, text_2):
        tokens_1 = set(tokenize(text[0]))
        tokens_2 = set(tokenize(text[1]))
        intersection_ = tokens_1.intersection(tokens_2)
        union_ = tokens_1.union(tokens_2)
        jaccard_indices.append(
            len(list(intersection_)) / float(len(list(union_))))
    return jaccard_indices

In [6]:
def jaro(s, t):
    s_len = len(s)
    t_len = len(t)

    if s_len == 0 and t_len == 0:
        return 1

    match_distance = (max(s_len, t_len) // 2) - 1

    s_matches = [False] * s_len
    t_matches = [False] * t_len

    matches = 0
    transpositions = 0

    for i in range(s_len):
        start = max(0, i-match_distance)
        end = min(i+match_distance+1, t_len)

        for j in range(start, end):
            if t_matches[j]:
                continue
            if s[i] != t[j]:
                continue
            s_matches[i] = True
            t_matches[j] = True
            matches += 1
            break
    if matches == 0:
        return 0

    k = 0
    for i in range(s_len):
        if not s_matches[i]:
            continue
        while not t_matches[k]:
            k += 1
        if s[i] != t[k]:
            transpositions += 1
        k += 1

    return ((matches / s_len) +
            (matches / t_len) +
            ((matches - transpositions/2) / matches)) / 3


def getJaroScoreOnDocs(query, long_text):
    # transform query and long_text to list of words.
    query_ls = query.split()
    long_text_ls = long_text.split()

    total_J_score = 0
    for i in query_ls:
        j_score_in_i = sum([jaro(i, j)
                            for j in long_text_ls if jaro(i, j) > 0.83])
        total_J_score += j_score_in_i

    return total_J_score


def createJaroCol(df, query_col_name, text_col_name, new_col_name):
    # Could combine title and description as a unit to compute Jaro score.
    # It will be higher but as one score, easy to compute.
    # compute all jscore in a list
    j_score_ls = []
    for i in range(len(df)):
        query = df[query_col_name].iloc[i]
        long_text = df[text_col_name].iloc[i]
        j_score = getJaroScoreOnDocs(query, long_text)
        j_score_ls.append(j_score)
    df[new_col_name] = j_score_ls

    return None


def smith_waterman(a: str, b: str, alignment_score: float = 1, gap_cost: float = 1) -> float:
    """
    Compute the Smith-Waterman alignment score for two strings.
    See https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm#Algorithm
    This implementation has a fixed gap cost (i.e. extending a gap is considered
    free). In the terminology of the Wikipedia description, W_k = {c, c, c, ...}.
    This implementation also has a fixed alignment score, awarded if the relevant
    characters are equal.
    Kinda slow, especially for large (50+ char) inputs.
    """
    # H holds the alignment score at each point, computed incrementally
    H = np.zeros((len(a) + 1, len(b) + 1))
    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
            # The score for substituting the letter a[i-1] for b[j-1]. Generally low
            # for mismatch, high for match.
            match = H[i-1, j-1] + (alignment_score if a[i-1] == b[j-1] else 0)

            # The scores for for introducing extra letters in one of the strings (or
            # by symmetry, deleting them from the other).
            delete = H[1:i, j].max() - gap_cost if i > 1 else 0
            insert = H[i, 1:j].max() - gap_cost if j > 1 else 0
            H[i, j] = max(match, delete, insert, 0)
    # The highest score is the best local alignment.
    # For our purposes, we don't actually care _what_ the alignment was, just how
    # aligned the two strings were.
    return H.max()


def getSWscore(query, long_text):
    """
    param: query is the search query as a string.
    param: text is the long text to compute the similarity.
    return the number of significant alignment strings in both text. ie. the number of similar terms in query and long_text.
    """
    query_ls = query.split()
    long_text_ls = long_text.split()

    sw_score = []
    for i in query_ls:
        score = sum([smith_waterman(i, j)
                     for j in long_text_ls if smith_waterman(i, j) >= 4.0])
        sw_score.append(score)
    return round(sum(sw_score)/5)


def createSWscoreCol(df, query_col_name, long_text_col_name, new_col_name):
    first_col = df[query_col_name]
    second_col = df[long_text_col_name]

    score_ls = []
    for i in range(len(first_col)):
        score_ls.append(getSWscore(first_col.iloc[i], second_col.iloc[i]))
    df[new_col_name] = score_ls
    return df


def computeNCD(string1, string2):
    """
    params: string1 is the query term
    params: string2 is the word in long-text, like title, description.
    """
    # Get concated strings and transform to bytes-like object for lzma.compress.
    concat_str = string1+string2
    string1 = bytes(string1, 'utf-8')
    string2 = bytes(string2, 'utf-8')
    concat_str = bytes(concat_str, 'utf-8')

    # Get the compressed file for each string.
    str1_comp = lzma.compress(string1)  # compress file 1
    str2_comp = lzma.compress(string2)  # compress file 2
    concat_str_comp = lzma.compress(concat_str)  # compress file concatenated

    # magic happens here
    ncd = (len(concat_str_comp) - min(len(str1_comp), len(str2_comp))) / \
        max(len(str1_comp), len(str2_comp))

    return ncd


def createNCDCol(df, search_name, long_text_name, new_col_name):
    NCD_score_ls = []
    for i in range(len(df)):
        str1 = df[search_name].iloc[i]
        str2 = df[long_text_name].iloc[i]
        NCD_score_ls.append(np.mean([computeNCD(a, b)
                                     for a in str1.split() for b in str2.split()]))
    df[new_col_name] = NCD_score_ls
    return df

In [110]:
PARTIALS = False


def gettext(xmltext):
    """
    Parse xmltext and return the text from <title> and <text> tags
    """

    # ensure there are no weird char
    xmltext = xmltext.encode('ascii', 'ignore')
    root = ET.fromstring(xmltext)
    text = []
    for elem in root.iterfind('title'):
        text.append(elem.text)
    for elem in root.iterfind('.//text/*'):
        text.append(elem.text)
    text = ' '.join(text)

    return text


def compute_tfidf(corpus):
    """
    Create and return a TfidfVectorizer object after training it on
    the list of articles pulled from the corpus dictionary. The
    corpus argument is a dictionary mapping file name to xml text.
    """
    tfidf = TfidfVectorizer(input='content',
                            analyzer='word',
                            preprocessor=gettext,
                            tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore')
    tfidf.fit(list(corpus.values()))

    return tfidf


def add_prod_description_column(train):
    """
    Add the product description from product df to train df.
    Concatenate Title and description to form total_description column.
    """
    train['total_description'] = train['product_title'] + \
        train['product_description']
    return train

In [87]:
def get_words(x):
    """
    Remove the tfidf scores and return only the top tfidf words
    """
    q = []
    for i in range(len(x)):
        if x[i][0] != []:
            q.append(x[i][0])
    return q


def add_tfidf_col(train):
    train['tfidf'] = train['tfidf'].apply(lambda x: get_words(x))
    return train


def num_stop_words(x):
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    return len([w for w in x if w in stops])


def find_tfidf_words_in_search(train):
    train['search_term_split'] = train['search_term'].apply(
        lambda x: tokenizer(x))
    p = train['search_term_split']
    q = train['tfidf']
    l = []
    for i in range(len(p)):
        l.append(len(set(p[i]).intersection(set(q[i]))))
    train['tfidf_search_common'] = l

    return train


def num_attrib_per_product(attributes):
    """
    Find the number of attributes per product
    """
    attributes['value'] = attributes['value'].apply(
        lambda x: tokenizer(str(x)))
    attributes['value'] = attributes['value'].apply(lambda x: ','.join(x))
    attrib_per_product = attributes.groupby(
        'product_uid').agg(lambda x: x.tolist())
    attrib_per_product = attrib_per_product.reset_index()
    attrib_per_product['value'] = attrib_per_product['value'].apply(
        lambda x: ','.join(x).replace(',', ' '))
    attrib_per_product['num_attrib'] = attrib_per_product['name'].apply(
        lambda x: len(x))
    attrib_per_product['value'].fillna('', inplace=True)
    attrib_per_product.rename(columns={'value': 'attribs'})
    attrib_per_product['product_uid'] = attrib_per_product['product_uid'].apply(
        lambda x: int(x))

    return attrib_per_product

In [88]:
def find_n_tfidf_highest_scores(train_set, n):
    tfidf = TfidfVectorizer(input='content',
                            analyzer='word',
                            tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore')
    tfidf.fit(train_set['total_description'])

    p = []
    total_description = list(train_set['total_description'])
    for i in range(len(train_set)):
        response = tfidf.transform([total_description[i]])
        feature_names = tfidf.get_feature_names()
        col = response.nonzero()[1]
        t = []
        t = [(feature_names[col], response[0, col])
             for col in response.nonzero()[1] if response[0, col] >= 0.09]
        t.sort(key=lambda x: x[1], reverse=True)
        p.append(t[0:n])

    train_set['tfidf'] = p
    return train_set

In [89]:
def add_word_count_features(train_df):
    train_df['num_words_in_description'] = train_df['total_description'].apply(
        lambda x: len(tokenize(x)))
    train_df['num_stop_words'] = train_df['search_term'].apply(
        lambda x: num_stop_words(x.split(' ')))
    train_df['num_search_words'] = train_df['search_term'].apply(
        lambda x: len(x.split(' ')))
    return train_df

In [165]:
def add_num_attrib_per_prod_column(train_df, attributes_df):
    attrib_per_product = num_attrib_per_product(attributes_df)
    train_df = train_df.set_index('product_uid').join(
        attrib_per_product.set_index('product_uid'),
        lsuffix='', rsuffix='_r')
    train_df = train_df.reset_index()
    attrib_per_product = attrib_per_product.reset_index()

    train_df = train_df.drop('name_r', 1)
    train_df = train_df.drop('value_r', 1)
    train_df['num_attrib'] = train_df['num_attrib'].fillna(0)

    return train_df

In [258]:
def getAllNumericalCols(all_features):
    """
    param: all_features is a data frame containning all features.
    output: column names of all numerical features.
    """
    col_names = all_features.columns.tolist()
    all_num_ind = [15]+list(range(25, len(col_names)))
    all_num_col = [col_names[i] for i in all_num_ind]

    return all_num_col


def getSimilarityCols(all_num_features):
    """
    param: all_features is a data frame containning all numerical features.
    output: column names of all similarity features.
    """
    all_similarity_features = [all_num_features.columns.tolist(
    )[i] for i in [0, 14, 15, 16, 17, 18, 19, 20, 21, 22, 26]]
    return all_similarity_features


def getCountAndOtherCols(all_similarity_features, all_num_features):
    """
    return the column names of all count features and len_Entropy columns.
    """
    all_other_num_cols = set(all_num_features.columns.tolist()).difference(
        set(all_similarity_features.columns.tolist()))
    col_has_in = [i for i in all_other_num_cols if "in" in i]
    len_H_features = list(set(all_other_num_cols).difference(set(col_has_in)))

    return col_has_in, len_H_features

In [18]:
def feature_engineering(train_df, products_df, dictionary):
    """
    Adds the following features to the training set dataframe: 
    * clean_length: the count of words in the 'cleaned' search terms
    * title_length: the count of words in the 'cleaned' title
    * desc_length: the count of words in the 'cleaned' description
    * clean_terms_in_title: the number of time 
    any of the words in clean_terms appears in the title
    * clean_terms_in_desc: the number of time 
    any of the words in clean_terms appears in the description
    * neighbours_in_title: the count of the appearance of the 
    words closest to the search terms in the title
    * neighbours_in_desc: the count of the appearance of the 
    words closest to the search terms in the description

    :param train_df:        the training set Pandas dataframe
    :param products_df:     the product descriptions dataframe
    :param dictionary:      the glove dictionary
    :returns:               the modified dataframe with the additional features
    """
    # join the dataframes together
    train_df = train_df.set_index('product_uid').join(
        products_df.set_index('product_uid'))
    train_df = train_df.reset_index()

    # "clean" the search terms of numbers and stop words
    search_terms = train_df['search_term']
    cleaned_terms = [' '.join(tokenize(search_term))
                     for search_term in search_terms]
    train_df['cleaned_terms'] = cleaned_terms

    cleaned = list(train_df['cleaned_terms'])
    title = list(train_df['product_title'])
    desc = list(train_df['product_description'])

    # stem the search terms, title, and descriptions
    stemmed_terms = [' '.join(stemmed(tokenize(search_term)))
                     for search_term in search_terms]
    stemmed_title = [' '.join(stemmed(tokenize(t)))
                     for t in train_df['product_title']]
    stemmed_desc = [' '.join(stemmed(tokenize(d)))
                    for d in train_df['product_description']]

    train_df['stemmed_terms'] = stemmed_terms
    train_df['stemmed_title'] = stemmed_title
    train_df['stemmed_desc'] = stemmed_desc

    stemmed_terms = list(train_df['stemmed_terms'])
    stemmed_title = list(train_df['stemmed_title'])
    stemmed_desc = list(train_df['stemmed_desc'])

    # lemmatize the search terms, title, and descriptions
    lemmatized_terms = [' '.join(lemmatized(tokenize(search_term)))
                        for search_term in search_terms]
    lemmatized_title = [' '.join(lemmatized(tokenize(t)))
                        for t in train_df['product_title']]
    lemmatized_desc = [' '.join(lemmatized(tokenize(d)))
                       for d in train_df['product_description']]

    train_df['lemmatized_terms'] = lemmatized_terms
    train_df['lemmatized_title'] = lemmatized_title
    train_df['lemmatized_desc'] = lemmatized_desc

    lemmatized_terms = list(train_df['lemmatized_terms'])
    lemmatized_title = list(train_df['lemmatized_title'])
    lemmatized_desc = list(train_df['lemmatized_desc'])

    # set up the calculations for finding the nearest neighbors
    wordlist, matrix = split_dictionary()
    cleaned_set = unique_words(train_df)
    find_nearest_neighbors('glove_neighbour_no_w.txt',
                           cleaned_set, matrix, wordlist, dictionary)
    k_dict = build_dictionary('glove_neighbour_no_w.txt')
    terms_neighbour = get_all_terms_neighbors(k_dict, cleaned)
    train_df['terms_neighbour'] = terms_neighbour

    # create the features to be used in the model
    train_df['clean_length'] = get_length(cleaned)
    train_df['title_length'] = get_length(title)
    train_df['desc_length'] = get_length(desc)
    train_df['clean_terms_in_title'] = clean_term_in_doc(cleaned, title)
    train_df['clean_terms_in_desc'] = clean_term_in_doc(cleaned, desc)
    train_df['stemmed_terms_in_title'] = clean_term_in_doc(
        stemmed_terms, stemmed_title)
    train_df['stemmed_terms_in_desc'] = clean_term_in_doc(
        stemmed_terms, stemmed_desc)
    train_df['lemmatized_terms_in_title'] = clean_term_in_doc(
        lemmatized_terms, lemmatized_title)
    train_df['lemmatized_terms_in_desc'] = clean_term_in_doc(
        lemmatized_terms, lemmatized_desc)
    train_df['neighbours_in_title'] = clean_term_in_doc(terms_neighbour, title)
    train_df['neighbours_in_desc'] = clean_term_in_doc(terms_neighbour, desc)

    train_df['search_terms_entropy'] = calculate_entropy(letter_prob(cleaned))
    train_df['title_entropy'] = calculate_entropy(letter_prob(title))
    train_df['jaccard_index_title'] = calculate_jaccard_index(title, cleaned)
    train_df['jaccard_index_desc'] = calculate_jaccard_index(desc, cleaned)
    train_df['lcs_title'] = longest_common_subsequence(cleaned, title)
    train_df['lcs_desc'] = longest_common_subsequence(cleaned, desc)

    return train_df

In [101]:
products = pd.read_csv('product_descriptions.csv')
train = pd.read_csv('train.csv', encoding='ISO-8859-1')

In [9]:
# BEWARE: this takes ~2.5 min to run
attributes = pd.read_csv('attributes.csv', encoding='ISO-8859-1')
attrib_per_product = attrib_stack(attributes)

In [10]:
train, attrib_per_product = join_attrib(train, attrib_per_product)
train = search_term_in_attrib(train)

In [11]:
# BEWARE: this takes ~4 min to run
train = color_df(attributes, train)
train = search_title_lev_dist(train)

In [12]:
train = search_brand_lev_dist(train, attributes)

In [13]:
train_temp = train.drop(['id', 'name', 'value', 'search_term_split',
                         'color', 'product_title_clean', 'brand'], axis=1)

In [14]:
glove_file = 'glove.6B.300d.txt'
glove_dic = make_dictionary(glove_file)

In [19]:
# BEWARE: this takes ~26 min to run
modified_train = feature_engineering(train, products, glove_dic)

In [20]:
# BEWARE: this takes ~4 min to run
createJaroCol(modified_train, "search_term",
              "product_description", "jscore_query_desc")
createJaroCol(modified_train, "search_term",
              "product_title", "jscore_query_title")

In [21]:
# BEWARE: this takes ~8.5 min to run
modified_train = createSWscoreCol(
    modified_train, "search_term", "product_title", "search_title_SW")

In [23]:
# BEWARE: this takes ~1hr 46m to run
modified_train = createSWscoreCol(
    modified_train, "search_term", "product_description", "search_desc_SW")

In [25]:
# BEWARE: this takes ~14hr 35m to run
modified_train = createNCDCol(
    modified_train, "search_term", "product_title", "NCD_query_title")

In [112]:
modified_train = add_prod_description_column(modified_train)

In [114]:
modified_train = add_word_count_features(modified_train)

In [119]:
# BEWARE: this takes ~3m to run
modified_train = add_num_attrib_per_prod_column(modified_train, attributes)

In [120]:
# BEWARE, this takes ~5h 12m to run
modified_train = find_n_tfidf_highest_scores(modified_train, 5)

In [121]:
modified_train = find_tfidf_words_in_search(modified_train)

In [122]:
modified_train = add_tfidf_col(modified_train)
modified_train['num_attrib'].fillna(0, inplace=True)

In [156]:
print(modified_train.columns.values)
modified_train[:3]

['product_uid' 'id' 'product_title' 'search_term' 'relevance' 'name'
 'value' 'search_term_split' 'search_term_in_attrib' 'color'
 'color_in_search_term' 'product_title_clean' 'min_levenstein_dist_title'
 'brand' 'min_levenstein_dist_brand' 'product_description' 'cleaned_terms'
 'stemmed_terms' 'stemmed_title' 'stemmed_desc' 'lemmatized_terms'
 'lemmatized_title' 'lemmatized_desc' 'terms_neighbour' 'clean_length'
 'title_length' 'desc_length' 'clean_terms_in_title' 'clean_terms_in_desc'
 'stemmed_terms_in_title' 'stemmed_terms_in_desc'
 'lemmatized_terms_in_title' 'lemmatized_terms_in_desc'
 'neighbours_in_title' 'neighbours_in_desc' 'search_terms_entropy'
 'title_entropy' 'jaccard_index_title' 'jaccard_index_desc' 'lcs_title'
 'lcs_desc' 'jscore_query_desc' 'jscore_query_title' 'search_title_SW'
 'search_desc_SW' 'NCD_query_title' 'total_description'
 'num_words_in_description' 'num_stop_words' 'num_search_words'
 'num_attrib' 'tfidf' 'tfidf_search_common']


Unnamed: 0,product_uid,id,product_title,search_term,relevance,name,value,search_term_split,search_term_in_attrib,color,...,search_title_SW,search_desc_SW,NCD_query_title,total_description,num_words_in_description,num_stop_words,num_search_words,num_attrib,tfidf,tfidf_search_common
0,100001,2,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"['Bullet01', 'Bullet02', 'Bullet03', 'Bullet04...","{'stronger', 'consist', 'extra', 'instal', 'jo...","[angl, bracket]",1,{''},...,1.0,4.0,0.107077,Simpson Strong-Tie 12-Gauge AngleNot only do a...,79,0,2,15.0,"[angl, simpson, strong, tie, project]",0
1,100001,3,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"['Bullet01', 'Bullet02', 'Bullet03', 'Bullet04...","{'stronger', 'consist', 'extra', 'instal', 'jo...",[bracket],0,{''},...,0.0,0.0,0.107077,Simpson Strong-Tie 12-Gauge AngleNot only do a...,79,0,2,15.0,"[angl, simpson, strong, tie, project]",0
2,100002,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,"['Application Method', 'Assembled Depth (in.)'...","{'represent', 'durabl', 'behr', 'slip', 'follo...",[deck],1,"{'Tans', 'Browns', 'Tugboat'}",...,0.0,3.0,0.109091,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,109,1,2,35.0,"[concret, deckov, behr, textur, deck]",0


## Fit scikit-learn model

To quickly get to predicting the models, we saved the data into a csv.

In [184]:
modified_train = pd.read_csv('final_combined.csv')
modified_train['num_attrib'] = modified_train['num_attrib'].fillna(0)

In [215]:
X_train = modified_train[['clean_length', 'title_length',
                          'desc_length', 'clean_terms_in_title',
                          'clean_terms_in_desc', 'stemmed_terms_in_title',
                          'stemmed_terms_in_desc', 'lemmatized_terms_in_title',
                          'lemmatized_terms_in_desc', 'neighbours_in_title',
                          'neighbours_in_desc',
                          'min_levenstein_dist_title', 'min_levenstein_dist_brand',
                          'stemmed_terms_in_title', 'stemmed_terms_in_desc',
                          'lemmatized_terms_in_title', 'lemmatized_terms_in_desc',
                          'neighbours_in_title', 'neighbours_in_desc', 'search_terms_entropy',
                          'title_entropy', 'jaccard_index_title', 'jaccard_index_desc', 'lcs_title',
                          'lcs_desc', 'jscore_query_desc', 'jscore_query_title', 'search_title_SW',
                          'search_desc_SW', 'NCD_query_title', 'num_words_in_description', 'num_stop_words',
                          'num_search_words', 'tfidf_search_common', 'num_attrib']]
y_train = modified_train[['relevance']]

Since we can't see the relevancy scores of the test set, we decided to split the training set further into our own training and test set.

In [216]:
train_data, test_data, train_target, test_target = train_test_split(X_train,
                                                                    y_train)

As our baseline, we decided to use linear regression.

In [217]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(train_data, train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [62]:
# BEWARE: this takes ~41m to run
tree_models = [RandomForestRegressor(),
               AdaBoostRegressor()]

grid_params_tree = [{'n_estimators': range(1, 30, 5),
                     'max_features': ['auto', 'sqrt', 'log2', None]},
                    {'n_estimators': range(1, 30, 5),
                     'loss': ['linear', 'square', 'exponential'],
                     'learning_rate': np.linspace(start=0.5, stop=1.5, num=5)}]
best_models_tree = []
for model in zip(tree_models, grid_params_tree):
    gs = GridSearchCV(estimator=model[0],
                      param_grid=model[1],
                      scoring='neg_mean_squared_error')
    if type(y_train) != np.ndarray:
        y = y_train.values.ravel()
        train_target = np.array(y).astype(float)
    gs.fit(X_train, y_train.values.ravel())
    best_models_tree.append(
        (sqrt(-1 * gs.best_score_), gs.best_params_, model[0]))

print(best_models_tree)



[(0.49136759545324854, {'max_features': 'sqrt', 'n_estimators': 26}, RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)), (0.5086641222487406, {'learning_rate': 0.5, 'loss': 'linear', 'n_estimators': 6}, AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None))]


In [63]:
# BEWARE: this takes ~5m to run
alt_linear_models = [Lasso(),
                     Ridge(),
                     ElasticNet()]

alt_linear_grid_params = [{'alpha': np.linspace(start=0.25, stop=1.0, num=4),
                           'normalize': [False, True],
                           'selection': ['cyclic', 'random']},
                          {'alpha': np.linspace(start=0.5, stop=2.0, num=4),
                           'normalize': [False, True],
                           'solver': ['svd', 'lsqr', 'sag', 'saga']},
                          {'alpha': np.linspace(start=0.5, stop=2.0, num=4),
                           'l1_ratio': np.linspace(start=0.25, stop=1.0, num=4),
                           'normalize': [False, True],
                           'selection': ['cyclic', 'random']}]
best_models_alt_linear = []
for model in zip(alt_linear_models, alt_linear_grid_params):
    gs = GridSearchCV(estimator=model[0],
                      param_grid=model[1],
                      scoring='neg_mean_squared_error')
    if type(y_train) != np.ndarray:
        y = y_train.values.ravel()
        train_target = np.array(y).astype(float)
    gs.fit(X_train, y_train.values.ravel())
    best_models_alt_linear.append(
        (sqrt(-1 * gs.best_score_), gs.best_params_, model[0]))

print(best_models_alt_linear)



[(0.536508754756873, {'alpha': 0.25, 'normalize': False, 'selection': 'random'}, Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)), (0.50115032928149, {'alpha': 0.5, 'normalize': False, 'solver': 'svd'}, Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)), (0.5240996355016398, {'alpha': 0.5, 'l1_ratio': 0.25, 'normalize': False, 'selection': 'cyclic'}, ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False))]


In [218]:
predicted = lin_reg_model.predict(test_data)
print(predicted[:5])
print(test_target[:5])

[[2.36685919]
 [2.06539957]
 [2.35021351]
 [2.66144108]
 [2.09629645]]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [219]:
# here we chose the best tree model
best_tree_model = sorted(best_models_tree, key=lambda model: model[0])[0]
best_tree_model = best_tree_model[2].__class__(**best_tree_model[1])
best_tree_model.fit(train_data, train_target)
tree_predicted = best_tree_model.predict(test_data)
print(tree_predicted[:5])
print(test_target[:5])

  after removing the cwd from sys.path.


[2.32       2.19076923 2.40961538 2.74346154 1.81384615]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [220]:
best_model_alt_linear = sorted(
    best_models_alt_linear, key=lambda model: model[0])[0]
best_model_alt_linear = best_model_alt_linear[2].__class__(
    **best_model_alt_linear[1])
best_model_alt_linear.fit(train_data, train_target)
alt_linear_predicted = best_model_alt_linear.predict(test_data)
print(alt_linear_predicted[:5])
print(test_target[:5])

[[2.36788502]
 [2.06600985]
 [2.34803036]
 [2.66176554]
 [2.09612481]]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [196]:
# BEWARE : this takes ~25m to run
rf_models = [RandomForestRegressor()]

grid_params_rf = [{'n_estimators': range(25, 151, 6),
                   'max_features': ['sqrt', 'log2']}]
best_models_rf = []
for model in zip(rf_models, grid_params_rf):
    gs = GridSearchCV(estimator=model[0],
                      param_grid=model[1],
                      scoring='neg_mean_squared_error')
    if type(y_train) != np.ndarray:
        y = y_train.values.ravel()
        train_target = np.array(y).astype(float)
    gs.fit(X_train, y_train.values.ravel())
    best_models_rf.append(
        (sqrt(-1 * gs.best_score_), gs.best_params_, model[0]))

print(best_models_rf)



[(0.47633563539292706, {'max_features': 'log2', 'n_estimators': 139}, RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]


In [221]:
best_rf_model = sorted(best_models_rf, key=lambda model: model[0])[0]
best_rf_model = best_rf_model[2].__class__(**best_rf_model[1])
best_rf_model.fit(train_data, train_target)
rf_predicted = best_rf_model.predict(test_data)
print(rf_predicted[:5])
print(test_target[:5])

  This is separate from the ipykernel package so we can avoid doing imports until


[2.22482014 2.22717026 2.45798561 2.83348921 1.98992806]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [209]:
# BEWARE: this takes ~10m to run
transformed_rf_models = [Pipeline([('scale', StandardScaler()),
                                   ('pca', PCA(n_components=int(
                                       math.log2(len(X_train.columns))))),
                                   ('regr', RandomForestRegressor())])]

grid_params_transformed_rf = [{'n_estimators': range(137, 142),
                               'max_features': ['log2']}]
best_models_transformed_rf = []
for model in zip(rf_models, grid_params_transformed_rf):
    gs = GridSearchCV(estimator=model[0],
                      param_grid=model[1],
                      scoring='neg_mean_squared_error',
                      cv=5)
    if type(y_train) != np.ndarray:
        y = y_train.values.ravel()
        train_target = np.array(y).astype(float)
    gs.fit(X_train, y_train.values.ravel())
    best_models_transformed_rf.append(
        (sqrt(-1 * gs.best_score_), gs.best_params_, model[0]))

print(best_models_transformed_rf)

[(0.4695884434881894, {'max_features': 'log2', 'n_estimators': 140}, RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]


In [222]:
best_transformed_rf_model = sorted(
    best_models_transformed_rf, key=lambda model: model[0])[0]
best_transformed_rf_model = best_transformed_rf_model[2].__class__(
    **best_transformed_rf_model[1])
best_transformed_rf_model.fit(train_data, train_target)
transformed_rf_predicted = best_transformed_rf_model.predict(test_data)
print(transformed_rf_predicted[:5])
print(test_target[:5])

  This is separate from the ipykernel package so we can avoid doing imports until


[2.26142857 2.276      2.48378571 2.763625   2.03057143]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


## Testing Smaller Feature Space

In [255]:
all_num_features = modified_train[getAllNumericalCols(modified_train)]
all_num_features.head(3)

Unnamed: 0,min_levenstein_dist_brand,clean_length,title_length,desc_length,clean_terms_in_title,clean_terms_in_desc,stemmed_terms_in_title,stemmed_terms_in_desc,lemmatized_terms_in_title,lemmatized_terms_in_desc,...,jscore_query_desc,jscore_query_title,search_title_SW,search_desc_SW,NCD_query_title,num_words_in_description,num_stop_words,num_search_words,tfidf_search_common,num_attrib
0,1000,2.0,4.0,129.0,1.0,0.0,1.0,1.0,1.0,1.0,...,2.833333,0.866667,1.0,4.0,0.107077,79,0,2,1,15.0
1,1000,1.0,4.0,129.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.107077,79,0,2,0,15.0
2,0,1.0,11.0,168.0,0.0,0.0,0.0,1.0,0.0,1.0,...,2.711111,0.0,0.0,3.0,0.109091,109,1,2,1,35.0


In [259]:
all_similarity_features = modified_train[getSimilarityCols(all_num_features)]
all_similarity_features.head(3)

Unnamed: 0,min_levenstein_dist_brand,jaccard_index_title,jaccard_index_desc,lcs_title,lcs_desc,jscore_query_desc,jscore_query_title,search_title_SW,search_desc_SW,NCD_query_title,tfidf_search_common
0,1000,0.166667,0.0,6,13,2.833333,0.866667,1.0,4.0,0.107077,1
1,1000,0.0,0.0,3,7,0.0,0.0,0.0,0.0,0.107077,0
2,0,0.0,0.012048,4,4,2.711111,0.0,0.0,3.0,0.109091,1


In [260]:
count_cols, len_h_cols = getCountAndOtherCols(
    all_similarity_features, all_num_features)
all_count_features = modified_train[count_cols]
len_entropy_features = modified_train[len_h_cols]

In [261]:
train_data_numerical_subset, test_data_numerical_subset, train_target_numerical_subset, test_target_numerical_subset = train_test_split(all_num_features,
                                                                                                                                        y_train)

In [262]:
best_transformed_rf_model.fit(
    train_data_numerical_subset, train_target_numerical_subset)
transformed_rf_predicted_numerical_subset = best_transformed_rf_model.predict(
    test_data_numerical_subset)
print(transformed_rf_predicted_numerical_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.24535714 2.49071429 2.64038095 2.60203571 2.58490476]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [266]:
best_rf_model.fit(train_data_numerical_subset, train_target_numerical_subset)
rf_predicted_numerical_subset = best_rf_model.predict(
    test_data_numerical_subset)
print(rf_predicted_numerical_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.22330935 2.45896043 2.65258993 2.58966427 2.46791367]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [263]:
train_data_similarity_subset, test_data_similarity_subset, train_target_similarity_subset, test_target_similarity_subset = train_test_split(all_similarity_features,
                                                                                                                                            y_train)

In [264]:
best_transformed_rf_model.fit(
    train_data_similarity_subset, train_target_similarity_subset)
transformed_rf_predicted_similarity_subset = best_transformed_rf_model.predict(
    test_data_similarity_subset)
print(transformed_rf_predicted_similarity_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.11857143 2.27058673 2.1015     2.44828571 2.39452381]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [268]:
best_rf_model.fit(train_data_similarity_subset, train_target_similarity_subset)
rf_predicted_similarity_subset = best_rf_model.predict(
    test_data_similarity_subset)
print(rf_predicted_similarity_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.02188849 2.34651079 2.10170264 2.58411871 2.42079137]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [273]:
train_data_count_subset, test_data_count_subset, train_target_count_subset, test_target_count_subset = train_test_split(all_count_features,
                                                                                                                        y_train)

In [274]:
best_transformed_rf_model.fit(
    train_data_count_subset, train_target_count_subset)
transformed_rf_predicted_count_subset = best_transformed_rf_model.predict(
    test_data_count_subset)
print(transformed_rf_predicted_count_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.62299464 2.61285714 2.37335278 2.22497168 2.24638285]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [275]:
best_rf_model.fit(train_data_count_subset, train_target_count_subset)
rf_predicted_count_subset = best_rf_model.predict(test_data_count_subset)
print(rf_predicted_count_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.62617557 2.52879496 2.36541988 2.20950156 2.27060029]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [278]:
train_data_len_entropy_subset, test_data_len_entropy_subset, train_target_len_entropy_subset, test_target_len_entropy_subset = train_test_split(len_entropy_features,
                                                                                                                                                y_train)

In [279]:
best_transformed_rf_model.fit(
    train_data_len_entropy_subset, train_target_len_entropy_subset)
transformed_rf_predicted_len_entropy_subset = best_transformed_rf_model.predict(
    test_data_len_entropy_subset)
print(transformed_rf_predicted_len_entropy_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.54094286 2.59914286 2.36814286 2.59955051 2.06421429]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


In [280]:
best_rf_model.fit(train_data_len_entropy_subset,
                  train_target_len_entropy_subset)
rf_predicted_len_entropy_subset = best_rf_model.predict(
    test_data_len_entropy_subset)
print(rf_predicted_len_entropy_subset[:5])
print(test_target[:5])

  """Entry point for launching an IPython kernel.


[2.51521223 2.61115108 2.4346283  2.65931655 1.94434053]
       relevance
64115       2.33
55409       2.33
57088       2.00
38621       3.00
50452       2.33


## Evaluation Metric

The benchmark was ~rank 1681 on the Kaggle leaderboard for this competition with an RMSE of .51049

1st place had an RMSE of .43192

https://www.kaggle.com/c/home-depot-product-search-relevance/leaderboard

In [223]:
rmse_lin_reg = sqrt(mean_squared_error(predicted, test_target))

print(f"{rmse_lin_reg:.4f}")

0.4943


In [224]:
rmse_tree = sqrt(mean_squared_error(tree_predicted, test_target))

print(f"{rmse_tree:.4f}")

0.4758


In [225]:
rmse_alt_linear = sqrt(mean_squared_error(alt_linear_predicted, test_target))

print(f"{rmse_alt_linear:.4f}")

0.4943


In [226]:
rmse_rf = sqrt(mean_squared_error(rf_predicted, test_target))

print(f"{rmse_rf:.4f}")

0.4688


In [227]:
rmse_transformed_rf = sqrt(mean_squared_error(
    transformed_rf_predicted, test_target))

print(f"{rmse_transformed_rf:.4f}")

0.4683


### Reduced Subset RMSE - Random Forest after PCA

In [240]:
rmse_transformed_rf_numerical_subset = sqrt(mean_squared_error(
    transformed_rf_predicted_numerical_subset, test_target))

print(f"{rmse_transformed_rf_numerical_subset:.4f}")

0.5992


In [265]:
rmse_transformed_rf_predicted_similarity_subset = sqrt(
    mean_squared_error(transformed_rf_predicted_similarity_subset, test_target))

print(f"{transformed_rf_predicted_similarity_subset:.4f}")

0.5953


In [276]:
rmse_transformed_rf_count_subset = sqrt(mean_squared_error(
    transformed_rf_predicted_count_subset, test_target))

print(f"{rmse_transformed_rf_count_subset:.4f}")

0.6039


In [281]:
rmse_transformed_rf_len_entropy_subset = sqrt(mean_squared_error(
    transformed_rf_predicted_len_entropy_subset, test_target))

print(f"{rmse_transformed_rf_len_entropy_subset:.4f}")

0.5788


### Reduced Subset RMSE - Regular Random Forest

In [267]:
rmse_rf_numerical_subset = sqrt(mean_squared_error(
    rf_predicted_numerical_subset, test_target))

print(f"{rmse_rf_numerical_subset:.4f}")

0.5988


In [269]:
rmse_rf_similarity_subset = sqrt(mean_squared_error(
    rf_predicted_similarity_subset, test_target))

print(f"{rmse_rf_similarity_subset:.4f}")

0.5950


In [277]:
rmse_rf_count_subset = sqrt(mean_squared_error(
    rf_predicted_count_subset, test_target))

print(f"{rmse_rf_count_subset:.4f}")

0.6036


In [282]:
rmse_rf_len_entropy_subset = sqrt(mean_squared_error(
    rf_predicted_len_entropy_subset, test_target))

print(f"{rmse_rf_len_entropy_subset:.4f}")

0.5786
