# Final Project Check-in 2018-11-16

## Group Name: Lambda

### Student Names
1. Jian Wang
2. Chong Geng
3. Alan Perry
4. Divya Bhargavi
5. Robert Sandor

## Load Data

In [1]:
from collections import defaultdict
from math import sqrt
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
import operator
import re
from scipy import spatial
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import string
import time

In [2]:
def make_dictionary(file):
    '''
    Initiate the glove model as a dictionary
    input: A String which is a file in the project directory
    returns: A dictionary with item = word : 300 d list
    
    :param file:            the filepath string of the dictionary
    :returns:               a dictionary with words as keys 
                            and 300d vectors as values
    '''
    vecs = defaultdict(lambda: np.zeros(shape=(300,1)))
    with open(file) as f:
        lines = f.readlines()
        for word_and_vec in lines:
            elems = word_and_vec.strip().split(' ')
            word = elems[0]
            vec = np.array(elems[1:], dtype=float)
            vecs[word] = vec
    return vecs

In [3]:
def split_dictionary():
    """
    firstly, I split the dictionary into a wordlist and a matrix.
    returns a list of words and 
    a 2d matrix of the normalized word vectors
    
    :returns:               the words and matrix associated with
                            the glove dictionary
    """
    wordlist = []
    matrix = []
    with open(glove_file) as f:
        lines = f.readlines()
        for word_and_vec in lines:
            wordvec = np.array([float(x) for x in word_and_vec.split()[1:]])    
            matrix.append(wordvec / np.linalg.norm(wordvec))
            wordlist.append(word_and_vec.split()[0])
        matrix = np.array(matrix)
    return wordlist, matrix

def unique_words(train_df):
    """
    I then obtain the unique words that appear in the search_term.
    
    :param train_df:        the training set Pandas dataframe
    :returns:               a list of unique words from search terms
                            that have been stripped of numbers, symbols, etc.
    """
    cleaned = list(train_df['cleaned_terms'])
    all_words = []
    for t in cleaned:
        all_words += t.split(' ')

    return list(set(all_words))[1:]

def find_nearest_neighbors(filename, cleaned_set, matrix, wordlist, dictionary):
    """
    here I count the cos_distance of each word that is in the cleaned_set.
    the output file looks like (each line): w0, w1, w2, w3, w4,
    i didn't print the distance, just the neighbour words
    this will take couple of minutes.
    
    :param filename:        a string representing the filename to write to
    :param clenaed_set:     a list of search terms that have 
                            been stripped of numbers, symbols, etc.
    :param matrix:          a 2d Numpy array of the word vectors in wordlist
    :param wordlist:        a list of words from the glove dictionary
    :param dictionary:      a dictionary with words as keys 
                            and 300d vectors as values
    """ 
    output_string = ''
    
    for word in cleaned_set:
        dots = matrix.dot(dictionary[word])
        close_index_vec = np.argsort(dots)
        for i in range(5):
            output_string += wordlist[int(close_index_vec[-1-i])] + ','
        output_string += '\n'
        
    f = open(filename, "w")
    f.write(output_string)
    f.close()

def get_all_terms_neighbors(dictionary, cleaned):
    """
    terms_neighbour is the list which stores the top 4 neighbours of each searching_terms. 
    for example, if the searching term is: cleaned[0]='w1_w2', 
    then the terms_neighbour[0]='n11_n12_n13_n14_n21_n22_n23_n24'.
    
    :param dictionary:      a dictionary
    :param cleaned:         a list of search terms that have
                            been stripped of numbers, symbols, etc.
    :returns:               a list of concatenated words that are neighbors
                            of the 'cleaned' terms
    """
    terms_neighbour = []
    for i in range(len(cleaned)):
        neighbours = ''
        if cleaned[i] != '':
            words = cleaned[i].split(' ')
            for w in words:
                neighbours = neighbours + dictionary[w] + ' '
        terms_neighbour.append(neighbours)
    return terms_neighbour

def build_dictionary(file):
    """
    based on the above output file, I then built a dictionary;
    this dictionary stores each word (as key) with its top 4 neighbour words (as value) 
    
    :param file:            the file containing the list of strings of neighbors
    :returns:               a dictionary with words as keys 
                            and 4 neighbors of that word as values
    """
    k_dic = defaultdict(lambda: '')
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            words = line.strip().split(',')
            k_dic[words[0]] = words[1] + ' ' + words[2] + ' ' + words[3] + ' ' + words[4]
    return k_dic

def clean_term_in_doc(terms, title):
    """
    This cleans the given terms in the specified document
    
    :param terms:           a list of unique search terms
    :param title:           a list of titles of products
    :return:                a list of the counts of the 
                            cleaned terms within a product's title
    """
    count = np.zeros(len(terms))
    for i in range(len(terms)):
        if not pd.isnull(terms[i]): 
            title[i] = title[i].lower()
            for term in terms[i].split(' '):
                if term in title[i].split(' '):
                    count[i] += 1
    return count

def get_length(column):
    """
    This calculates and returns the number of words
    for each row in a specified column
    
    :param column:          the feature/attribute which
                            will have its words counted
    :returns:               a column with the count of 
                            words in each string
    """
    length = np.zeros(len(column))
    for index in range(len(column)):
        if not pd.isnull(column[index]):
            length[index] = len(column[index].split(' '))
    return length

def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    
    :param text:            a string
    :returns:               the same string stripped of numbers,
                            tabs, newline characters, and punctuation
    """
    stops = list(stop_words.ENGLISH_STOP_WORDS)
    text = text.lower()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if (len(w) > 2 and (w not in stops))]  # ignore a, an, to, at, be, ...
    return words

def stemmed(words):
    """
    Stem a tokenized text and return a non-unique list of stemmed words
    found in the text. This is based on the output of function
    tokenize(text).
    
    :param text:            a list of tokenized words
    :returns:               a list of stemmed words
    """
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(w) for w in words]
    return stemmed_words


def lemmatized(words):
    """
    lemmatize a tokenized text and return a non-unique list of stemmed words
    found in the text. This is based on the output of function
    tokenize(text).
    
    :param text:            a list of tokenized words
    :returns:               a list of lemmatized words
    """
    lemmatized_words = [nltk.stem.WordNetLemmatizer().lemmatize(w) for w in words]
    return lemmatized_words


In [10]:
def feature_engineering(train_df, products_df, dictionary):
    """
    Adds the following features to the training set dataframe: 
    * clean_length: the count of words in the 'cleaned' search terms
    * title_length: the count of words in the 'cleaned' title
    * desc_length: the count of words in the 'cleaned' description
    * clean_terms_in_title: the number of time 
    any of the words in clean_terms appears in the title
    * clean_terms_in_desc: the number of time 
    any of the words in clean_terms appears in the description
    * neighbours_in_title: the count of the appearance of the 
    words closest to the search terms in the title
    * neighbours_in_desc: the count of the appearance of the 
    words closest to the search terms in the description
    
    :param train_df:        the training set Pandas dataframe
    :param products_df:     the product descriptions dataframe
    :param dictionary:      the glove dictionary
    :returns:               the modified dataframe with the additional features
    """
    # join the dataframes together
    train_df = train_df.set_index('product_uid').join(products_df.set_index('product_uid'))
    train_df = train_df.reset_index()
    
    # "clean" the search terms of numbers and stop words
    search_terms = train_df['search_term']
    cleaned_terms = [' '.join(tokenize(search_term)) for search_term in search_terms]
    train_df['cleaned_terms'] = cleaned_terms
    
    cleaned = list(train_df['cleaned_terms'])
    title = list(train_df['product_title'])
    desc = list(train_df['product_description'])
    
    # stem the search terms, title, and descriptions
    stemmed_terms = [' '.join(stemmed(tokenize(search_term))) for search_term in search_terms]
    stemmed_title = [' '.join(stemmed(tokenize(t))) for t in train_df['product_title']]
    stemmed_desc = [' '.join(stemmed(tokenize(d))) for d in train_df['product_description']]
    
    train_df['stemmed_terms'] = stemmed_terms
    train_df['stemmed_title'] = stemmed_title
    train_df['stemmed_desc'] = stemmed_desc
    
    stemmed_terms = list(train_df['stemmed_terms'])
    stemmed_title = list(train_df['stemmed_title'])
    stemmed_desc = list(train_df['stemmed_desc'])
    
    # lemmatize the search terms, title, and descriptions
    lemmatized_terms = [' '.join(lemmatized(tokenize(search_term))) for search_term in search_terms]
    lemmatized_title = [' '.join(lemmatized(tokenize(t))) for t in train_df['product_title']]
    lemmatized_desc = [' '.join(lemmatized(tokenize(d))) for d in train_df['product_description']]
    
    train_df['lemmatized_terms'] = lemmatized_terms
    train_df['lemmatized_title'] = lemmatized_title
    train_df['lemmatized_desc'] = lemmatized_desc
    
    lemmatized_terms = list(train_df['lemmatized_terms'])
    lemmatized_title = list(train_df['lemmatized_title'])
    lemmatized_desc = list(train_df['lemmatized_desc'])
    
    # set up the calculations for finding the nearest neighbors
    wordlist, matrix = split_dictionary()
    cleaned_set = unique_words(train_df)
    find_nearest_neighbors('glove_neighbour_no_w.txt', cleaned_set, matrix, wordlist, dictionary)
    k_dict = build_dictionary('glove_neighbour_no_w.txt')
    terms_neighbour = get_all_terms_neighbors(k_dict, cleaned)
    train_df['terms_neighbour'] = terms_neighbour
    
    # create the features to be used in the model
    train_df['clean_length'] = get_length(cleaned)
    train_df['title_length'] = get_length(title)
    train_df['desc_length'] = get_length(desc)
    train_df['clean_terms_in_title'] = clean_term_in_doc(cleaned, title)
    train_df['clean_terms_in_desc'] = clean_term_in_doc(cleaned, desc)
    train_df['stemmed_terms_in_title'] = clean_term_in_doc(stemmed_terms, stemmed_title)
    train_df['stemmed_terms_in_desc'] = clean_term_in_doc(stemmed_terms, stemmed_desc)
    train_df['lemmatized_terms_in_title'] = clean_term_in_doc(lemmatized_terms, lemmatized_title)
    train_df['lemmatized_terms_in_desc'] = clean_term_in_doc(lemmatized_terms, lemmatized_desc)
    train_df['neighbours_in_title'] = clean_term_in_doc(terms_neighbour, title)
    train_df['neighbours_in_desc'] = clean_term_in_doc(terms_neighbour, desc)
    
    return train_df

In [11]:
products = pd.read_csv('product_descriptions.csv')
train = pd.read_csv('train.csv', encoding='ISO-8859-1')

In [7]:
glove_file = 'glove.6B.300d.txt'
glove_dic = make_dictionary(glove_file)

In [12]:
modified_train = feature_engineering(train, products, glove_dic)

## Fit scikit-learn model

In [13]:
# choice 1
X_train = modified_train[['clean_length', 'title_length', 
                          'desc_length', 'clean_terms_in_title', 
                          'clean_terms_in_desc', 'stemmed_terms_in_title',
                          'stemmed_terms_in_desc','lemmatized_terms_in_title',
                          'lemmatized_terms_in_desc','neighbours_in_title',
                         'neighbours_in_desc']]
y_train = modified_train[['relevance']]

In [18]:
# choice 2
X_train = modified_train[['clean_length', 'title_length', 
                          'desc_length', 'clean_terms_in_title', 
                          'clean_terms_in_desc','neighbours_in_title',
                         'neighbours_in_desc']]
y_train = modified_train[['relevance']]

In [23]:
# choice 3
X_train = modified_train[['clean_length', 'title_length', 
                          'desc_length', 'stemmed_terms_in_title',
                          'stemmed_terms_in_desc','neighbours_in_title',
                         'neighbours_in_desc']]
y_train = modified_train[['relevance']]

In [28]:
# choice 4
X_train = modified_train[['clean_length', 'title_length', 
                          'desc_length', 'lemmatized_terms_in_title',
                          'lemmatized_terms_in_desc','neighbours_in_title',
                         'neighbours_in_desc']]
y_train = modified_train[['relevance']]

In [29]:
# since we can't see the relevancy scores of the test set,
# I decided to split the training set 
train_data, test_data, train_target, test_target = train_test_split(X_train,
                                                                        y_train, random_state=42)

In [30]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(train_data, train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [31]:
predicted = lin_reg_model.predict(test_data)
print(predicted[:5])
print(test_target[:5])

[[2.03628912]
 [2.24501342]
 [2.32567038]
 [2.64512307]
 [2.6883504 ]]
       relevance
13534       3.00
29748       2.67
20225       2.67
5169        2.67
49860       2.00


## Evaluation Metric

In [32]:
# since an RMSE function couldn't be found quickly in the sklearn library,
# we just used the MSE function and took the square root of that
rmse_lin_reg = sqrt(mean_squared_error(predicted, test_target))

# this value is equivalent to rank 1680 on the Kaggle leaderboard for this competition
# the benchmark was ~ rank 1681
# https://www.kaggle.com/c/home-depot-product-search-relevance/leaderboard
print(f"{rmse_lin_reg:.4f}")

0.5014
