#### Introduction

HomeDepot Kaggle Competition: Goal is to predict the search term relevance by the user inputted search team and returned product. Training data is provided in the form of search terms, products and a manually curated score. These scores are the labels that the model needs to predict.

The dataset is preprocessed using the NLTK library and basic NLP techniques such as stemming. Features are engineered using data transformations such as tf-idf and truncated SVD - this is to reduce the dimensionality of the feature set. A Random Forest Regressor is used as the final model predictor due to its ability to minimize outlier effects and non-parametric property.

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter
import pdb
import matplotlib.pyplot as plt
import itertools
%matplotlib inline

### Data Preprocessing

In [2]:
df_train = pd.read_csv('train.csv', header=0, encoding='iso-8859-1')
df_test = pd.read_csv('test.csv', header=0, encoding='iso-8859-1')
df_att = pd.read_csv('attributes.csv', header=0, encoding='iso-8859-1')
df_des = pd.read_csv('product_descriptions.csv', header=0, encoding='iso-8859-1')

#### Num of search term distribution:

In [3]:
def search_term_analysis(df, comment):
    search_terms = [search_term.split() for search_term in df.search_term]
    num_words_used = [len(search_term) for search_term in search_terms]
    avg_terms_used = sum(num_words_used) / float(len(search_terms))
    print 'num_words_used: {}'.format(Counter(num_words_used))
    print 'avg_terms_used: {}'.format(avg_terms_used)
    
    return search_terms

In [4]:
df_train.head()
df_train.describe()
search_terms_train = search_term_analysis(df_train, 'Training Data...')

num_words_used: Counter({3: 26575, 2: 18386, 4: 14847, 5: 6601, 1: 4503, 6: 2076, 7: 741, 8: 204, 9: 80, 11: 31, 10: 10, 12: 9, 14: 4})
avg_terms_used: 3.15920720429


In [5]:
df_test.head()
search_terms_test = search_term_analysis(df_test, 'Training Data...')

num_words_used: Counter({3: 56420, 2: 51329, 4: 28100, 1: 12772, 5: 12004, 6: 4004, 7: 1241, 8: 469, 9: 145, 10: 104, 12: 54, 11: 46, 14: 4, 13: 1})
avg_terms_used: 2.98237478478


#### Num of unique products:

In [6]:
print 'training data - num of records: {0}, num unique products: {1}'.format(len(df_train), len(set(df_train.product_uid)))
print 'testing data - num of records: {0}, num unique products: {1}'.format(len(df_test), len(set(df_test.product_uid)))

training data - num of records: 74067, num unique products: 54667
testing data - num of records: 166693, num unique products: 97460


#### Group by num of search terms

In [3]:
search_by_num_words = defaultdict(list)
for search_term in search_terms_train + search_terms_test:
    search_by_num_words[len(search_term)].append(search_term)
search_by_num_words[1]

In [8]:
y_train = df_train.relevance

In [9]:
# merge training and testing data sets to single dataframe

df_train_copy = df_train.drop('relevance', axis=1)
df_train_copy['dataset'] = 'train'

df_test_copy = df_test.copy()
df_test_copy['dataset'] = 'test'

df = pd.concat([df_train_copy, df_test_copy], ignore_index=True)

#### Are all products in product description?

In [10]:
unique_uids = set(df_des.product_uid.values)
len(filter(lambda uid: uid in unique_uids, df.product_uid.values)) / float(len(df.product_uid.values))

1.0

### Merge data with descriptions

In [11]:
df = pd.merge(df, df_des, how='left', on='product_uid')
df.head()

Unnamed: 0,id,product_uid,product_title,search_term,dataset,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,train,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,train,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,train,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,train,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,train,Update your bathroom with the Delta Vero Singl...


### Merge data with attributes

In [13]:
def get_attributes_pivoted_df():

    def filter_attributes(row):
        if (row[0], row[1]) in filter_set:
            return 0
        return 1

    by_attribute = df_att.groupby(['product_uid', 'name']).count()
    filter_set = by_attribute[by_attribute.value > 1].reset_index()[['product_uid', 'name']].values
    filter_set = set([(uid, name) for uid, name in filter_set.tolist()])
    
    _df = df_att.copy()
    _df['keep'] = df_att.apply(filter_attributes, axis=1)
    _df = _df[_df['keep'] == 1]
    _df = _df.dropna(how='any')
    _df = _df.pivot(index='product_uid', columns='name', values='value')
    _df.reset_index(inplace=True)
    
    return _df

In [14]:
df_pivoted_attributes = get_attributes_pivoted_df()

## Feature Engineering

In [15]:
df1 = df.copy()
df1.set_index('id', inplace=True)

In [18]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
import unicodedata
strNum = {'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9}

def str_stem(s):
    s = unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore')
    s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) #Split words with a.A
    s = s.lower()
    s = s.replace("  ", " ")
    s = s.replace(",", "") #could be number / segment later
    s = s.replace("$", " ")
    s = s.replace("?", " ")
    s = s.replace("-", " ")
    s = s.replace("//", "/")
    s = s.replace("..", ".")
    s = s.replace(" / ", " ")
    s = s.replace(" \\ ", " ")
    s = s.replace(".", " . ")
    s = re.sub(r"(^\.|/)", r"", s)
    s = re.sub(r"(\.|/)$", r"", s)
    s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
    s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
    s = s.replace(" x ", " xbi ")
    s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s)
    s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s)
    s = s.replace("*", " xbi ")
    s = s.replace(" by ", " xbi ")
    s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
    s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
    s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
    s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
    s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
    s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
    s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
    s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
    s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
    s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
    s = s.replace("°", " degrees ")
    s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
    s = s.replace(" v ", " volts ")
    s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
    s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
    s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
    s = s.replace("  ", " ")
    s = s.replace(" . ", " ")
    s = (" ").join([str(strNum[z]) if z in strNum else z for z in s.split(" ")])
    s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
    s = s.lower()
    s = s.replace("toliet", "toilet")
    s = s.replace("airconditioner", "air conditioner")
    s = s.replace("vinal", "vinyl")
    s = s.replace("vynal", "vinyl")
    s = s.replace("skill", "skil")
    s = s.replace("snowbl", "snow bl")
    s = s.replace("plexigla", "plexi gla")
    s = s.replace("rustoleum", "rust oleum")
    s = s.replace("whirpool", "whirlpool")
    s = s.replace("whirlpoolga", "whirlpool ga")
    s = s.replace("whirlpoolstainless", "whirlpool stainless")
    return s

df1.search_term = df1.search_term.map(str_stem)
df1.product_title = df1.product_title.map(str_stem)
df_att.value = df_att.value.map(str_stem)

### Latent Semantic Analysis - TF-IDF + TSVD 

In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

def lemma(word):
    try:
        return lmtzr.lemmatize(word.encode('ascii', 'ignore'))
    except:
        return word
    
def text_transformer(_df, _field):

    text_array = _df[_field]
    text_array_copy = text_array.map(lambda words: ' '.join([lemma(word) for word in words.split(' ')]))
    
    tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
    tsvd = TruncatedSVD(n_components=10, random_state=0)
    
    tfidf.fit(text_array)
    mx_tfidf = tfidf.transform(text_array)
    
    tsvd.fit(mx_tfidf)
    tsvd_tfidf = tsvd.transform(mx_tfidf)
    
    return tsvd_tfidf

In [20]:
search_terms_ns = text_transformer(df1, 'search_term')
product_titles_ns = text_transformer(df1, 'product_title')

In [21]:
df_search_terms = pd.DataFrame(search_terms_ns, index=df1.index, columns=['search_term_ns_' + str(x) 
                                                         for x in xrange(search_terms_ns.shape[1])])
df_product_titles = pd.DataFrame(product_titles_ns, index=df1.index, columns=['product_titles_ns_' + str(x) 
                                                         for x in xrange(product_titles_ns.shape[1])])
df1 = pd.concat([df1, df_search_terms, df_product_titles], axis=1, ignore_index=False)

### Average Score by Term

In [23]:
search_terms = []
search_terms_cum_score = defaultdict(int)

for search, score in zip(df1.search_term, y_train):
    for term in map(lambda x: x.strip(), search.split(' ')):
        search_terms.append(term)
        search_terms_cum_score[term] += score
        
term_counts = sorted(Counter(search_terms).items(), key=lambda x: x[1], reverse=True)
terms, counts = zip(*term_counts)
 

In [24]:
mean_score_by_term = dict()
for term, count in zip(terms, counts):
    mean_score_by_term[term] = search_terms_cum_score[term] / float(count)

In [25]:
baseline_score = np.mean(y_train)

def term_avg_score(searches):
    output_array = []
    for search in searches:
        cum_score = 0
        term_count = 0
        for term in map(lambda x: x.strip(), search.split(' ')):
            if term in mean_score_by_term:
                cum_score += mean_score_by_term[term]
                term_count += 1
        
        if term_count > 0:
            output_array.append(cum_score / float(term_count))
        else:
            output_array.append(baseline_score)
        
    return output_array

df1['term_avg_score'] = term_avg_score(df1.search_term)

### Feature Engineering

In [27]:
from nltk.corpus import stopwords
import math
stopwords_eng = set(stopwords.words("english"))
nan_arrays = []

def ratio_words_matched(searches, match_phrases, ids, remove_stopwords=False, 
                        lemma=False, singularize=False, denominator='search'):
    
    output_array = []
    for search_terms, match_phrase, _id in zip(searches, match_phrases, ids):
        
        if isinstance(match_phrase, float) and math.isnan(match_phrase):
            nan_arrays.append((search_terms, match_phrase))
            output_array.append(0)
        else:
            
            a = search_terms
            b = match_phrase
            
            try:
                search_terms = search_terms.encode('ascii','ignore')
                search_terms = str(search_terms)
                search_terms = search_terms.lower()
                
                match_phrase = match_phrase.encode('ascii','ignore')
                match_phrase = str(match_phrase)
                match_phrase = match_phrase.lower()
            except:
                print 'error in encoding: {}, {}, {}'.format(search_terms, match_phrase, _id)
                output_array.append(0)
                pdb.set_trace()
                continue
            
            if remove_stopwords:
                search_terms = ' '.join([word for word in search_terms.split() if word not in stopwords_eng])
                match_phrase = ' '.join([word for word in match_phrase.split() if word not in stopwords_eng])
                
            search_words = [term for term in search_terms.split() if term.strip() != '']
            match_phrase_words = [term for term in match_phrase.split() if term.strip() != '']
                     
            if denominator == 'search':
                num_matches = sum([1 for word in search_words if word in match_phrase_words])
                
                if len(search_words) > 0:
                    output_array.append(num_matches / float(len(search_words))) 
                else:
                    output_array.append(0)
            else:
                num_matches = sum([1 for word in match_phrase_words if word in search_words])
                if len(search_words) > 0:
                    output_array.append(num_matches / float(len(match_phrase_words))) 
                else:
                    output_array.append(0)
                
    return output_array

def num_chars_in_search(searches, remove_stopwords=False, remove_numeric_units=False):
    
    output_array = []
    for search_terms in searches:
        if remove_stopwords:
            search_terms = ' '.join([word for word in search_terms.split() if word not in stopwords_eng])
        if remove_numeric_units:
            search_terms = ' '.join([word for word in search_terms.split() if word not in trivial_terms])
        output_array.append(len(search_terms))
    return output_array      

def num_stopwords_in_search(searches):
    output_array = []
    for search_terms in searches:
        _stopwords = [word for word in search_terms.split() if word in stopwords_eng]
        output_array.append(len(_stopwords))
    return output_array      

def attribute_match(attribute):
    
    _df_merged = pd.merge(df1, df_pivoted_attributes[['product_uid', attribute]], how='left', on='product_uid')
    output_array = ratio_words_matched(_df_merged.search_term, _df_merged[attribute], _df_merged.index)
    
    return [1 if x > 0 else 0 for x in output_array]

def nth_word_matched(searches, match_phrases, n):
    
    if n == 0:
        raise ValueError('input n must be greater than 0')
    
    stopwords_eng = set(stopwords.words("english"))
    
    output_array = []
    for search_phrase, match_phrase in zip(searches, match_phrases):
        
        search_terms = search_phrase.split()
        
        # if there are n words
        if n > len(search_terms) or search_terms[n-1] in stopwords_eng:
            output_array.append(-1)
        elif search_terms[n-1] in match_phrase:
            output_array.append(1)
        else:
            output_array.append(0)
    
    print 'n: {}, counter: {}'.format(n, Counter(output_array))
    return output_array      

def word_matched(searches, match_phrases):
    
    output_array = []
    stopwords_eng = set(stopwords.words("english"))
    
    for search, phrase in zip(searches, match_phrases):
        last_word = search.split()[-1]
        if last_word not in stopwords_eng and last_word in phrase.split():
            output_array.append(1)
        else:
            output_array.append(0)
        
    return output_array

In [28]:
len(df_pivoted_attributes.columns)

5410

In [4]:
column_counts = []
column_uniques = []
count = 0
for column in df_pivoted_attributes.columns:
    count += 1
    if count % 1000 == 0:
        print count
    
    column_counts.append((column, 
                          df_pivoted_attributes[column].count(), 
                          len(df_pivoted_attributes[column].unique())))    

In [30]:
sorted(column_counts, key=lambda x: x[1], reverse=True)[30:100]

[(u'Package Quantity', 6904, 282),
 (u'Bullet13', 6348, 2037),
 (u'Flooring Product Type', 6230, 93),
 (u'Color', 6214, 1314),
 (u'Tools Product Type', 6169, 14),
 (u'Included', 6079, 153),
 (u'Voltage (volts)', 6068, 105),
 (u'Assembly Required', 5718, 3),
 (u'Features', 5562, 530),
 (u'Wattage (watts)', 5107, 434),
 (u'Finish', 4996, 667),
 (u'Shape', 4876, 52),
 (u'Color/Finish Family', 4628, 74),
 (u'Electrical Product Type', 4409, 143),
 (u'Finish Family', 4209, 54),
 (u'Fixture Color/Finish', 4117, 764),
 (u'Product Thickness (in.)', 4080, 505),
 (u'Style', 4057, 32),
 (u'Interior/Exterior', 3950, 5),
 (u'Bullet14', 3853, 1447),
 (u'Number of Bulbs Required', 3802, 44),
 (u'Coverage Area (sq. ft.)', 3756, 276),
 (u'Finish Type', 3658, 18),
 (u'Power Tool Product Type', 3442, 20),
 (u'Paint Product Type', 3427, 114),
 (u'Outdoor Living Product Type', 3395, 127),
 (u'Collection Name', 3361, 765),
 (u'Hardware Finish Family', 3355, 23),
 (u'Bulb Type Included', 3331, 18),
 (u'Recond

In [31]:
# sort items to ranking of popularity common brands buckets
def rank_by_count(attribute):
    
    rank_lookup = dict()
    for i, (key, _) in enumerate(sorted(Counter(df_pivoted_attributes[attribute]).items(), 
                                        key=lambda x: x[1], reverse=True)):
        rank_lookup[key] = i
                                 
    return rank_lookup                     

# commoness of attributes
rank_lookup_brand = rank_by_count('MFG Brand Name')
rank_lookup_color = rank_by_count('Color Family')
rank_lookup_material = rank_by_count('Material')
rank_lookup_finish = rank_by_count('Color/Finish')

df_pivoted_attributes['brand_c_index'] = df_pivoted_attributes['MFG Brand Name'].map(lambda x: rank_lookup_brand[x])
df_pivoted_attributes['color_c_index'] = df_pivoted_attributes['Color Family'].map(lambda x: rank_lookup_color[x])
df_pivoted_attributes['material_c_index'] = df_pivoted_attributes['Material'].map(lambda x: rank_lookup_material[x])
df_pivoted_attributes['finish_c_index'] = df_pivoted_attributes['Color/Finish'].map(lambda x: rank_lookup_finish[x])

In [5]:
numpy_strings = np.array(df.search_term.map(lambda x: nltk.word_tokenize(x)))

In [None]:
Counter([word for words in numpy_strings for word in words]).most_common(100)

In [None]:
# size ranking
Counter(df_pivoted_attributes['Product Width (in.)']).most_common(100)
Counter(df_pivoted_attributes['Product Height (in.)']).most_common(100)
Counter(df_pivoted_attributes['Product Depth (in.)']).most_common(100)
Counter(df_pivoted_attributes['Product Weight (lb.)']).most_common(100)

In [None]:
# one hot encoding
print Counter(df_pivoted_attributes['Indoor/Outdoor']).most_common(20)
print Counter(df_pivoted_attributes['Commercial / Residential']).most_common(20)
(u'Assembly Required', 5718, 3),
(u'Finish', 4996, 667),

In [None]:
df1['num_chars_in_search'] = num_chars_in_search(df1.search_term, remove_stopwords=True)

df1['ratio_words_matched_search'] = ratio_words_matched(df1.search_term, df1.product_title, df1.index,
                                                        remove_stopwords=True, lemma=True)

df1['ratio_words_matched_title'] = ratio_words_matched(df1.search_term, df1.product_title, df1.index,
                                                       remove_stopwords=True, lemma=True, denominator='product_title')

df1['num_stopwords_in_search'] = num_stopwords_in_search(df1.search_term)

df1['brand_matched'] = attribute_match('MFG Brand Name')
df1['material_matched'] = attribute_match('Material')
df1['Bullet01_matched'] = attribute_match('Bullet01')
df1['Bullet02_matched'] = attribute_match('Bullet02')
df1['Bullet03_matched'] = attribute_match('Bullet03')
df1['Bullet04_matched'] = attribute_match('Bullet04')
df1['Bullet05_matched'] = attribute_match('Bullet05')
df1['Bullet06_matched'] = attribute_match('Bullet06')
df1['Bullet07_matched'] = attribute_match('Bullet07')
df1['Bullet08_matched'] = attribute_match('Bullet08')
df1['Bullet09_matched'] = attribute_match('Bullet09')
df1['Bullet10_matched'] = attribute_match('Bullet10')
df1['color_family_matched'] = attribute_match('Color Family')
df1['color_finish_matched'] = attribute_match('Color/Finish')

df1['first_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 1)
df1['second_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 2)
df1['third_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 3)
df1['fourth_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 4)
df1['fifth_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 5)
df1['sixth_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 6)
df1['seventh_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 7)
df1['eighth_word_matched'] = nth_word_matched(df1.search_term, df1.product_title, 8)

df1['query_last_word_in_title'] = word_matched(df1.search_term, df1.product_title)
df1['query_last_word_in_description'] = word_matched(df1.search_term, df1.product_description)

In [None]:
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()

def pos_tag_count(searches):
    word_pos = []
    
    count = 0    
    for search_phrase in searches:
        count += 1
        if count % 50000 == 0:
            print count 
            
        word_pos.append(nltk.tag._pos_tag(search_phrase, None, tagger))
    
    return word_pos

word_pos = pos_tag_count(df1.search_term)


In [None]:
pos_list = map(lambda word_list: [x[1] for x in word_list], word_pos)

In [None]:
pos_all = []
for pos in pos_list:
    pos_all.extend(pos)

pos_counts = Counter(pos_all)
for pos, _ in pos_counts.iteritems():
    df1['pos_' + str(pos)] = [x.count(pos) for x in pos_list]
    

In [None]:
df1_backup = df1.copy()

#### Set up values for training

In [None]:
def get_train_test_x(_df, features=None, feature_indices=None):
    if features:
        _df_train = _df[features][_df.dataset == 'train']
        _df_test = _df[features][_df.dataset == 'test']
    elif feature_indices:
        _df = df.iloc[:, features_indices]
        _df_train = _df[features][_df.dataset == 'train']
        _df_test = _df[features][_df.dataset == 'test']
    else:
        _df_train = _df[_df.dataset == 'train']
        _df_test = _df[_df.dataset == 'test']
        
        _df_train.drop(['dataset'], axis=1, inplace=True)
        _df_test.drop(['dataset'], axis=1, inplace=True)
        
    return _df_train.values, _df_test.values

In [None]:
selected_features = [u'term_avg_score', u'ratio_words_matched_search',
       u'ratio_words_matched_title', u'query_last_word_in_title',
       u'product_titles_ns_3', u'product_titles_ns_1', u'product_titles_ns_4',
       u'product_titles_ns_6', u'product_titles_ns_0', u'product_titles_ns_7',
       u'product_titles_ns_9', u'product_titles_ns_5', u'product_titles_ns_2',
       u'product_titles_ns_8']

In [None]:
import warnings
warnings.filterwarnings("ignore")

df1 = df1.drop(['product_uid', 'product_title', 'search_term', 'product_description'], axis=1)
x_train, x_test = get_train_test_x(df1)
df1 = df1.drop(['dataset'], axis=1) 

### Training Classifiers

In [None]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn import pipeline, grid_search

def RMSE(y, y_pred):
    return round(mean_squared_error(y, y_pred)**0.5, 3)

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_
RMSE_scorer = make_scorer(fmean_squared_error, greater_is_better=False)

import sklearn.preprocessing as pp
le = pp.LabelEncoder()

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state = 0, verbose = 0)
param_grid = {'n_estimators':[30], 'max_features': [10], 'max_depth': [10]}
model_rfr = grid_search.GridSearchCV(estimator = rfr, param_grid = param_grid, 
                                     cv = 2, verbose = 20, scoring=RMSE_scorer, n_jobs=-1)

In [None]:
def run_model(_model, _x_train, _y_train):
    _model.fit(_x_train, _y_train)

    print("Best parameters found by grid search:")
    print(_model.best_params_)
    print("Best CV score:")
    print(_model.best_score_)

    return _model.best_estimator_


In [None]:
model_optimized = run_model(model_rfr, x_train, y_train)

In [None]:
new_x_train = x_train
ith_e = 0
ensemble_iterations = 5

while ith_e < ensemble_iterations:
    curr_x_train = new_x_train
    
    ith_e += 1
    print 'ith Ensemble Iteration: {}'.format(ith_e)
    
    model_optimized = run_model(model_rfr, curr_x_train, y_train)
    output = model_optimized.predict(curr_x_train)
    
    new_x_train = np.zeros((curr_x_train.shape[0], curr_x_train.shape[1] + 1))
    new_x_train[:,:-1] = curr_x_train
    new_x_train[:,-1] = output
    

In [None]:
from xgboost.sklearn import XGBRegressor

xgbr = XGBRegressor(seed=0)
param_grid = {'objective':['reg:linear', 'reg:logistic'], 
              'n_estimators': [50], 'max_depth': [5], 
              'learning_rate': [0.01, 0.1]}

model = grid_search.GridSearchCV(estimator = xgbr, param_grid = param_grid, cv = 2, 
                                 verbose = 20, scoring=RMSE_scorer)

In [None]:
model.fit(x_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)


In [None]:
x_train_copy = x_train 
y_train_copy = y_train

## Feature Importance

In [None]:
print map(lambda x: round(x, 2), sorted(model_optimized.feature_importances_)[::-1])
print df1.columns[np.argsort(model_optimized.feature_importances_)[::-1]]

In [None]:
def choose_best_k_features(k):
    return np.argsort(model_optimized.feature_importances_)[::-1][:k]

In [None]:
plt.hist(y_train, 20)
plt.show()

In [None]:
plt.hist(predictions_all[0], 20)
plt.show()

## PCA

In [None]:
print df1.columns
np.around(np.corrcoef(x_train, rowvar=0), 2)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train)
print(pca.explained_variance_ratio_) 

## GraphViz

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

from sklearn.externals.six import StringIO  
import pydotplus
dot_data = StringIO() 

import graphviz

rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(x_train, y_train)

for i, _tree in enumerate(rfc.estimators_):
    with open('figures/tree_' + str(i) + '.dot', 'w') as dotfile:
        dot_data = StringIO() 
        tree.export_graphviz(_tree, out_file=dot_data)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 

## Results

In [None]:
new_x_train = x_train
new_x_test = x_test

rfr = RandomForestRegressor(random_state = 0, verbose = 0)
param_grid = {'n_estimators':[500], 'max_features': [15,25], 'max_depth': [15,25]}
model_rfr = grid_search.GridSearchCV(estimator = rfr, param_grid = param_grid, 
                                     cv = 2, verbose = 20, scoring=RMSE_scorer, n_jobs=-1)

ith_e = 0
ensemble_iterations = 1
while ith_e < ensemble_iterations:
    ith_e += 1
    curr_x_train = new_x_train
    curr_x_test = new_x_test
    
    model_optimized = run_model(model_rfr, curr_x_train, y_train)
    output_train = model_optimized.predict(curr_x_train)

    new_x_train = np.zeros((curr_x_train.shape[0], curr_x_train.shape[1]+1))
    new_x_train[:,:-1] = curr_x_train
    new_x_train[:,-1] = output_train

    output_test = model_optimized.predict(curr_x_test)
    new_x_test = np.zeros((curr_x_test.shape[0], curr_x_test.shape[1]+1))
    new_x_test[:,:-1] = curr_x_test
    new_x_test[:,-1] = output_test

In [None]:
preds = output_test

In [None]:
import csv
import os

with open('results/results_rfr_ensemble.csv', 'wb') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['id', 'relevance'])
    for id, pred in zip(df_test.id.values, preds):
        csv_writer.writerow([id, pred])