## Investigate 2: feature size of each step

In [12]:
# define the transformers
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.base import TransformerMixin
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import random
from collections import defaultdict
import re
import time 
%matplotlib inline


In [4]:
# import transformers
from transformers import *

In [6]:
random_state = 123

In [7]:
# build pipelines
variable_unchanged = Pipeline([('variable_extractor', VariableExtractor(['bedrooms', 'bathrooms', 'price']))])
bbratio = Pipeline([
        ('room_ratio', RatioCreator('bedrooms', 'bathrooms')),
        ('bbratio_dimup', DimOneUp())
    ])
bpratio = Pipeline([
        ('priceroom_ratio', RatioCreator('price', 'bedrooms')),
        ('bpratio_dimup', DimOneUp())
    ])
llcluster = Pipeline([
        ('ll_extractor', VariableExtractor(['longitude', 'latitude'])),
        ('ll_cluster', LLCluster(init='k-means++', n_clusters=10, batch_size=200, n_init=10, 
                                 max_no_improvement=10, verbose=0, random_state=random_state)),
        ('llcluster_dimup', DimOneUp())                     
    ])

feature_process = Pipeline([
        ('feature_cleanser', FeatureCleanser()),
        ('feature_union', FeatureUnion([
                    ('feature_counts', Pipeline([
                                ('feature_length', VariableLength()),
                                ('feature_dimup', DimOneUp())
                            ])),
                    ('different_features', DiffFeatCounts())
                ]))
    ])

description_process = FeatureUnion([
        ('description_length', Pipeline([('description_counts', DescriptionWordCounts()),
                                     ('dc_dimup', DimOneUp())
                                    ])
        ), 
        ('description_tf', DescriptionProcessor())
    ])

photo_length = Pipeline([('photo_extractor', VariableExtractor('photos')),
                         ('photo_counts', VariableLength()), 
                         ('photo_dimup', DimOneUp())
    ])

building_process = Pipeline([
        ('building_extractor', VariableExtractor('building_id')),
        ('building_union', FeatureUnion([
                    ('building_counts', Pipeline([
                                ('building_length', CatVariableCounts()),
                                ('building_dimup', DimOneUp())
                            ])),
                    ('building_indicator', CatVariableIndicator())
                ]))
    ])

manager_process = Pipeline([
        ('manager_extractor', VariableExtractor('manager_id')),
        ('manager_union', FeatureUnion([
                    ('manager_counts', Pipeline([
                                ('manager_length', CatVariableCounts()),
                                ('manager_dimup', DimOneUp())
                            ])),
                    ('manager_indicator', CatVariableIndicator())
                ]))
    ])

date_process = Pipeline([
        ('hour_extractor', DateProcessor(wantyear=False, wantmonth=False, wanthour=True)),
        ('hour_dimup', DimOneUp())
    ])

address_process = Pipeline([
        ('address_cleanser', AddressCleanser()),
        ('address_union', FeatureUnion([
                    ('address_counts', Pipeline([
                                ('address_length', CatVariableCounts()),
                                ('address_dimup', DimOneUp())
                            ])),
                    ('address_indicator', CatVariableIndicator())
                ]))
    ])


In [9]:
# build predictors
predictor = AdaBoostClassifier(random_state=random_state)

In [11]:
# read in the data 
# read in the training data set
train_set = pd.read_json(r'..\data\train.json')
test_set = pd.read_json(r'..\data\test.json')
print("Read in training data set with size of {} * {}".format(train_set.shape[0], train_set.shape[1]))
print("Read in testing data set with size of {} * {}".format(test_set.shape[0], test_set.shape[1]))

Read in training data set with size of 49352 * 15
Read in testing data set with size of 74659 * 14


In [15]:
orig_X = train_set.drop('interest_level', axis=1)
orig_y = train_set['interest_level']

In [22]:
train_X, test_X, train_y, test_y = train_test_split(orig_X, orig_y, test_size=0.2, 
                                                    random_state=random_state, stratify=orig_y )

In [27]:
print("Size: train_X: {}, train_y: {}".format(train_X.shape, len(train_y)))
print("Size: test_X: {}, test_y: {}".format(test_X.shape, len(test_y)))


Size: train_X: (39481, 14), train_y: 39481
Size: test_X: (9871, 14), test_y: 9871


In [31]:
# check the distribution of y
print('Train set:')
print(train_y.value_counts(normalize=True))
print('\nTest set:')
print(test_y.value_counts(normalize=True))

Train set:
low       0.694689
medium    0.227527
high      0.077784
Name: interest_level, dtype: float64

Test set:
low       0.694661
medium    0.227535
high      0.077804
Name: interest_level, dtype: float64


The split is balanced.

In [43]:
def fitmodel(pipeline):
    t1 =time.time()
    pipeline.fit(train_X, train_y)
    print('Time to fit the trainset: {:.2f}'.format((time.time()-t1)))
    t1 =time.time()
    pred_prob = pipeline.predict_proba(train_X)
    print('Log score: {}'.format(log_loss(train_y, pred_prob)))
    print('Time to make the prediction to the trainset: {:.2f}'.format((time.time()-t1)))
    return pipeline, pred_prob
    

In [48]:
def predictmodel(pipeline):
    pred_unchanged_test = pipeline.predict_proba(test_X)
    print('Log score: {} for CAT'.format(log_loss(test_y, pred_unchanged_test)))
    return pred_unchanged_test

    

In [51]:
def get_feature_importance(pipeline):
    return pipeline.named_steps['predictor'].feature_importances_ 


## Build pipline step by step

### Step1: Unchanged Features

variable_unchanged = Pipeline([('variable_extractor', VariableExtractor(['bedrooms', 'bathrooms', 'price']))])


In [37]:
data1 = variable_unchanged.fit_transform(train_X, train_y)

In [38]:
data1.shape

(39481, 3)

In [40]:
pipe_unchaged = Pipeline([
        ('variable_unchanged', variable_unchanged),
        ('predictor', predictor)
    ])

In [45]:
pipe_unchaged, pred_unchanged_train = fitmodel(pipe_unchaged)

Time to fit the trainset: 4.59
Log score: 1.0758216934480005
Time to make the prediction to the trainset: 0.43


In [52]:
# importance features
get_feature_importance(pipe_unchaged) 

array([ 0.16,  0.16,  0.68])

price is more important among the 3 variables.

In [49]:
pred_unchanged_test = predictmodel(pipe_unchaged)

Log score: 1.0758768675370136 for CAT


Unchanged variables make sense, no obvious overfitting. 

## Step2: ratios

In [58]:
bbratio = Pipeline([
        ('room_ratio', RatioCreator('bathrooms', 'bedrooms')),
        ('bbratio_dimup', DimOneUp())
    ])
bpratio = Pipeline([
        ('priceroom_ratio', RatioCreator('price', 'bedrooms')),
        ('bpratio_dimup', DimOneUp())
    ])

ratio_unions = FeatureUnion([
        ('bbratio', bbratio),
        ('bpratio', bpratio)
    ])


In [59]:
data2 = ratio_unions.fit_transform(train_X, train_y)

In [60]:
data2[:10]

array([[  1.00000000e+00,   1.99750000e+03],
       [  1.00000000e+00,   2.15000000e+03],
       [  1.00000000e+00,   2.94100000e+03],
       [  1.00000000e+00,   2.05000000e+03],
       [  1.00000000e+00,   2.80000000e+03],
       [  5.00000000e-01,   1.32500000e+03],
       [  2.50000000e-01,   1.17375000e+03],
       [  5.00000000e-01,   1.52500000e+03],
       [  6.66666667e-01,   1.43166667e+03],
       [  1.00000000e+00,   4.20000000e+03]])

In [74]:
pipe_ratio = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions)
                ])),        
        ('predictor', predictor)
    ])

In [75]:
pipe_ratio, pred_ratio_train = fitmodel(pipe_ratio)

Time to fit the trainset: 4.59
Log score: 1.0725010664541172
Time to make the prediction to the trainset: 0.46


In [76]:
get_feature_importance(pipe_ratio) 

array([ 0.1 ,  0.04,  0.34,  0.04,  0.48])

price and bedroom-price ratio more important

In [77]:
pred_ratio_test = predictmodel(pipe_ratio)

Log score: 1.0724667541005297 for CAT


A little improvement adding the ratios. No obvious overfitting.

## Step3: llcluster

In [78]:
llcluster = Pipeline([
        ('ll_extractor', VariableExtractor(['longitude', 'latitude'])),
        ('ll_cluster', LLCluster(init='k-means++', n_clusters=10, batch_size=200, n_init=10, 
                                 max_no_improvement=10, verbose=0, random_state=random_state)),
        ('llcluster_dimup', DimOneUp())                     
    ])


In [79]:
data3 = llcluster.fit_transform(train_X, train_y)

In [80]:
data3.shape

(39481, 1)

In [82]:
pipe_llcluster = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster)
                ])),        
        ('predictor', predictor)
    ])

In [83]:
pipe_llcluster, pred_ll_train = fitmodel(pipe_llcluster)

Time to fit the trainset: 5.80
Log score: 1.0718592073577156
Time to make the prediction to the trainset: 0.56


In [84]:
get_feature_importance(pipe_llcluster) 

array([ 0.12,  0.  ,  0.3 ,  0.02,  0.36,  0.2 ])

In [85]:
pred_ll_test = predictmodel(pipe_llcluster)

Log score: 1.0718482982652673 for CAT


A little improvement adding the llcluster. No obvious overfitting.

## Step4: photo length

In [86]:
photo_length = Pipeline([('photo_extractor', VariableExtractor('photos')),
                         ('photo_counts', VariableLength()), 
                         ('photo_dimup', DimOneUp())
    ])

In [87]:
data4 = photo_length.fit_transform(train_X, train_y)

In [89]:
data4[:10]

array([[8],
       [7],
       [0],
       [4],
       [6],
       [3],
       [0],
       [4],
       [6],
       [7]], dtype=int64)

In [90]:
pipe_photo = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster),
                    ('photo_length', photo_length)
                ])),        
        ('predictor', predictor)
    ])

In [91]:
pipe_photo, pred_photo = fitmodel(pipe_photo)

Time to fit the trainset: 5.99
Log score: 1.0685676500078314
Time to make the prediction to the trainset: 0.58


In [92]:
get_feature_importance(pipe_photo) 

array([ 0.1 ,  0.02,  0.24,  0.02,  0.24,  0.26,  0.12])

bedroom, bathroom, price, bbraio, bpratio, llcluster, photolength

In [93]:
pred_photo_test = predictmodel(pipe_photo)

Log score: 1.0689446900372812 for CAT


A little improvement adding the photo length. No obvious overfitting.

## Step5: Manager_count

In [94]:
manager_counts = Pipeline([
        ('manager_extractor', VariableExtractor('manager_id')),
        ('manager_union', FeatureUnion([
                    ('manager_counts', Pipeline([
                                ('manager_length', CatVariableCounts()),
                                ('manager_dimup', DimOneUp())
                            ]))
                ]))
    ])


In [96]:
data5 = manager_counts.fit_transform(train_X, train_y)

In [98]:
data5[:10]

array([[  5],
       [  2],
       [325],
       [ 11],
       [118],
       [  4],
       [ 15],
       [114],
       [232],
       [ 50]], dtype=int64)

In [99]:
pipe_m_c = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster),
                    ('photo_length', photo_length),
                    ('manager_counts', manager_counts)
                ])),        
        ('predictor', predictor)
    ])

In [100]:
pipe_m_c, pred_mc = fitmodel(pipe_m_c)

Time to fit the trainset: 6.48
Log score: 1.0682254627726508
Time to make the prediction to the trainset: 0.80


In [101]:
get_feature_importance(pipe_m_c) 

array([ 0.08,  0.02,  0.22,  0.02,  0.22,  0.14,  0.1 ,  0.2 ])

 bathroom, bedroom, price, bbratio, bpratio, llcluster, photolength, mc

In [103]:
pred_mc_test = predictmodel(pipe_m_c)

Log score: 1.0686196460413642 for CAT


A little improvement adding the manager count. No obvious overfitting.

## Step6: Managers_indicator

In [104]:
manager_process = Pipeline([
        ('manager_extractor', VariableExtractor('manager_id')),
        ('manager_union', FeatureUnion([
                    ('manager_counts', Pipeline([
                                ('manager_length', CatVariableCounts()),
                                ('manager_dimup', DimOneUp())
                            ])),
                    ('manager_indicator', CatVariableIndicator(min_list=10, 
                                                               threshold={'low': 0.8, 'medium': 0.6, 'low': 0.3}))
                ]))
    ])


In [106]:
data6 = manager_process.fit_transform(train_X, train_y)

In [107]:
data6.shape

(39481, 3)

In [108]:
data6[:10]

array([[  5,   0,   0],
       [  2,   0,   0],
       [325,   1,   0],
       [ 11,   1,   0],
       [118,   1,   0],
       [  4,   0,   0],
       [ 15,   1,   0],
       [114,   1,   0],
       [232,   1,   0],
       [ 50,   1,   0]], dtype=int64)

In [115]:
pipe_m_i = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster),
                    ('photo_length', photo_length),
                    ('manager_process', manager_process)
                ])),        
        ('predictor', predictor)
    ])

In [116]:
pipe_m_i, pred_mi = fitmodel(pipe_m_i)

Time to fit the trainset: 18.65
Log score: 1.0669540400088766
Time to make the prediction to the trainset: 8.19


In [117]:
get_feature_importance(pipe_m_i) 

array([ 0.04,  0.  ,  0.18,  0.02,  0.18,  0.14,  0.1 ,  0.24,  0.08,  0.02])

The feature indicators do not help a lot.

In [118]:
pred_mi_test = predictmodel(pipe_m_i)

Log score: 1.0673653516365695 for CAT


### How about only using manager process itself

In [119]:
manager_process_only = Pipeline([
        ('feature_union', FeatureUnion([
                    ('manager_process', manager_process)
                ])),        
        ('predictor', predictor)
    ])

In [120]:
manager_process_only, pred_m = fitmodel(manager_process_only)

Time to fit the trainset: 18.05
Log score: 1.0826666736772048
Time to make the prediction to the trainset: 8.55


In [121]:
get_feature_importance(manager_process_only) 

array([ 0.8 ,  0.14,  0.06])

In [122]:
pred_m_test = predictmodel(manager_process_only)

Log score: 1.0828867315567432 for CAT


Not so good, seems like the random forest used before overfits

## Step6: Building_count

In [123]:
building_counts = Pipeline([
        ('building_extractor', VariableExtractor('building_id')),
        ('building_union', FeatureUnion([
                    ('building_counts', Pipeline([
                                ('building_length', CatVariableCounts()),
                                ('building_dimup', DimOneUp())
                            ]))
                ]))
    ])

In [125]:
data6 = building_counts.fit_transform(train_X)

In [126]:
data6.shape

(39481, 1)

In [127]:
pipe_b_c = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster),
                    ('photo_length', photo_length),
                    ('manager_process', manager_process),
                    ('building_counts', building_counts)
                ])),        
        ('predictor', predictor)
    ])

In [128]:
pipe_b_c, pred_bc = fitmodel(pipe_b_c)

Time to fit the trainset: 19.47
Log score: 1.0653566419848486
Time to make the prediction to the trainset: 9.83


In [129]:
get_feature_importance(pipe_b_c) 

array([ 0.04,  0.  ,  0.18,  0.02,  0.22,  0.14,  0.1 ,  0.18,  0.06,
        0.02,  0.04])

bathroom, bedroom, price, bbratio, bpratio, llcluster, photolength, mc, mc_low, mc_medium, bc

In [131]:
pred_bc_test = predictmodel(pipe_b_c)

Log score: 1.065833600017015 for CAT


## Step7: Address count

In [132]:
ac = Pipeline([
        ('address_cleanser', AddressCleanser()),
        ('address_union', FeatureUnion([
                    ('address_counts', Pipeline([
                                ('address_length', CatVariableCounts()),
                                ('address_dimup', DimOneUp())
                            ]))
                ]))
    ])


In [133]:
data7 = ac.fit_transform(train_X)

In [134]:
data7.shape

(39481, 1)

In [135]:
pipe_a_c = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster),
                    ('photo_length', photo_length),
                    ('manager_process', manager_process),
                    ('building_counts', building_counts),
                    ('ac', ac)
                ])),        
        ('predictor', predictor)
    ])

In [136]:
pipe_a_c, pred_ac = fitmodel(pipe_a_c)

Time to fit the trainset: 20.30
Log score: 1.065798313274488
Time to make the prediction to the trainset: 9.51


In [137]:
get_feature_importance(pipe_a_c) 

array([ 0.04,  0.  ,  0.16,  0.02,  0.2 ,  0.14,  0.1 ,  0.16,  0.08,
        0.02,  0.04,  0.04])

AC seems not helpful.

In [138]:
pred_bc_test = predictmodel(pipe_a_c)

Log score: 1.065978658380682 for CAT


## Step8: Address indicator & building indicator

In [142]:
building_process = Pipeline([
        ('building_extractor', VariableExtractor('building_id')),
        ('building_union', FeatureUnion([
                    ('building_counts', Pipeline([
                                ('building_length', CatVariableCounts()),
                                ('building_dimup', DimOneUp())
                            ])),
                    ('building_indicator', CatVariableIndicator(min_list=10, 
                                                               threshold={'low': 0.8, 'medium': 0.6, 'low': 0.3}))
                ]))
    ])



In [143]:
data81 = building_process.fit_transform(train_X, train_y)

In [149]:
data81[:10]

array([[  17,    1,    0],
       [6643,    1,    0],
       [ 116,    1,    0],
       [6643,    1,    0],
       [  15,    1,    0],
       [6643,    1,    0],
       [6643,    1,    0],
       [  56,    1,    0],
       [ 102,    1,    0],
       [  22,    1,    0]], dtype=int64)

In [145]:
address_process = Pipeline([
        ('address_cleanser', AddressCleanser()),
        ('address_union', FeatureUnion([
                    ('address_counts', Pipeline([
                                ('address_length', CatVariableCounts()),
                                ('address_dimup', DimOneUp())
                            ])),
                    ('address_indicator', CatVariableIndicator(min_list=10, 
                                                               threshold={'low': 0.8, 'medium': 0.6, 'low': 0.3}))
                ]))
    ])


In [146]:
data82 = address_process.fit_transform(train_X, train_y)

In [148]:
data82[:10]

array([[  8,   0,   0],
       [  2,   0,   0],
       [301,   1,   0],
       [  1,   0,   0],
       [ 41,   1,   0],
       [121,   1,   0],
       [149,   1,   0],
       [150,   1,   0],
       [128,   1,   0],
       [ 53,   1,   0]], dtype=int64)

In [150]:
pipe_ab_p = Pipeline([
        ('feature_union', FeatureUnion([
                    ('variable_unchanged', variable_unchanged),
                    ('ratio_unions', ratio_unions),
                    ('llcluster', llcluster),
                    ('photo_length', photo_length),
                    ('manager_process', manager_process),
                    ('building_process', building_process),
                    ('address_process', address_process)
                ])),        
        ('predictor', predictor)
    ])

In [151]:
pipe_ab_p, pred_ab = fitmodel(pipe_ab_p)

Time to fit the trainset: 41.20
Log score: 1.0655419708554426
Time to make the prediction to the trainset: 24.45


In [152]:
get_feature_importance(pipe_a_c) 

array([ 0.04,  0.  ,  0.14,  0.02,  0.22,  0.14,  0.1 ,  0.16,  0.08,
        0.02,  0.02,  0.  ,  0.02,  0.02,  0.  ,  0.02])

bathroom, bedroom, price, bbratio, bpratio, llcluster, photolength, mc, mc_low, mc_medium, bc, bc_low, bc_m, ac, ac_l, ac_m

In [154]:
pred_bc_test = predictmodel(pipe_ab_p)

Log score: 1.0657994409853349 for CAT


In [None]:
# in mu opinion, only keep manager, no use of address, building. can build small models and add share.