## Investigate 2: feature size of each step

In [12]:
# define the transformers
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.base import TransformerMixin
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import random
from collections import defaultdict
import re
import time 
%matplotlib inline


In [4]:
# import transformers
from transformers import *

In [6]:
random_state = 123

In [7]:
# build pipelines
variable_unchanged = Pipeline([('variable_extractor', VariableExtractor(['bedrooms', 'bathrooms', 'price']))])
bbratio = Pipeline([
        ('room_ratio', RatioCreator('bedrooms', 'bathrooms')),
        ('bbratio_dimup', DimOneUp())
    ])
bpratio = Pipeline([
        ('priceroom_ratio', RatioCreator('price', 'bedrooms')),
        ('bpratio_dimup', DimOneUp())
    ])
llcluster = Pipeline([
        ('ll_extractor', VariableExtractor(['longitude', 'latitude'])),
        ('ll_cluster', LLCluster(init='k-means++', n_clusters=10, batch_size=200, n_init=10, 
                                 max_no_improvement=10, verbose=0, random_state=random_state)),
        ('llcluster_dimup', DimOneUp())                     
    ])

feature_process = Pipeline([
        ('feature_cleanser', FeatureCleanser()),
        ('feature_union', FeatureUnion([
                    ('feature_counts', Pipeline([
                                ('feature_length', VariableLength()),
                                ('feature_dimup', DimOneUp())
                            ])),
                    ('different_features', DiffFeatCounts())
                ]))
    ])

description_process = FeatureUnion([
        ('description_length', Pipeline([('description_counts', DescriptionWordCounts()),
                                     ('dc_dimup', DimOneUp())
                                    ])
        ), 
        ('description_tf', DescriptionProcessor())
    ])

photo_length = Pipeline([('photo_extractor', VariableExtractor('photos')),
                         ('photo_counts', VariableLength()), 
                         ('photo_dimup', DimOneUp())
    ])

building_process = Pipeline([
        ('building_extractor', VariableExtractor('building_id')),
        ('building_union', FeatureUnion([
                    ('building_counts', Pipeline([
                                ('building_length', CatVariableCounts()),
                                ('building_dimup', DimOneUp())
                            ])),
                    ('building_indicator', CatVariableIndicator())
                ]))
    ])

manager_process = Pipeline([
        ('manager_extractor', VariableExtractor('manager_id')),
        ('manager_union', FeatureUnion([
                    ('manager_counts', Pipeline([
                                ('manager_length', CatVariableCounts()),
                                ('manager_dimup', DimOneUp())
                            ])),
                    ('manager_indicator', CatVariableIndicator())
                ]))
    ])

date_process = Pipeline([
        ('hour_extractor', DateProcessor(wantyear=False, wantmonth=False, wanthour=True)),
        ('hour_dimup', DimOneUp())
    ])

address_process = Pipeline([
        ('address_cleanser', AddressCleanser()),
        ('address_union', FeatureUnion([
                    ('address_counts', Pipeline([
                                ('address_length', CatVariableCounts()),
                                ('address_dimup', DimOneUp())
                            ])),
                    ('address_indicator', CatVariableIndicator())
                ]))
    ])


In [9]:
# build predictors
predictor = AdaBoostClassifier(random_state=random_state)

In [11]:
# read in the data 
# read in the training data set
train_set = pd.read_json(r'..\data\train.json')
test_set = pd.read_json(r'..\data\test.json')
print("Read in training data set with size of {} * {}".format(train_set.shape[0], train_set.shape[1]))
print("Read in testing data set with size of {} * {}".format(test_set.shape[0], test_set.shape[1]))

Read in training data set with size of 49352 * 15
Read in testing data set with size of 74659 * 14


In [15]:
orig_X = train_set.drop('interest_level', axis=1)
orig_y = train_set['interest_level']

In [22]:
train_X, test_X, train_y, test_y = train_test_split(orig_X, orig_y, test_size=0.2, 
                                                    random_state=random_state, stratify=orig_y )

In [27]:
print("Size: train_X: {}, train_y: {}".format(train_X.shape, len(train_y)))
print("Size: test_X: {}, test_y: {}".format(test_X.shape, len(test_y)))


Size: train_X: (39481, 14), train_y: 39481
Size: test_X: (9871, 14), test_y: 9871


In [31]:
# check the distribution of y
print('Train set:')
print(train_y.value_counts(normalize=True))
print('\nTest set:')
print(test_y.value_counts(normalize=True))

Train set:
low       0.694689
medium    0.227527
high      0.077784
Name: interest_level, dtype: float64

Test set:
low       0.694661
medium    0.227535
high      0.077804
Name: interest_level, dtype: float64


The split is balanced.

In [43]:
def fitmodel(pipeline):
    t1 =time.time()
    pipeline.fit(train_X, train_y)
    print('Time to fit the trainset: {:.2f}'.format((time.time()-t1)))
    t1 =time.time()
    pred_prob = pipeline.predict_proba(train_X)
    print('Log score: {}'.format(log_loss(train_y, pred_prob)))
    print('Time to make the prediction to the trainset: {:.2f}'.format((time.time()-t1)))
    return pipeline, pred_prob
    

In [48]:
def predictmodel(pipeline):
    pred_unchanged_test = pipeline.predict_proba(test_X)
    print('Log score: {} for CAT'.format(log_loss(test_y, pred_unchanged_test)))
    return pred_unchanged_test

    

In [51]:
def get_feature_importance(pipeline):
    return pipeline.named_steps['predictor'].feature_importances_ 


## Build pipline step by step

### Step1: Unchanged Features

variable_unchanged = Pipeline([('variable_extractor', VariableExtractor(['bedrooms', 'bathrooms', 'price']))])


In [37]:
data1 = variable_unchanged.fit_transform(train_X, train_y)

In [38]:
data1.shape

(39481, 3)

In [40]:
pipe_unchaged = Pipeline([
        ('variable_unchanged', variable_unchanged),
        ('predictor', predictor)
    ])

In [45]:
pipe_unchaged, pred_unchanged_train = fitmodel(pipe_unchaged)

Time to fit the trainset: 4.59
Log score: 1.0758216934480005
Time to make the prediction to the trainset: 0.43


In [52]:
# importance features
get_feature_importance(pipe_unchaged) 

array([ 0.16,  0.16,  0.68])

price is more important among the 3 variables.

In [49]:
pred_unchanged_test = predictmodel(pipe_unchaged)

Log score: 1.0758768675370136 for CAT


Unchanged variables make sense, no obvious overfitting. 

## Step2: ratios