## Part II: Predictive Model

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from bs4 import BeautifulSoup 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import random
import re
from collections import defaultdict
from transformers import *
%matplotlib inline



In [2]:
# variables
random_state = 2017

In [28]:
# predictor
predictor_gb = GradientBoostingClassifier(subsample=0.5, random_state=random_state, max_features='sqrt')
gb_params = dict(Predictor__learning_rate=[0.01, 0.1, 0.5],
                 Predictor__n_estimators=[30, 50, 100, 500],
                 Predictor__max_depth=[3, 5, 10]
)




In [29]:
gb_params

{'Predictor__learning_rate': [0.01, 0.1, 0.5],
 'Predictor__max_depth': [3, 5, 10],
 'Predictor__n_estimators': [30, 50, 100, 500]}

In [4]:
predictor_rf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, 
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                                      max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, 
                                      random_state=None, verbose=0, warm_start=False, class_weight=None)


In [5]:
predictor_ada = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', 
                                   random_state=None)

In [6]:
# make a cv generator
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# make a scorer
scorer_log_loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)


In [40]:
class RentalPredict():
    def __init__(self, featureunion, predictor, param_dict, cv, scorer):
        self.estimator = Pipeline([
                ('Feature_Unions', featureunion),
                ('Predictor', predictor)
            ])
        self.param_grid = param_dict
        self.cv = cv
        self.scorer = scorer
        self.Grid_model = GridSearchCV(estimator=self.estimator, param_grid=self.param_grid, cv=self.cv, scoring='neg_log_loss')

        
    
    def fit(self, X, y):
        # Grid Search
        self.Grid_model.fit(X, y)
        return self
    
    def get_gridmodel(self):
        return self.Grid_model
    


In [74]:
def create_predprob(grid_model, test_X, tocsv=False, filename=None):
    test_pred_prob = grid_model.predict_proba(test_X)
    test_pred_prob_set = pd.DataFrame(test_pred_prob, columns=grid_model.best_estimator_.classes_, index=test_set.index)
    test_pred_prob_set = test_pred_prob_set.join(test_set['listing_id'])
    col_orders = ['listing_id', 'high', 'medium', 'low']
    test_pred_prob_set = test_pred_prob_set[col_orders]
    if tocsv:
        test_pred_prob_set.to_csv(filename, index = False)
    return test_pred_prob_set

In [21]:
# Feature Unions
# build pipelines
variable_unchanged = Pipeline([('variable_extractor', VariableExtractor(['bedrooms', 'bathrooms', 'price']))])
bbratio = Pipeline([
        ('room_ratio', RatioCreator('bedrooms', 'bathrooms')),
        ('bbratio_dimup', DimOneUp())
    ])
bpratio = Pipeline([
        ('priceroom_ratio', RatioCreator('price', 'bedrooms')),
        ('bpratio_dimup', DimOneUp())
    ])
llcluster = Pipeline([
        ('ll_extractor', VariableExtractor(['longitude', 'latitude'])),
        ('ll_cluster', LLCluster(init='k-means++', n_clusters=10, batch_size=200, n_init=10, 
                                 max_no_improvement=10, verbose=0, random_state=random_state)),
        ('llcluster_dimup', DimOneUp())                     
    ])

feature_process = Pipeline([
        ('feature_cleanser', FeatureCleanser()),
        ('feature_union', FeatureUnion([
                    ('feature_counts', Pipeline([
                                ('feature_length', VariableLength()),
                                ('feature_dimup', DimOneUp())
                            ])),
                    ('different_features', DiffFeatCounts())
                ]))
    ])

description_process = FeatureUnion([
        ('description_length', Pipeline([('description_counts', DescriptionWordCounts()),
                                     ('dc_dimup', DimOneUp())
                                    ])
        ), 
        ('description_tf', DescriptionProcessor())
    ])

photo_length = Pipeline([('photo_extractor', VariableExtractor('photos')),
                         ('photo_counts', VariableLength()), 
                         ('photo_dimup', DimOneUp())
    ])

building_process = Pipeline([
        ('building_extractor', VariableExtractor('building_id')),
        ('building_union', FeatureUnion([
                    ('building_counts', Pipeline([
                                ('building_length', CatVariableCounts()),
                                ('building_dimup', DimOneUp())
                            ])),
                    ('building_indicator', CatVariableIndicator())
                ]))
    ])

manager_process = Pipeline([
        ('manager_extractor', VariableExtractor('manager_id')),
        ('manager_union', FeatureUnion([
                    ('manager_counts', Pipeline([
                                ('manager_length', CatVariableCounts()),
                                ('manager_dimup', DimOneUp())
                            ])),
                    ('manager_indicator', CatVariableIndicator())
                ]))
    ])

date_process = Pipeline([
        ('hour_extractor', DateProcessor(wantyear=False, wantmonth=False, wanthour=True)),
        ('hour_dimup', DimOneUp())
    ])

address_process = Pipeline([
        ('address_cleanser', AddressCleanser()),
        ('address_union', FeatureUnion([
                    ('address_counts', Pipeline([
                                ('address_length', CatVariableCounts()),
                                ('address_dimup', DimOneUp())
                            ])),
                    ('address_indicator', CatVariableIndicator())
                ]))
    ])


In [9]:
# read in the data 
# read in the training data set
train_set = pd.read_json(r'..\data\train.json')
test_set = pd.read_json(r'..\data\test.json')
print("Read in training data set with size of {} * {}".format(train_set.shape[0], train_set.shape[1]))
print("Read in testing data set with size of {} * {}".format(test_set.shape[0], test_set.shape[1]))

Read in training data set with size of 49352 * 15
Read in testing data set with size of 74659 * 14


In [41]:
N = 3000
train_X = train_set.iloc[:3000].drop('interest_level', axis=1)
train_y = train_set.iloc[:3000]['interest_level']

In [42]:
rp_try = RentalPredict(variable_unchanged, predictor_gb, gb_params, cv, scorer_log_loss)

In [43]:
rp_try.fit(train_X, train_y)

<__main__.RentalPredict at 0x1a958f09f60>

In [44]:
model_try = rp_try.get_gridmodel()

In [45]:
model_try.best_score_

-0.72633535736205157

In [39]:
model_try.best_params_

{'Predictor__learning_rate': 0.01,
 'Predictor__max_depth': 3,
 'Predictor__n_estimators': 500}

In [47]:
pred_try = model_try.predict_proba(test_set)

In [50]:
pred_try[:10]

array([[ 0.04878566,  0.74566373,  0.20555062],
       [ 0.09721395,  0.5741655 ,  0.32862055],
       [ 0.03090766,  0.86356531,  0.10552703],
       [ 0.0651726 ,  0.63616919,  0.29865821],
       [ 0.04143167,  0.82109574,  0.13747259],
       [ 0.01154865,  0.95722645,  0.0312249 ],
       [ 0.09799949,  0.56887869,  0.33312181],
       [ 0.10022034,  0.57378729,  0.32599237],
       [ 0.08216564,  0.56358297,  0.35425139],
       [ 0.07926138,  0.61857601,  0.30216261]])

In [79]:
pred_try = create_predprob(model_try, test_set, tocsv=True, filename=r'..\data\predictions\gb_try_3var.csv')

In [78]:
a.head()

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.048786,0.205551,0.745664
1,7210040,0.097214,0.328621,0.574165
100,7103890,0.030908,0.105527,0.863565
1000,7143442,0.065173,0.298658,0.636169
100000,6860601,0.041432,0.137473,0.821096


In [69]:
a = GradientBoostingClassifier()

In [70]:
a.fit(train_X[['bedrooms', 'price']], train_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [71]:
a.classes_

array(['high', 'low', 'medium'], dtype=object)