In [1]:
# use price, bedrooms, bathrooms to build a bench mark model
# try sklearn pipline

In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin


In [7]:
# read in the training data set
train_set = pd.read_json(r'..\data\train.json')
test_set = pd.read_json(r'..\data\test.json')
print("Read in training data set with size of {} * {}".format(train_set.shape[0], train_set.shape[1]))
print("Read in testing data set with size of {} * {}".format(test_set.shape[0], test_set.shape[1]))


Read in training data set with size of 49352 * 15
Read in testing data set with size of 74659 * 14


In [18]:
train_X = train_set.drop('interest_level', axis=1)
train_y = train_set['interest_level']
train_y.head()

10        medium
10000        low
100004      high
100007       low
100013       low
Name: interest_level, dtype: object

In [8]:
train_set.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address'],
      dtype='object')

In [59]:
# build a random forest model using bedrooms, bathrooms and price
predictor = RandomForestClassifier(random_state=123)
predictor.fit(train_X[['bedrooms', 'bathrooms', 'price']], train_y)
pred = predictor.predict(train_X[['bedrooms', 'bathrooms', 'price']])
score = predictor.score(train_X[['bedrooms', 'bathrooms', 'price']], train_y)

unique, counts = np.unique(pred, return_counts=True)

for u, c in zip(unique, counts):
    print(u, c)
print('Score: {}'.format(score))


high 1259
low 44785
medium 3308
Score: 0.7286026908737234


In [63]:
class FeatureExtractor(TransformerMixin):
    '''Extract Feature(s) from trainning data set.'''
    def __init__(self, feature):
        self.feature = feature

    def transform(self, data):
        return data[self.feature]

    def fit(self, *_):
        return self


In [64]:
class ArrayUpDimension(TransformerMixin):
    '''Turn Series into DataFrame.'''
    def transform(self, data):
        return data.reshape((data.shape[0], 1))
    def fit(self, *_):
        return self

In [48]:
a = FeatureExtractor('bedrooms')
b = a.transform(train_X)
b.shape

(49352,)

In [52]:
bedrooms_featurizer = Pipeline([
  ('bedroom_extractor', FeatureExtractor('bedrooms')),
  ('bedroom_dim', ArrayUpDimension())
])

bathrooms_featurizer = Pipeline([
    ('bathroom_extractor', FeatureExtractor('bathrooms')),
    ('bathroom_dim', ArrayUpDimension())        
])

price_featurizer = Pipeline([
    ('price_extractor', FeatureExtractor('price')),
    ('price_dim', ArrayUpDimension())
])

In [53]:
features = FeatureUnion([
  ('bedroom_feature', bedrooms_featurizer),
  ('source_featurs', bathrooms_featurizer),
  ('location_featurizer', price_featurizer)
])

predictor = RandomForestClassifier(random_state=123)

pipeline = Pipeline([
  ('feature_union',  features),
  ('predictor',      predictor)
])

In [54]:
features
pipeline

Pipeline(steps=[('feature_union', FeatureUnion(n_jobs=1,
       transformer_list=[('bedroom_feature', Pipeline(steps=[('bedroom_extractor', <__main__.FeatureExtractor object at 0x0000028729BF9898>), ('bedroom_dim', <__main__.ArrayUpDimension object at 0x0000028729BF9828>)])), ('source_featurs', Pipeline(step...timators=10, n_jobs=1, oob_score=False, random_state=123,
            verbose=0, warm_start=False))])

In [60]:
pipeline.fit(train_X, train_y)
pipeline.score(train_X, train_y)

0.72860269087372342

In [61]:
features2 = FeatureExtractor(['bedrooms', 'bathrooms', 'price'])
pipeline2 = Pipeline([
  ('feature_union',  features2),
  ('predictor',      predictor)
])

In [62]:
pipeline2.fit(train_X, train_y)
pipeline2.score(train_X, train_y)

0.72860269087372342

In [70]:
# use this model to train the test data set
test_pred_prob = pipeline.predict_proba(test_set)

In [76]:
pipeline.classes_

array(['high', 'low', 'medium'], dtype=object)

In [111]:
test_pred_prob_set = pd.DataFrame(test_pred_prob, columns=pipeline.classes_, index=test_set.index)
test_pred_prob_set = test_pred_prob_set.join(test_set['listing_id'])

In [112]:
test_pred_prob_set.head()

Unnamed: 0,high,low,medium,listing_id
0,0.046202,0.756425,0.197373,7142618
1,0.10821,0.509232,0.382558,7210040
100,0.0,1.0,0.0,7103890
1000,0.063062,0.635563,0.301375,7143442
100000,0.0,0.822888,0.177112,6860601


In [121]:
# reorder the columns
col_orders = ['listing_id', 'high', 'medium', 'low']
test_pred_prob_set = test_pred_prob_set[col_orders]
test_pred_prob_set.head()


Unnamed: 0,listing_id,high,medium,low
0,7142618,0.046202,0.197373,0.756425
1,7210040,0.10821,0.382558,0.509232
100,7103890,0.0,0.0,1.0
1000,7143442,0.063062,0.301375,0.635563
100000,6860601,0.0,0.177112,0.822888


In [122]:
# write the benchmark model to csv
test_pred_prob_set.to_csv(r'..\data\benchmark.csv', index = False)
