In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Load training data

In [2]:
train = pd.read_json('./data/train.json')

In [3]:
print train.shape

(49352, 15)


In [4]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [5]:
train.columns

Index([u'bathrooms', u'bedrooms', u'building_id', u'created', u'description',
       u'display_address', u'features', u'interest_level', u'latitude',
       u'listing_id', u'longitude', u'manager_id', u'photos', u'price',
       u'street_address'],
      dtype='object')

In [6]:
train.bathrooms.describe()

count    49352.00000
mean         1.21218
std          0.50142
min          0.00000
25%          1.00000
50%          1.00000
75%          1.00000
max         10.00000
Name: bathrooms, dtype: float64

In [7]:
train.bedrooms.value_counts()

1    15752
2    14623
0     9475
3     7276
4     1929
5      247
6       46
8        2
7        2
Name: bedrooms, dtype: int64

In [8]:
train.created.describe()

count                   49352
unique                  48675
top       2016-04-15 02:24:25
freq                        3
Name: created, dtype: object

In [9]:
train.created.max(), train.created.min()

(u'2016-06-29 21:41:47', u'2016-04-01 22:12:41')

In [10]:
train.created.dtype

dtype('O')

### Date/time feature

In [11]:
# Change Dates columns to Python Date format
train['created'] = pd.to_datetime(train['created'], format='%Y-%m-%d %H:%M:%S')
# Extract Date Features
import datetime as dt
train['day_created'] = train['created'].dt.weekday
train['month_created'] = train['created'].dt.month
train['hour_created'] = train['created'].dt.hour

### Features (of the listing) feature

In [12]:
train['feature_list'] = train['features'].map(lambda x: ','.join(x)).str.lower()

In [13]:
train['number_of_features'] = train['features'].map(len)

In [44]:
train.head(10)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,day_created,month_created,hour_created,feature_list,number_of_features
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,4,6,7,,0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6,6,12,"doorman,elevator,fitness center,cats allowed,d...",5
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,6,4,3,"laundry in building,dishwasher,hardwood floors...",4
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,4,2,"hardwood floors,no fee",2
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,3,4,1,pre-war,1
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,West 18th Street,[],medium,40.7429,6894514,-74.0028,b209e2c4384a64cc307c26759ee0c651,[https://photos.renthop.com/2/6894514_9abb8592...,7995,350 West 18th Street,1,4,4,,0
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,West 107th Street,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",low,40.8012,6930771,-73.966,01287194f20de51872e81f660def4784,[https://photos.renthop.com/2/6930771_7e3622b6...,3600,210 West 107th Street,2,4,3,"prewar,elevator,dogs allowed,cats allowed,lowr...",8
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",low,40.7427,6867392,-73.9957,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/6867392_b18283f6...,5645,155 West 21st Street,2,4,6,"doorman,elevator,pre-war,terrace,laundry in un...",8
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",medium,40.8234,6898799,-73.9457,c1a6598437b7db560cde66e5a297a53f,[https://photos.renthop.com/2/6898799_3759be4c...,1725,63 Hamilton Terrace,2,4,2,"cats allowed,dogs allowed,elevator,laundry in ...",4
100027,2.0,4,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,522 E 11th,"[Dishwasher, Hardwood Floors]",low,40.7278,6814332,-73.9808,23a01ea7717b38875f5b070282d1b9d2,[https://photos.renthop.com/2/6814332_e19a8552...,5800,522 E 11th,5,4,2,"dishwasher,hardwood floors",2


In [15]:
len(train[train['feature_list']==''])

3218

In [16]:
train.loc[10000,'feature_list']

u'doorman,elevator,fitness center,cats allowed,dogs allowed'

In [21]:
all_features = train['feature_list'].str.cat(sep=',')

In [22]:
all_feature_list = all_features.split(',')

In [39]:
all_feature_list[:20]

[u'',
 u'doorman',
 u'elevator',
 u'fitness center',
 u'cats allowed',
 u'dogs allowed',
 u'laundry in building',
 u'dishwasher',
 u'hardwood floors',
 u'pets allowed case by case',
 u'hardwood floors',
 u'no fee',
 u'pre-war',
 u'',
 u'prewar',
 u'elevator',
 u'dogs allowed',
 u'cats allowed',
 u'lowrise',
 u'simplex']

In [25]:
len(all_feature_list)

271124

In [26]:
unique_feature_list = list(set(all_feature_list))

In [27]:
len(unique_feature_list)

1295

In [28]:
unique_feature_list[:10]

[u'',
 u'photos of actual apartment',
 u'** extravagant east village! * massive 4br mansion * 2 full baths * gourmet kitchen * roomy closets **',
 u'complimentary sunday brunch',
 u'sundeck with bbq grills',
 u'party room',
 u'private shuttle',
 u'spacious layout',
 u'cable ready',
 u'private garden']

In [40]:
df_unique = pd.DataFrame(unique_feature_list)

In [43]:
df_unique.to_csv('./data/unique_feature_list.csv', encoding='utf-8')

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
# vocabulary list from the unique feature list
# custom tokenizer to split by comma
count_vect = CountVectorizer(vocabulary=unique_feature_list, tokenizer=lambda x: x.split(','))

In [46]:
X_train_feature_counts = count_vect.fit_transform(train['feature_list'])

In [47]:
X_train_feature_counts

<49352x1295 sparse matrix of type '<type 'numpy.int64'>'
	with 270680 stored elements in Compressed Sparse Row format>

In [48]:
freqs = [(word, X_train_feature_counts.getcol(idx).sum()) for word, idx in count_vect.vocabulary_.items()]
#sort from largest to smallest
print sorted(freqs, key = lambda x: -x[1])[:20]

[(u'elevator', 26273), (u'hardwood floors', 23558), (u'cats allowed', 23540), (u'dogs allowed', 22035), (u'doorman', 20967), (u'dishwasher', 20806), (u'laundry in building', 18944), (u'no fee', 18079), (u'fitness center', 13257), (u'laundry in unit', 9435), (u'pre-war', 9149), (u'roof deck', 6555), (u'outdoor space', 5270), (u'dining room', 5150), (u'high speed internet', 4299), (u'', 3218), (u'balcony', 3058), (u'swimming pool', 2730), (u'new construction', 2608), (u'terrace', 2313)]


In [49]:
feature_freq = pd.DataFrame(sorted(freqs, key = lambda x: -x[1]))

In [50]:
feature_freq.to_csv('./data/feature_freq.csv', encoding='utf-8')

In [51]:
# TF-IDF
#from sklearn.feature_extraction.text import TfidfTransformer

### Description feature

In [74]:
train['desc_length'] = train['description'].map(len)

In [76]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,manager_id,photos,price,street_address,day_created,month_created,hour_created,feature_list,number_of_features,desc_length
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,4,6,7,,0,588
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6,6,12,"doorman,elevator,fitness center,cats allowed,d...",5,8
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,6,4,3,"laundry in building,dishwasher,hardwood floors...",4,691
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,4,2,"hardwood floors,no fee",2,492
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,3,4,1,pre-war,1,479


### Spatial features (Location, Address, Latitude/Longitude)

In [91]:
train.columns

Index([         u'bathrooms',           u'bedrooms',        u'building_id',
                  u'created',        u'description',    u'display_address',
                 u'features',     u'interest_level',           u'latitude',
               u'listing_id',          u'longitude',         u'manager_id',
                   u'photos',              u'price',     u'street_address',
              u'day_created',      u'month_created',       u'hour_created',
             u'feature_list', u'number_of_features',        u'desc_length'],
      dtype='object')

In [107]:
out = train[['bathrooms','bedrooms','created','display_address','interest_level','latitude','longitude',
             'listing_id','building_id','manager_id','price','street_address']]

In [108]:
out['display_address'] = out['display_address'].map(lambda x: x.replace('\r',''))
out['street_address'] = out['street_address'].map(lambda x: x.replace('\r',''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [109]:
out[out['listing_id']==6861112]

Unnamed: 0,bathrooms,bedrooms,created,display_address,interest_level,latitude,longitude,listing_id,building_id,manager_id,price,street_address
83394,1.0,0,2016-04-12 05:41:24,Center Blvd,low,40.7478,-73.9571,6861112,6f42136c0b436713b38b7772afdf23a8,df1c47247fd4fd9d6ab38a83e0f3c164,2354,4540 Center Blvd


In [110]:
out[out['listing_id']==7115466]

Unnamed: 0,bathrooms,bedrooms,created,display_address,interest_level,latitude,longitude,listing_id,building_id,manager_id,price,street_address
610,1.0,2,2016-06-06 02:50:15,Hanover Square,medium,40.7045,-74.0089,7115466,b3b3bb23497fedb4e68dfa182e272bb2,31323cdfe8308b88ec68ea9aad82e6b7,2995,10 Hanover Square


In [111]:
out.to_csv('./data/train_for_map.csv', encoding='utf-8', sep=',')

### Combine all training features

In [77]:
simple_features = ['bathrooms','bedrooms','price',
                   'day_created','month_created','hour_created',
                   'number_of_features', 'desc_length']
target = 'interest_level'

In [78]:
X_train = np.concatenate((train[simple_features].values, X_train_feature_counts.toarray()), axis=1)

In [79]:
X_train.shape

(49352, 1303)

### Prepare feature for test dataset

### Modeling and Cross validation

In [80]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train[target])

In [81]:
np.bincount(y_train)

array([ 3839, 34284, 11229])

In [82]:
train[target].value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [83]:
# high = 0, low = 1, medium = 2
le.classes_

array([u'high', u'low', u'medium'], dtype=object)

#### XGBoost

In [84]:
# Model 3: xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
XGB_model = xgb.XGBClassifier(objective='multi:softprob')

In [85]:
XGB_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [86]:
from sklearn.cross_validation import cross_val_score

In [87]:
scores = cross_val_score(XGB_model, X_train, y_train, cv=5, scoring='log_loss')

In [88]:
scores

array([-0.64932748, -0.63404355, -0.63865757, -0.63223544, -0.63889356])