In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Load training data

In [2]:
train = pd.read_json('./data/train.json')

In [3]:
print train.shape

(49352, 15)


In [4]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [5]:
train.columns

Index([u'bathrooms', u'bedrooms', u'building_id', u'created', u'description',
       u'display_address', u'features', u'interest_level', u'latitude',
       u'listing_id', u'longitude', u'manager_id', u'photos', u'price',
       u'street_address'],
      dtype='object')

In [6]:
train.bathrooms.describe()

count    49352.00000
mean         1.21218
std          0.50142
min          0.00000
25%          1.00000
50%          1.00000
75%          1.00000
max         10.00000
Name: bathrooms, dtype: float64

In [7]:
train.bedrooms.value_counts()

1    15752
2    14623
0     9475
3     7276
4     1929
5      247
6       46
8        2
7        2
Name: bedrooms, dtype: int64

In [8]:
train.created.describe()

count                   49352
unique                  48675
top       2016-04-15 02:24:25
freq                        3
Name: created, dtype: object

In [9]:
train.created.max(), train.created.min()

(u'2016-06-29 21:41:47', u'2016-04-01 22:12:41')

In [10]:
train.created.dtype

dtype('O')

### Date/time feature

In [11]:
# Change Dates columns to Python Date format
train['created'] = pd.to_datetime(train['created'], format='%Y-%m-%d %H:%M:%S')
# Extract Date Features
import datetime as dt
train['day_created'] = train['created'].dt.weekday
train['month_created'] = train['created'].dt.month
train['hour_created'] = train['created'].dt.hour

### Features (of the listing) feature

In [12]:
train['feature_list'] = train['features'].map(lambda x: ','.join(x)).str.lower()

In [13]:
train['number_of_features'] = train['features'].map(len)

In [18]:
train.head(10)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,day_created,month_created,hour_created,feature_list,number_of_features
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,4,6,7,,0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6,6,12,"doorman,elevator,fitness center,cats allowed,d...",5
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,6,4,3,"laundry in building,dishwasher,hardwood floors...",4
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,4,2,"hardwood floors,no fee",2
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,3,4,1,pre-war,1
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,West 18th Street,[],medium,40.7429,6894514,-74.0028,b209e2c4384a64cc307c26759ee0c651,[https://photos.renthop.com/2/6894514_9abb8592...,7995,350 West 18th Street,1,4,4,,0
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,West 107th Street,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",low,40.8012,6930771,-73.966,01287194f20de51872e81f660def4784,[https://photos.renthop.com/2/6930771_7e3622b6...,3600,210 West 107th Street,2,4,3,"prewar,elevator,dogs allowed,cats allowed,lowr...",8
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",low,40.7427,6867392,-73.9957,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/6867392_b18283f6...,5645,155 West 21st Street,2,4,6,"doorman,elevator,pre-war,terrace,laundry in un...",8
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",medium,40.8234,6898799,-73.9457,c1a6598437b7db560cde66e5a297a53f,[https://photos.renthop.com/2/6898799_3759be4c...,1725,63 Hamilton Terrace,2,4,2,"cats allowed,dogs allowed,elevator,laundry in ...",4
100027,2.0,4,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,522 E 11th,"[Dishwasher, Hardwood Floors]",low,40.7278,6814332,-73.9808,23a01ea7717b38875f5b070282d1b9d2,[https://photos.renthop.com/2/6814332_e19a8552...,5800,522 E 11th,5,4,2,"dishwasher,hardwood floors",2


In [19]:
len(train[train['feature_list']==''])

3218

In [20]:
train.loc[10000,'feature_list']

u'doorman,elevator,fitness center,cats allowed,dogs allowed'

In [21]:
all_features = train['feature_list'].str.cat(sep=',')

In [22]:
all_feature_list = all_features.split(',')

In [23]:
all_feature_list[:20]

[u'',
 u'doorman',
 u'elevator',
 u'fitness center',
 u'cats allowed',
 u'dogs allowed',
 u'laundry in building',
 u'dishwasher',
 u'hardwood floors',
 u'pets allowed case by case',
 u'hardwood floors',
 u'no fee',
 u'pre-war',
 u'',
 u'prewar',
 u'elevator',
 u'dogs allowed',
 u'cats allowed',
 u'lowrise',
 u'simplex']

In [24]:
len(all_feature_list)

271124

In [25]:
unique_feature_list = list(set(all_feature_list))

In [26]:
len(unique_feature_list)

1295

In [27]:
unique_feature_list[:10]

[u'',
 u'photos of actual apartment',
 u'** extravagant east village! * massive 4br mansion * 2 full baths * gourmet kitchen * roomy closets **',
 u'complimentary sunday brunch',
 u'sundeck with bbq grills',
 u'party room',
 u'private shuttle',
 u'spacious layout',
 u'cable ready',
 u'private garden']

In [28]:
df_unique = pd.DataFrame(unique_feature_list)

In [29]:
df_unique.to_csv('./data/unique_feature_list.csv', encoding='utf-8')

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# vocabulary list from the unique feature list
# custom tokenizer to split by comma
count_vect = CountVectorizer(vocabulary=unique_feature_list, tokenizer=lambda x: x.split(','))

In [32]:
X_train_feature_counts = count_vect.fit_transform(train['feature_list'])

In [33]:
X_train_feature_counts

<49352x1295 sparse matrix of type '<type 'numpy.int64'>'
	with 270680 stored elements in Compressed Sparse Row format>

In [34]:
freqs = [(word, X_train_feature_counts.getcol(idx).sum()) for word, idx in count_vect.vocabulary_.items()]
#sort from largest to smallest
print sorted(freqs, key = lambda x: -x[1])[:20]

[(u'elevator', 26273), (u'hardwood floors', 23558), (u'cats allowed', 23540), (u'dogs allowed', 22035), (u'doorman', 20967), (u'dishwasher', 20806), (u'laundry in building', 18944), (u'no fee', 18079), (u'fitness center', 13257), (u'laundry in unit', 9435), (u'pre-war', 9149), (u'roof deck', 6555), (u'outdoor space', 5270), (u'dining room', 5150), (u'high speed internet', 4299), (u'', 3218), (u'balcony', 3058), (u'swimming pool', 2730), (u'new construction', 2608), (u'terrace', 2313)]


In [35]:
feature_freq = pd.DataFrame(sorted(freqs, key = lambda x: -x[1]))

In [36]:
feature_freq.to_csv('./data/feature_freq.csv', encoding='utf-8')

In [37]:
# TF-IDF
#from sklearn.feature_extraction.text import TfidfTransformer

### Description and photo features

In [38]:
train['desc_length'] = train['description'].map(len)
train['num_photo'] = train['photos'].map(len)

In [39]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,photos,price,street_address,day_created,month_created,hour_created,feature_list,number_of_features,desc_length,num_photo
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,4,6,7,,0,588,5
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6,6,12,"doorman,elevator,fitness center,cats allowed,d...",5,8,11
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,6,4,3,"laundry in building,dishwasher,hardwood floors...",4,691,8
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,4,2,"hardwood floors,no fee",2,492,3
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,3,4,1,pre-war,1,479,3


### Spatial features (Location, Address, Latitude/Longitude)

In [None]:
train.columns

In [None]:
out = train[['bathrooms','bedrooms','created','display_address','interest_level','latitude','longitude',
             'listing_id','building_id','manager_id','price','street_address']]

In [None]:
out['display_address'] = out['display_address'].map(lambda x: x.replace('\r',''))
out['street_address'] = out['street_address'].map(lambda x: x.replace('\r',''))

In [None]:
out[out['listing_id']==6861112]

In [None]:
out[out['listing_id']==7115466]

In [None]:
out.to_csv('./data/train_for_map.csv', encoding='utf-8', sep=',')

#### Processing Geocoding ....

In [41]:
train_geo = pd.read_csv('./data/geo_train.csv')

In [42]:
train_geo.shape

(49352, 19)

In [43]:
train.shape

(49352, 22)

In [44]:
train_geo.head()

Unnamed: 0,row_index,bathrooms,bedrooms,created,display_address,interest_level,latitude,longitude,listing_id,building_id,manager_id,price,street_address,geometry,BoroCode,BoroName,CountyFIPS,NTACode,NTAName
0,10,1.5,3,2016-06-24 07:54:24,Metropolitan Avenue,medium,40.7145,-73.9425,7211212,53a5b119ba8f7b61d4e010512e0dfc85,5ba989232d0489da1b5f2c45f6688adc,3000,792 Metropolitan Avenue,POINT (1000190.17876591 199593.3567914022),3.0,Brooklyn,47.0,BK90,East Williamsburg
1,10000,1.0,2,2016-06-12 12:19:27,Columbus Avenue,low,40.7947,-73.9667,7150865,c5c8a357cba207596b04d1afd1e4f130,7533621a882f71e25173b27e3139d83d,5465,808 Columbus Avenue,POINT (993470.335349353 228809.3430366625),1.0,Manhattan,61.0,MN12,Upper West Side
2,100004,1.0,1,2016-04-17 03:26:41,W 13 Street,high,40.7388,-74.0018,6887163,c3ba40552e2120b0acfc3cb5730bb2aa,d9039c43983f6e564b1482b273bd7b01,2850,241 W 13 Street,POINT (983751.1849854254 208441.3741951153),1.0,Manhattan,61.0,MN23,West Village
3,100007,1.0,1,2016-04-18 02:22:02,East 49th Street,low,40.7539,-73.9677,6888711,28d9ad350afeaab8027513a3e52ac8d5,1067e078446a7897d2da493d2f741316,3275,333 East 49th Street,POINT (993198.9295615184 213944.4325395406),1.0,Manhattan,61.0,MN19,Turtle Bay-East Midtown
4,100013,1.0,4,2016-04-28 01:32:41,West 143rd Street,low,40.8241,-73.9493,6934781,0,98e13ad4b495b9613cef886d79a6291f,3350,500 West 143rd Street,POINT (998281.9681499053 239523.1154973624),1.0,Manhattan,61.0,MN04,Hamilton Heights


In [45]:
train.columns

Index([         u'bathrooms',           u'bedrooms',        u'building_id',
                  u'created',        u'description',    u'display_address',
                 u'features',     u'interest_level',           u'latitude',
               u'listing_id',          u'longitude',         u'manager_id',
                   u'photos',              u'price',     u'street_address',
              u'day_created',      u'month_created',       u'hour_created',
             u'feature_list', u'number_of_features',        u'desc_length',
                u'num_photo'],
      dtype='object')

In [46]:
train_geo.columns

Index([u'row_index', u'bathrooms', u'bedrooms', u'created', u'display_address',
       u'interest_level', u'latitude', u'longitude', u'listing_id',
       u'building_id', u'manager_id', u'price', u'street_address', u'geometry',
       u'BoroCode', u'BoroName', u'CountyFIPS', u'NTACode', u'NTAName'],
      dtype='object')

In [47]:
train = pd.merge(train, train_geo[['row_index','NTACode']], how='inner', left_index=True, right_on='row_index')

In [48]:
train.shape

(49352, 24)

In [49]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,street_address,day_created,month_created,hour_created,feature_list,number_of_features,desc_length,num_photo,row_index,NTACode
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,792 Metropolitan Avenue,4,6,7,,0,588,5,10,BK90
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,808 Columbus Avenue,6,6,12,"doorman,elevator,fitness center,cats allowed,d...",5,8,11,10000,MN12
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,241 W 13 Street,6,4,3,"laundry in building,dishwasher,hardwood floors...",4,691,8,100004,MN23
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,333 East 49th Street,0,4,2,"hardwood floors,no fee",2,492,3,100007,MN19
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,500 West 143rd Street,3,4,1,pre-war,1,479,3,100013,MN04


In [50]:
train.isnull().sum()

bathrooms               0
bedrooms                0
building_id             0
created                 0
description             0
display_address         0
features                0
interest_level          0
latitude                0
listing_id              0
longitude               0
manager_id              0
photos                  0
price                   0
street_address          0
day_created             0
month_created           0
hour_created            0
feature_list            0
number_of_features      0
desc_length             0
num_photo               0
row_index               0
NTACode               125
dtype: int64

In [51]:
train['NTACode'] = train.NTACode.fillna('Non-NYC')

In [52]:
#Label Encoder Category feature Category features
cat_features = ['NTACode', 'manager_id']
from sklearn import preprocessing
for cat in cat_features:
    cat_le = preprocessing.LabelEncoder()
    train[cat] = cat_le.fit_transform(train[cat])

In [55]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,street_address,day_created,month_created,hour_created,feature_list,number_of_features,desc_length,num_photo,row_index,NTACode
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,792 Metropolitan Avenue,4,6,7,,0,588,5,10,45
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,808 Columbus Avenue,6,6,12,"doorman,elevator,fitness center,cats allowed,d...",5,8,11,10000,88
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,241 W 13 Street,6,4,3,"laundry in building,dishwasher,hardwood floors...",4,691,8,100004,97
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,333 East 49th Street,0,4,2,"hardwood floors,no fee",2,492,3,100007,93
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,500 West 143rd Street,3,4,1,pre-war,1,479,3,100013,84


### Combine all training features

In [56]:
simple_features = ['bathrooms','bedrooms','price',
                   'day_created','month_created','hour_created',
                   'number_of_features', 'desc_length','NTACode', 'num_photo', 'manager_id']
target = 'interest_level'

In [57]:
X_train = np.concatenate((train[simple_features].values, X_train_feature_counts.toarray()), axis=1)

In [58]:
X_train.shape

(49352, 1306)

### Prepare feature for test dataset

### Modeling and Cross validation

In [59]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train[target])

In [60]:
np.bincount(y_train)

array([ 3839, 34284, 11229])

In [61]:
train[target].value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [62]:
# high = 0, low = 1, medium = 2
le.classes_

array([u'high', u'low', u'medium'], dtype=object)

#### XGBoost

In [63]:
# Model 3: xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
XGB_model = xgb.XGBClassifier(objective='multi:softprob')

In [64]:
XGB_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [65]:
from sklearn.cross_validation import cross_val_score

In [66]:
scores = cross_val_score(XGB_model, X_train, y_train, cv=5, scoring='log_loss')

In [67]:
scores

array([-0.6340476 , -0.62269881, -0.62776284, -0.61974702, -0.62678154])