In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from scipy import sparse

### Load training/test data

In [3]:
train = pd.read_json('./data/train.json')

In [4]:
test = pd.read_json('./data/test.json')

In [5]:
print train.shape, test.shape

(49352, 15) (74659, 14)


In [6]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [7]:
train.columns

Index([u'bathrooms', u'bedrooms', u'building_id', u'created', u'description',
       u'display_address', u'features', u'interest_level', u'latitude',
       u'listing_id', u'longitude', u'manager_id', u'photos', u'price',
       u'street_address'],
      dtype='object')

In [8]:
train.bathrooms.describe()

count    49352.00000
mean         1.21218
std          0.50142
min          0.00000
25%          1.00000
50%          1.00000
75%          1.00000
max         10.00000
Name: bathrooms, dtype: float64

In [9]:
train.bedrooms.value_counts()

1    15752
2    14623
0     9475
3     7276
4     1929
5      247
6       46
8        2
7        2
Name: bedrooms, dtype: int64

In [10]:
train.created.describe()

count                   49352
unique                  48675
top       2016-04-15 02:24:25
freq                        3
Name: created, dtype: object

In [11]:
train.created.max(), train.created.min()

(u'2016-06-29 21:41:47', u'2016-04-01 22:12:41')

In [12]:
train.created.dtype

dtype('O')

In [13]:
#### Remove outlier
train = train[train.price < 100000]

In [14]:
train.shape

(49345, 15)

### Date/time feature

In [15]:
# Change Dates columns to Python Date format
train['created'] = pd.to_datetime(train['created'], format='%Y-%m-%d %H:%M:%S')
test['created'] = pd.to_datetime(test['created'], format='%Y-%m-%d %H:%M:%S')
# Extract Date Features
import datetime as dt
train['weekday_created'] = train['created'].dt.weekday
train['month_created'] = train['created'].dt.month
train['day_created'] = train['created'].dt.day
train['hour_created'] = train['created'].dt.hour

test['weekday_created'] = test['created'].dt.weekday
test['month_created'] = test['created'].dt.month
test['day_created'] = test['created'].dt.day
test['hour_created'] = test['created'].dt.hour

### Features (of the listing) feature

In [16]:
train['feature_list'] = train['features'].map(lambda x: ','.join(x)).str.lower()
test['feature_list'] = test['features'].map(lambda x: ','.join(x)).str.lower()

In [17]:
train['number_of_features'] = train['features'].map(len)
test['number_of_features'] = test['features'].map(len)

In [18]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,manager_id,photos,price,street_address,weekday_created,month_created,day_created,hour_created,feature_list,number_of_features
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,4,6,24,7,,0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6,6,12,12,"doorman,elevator,fitness center,cats allowed,d...",5
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,6,4,17,3,"laundry in building,dishwasher,hardwood floors...",4
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,4,18,2,"hardwood floors,no fee",2
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,3,4,28,1,pre-war,1


In [19]:
len(train[train['feature_list']==''])

3218

In [20]:
train.loc[10000,'feature_list']

u'doorman,elevator,fitness center,cats allowed,dogs allowed'

In [21]:
all_features = train['feature_list'].str.cat(sep=',')

In [22]:
all_feature_list = all_features.split(',')

In [23]:
all_feature_list[:20]

[u'',
 u'doorman',
 u'elevator',
 u'fitness center',
 u'cats allowed',
 u'dogs allowed',
 u'laundry in building',
 u'dishwasher',
 u'hardwood floors',
 u'pets allowed case by case',
 u'hardwood floors',
 u'no fee',
 u'pre-war',
 u'',
 u'prewar',
 u'elevator',
 u'dogs allowed',
 u'cats allowed',
 u'lowrise',
 u'simplex']

In [24]:
len(all_feature_list)

271091

In [25]:
unique_feature_list = list(set(all_feature_list))

In [26]:
len(unique_feature_list)

1295

In [27]:
unique_feature_list[:10]

[u'',
 u'photos of actual apartment',
 u'** extravagant east village! * massive 4br mansion * 2 full baths * gourmet kitchen * roomy closets **',
 u'complimentary sunday brunch',
 u'sundeck with bbq grills',
 u'party room',
 u'private shuttle',
 u'spacious layout',
 u'cable ready',
 u'private garden']

In [28]:
df_unique = pd.DataFrame(unique_feature_list)

In [29]:
df_unique.to_csv('./data/unique_feature_list.csv', encoding='utf-8')

#### Load significant feature list

In [22]:
df_sig_features = pd.read_csv('./data/significant_features.csv')

In [23]:
significant_features = df_sig_features.xfeature.tolist()

In [24]:
len(significant_features)

89

In [25]:
significant_features[:10]

['no fee',
 'laundry in building',
 'furnished',
 'hardwood floors',
 'reduced fee',
 'laundry in unit',
 'doorman',
 'cats allowed',
 'dishwasher',
 'private outdoor space']

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
# vocabulary list from the unique feature list
# custom tokenizer to split by comma

count_vect = CountVectorizer(vocabulary=significant_features, tokenizer=lambda x: x.split(','))
#count_vect = CountVectorizer(vocabulary=unique_feature_list, tokenizer=lambda x: x.split(','))
#count_vect = CountVectorizer(max_features=300, tokenizer=lambda x: x.split(','))

In [28]:
X_train_feature_counts = count_vect.fit_transform(train['feature_list'])

In [29]:
X_train_feature_counts

<49345x89 sparse matrix of type '<type 'numpy.int64'>'
	with 261492 stored elements in Compressed Sparse Row format>

In [30]:
test_count_vect = CountVectorizer(vocabulary=significant_features, tokenizer=lambda x: x.split(','))
#test_count_vect = CountVectorizer(vocabulary=unique_feature_list, tokenizer=lambda x: x.split(','))
#test_count_vect = CountVectorizer(vocabulary=count_vect.vocabulary_, tokenizer=lambda x: x.split(','))
X_test_feature_counts = count_vect.fit_transform(test['feature_list'])
X_test_feature_counts

<74659x89 sparse matrix of type '<type 'numpy.int64'>'
	with 395060 stored elements in Compressed Sparse Row format>

In [31]:
#freqs = [(word, X_train_feature_counts.getcol(idx).sum()) for word, idx in count_vect.vocabulary_.items()]
#sort from largest to smallest
#print sorted(freqs, key = lambda x: -x[1])[:20]

In [32]:
#feature_freq = pd.DataFrame(sorted(freqs, key = lambda x: -x[1]))

In [33]:
#feature_freq.to_csv('./data/feature_freq.csv', encoding='utf-8')

In [34]:
# TF-IDF
#from sklearn.feature_extraction.text import TfidfTransformer

### Description feature

In [35]:
train['desc_length'] = train['description'].map(len)
test['desc_length'] = test['description'].map(len)
train['num_photos'] = train['photos'].map(len)
test['num_photos'] = test['photos'].map(len)

In [36]:
desc_count = CountVectorizer(max_features=200, stop_words='english')

In [37]:
X_train_desc_vect = desc_count.fit_transform(train['description'])
X_test_desc_vect = desc_count.transform(test['description'])

In [38]:
X_train_desc_vect

<49345x200 sparse matrix of type '<type 'numpy.int64'>'
	with 1462860 stored elements in Compressed Sparse Row format>

In [39]:
X_test_desc_vect

<74659x200 sparse matrix of type '<type 'numpy.int64'>'
	with 2206340 stored elements in Compressed Sparse Row format>

### Spatial features (Location, Address, Latitude/Longitude)

In [None]:
train.columns

In [None]:
feature_for_geo = ['bathrooms','bedrooms','created','display_address','latitude','longitude',
             'listing_id','building_id','manager_id','price','street_address']

In [None]:
out_train = train[feature_for_geo]
out_test = test[feature_for_geo]

In [None]:
out_train['display_address'] = out_train['display_address'].map(lambda x: x.replace('\r',''))
out_train['street_address'] = out_train['street_address'].map(lambda x: x.replace('\r',''))

In [None]:
out_test['display_address'] = out_test['display_address'].map(lambda x: x.replace('\r',''))
out_test['street_address'] = out_test['street_address'].map(lambda x: x.replace('\r',''))

In [None]:
out_train.shape, out_test.shape

In [None]:
out = pd.concat((out_train,out_test))

In [None]:
len(out.index.unique()), out.shape

In [None]:
out.to_csv('./data/train_for_map.csv', encoding='utf-8', sep=',')

#### Processing Geocoding ....

In [None]:
train_geo = pd.read_csv('./data/geo_train.csv')

In [None]:
train_geo.shape

In [None]:
train.shape, test.shape

In [None]:
train_geo.columns

In [None]:
train = pd.merge(train, train_geo[['row_index','NTACode']], how='inner', left_index=True, right_on='row_index')
test = pd.merge(test, train_geo[['row_index','NTACode']], how='inner', left_index=True, right_on='row_index')

In [None]:
train.shape, test.shape

In [None]:
train.columns

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['NTACode'] = train.NTACode.fillna('Non-NYC')
test['NTACode'] = test.NTACode.fillna('Non-NYC')

### Derived features

In [40]:
train['price_per_bedroom'] = train['price'] / train['bedrooms']
train['price_per_bathroom'] = train['price'] / train['bathrooms']
test['price_per_bedroom'] = test['price'] / test['bedrooms']
test['price_per_bathroom'] = test['price'] / test['bathrooms']

### Category features

In [41]:
#Label Encoder Category feature NTACode
cat_features = ['manager_id','building_id','street_address','display_address']
from sklearn import preprocessing
for cat in cat_features:
    cat_le = preprocessing.LabelEncoder()
    cat_le.fit(list(train[cat].values) + list(test[cat].values))
    train[cat] = cat_le.transform(train[cat])
    test[cat] = cat_le.transform(test[cat])

### Combine all training features

In [42]:
simple_features = ['bathrooms','bedrooms','price',
                   'day_created','month_created','hour_created','weekday_created',
                   'number_of_features', 'desc_length','num_photos','manager_id','building_id',
                   'street_address','display_address','latitude','longitude',
                   'price_per_bedroom','price_per_bathroom']
target = 'interest_level'

#### Transform features into sparse matrix

In [43]:
X_train = sparse.hstack([train[simple_features], X_train_feature_counts, X_train_desc_vect]).tocsr()
X_test = sparse.hstack([test[simple_features], X_test_feature_counts, X_test_desc_vect]).tocsr()

In [44]:
#X_train = np.concatenate((train[simple_features].values, X_train_feature_counts.toarray()), axis=1)

In [45]:
X_train.shape, X_test.shape

((49345, 307), (74659, 307))

### Modeling and Cross validation

In [46]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train[target])

In [47]:
np.bincount(y_train)

array([ 3838, 34278, 11229])

In [48]:
train[target].value_counts()

low       34278
medium    11229
high       3838
Name: interest_level, dtype: int64

In [49]:
# high = 0, low = 1, medium = 2
le.classes_

array([u'high', u'low', u'medium'], dtype=object)

#### XGBoost

In [50]:
# Model 3: xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
XGB_model = xgb.XGBClassifier(
                objective='multi:softprob',
                learning_rate =0.1,
                max_depth=6,
                subsample=0.7, 
                colsample_bytree=0.7,
                seed=0)

In [51]:
XGB_model.fit(X_train, y_train, eval_metric='mlogloss')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.7)

In [52]:
from sklearn.cross_validation import cross_val_score

In [53]:
scores = cross_val_score(XGB_model, X_train, y_train, cv=5, scoring='log_loss')

In [54]:
scores

array([-0.57517832, -0.56128948, -0.56459166, -0.56488113, -0.57257408])

#### Prediction

In [55]:
y_id = test['listing_id'].astype('O')

In [56]:
y_test_log_pred = XGB_model.predict_proba(X_test)

In [57]:
sub = pd.DataFrame(np.column_stack((y_id, y_test_log_pred)), columns=['listing_id'] + le.classes_.tolist())

In [58]:
sub.to_csv('./submission/submission_2017-03-21_r4.csv', index=False)

#### Feature importances

In [None]:
feat_imp = pd.Series(XGB_model.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

In [None]:
train[simple_features].columns[2], train[simple_features].columns[14], train[simple_features].columns[10]

In [None]:
len(simple_features)

In [None]:
train[simple_features].head(1)

In [None]:
for v in X_train.toarray()[0][:20]:
    print v

In [None]:
d = len(simple_features)

In [None]:
print unique_feature_list[1048-d]
print unique_feature_list[99-d]
print unique_feature_list[343-d]
print unique_feature_list[138-d]
print unique_feature_list[168-d]
print unique_feature_list[850-d]
#print unique_feature_list[1248-d]
#print unique_feature_list[521-d]
print unique_feature_list[919-d]
print unique_feature_list[1310-d]
print unique_feature_list[232-d]
print unique_feature_list[665-d]
print unique_feature_list[338-d]
print unique_feature_list[1038-d]
#print unique_feature_list[354-d]

In [None]:
print [(i,v) for i, v in enumerate(simple_features)]

In [None]:
len(feat_imp), X_train.shape

In [None]:
feat_imp[:66]

In [None]:
feat_imp_index = [f[1:] for f in feat_imp.index]

In [None]:
significant_features = [unique_feature_list[int(x)-len(simple_features)] for x in feat_imp_index if int(x) > 15]

In [None]:
len(significant_features)

In [None]:
significant_features.remove('')

In [None]:
len(significant_features)

In [None]:
significant_features

In [None]:
df_significant_feature = pd.DataFrame(significant_features, index=range(len(significant_features)), columns=['xfeature'])

In [None]:
df_significant_feature.to_csv('./data/significant_features.csv', index=False)