In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from scipy import sparse

### Load training/test data

In [3]:
train = pd.read_json('./data/train.json')

In [4]:
test = pd.read_json('./data/test.json')

In [5]:
print train.shape, test.shape

(49352, 15) (74659, 14)


In [6]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [7]:
train.columns

Index([u'bathrooms', u'bedrooms', u'building_id', u'created', u'description',
       u'display_address', u'features', u'interest_level', u'latitude',
       u'listing_id', u'longitude', u'manager_id', u'photos', u'price',
       u'street_address'],
      dtype='object')

In [8]:
train.bathrooms.describe()

count    49352.00000
mean         1.21218
std          0.50142
min          0.00000
25%          1.00000
50%          1.00000
75%          1.00000
max         10.00000
Name: bathrooms, dtype: float64

In [9]:
train.bedrooms.value_counts()

1    15752
2    14623
0     9475
3     7276
4     1929
5      247
6       46
8        2
7        2
Name: bedrooms, dtype: int64

In [10]:
train.created.describe()

count                   49352
unique                  48675
top       2016-04-15 02:24:25
freq                        3
Name: created, dtype: object

In [11]:
train.created.max(), train.created.min()

(u'2016-06-29 21:41:47', u'2016-04-01 22:12:41')

In [12]:
train.created.dtype

dtype('O')

In [13]:
#### Remove outlier
train = train[train.price < 100000]

In [14]:
train.shape

(49345, 15)

### Date/time feature

In [15]:
# Change Dates columns to Python Date format
train['created'] = pd.to_datetime(train['created'], format='%Y-%m-%d %H:%M:%S')
test['created'] = pd.to_datetime(test['created'], format='%Y-%m-%d %H:%M:%S')
# Extract Date Features
import datetime as dt
train['weekday_created'] = train['created'].dt.weekday
train['month_created'] = train['created'].dt.month
train['day_created'] = train['created'].dt.day
train['hour_created'] = train['created'].dt.hour

test['weekday_created'] = test['created'].dt.weekday
test['month_created'] = test['created'].dt.month
test['day_created'] = test['created'].dt.day
test['hour_created'] = test['created'].dt.hour

### Features (of the listing) feature

In [16]:
train['feature_list'] = train['features'].map(lambda x: ','.join(x)).str.lower()
test['feature_list'] = test['features'].map(lambda x: ','.join(x)).str.lower()

In [17]:
train['number_of_features'] = train['features'].map(len)
test['number_of_features'] = test['features'].map(len)

In [18]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,manager_id,photos,price,street_address,weekday_created,month_created,day_created,hour_created,feature_list,number_of_features
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,4,6,24,7,,0
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6,6,12,12,"doorman,elevator,fitness center,cats allowed,d...",5
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,6,4,17,3,"laundry in building,dishwasher,hardwood floors...",4
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,0,4,18,2,"hardwood floors,no fee",2
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,3,4,28,1,pre-war,1


In [19]:
len(train[train['feature_list']==''])

3218

In [20]:
train.loc[10000,'feature_list']

u'doorman,elevator,fitness center,cats allowed,dogs allowed'

In [None]:
all_features = train['feature_list'].str.cat(sep=',')

In [None]:
all_feature_list = all_features.split(',')

In [None]:
all_feature_list[:20]

In [None]:
len(all_feature_list)

In [None]:
unique_feature_list = list(set(all_feature_list))

In [None]:
len(unique_feature_list)

In [None]:
unique_feature_list[:10]

In [None]:
df_unique = pd.DataFrame(unique_feature_list)

In [None]:
df_unique.to_csv('./data/unique_feature_list.csv', encoding='utf-8')

#### Load significant feature list

In [21]:
df_sig_features = pd.read_csv('./data/significant_features.csv')

In [22]:
significant_features = df_sig_features.xfeature.tolist()

In [23]:
len(significant_features)

89

In [24]:
significant_features[:10]

['no fee',
 'laundry in building',
 'furnished',
 'hardwood floors',
 'reduced fee',
 'laundry in unit',
 'doorman',
 'cats allowed',
 'dishwasher',
 'private outdoor space']

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
# vocabulary list from the unique feature list
# custom tokenizer to split by comma

count_vect = CountVectorizer(vocabulary=significant_features, tokenizer=lambda x: x.split(','))
#count_vect = CountVectorizer(vocabulary=unique_feature_list, tokenizer=lambda x: x.split(','))
#count_vect = CountVectorizer(max_features=300, tokenizer=lambda x: x.split(','))

In [27]:
X_train_feature_counts = count_vect.fit_transform(train['feature_list'])

In [28]:
X_train_feature_counts

<49345x89 sparse matrix of type '<type 'numpy.int64'>'
	with 261492 stored elements in Compressed Sparse Row format>

In [29]:
test_count_vect = CountVectorizer(vocabulary=significant_features, tokenizer=lambda x: x.split(','))
#test_count_vect = CountVectorizer(vocabulary=unique_feature_list, tokenizer=lambda x: x.split(','))
#test_count_vect = CountVectorizer(vocabulary=count_vect.vocabulary_, tokenizer=lambda x: x.split(','))
X_test_feature_counts = count_vect.fit_transform(test['feature_list'])
X_test_feature_counts

<74659x89 sparse matrix of type '<type 'numpy.int64'>'
	with 395060 stored elements in Compressed Sparse Row format>

In [30]:
#freqs = [(word, X_train_feature_counts.getcol(idx).sum()) for word, idx in count_vect.vocabulary_.items()]
#sort from largest to smallest
#print sorted(freqs, key = lambda x: -x[1])[:20]

In [31]:
#feature_freq = pd.DataFrame(sorted(freqs, key = lambda x: -x[1]))

In [32]:
#feature_freq.to_csv('./data/feature_freq.csv', encoding='utf-8')

In [33]:
# TF-IDF
#from sklearn.feature_extraction.text import TfidfTransformer

### Description feature

In [34]:
# length of description 
#train['desc_length'] = train['description'].map(len)
#test['desc_length'] = test['description'].map(len)

# count of words present in description column #
train["num_description_words"] = train["description"].apply(lambda x: len(x.split(" ")))
test["num_description_words"] = test["description"].apply(lambda x: len(x.split(" ")))

# count the number of photos
train['num_photos'] = train['photos'].map(len)
test['num_photos'] = test['photos'].map(len)

In [None]:
desc_count = CountVectorizer(max_features=200, stop_words='english')

In [None]:
X_train_desc_vect = desc_count.fit_transform(train['description'])
X_test_desc_vect = desc_count.transform(test['description'])

In [None]:
X_train_desc_vect

In [None]:
X_test_desc_vect

### Spatial features (Location, Address, Latitude/Longitude)

In [None]:
train.columns

In [None]:
feature_for_geo = ['bathrooms','bedrooms','created','display_address','latitude','longitude',
             'listing_id','building_id','manager_id','price','street_address']

In [None]:
out_train = train[feature_for_geo]
out_test = test[feature_for_geo]

In [None]:
out_train['display_address'] = out_train['display_address'].map(lambda x: x.replace('\r',''))
out_train['street_address'] = out_train['street_address'].map(lambda x: x.replace('\r',''))

In [None]:
out_test['display_address'] = out_test['display_address'].map(lambda x: x.replace('\r',''))
out_test['street_address'] = out_test['street_address'].map(lambda x: x.replace('\r',''))

In [None]:
out_train.shape, out_test.shape

In [None]:
out = pd.concat((out_train,out_test))

In [None]:
len(out.index.unique()), out.shape

In [None]:
out.to_csv('./data/train_for_map.csv', encoding='utf-8', sep=',')

#### Processing Geocoding ....

In [35]:
train_geo = pd.read_csv('./data/geo_train.csv')

In [36]:
train_geo.shape

(124004, 18)

In [37]:
train.shape, test.shape

((49345, 23), (74659, 22))

In [38]:
train_geo.columns

Index([u'row_index', u'bathrooms', u'bedrooms', u'created', u'display_address',
       u'latitude', u'longitude', u'listing_id', u'building_id', u'manager_id',
       u'price', u'street_address', u'geometry', u'BoroCode', u'BoroName',
       u'CountyFIPS', u'NTACode', u'NTAName'],
      dtype='object')

In [39]:
train = pd.merge(train, train_geo[['row_index','NTACode']], how='inner', left_index=True, right_on='row_index')
test = pd.merge(test, train_geo[['row_index','NTACode']], how='inner', left_index=True, right_on='row_index')

In [40]:
train.shape, test.shape

((49345, 25), (74659, 24))

In [41]:
train.columns

Index([            u'bathrooms',              u'bedrooms',
                 u'building_id',               u'created',
                 u'description',       u'display_address',
                    u'features',        u'interest_level',
                    u'latitude',            u'listing_id',
                   u'longitude',            u'manager_id',
                      u'photos',                 u'price',
              u'street_address',       u'weekday_created',
               u'month_created',           u'day_created',
                u'hour_created',          u'feature_list',
          u'number_of_features', u'num_description_words',
                  u'num_photos',             u'row_index',
                     u'NTACode'],
      dtype='object')

In [42]:
train.isnull().sum()

bathrooms                  0
bedrooms                   0
building_id                0
created                    0
description                0
display_address            0
features                   0
interest_level             0
latitude                   0
listing_id                 0
longitude                  0
manager_id                 0
photos                     0
price                      0
street_address             0
weekday_created            0
month_created              0
day_created                0
hour_created               0
feature_list               0
number_of_features         0
num_description_words      0
num_photos                 0
row_index                  0
NTACode                  125
dtype: int64

In [43]:
test.isnull().sum()

bathrooms                  0
bedrooms                   0
building_id                0
created                    0
description                0
display_address            0
features                   0
latitude                   0
listing_id                 0
longitude                  0
manager_id                 0
photos                     0
price                      0
street_address             0
weekday_created            0
month_created              0
day_created                0
hour_created               0
feature_list               0
number_of_features         0
num_description_words      0
num_photos                 0
row_index                  0
NTACode                  162
dtype: int64

In [44]:
train['NTACode'] = train.NTACode.fillna('Non-NYC')
test['NTACode'] = test.NTACode.fillna('Non-NYC')

### Derived features

In [45]:
train['price_per_bedroom'] = train['price'] / train['bedrooms']
train['price_per_bathroom'] = train['price'] / train['bathrooms']
test['price_per_bedroom'] = test['price'] / test['bedrooms']
test['price_per_bathroom'] = test['price'] / test['bathrooms']

train["room_sum"] = train["bedrooms"]+train["bathrooms"]
test["room_sum"] = test["bedrooms"]+test["bathrooms"]

### Manager feature

In [46]:
import random

index=list(range(train.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train)
b=[np.nan]*len(train)
c=[np.nan]*len(train)

In [47]:
for i in range(5):
    building_level={}
    for j in train['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train.shape[0])/5):int(((i+1)*train.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
            
train['manager_level_low'] = a
train['manager_level_medium'] = b
train['manager_level_high'] = c

In [None]:
a=[]
b=[]
c=[]
building_level={}
for j in train['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train.shape[0]):
    temp=train.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test['manager_level_low']=a
test['manager_level_medium']=b
test['manager_level_high']=c

### Category features

In [None]:
#Label Encoder Category feature NTACode
cat_features = ['manager_id','building_id','street_address','display_address','NTACode']
from sklearn import preprocessing
for cat in cat_features:
    cat_le = preprocessing.LabelEncoder()
    cat_le.fit(list(train[cat].values) + list(test[cat].values))
    train[cat] = cat_le.transform(train[cat])
    test[cat] = cat_le.transform(test[cat])

### Combine all training features

In [None]:
simple_features = ['bathrooms','bedrooms','price',
                   'day_created','month_created','hour_created','weekday_created',
                   'number_of_features', 'num_description_words','num_photos','manager_id','building_id',
                   'street_address','display_address','latitude','longitude',
                   'NTACode','listing_id',
                   'price_per_bedroom','price_per_bathroom','room_sum',
                   'manager_level_low','manager_level_medium','manager_level_high']
target = 'interest_level'

#### Transform features into sparse matrix

In [None]:
X_train = sparse.hstack([train[simple_features], X_train_feature_counts]).tocsr()
X_test = sparse.hstack([test[simple_features], X_test_feature_counts]).tocsr()
#X_train = sparse.hstack([train[simple_features], X_train_feature_counts, X_train_desc_vect]).tocsr()
#X_test = sparse.hstack([test[simple_features], X_test_feature_counts, X_test_desc_vect]).tocsr()

In [None]:
#X_train = np.concatenate((train[simple_features].values, X_train_feature_counts.toarray()), axis=1)

In [None]:
X_train.shape, X_test.shape

### Modeling and Cross validation

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train[target])

In [None]:
np.bincount(y_train)

In [None]:
train[target].value_counts()

In [None]:
# high = 0, low = 1, medium = 2
le.classes_

#### XGBoost

In [None]:
# Model 3: xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
XGB_model = xgb.XGBClassifier(
                objective='multi:softprob',
                learning_rate =0.1,
                max_depth=6,
                min_child_weight=1,
                subsample=0.7, 
                colsample_bytree=0.7,
                seed=0)

In [None]:
XGB_model.fit(X_train, y_train, eval_metric='mlogloss')

In [None]:
from sklearn.cross_validation import cross_val_score

In [None]:
scores = cross_val_score(XGB_model, X_train, y_train, cv=5, scoring='log_loss')

In [None]:
scores

In [None]:
y_id = test['listing_id'].astype('O')

In [None]:
y_test_log_pred = XGB_model.predict_proba(X_test)

In [None]:
sub = pd.DataFrame(np.column_stack((y_id, y_test_log_pred)), columns=['listing_id'] + le.classes_.tolist())

In [None]:
sub.to_csv('./submission/submission_2017-04-21_r1.csv', index=False)

In [None]:
le.classes_

### XGBoost re-train

In [None]:
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(X_train.shape[0])):
        dev_X, val_X = X_train[dev_index,:], X_train[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

#### Prediction

In [None]:
preds, model = runXGB(X_train, y_train, X_test, num_rounds=211)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "low", "medium"]
out_df["listing_id"] = test.listing_id.values
out_df.to_csv("./submission/submission_2017-04-21_r3.csv", index=False)

#### Feature importances

In [None]:
feat_imp = pd.Series(XGB_model.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

In [None]:
train[simple_features].columns[2], train[simple_features].columns[14], train[simple_features].columns[10]

In [None]:
len(simple_features)

In [None]:
train[simple_features].head(1)

In [None]:
for v in X_train.toarray()[0][:20]:
    print v

In [None]:
d = len(simple_features)

In [None]:
print unique_feature_list[1048-d]
print unique_feature_list[99-d]
print unique_feature_list[343-d]
print unique_feature_list[138-d]
print unique_feature_list[168-d]
print unique_feature_list[850-d]
#print unique_feature_list[1248-d]
#print unique_feature_list[521-d]
print unique_feature_list[919-d]
print unique_feature_list[1310-d]
print unique_feature_list[232-d]
print unique_feature_list[665-d]
print unique_feature_list[338-d]
print unique_feature_list[1038-d]
#print unique_feature_list[354-d]

In [None]:
print [(i,v) for i, v in enumerate(simple_features)]

In [None]:
len(feat_imp), X_train.shape

In [None]:
feat_imp[:66]

In [None]:
feat_imp_index = [f[1:] for f in feat_imp.index]

In [None]:
significant_features = [unique_feature_list[int(x)-len(simple_features)] for x in feat_imp_index if int(x) > 15]

In [None]:
len(significant_features)

In [None]:
significant_features.remove('')

In [None]:
len(significant_features)

In [None]:
significant_features

In [None]:
df_significant_feature = pd.DataFrame(significant_features, index=range(len(significant_features)), columns=['xfeature'])

In [None]:
df_significant_feature.to_csv('./data/significant_features.csv', index=False)