In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
#from sklearn import cross_validation, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
train_df = pd.read_json('data/train.json')
test_df = pd.read_json('data/test.json')

In [3]:
#basic features
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))


features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","num_photos", "num_features", "num_description_words","listing_id"]

In [4]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [5]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

In [6]:
for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])

In [7]:
train_df['manager_level_low'] = a
train_df['manager_level_medium'] = b
train_df['manager_level_high'] = c

In [8]:
train_df[train_df['manager_id']=='725c43266983be0350de7fb42c9e84e9']

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,price,street_address,price_t,room_sum,num_photos,num_features,num_description_words,manager_level_low,manager_level_medium,manager_level_high
102587,1.0,0,0,2016-04-24 01:11:26,"Great apartment, great building... in a great ...",Park Terrace East,"[Pre-War, Laundry in Building]",low,40.8699,6917173,...,1500,30 Park Terrace East,inf,1.0,0,2,20,1.0,0.0,0.0
21273,1.0,2,0,2016-06-06 01:23:27,great apartment... in a great neighborhood,East 117th Street,"[Pre-War, Dogs Allowed, Cats Allowed]",low,40.7993,7114701,...,2225,124 East 117th Street,1112.5,3.0,0,3,7,1.0,0.0,0.0
78879,1.0,1,0,2016-05-15 01:12:44,great apartment in a great building ..in a gre...,East 81st Street,"[Pre-War, Dogs Allowed, Cats Allowed]",low,40.773,7014548,...,2100,422 East 81st Street,2100.0,2.0,0,3,11,1.0,0.0,0.0


In [9]:
building_level['725c43266983be0350de7fb42c9e84e9']

[2, 0, 0]

In [10]:
building_level['04443051e359588472981e81e61b9d3b']

[8, 2, 0]

In [11]:
print train_df[train_df['manager_id']=='04443051e359588472981e81e61b9d3b'].manager_level_low.sum()
print train_df[train_df['manager_id']=='04443051e359588472981e81e61b9d3b'].manager_level_medium.sum()

9.05634920635
1.94365079365


In [12]:
train_df[train_df['manager_id']=='04443051e359588472981e81e61b9d3b']

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,price,street_address,price_t,room_sum,num_photos,num_features,num_description_words,manager_level_low,manager_level_medium,manager_level_high
110912,1.0,1,ea084f01cc956506019c9543636f98ad,2016-04-19 06:03:34,<p><a website_redacted,West 54th Street,"[No Fee, Dogs Allowed, Cats Allowed]",medium,40.7645,6896183,...,2500,243 West 54th Street,2500.0,2.0,0,3,4,0.857143,0.142857,0.0
17599,2.0,2,be27507344f959deeb75f2838204dd38,2016-06-14 15:51:02,"Two Bdrm Duplex w/ 2 baths, plus private terra...",West 73rd Street,[],low,40.7805,7160084,...,3750,273 West 73rd Street,1875.0,4.0,1,0,32,0.75,0.25,0.0
21336,1.0,4,7ec3fe5a77f8265aae990ef621e80158,2016-06-14 15:21:46,"1.5 blocks from Riverside Park, 1 block from 1...",West 109th Street,"[Pre-War, Dogs Allowed, Cats Allowed]",low,40.8032,7158790,...,5200,235 West 109th Street,1300.0,5.0,1,3,45,0.857143,0.142857,0.0
21565,1.0,1,58addc519ec1925d839cf6dc25ec3069,2016-06-16 07:18:34,"Large One Bdrm Penthouse w/ Two terraces, in e...",West 75th Street,"[Elevator, Terrace, Laundry in Unit, Dogs Allo...",low,40.7817,7171595,...,2850,255 West 75th Street,2850.0,2.0,1,5,51,0.8,0.2,0.0
2167,1.0,0,2412fb493a2b1a9f44406b29298e3cd2,2016-06-16 06:28:36,DECORATIVE FIREPLACE & EXPOSED BRICK; CLOSE T...,West 84th Street,"[Fireplace, Loft, Laundry in Building, Dogs Al...",low,40.7859,7170820,...,2250,158 West 84th Street,inf,1.0,1,5,44,0.857143,0.142857,0.0
26115,1.0,4,b9f488ccbda179edc69f36f80f242e23,2016-06-16 08:19:51,Safe & quite block with quick access to C & 1 ...,West 162nd Street,"[Pre-War, Dishwasher, Hardwood Floors, No Fee]",low,40.8369,7173182,...,3100,539 West 162nd Street,775.0,5.0,3,4,48,0.75,0.25,0.0
28780,1.0,2,6a7872e86552c538614d66c8198e7e1e,2016-06-11 05:32:21,GREAT SHARE STEPS FROM CITY COLLEGECLOSE TO TR...,Convent Avenue,"[Elevator, Pre-War, Laundry in Building]",medium,40.8217,7142665,...,2500,260 Convent Avenue,1250.0,3.0,1,3,29,0.888889,0.111111,0.0
34434,1.0,2,ac90c3e8e9497467158f5235db875810,2016-06-11 06:22:27,5 MINUTE WALK TO RIVERSIDE PARK AND BIKE PATHS...,West 151st Street,[Pre-War],low,40.83,7143530,...,1875,531 West 151st Street,937.5,3.0,1,1,58,0.75,0.25,0.0
35797,1.0,2,6a7872e86552c538614d66c8198e7e1e,2016-06-14 15:44:59,GREAT SHARE STEPS FROM CITY COLLEGECLOSE TO TR...,Convent Avenue,"[Elevator, Pre-War, Laundry in Building]",low,40.8217,7159869,...,2500,260 Convent Avenue,1250.0,3.0,1,3,29,0.888889,0.111111,0.0
36614,1.0,1,0c239739ea5fb30514f5968285259b58,2016-06-14 15:33:09,Charming building located on the Upper West Si...,West 86th Street,"[Balcony, Doorman, Elevator, Garden/Patio, Pre...",low,40.7863,7159238,...,3820,41 West 86th Street,3820.0,2.0,6,9,98,0.857143,0.142857,0.0


In [13]:
a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [14]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [15]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [16]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

((49352, 217), (74659, 217))


In [18]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

Will train until test error hasn't decreased in 20 rounds.
[0]	train-mlogloss:1.035064	test-mlogloss:1.036923
[1]	train-mlogloss:0.978842	test-mlogloss:0.982464
[2]	train-mlogloss:0.931228	test-mlogloss:0.935937
[3]	train-mlogloss:0.891441	test-mlogloss:0.897482
[4]	train-mlogloss:0.855164	test-mlogloss:0.862700
[5]	train-mlogloss:0.823283	test-mlogloss:0.832104
[6]	train-mlogloss:0.795456	test-mlogloss:0.805587
[7]	train-mlogloss:0.770122	test-mlogloss:0.781502
[8]	train-mlogloss:0.748552	test-mlogloss:0.760970
[9]	train-mlogloss:0.729731	test-mlogloss:0.742910
[10]	train-mlogloss:0.712865	test-mlogloss:0.727196
[11]	train-mlogloss:0.697337	test-mlogloss:0.712855
[12]	train-mlogloss:0.683975	test-mlogloss:0.700244
[13]	train-mlogloss:0.671414	test-mlogloss:0.688782
[14]	train-mlogloss:0.659961	test-mlogloss:0.678362
[15]	train-mlogloss:0.649222	test-mlogloss:0.668653
[16]	train-mlogloss:0.639911	test-mlogloss:0.660183
[17]	train-mlogloss:0.631324	test-mlogloss:0.652482
[18]	train-mlog

[0.53859988084287791]


[277]	train-mlogloss:0.362545	test-mlogloss:0.538600
Stopping. Best iteration:
[257]	train-mlogloss:0.372386	test-mlogloss:0.538368



In [20]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("./submission/submission_2017-04-06_r1.csv", index=False)