In [41]:
import pandas as pd

train_data = pd.read_json("F:/graduate study/semester2/660/final project/train.json")
test_data = pd.read_json("F:/graduate study/semester2/660/final project/test.json")

In [42]:
import xgboost as xgb

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=2000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [43]:
# Deal with the created date
import pandas as pd
from datetime import datetime


train_date = train_data['created']
test_date = test_data['created']

now = datetime.now()

train_datetime = pd.to_datetime(train_date)
test_datetime = pd.to_datetime(test_date)

train_time = now - train_datetime
test_time = now - test_datetime
train_days = train_time.astype('timedelta64[D]')
test_days = test_time.astype('timedelta64[D]')
train_data['created_new'] = pd.DataFrame(train_days)
test_data['created_new'] = pd.DataFrame(test_days)

In [44]:
import numpy as np

test_data["bathrooms"].loc[19671] = 1.5
test_data["bathrooms"].loc[22977] = 2.0
test_data["bathrooms"].loc[63719] = 2.0
train_data["price"] = train_data["price"].clip(upper=13000)

train_data["logprice"] = np.log(train_data["price"])
test_data["logprice"] = np.log(test_data["price"])

train_data["price_t"] =train_data["price"]/train_data["bedrooms"]
test_data["price_t"] = test_data["price"]/test_data["bedrooms"] 

train_data["room_sum"] = train_data["bedrooms"]+train_data["bathrooms"] 
test_data["room_sum"] = test_data["bedrooms"]+test_data["bathrooms"] 

train_data['price_per_room'] = train_data['price']/train_data['room_sum']
test_data['price_per_room'] = test_data['price']/test_data['room_sum']

train_data["num_photos"] = train_data["photos"].apply(len)
test_data["num_photos"] = test_data["photos"].apply(len)

train_data["num_features"] = train_data["features"].apply(len)
test_data["num_features"] = test_data["features"].apply(len)

train_data["num_description_words"] = train_data["description"].apply(lambda x: len(x.split(" ")))
test_data["num_description_words"] = test_data["description"].apply(lambda x: len(x.split(" ")))

train_data["created"] = pd.to_datetime(train_data["created"])
test_data["created"] = pd.to_datetime(test_data["created"])
train_data["created_year"] = train_data["created"].dt.year
test_data["created_year"] = test_data["created"].dt.year
train_data["created_month"] = train_data["created"].dt.month
test_data["created_month"] = test_data["created"].dt.month
train_data["created_day"] = train_data["created"].dt.day
test_data["created_day"] = test_data["created"].dt.day
train_data["created_hour"] = train_data["created"].dt.hour
test_data["created_hour"] = test_data["created"].dt.hour

In [45]:
train_data["pos"] = train_data.longitude.round(3).astype(str) + '_' + train_data.latitude.round(3).astype(str)
test_data["pos"] = test_data.longitude.round(3).astype(str) + '_' + test_data.latitude.round(3).astype(str)

vals = train_data['pos'].value_counts()
dvals = vals.to_dict()
train_data["density"] = train_data['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_data["density"] = test_data['pos'].apply(lambda x: dvals.get(x, vals.min()))

In [46]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density",
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour","created_new"]

In [47]:
test_data["bathrooms"].loc[19671] = 1.5
test_data["bathrooms"].loc[22977] = 2.0
test_data["bathrooms"].loc[63719] = 2.0
train_data["price"] = train_data["price"].clip(upper=13000)

In [48]:
import random
from sklearn.preprocessing import LabelEncoder

index=list(range(train_data.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_data)
b=[np.nan]*len(train_data)
c=[np.nan]*len(train_data)

for i in range(5):
    building_level={}
    for j in train_data['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_data.shape[0])/5):int(((i+1)*train_data.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_data.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_data.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
train_data['manager_level_low']=a
train_data['manager_level_medium']=b
train_data['manager_level_high']=c

In [49]:
a=[]
b=[]
c=[]
building_level={}
for j in train_data['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_data.shape[0]):
    temp=train_data.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_data['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0] * 1.0 / sum(building_level[i]))
        b.append(building_level[i][1] * 1.0 / sum(building_level[i]))
        c.append(building_level[i][2] * 1.0 / sum(building_level[i]))
test_data['manager_level_low']=a
test_data['manager_level_medium']=b
test_data['manager_level_high']=c

In [50]:
features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [51]:
features_to_use

['bathrooms',
 'bedrooms',
 'latitude',
 'longitude',
 'price',
 'price_t',
 'price_per_room',
 'logprice',
 'density',
 'num_photos',
 'num_features',
 'num_description_words',
 'listing_id',
 'created_year',
 'created_month',
 'created_day',
 'created_hour',
 'created_new',
 'manager_level_low',
 'manager_level_medium',
 'manager_level_high']

In [52]:
categorical = ["display_address", "manager_id", "building_id"]
for f in categorical:
        if train_data[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_data[f].values) + list(test_data[f].values))
            train_data[f] = lbl.transform(list(train_data[f].values))
            test_data[f] = lbl.transform(list(test_data[f].values))
            features_to_use.append(f)

In [54]:
train_data[features_to_use].head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,price_t,price_per_room,logprice,density,num_photos,...,created_month,created_day,created_hour,created_new,manager_level_low,manager_level_medium,manager_level_high,display_address,manager_id,building_id
10,1.5,3,40.7145,-73.9425,3000,1000.0,666.666667,8.006368,5,5,...,6,24,7,303.0,0.767123,0.232877,0.0,12282,1568,3797
10000,1.0,2,40.7947,-73.9667,5465,2732.5,1821.666667,8.606119,62,11,...,6,12,12,315.0,0.984375,0.015625,0.0,9080,1988,8986
100004,1.0,1,40.7388,-74.0018,2850,2850.0,1425.0,7.955074,92,8,...,4,17,3,371.0,0.539216,0.401961,0.058824,13719,3733,8889
100007,1.0,1,40.7539,-73.9677,3275,3275.0,1637.5,8.094073,144,3,...,4,18,2,370.0,0.79085,0.143791,0.065359,10866,282,1848
100013,1.0,4,40.8241,-73.9493,3350,837.5,670.0,8.116716,5,3,...,4,28,1,360.0,1.0,0.0,0.0,15072,2618,0


In [55]:
train_data['interest_level'] = train_data['interest_level'].map({'low': 0, 'medium': 1,'high':2})
train_data['interest_level'].head()

10        1
10000     0
100004    2
100007    0
100013    0
Name: interest_level, dtype: int64

In [61]:
train_y = train_data['interest_level']
train_x = train_data[features_to_use]
test_x = test_data[features_to_use]

In [63]:
preds, model = runXGB(train_x, train_y, test_x, num_rounds=2000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_data.listing_id.values
out_df.to_csv("sub.csv", index=False)
