# Importing stuff

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib

pd.options.display.max_columns = 999

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import log_loss

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold, train_test_split

import xgboost as xgb

import lightgbm as lgb



Using matplotlib backend: Qt4Agg


# Loading data

In [2]:
X_train = pd.read_json('data/train.json')
X_test = pd.read_json('data/test.json') 

In [3]:
Y_train = X_train['interest_level']

## See data

In [4]:
X_train.head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street


# looking at the data

In [None]:
X_train.pivot_table(index='bedrooms', columns='interest_level')

# Feature engeneering

### Number of photos

In [5]:
X_train['num_photos'] = X_train['photos'].apply(len)
X_test['num_photos'] = X_test['photos'].apply(len)

### Date of creation

In [6]:
X_train['created'] = pd.to_datetime(X_train["created"])
X_test['created'] = pd.to_datetime(X_test["created"])

In [7]:
#X_train["created_year"] = X_train["created"].dt.year
#X_test["created_year"] = X_test["created"].dt.year
X_train["created_month"] = X_train["created"].dt.month
X_test["created_month"] = X_test["created"].dt.month
X_train["created_day"] = X_train["created"].dt.day
X_test["created_day"] = X_test["created"].dt.day
X_train["created_hour"] = X_train["created"].dt.hour
X_test["created_hour"] = X_test["created"].dt.hour

In [8]:
X_train["created_dow"] = X_train["created"].dt.dayofweek
X_test["created_dow"] = X_test["created"].dt.dayofweek

In [9]:
X_train = X_train.join(pd.get_dummies(X_train['created_dow'], prefix='created_dow'))
X_test = X_test.join(pd.get_dummies(X_test['created_dow'], prefix='created_dow'))

### Dummies from bedrooms

In [10]:
# X_train['bedrooms'] = X_train['bedrooms'].replace(8,7)

In [11]:
X_train = X_train.join(pd.get_dummies(X_train['bedrooms'], prefix='bedrooms'))
#del X_train['bedrooms']

X_test = X_test.join(pd.get_dummies(X_test['bedrooms'], prefix='bedrooms'))
#del X_test['bedrooms']

### Dummies from bathrooms

In [12]:
X_train = X_train.join(pd.get_dummies(X_train['bathrooms'], prefix='bathrooms'))
#del X_train['bedrooms']

X_test = X_test.join(pd.get_dummies(X_test['bathrooms'], prefix='bathrooms'))
#del X_test['bedrooms']

### Len of Features, Descriptions

In [13]:
X_train['features_len'] = X_train['features'].apply(len)
X_train['description_len'] = X_train['description'].apply(len)


In [14]:
X_test['features_len'] = X_test['features'].apply(len)
X_test['description_len'] = X_test['description'].apply(len)


### Dealing with managers

In [22]:
# Here we go

X_train['manager_freq'] = X_train['manager_id'].map(X_train['manager_id'].value_counts())
X_test['manager_freq'] = X_test['manager_id'].map(X_test['manager_id'].value_counts())

## Getting ready to GO

In [16]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()

In [None]:
features_to_use = ['bathrooms',
       'latitude','listing_id', 'longitude', 'price',
       'num_photos', 'created_year', 'created_month',
       'created_day', 'created_hour', 'bedrooms_0', 'bedrooms_1', 'bedrooms_2',
       'bedrooms_3', 'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7', 'KM']

In [31]:
X_train.columns

Index(['index', 'bathrooms', 'bedrooms', 'building_id', 'created',
       'description', 'display_address', 'features', 'interest_level',
       'latitude', 'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'num_photos', 'created_month', 'created_day',
       'created_hour', 'created_dow', 'created_dow_0', 'created_dow_1',
       'created_dow_2', 'created_dow_3', 'created_dow_4', 'created_dow_5',
       'created_dow_6', 'bedrooms_0', 'bedrooms_1', 'bedrooms_2', 'bedrooms_3',
       'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7', 'bedrooms_8',
       'bathrooms_0.0', 'bathrooms_1.0', 'bathrooms_1.5', 'bathrooms_2.0',
       'bathrooms_2.5', 'bathrooms_3.0', 'bathrooms_3.5', 'bathrooms_4.0',
       'bathrooms_4.5', 'bathrooms_5.0', 'bathrooms_5.5', 'bathrooms_6.0',
       'bathrooms_6.5', 'bathrooms_7.0', 'bathrooms_10.0', 'features_len',
       'description_len', 'manager_freq', 'KM'],
      dtype='object')

In [29]:
features_to_use = ['latitude', 'longitude', 'price',
       'num_photos', 'created_month',
       'created_day', 'created_hour', 
       
                   'created_dow_0', 'created_dow_1',
       'created_dow_2', 'created_dow_3', 'created_dow_4', 'created_dow_5',
       'created_dow_6', 
                   
                   'bedrooms_0', 'bedrooms_1', 'bedrooms_2',
       'bedrooms_3', 'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7', 
                   'bedrooms_8',
       'bathrooms_0.0', 'bathrooms_1.0', 'bathrooms_1.5',
       'bathrooms_2.0', 'bathrooms_2.5', 'bathrooms_3.0', 'bathrooms_3.5',
       'bathrooms_4.0', 'bathrooms_4.5', 'bathrooms_5.0', 'bathrooms_5.5',
       'bathrooms_6.0', 'bathrooms_6.5', 
                   'bathrooms_7.0', 'bathrooms_10.0',
       'features_len', 'description_len', 'manager_freq']

In [30]:
X_test['bedrooms_8'] = 0
X_test['bathrooms_7.0'] = 0
X_test['bathrooms_10.0'] = 0

# Kmeans clustering

In [20]:
from sklearn.cluster import KMeans

In [32]:
kmeans = KMeans(n_clusters=30, random_state=42).fit(X_train[features_to_use])
X_train = X_train.join(pd.DataFrame(kmeans.labels_))
X_train['KM'] = X_train[0]
del X_train[0]

kmeans2 = KMeans(n_clusters=30, random_state=42).fit(X_test[features_to_use])
X_test = X_test.join(pd.DataFrame(kmeans2.labels_))
X_test['KM'] = X_test[0]
del X_test[0]

# Implementing XGBoost

In [34]:
num_Y_map = {'low':0, 'medium':1, 'high':2}
Y_train = X_train['interest_level'].map(num_Y_map)

In [35]:
from time import time

In [None]:
start = time()

param_grid =       {'base_score':      [0.33],
                    'max_depth':       [3,6],
                    'learning_rate':   [0.02,0.05],
                    'reg_alpha':       [0.5,1],
                    'reg_lambda':      [0.5,1],
                    'objective':['multi:softprob'],
                    'n_estimators':[100]}

xgb_model_gs = xgb.XGBClassifier()

clf = GridSearchCV(xgb_model_gs,
                   param_grid,
                   verbose=1)

clf.fit(X_train[features_to_use].values,Y_train.values)

print(clf.best_estimator_)
print(clf.best_params_)

print (time()-start) 

In [44]:
# rng = np.random.RandomState(42)
kf = KFold(len(Y_train), n_folds=5, shuffle=True, random_state=42)

In [79]:
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier(base_score=0.33, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=0.5,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(
        X_train.loc[train_index,features_to_use].values,Y_train[train_index].values)
    
    predictions = xgb_model.predict_proba(X_train.loc[test_index,features_to_use].values)
    actuals = Y_train[test_index].values
    print(log_loss(actuals, predictions))

KeyboardInterrupt: 

In [48]:
lgb.LGBMClassifier(objective='lambdarank')

lightgbm.sklearn.LGBMClassifier

In [54]:
for train_index, test_index in kf:
    lgb_model = lgb.LGBMClassifier(objective='lambdarank',  n_estimators=700).fit(
        X_train.loc[train_index,features_to_use].values,Y_train[train_index].values)
    
    predictions = lgb_model.predict_proba(X_train.loc[test_index,features_to_use].values)
    actuals = Y_train[test_index].values
    print(log_loss(actuals, predictions))

0.603716759601
0.599564380075
0.610364108178
0.597762911003
0.605776638373


learning_rate=0.05
n_estimators=700

0.599808973253
0.612607784257
0.606060899577
0.619159023335
0.611381953856




In [51]:
xgb.plot_importance(xgb_model)

<matplotlib.axes._subplots.AxesSubplot at 0x7f494cb639e8>

In [52]:
lgb.plot_importance(lgb_model)

<matplotlib.axes._subplots.AxesSubplot at 0x7f494545d6d8>

# Generating result

In [None]:
xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(X_train[features_to_use],Y_train)

In [82]:
xgb_model = xgb.XGBClassifier(base_score=0.33, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=0.5,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(X_train[features_to_use].values,Y_train.values)

In [83]:
predictions = xgb_model.predict_proba(X_test[features_to_use].values)

In [None]:
# num_Y_map = {'low':0, 'medium':1, 'high':2}

In [84]:
predictions = pd.DataFrame(predictions).join(X_test['listing_id'])

In [85]:
predictions.head(2)

Unnamed: 0,0,1,2,listing_id
0,0.422695,0.477518,0.099787,7142618
1,0.211814,0.374498,0.413688,7210040


In [86]:
predictions.columns = ['low', 'medium', 'high', 'listing_id']
predictions = predictions[['listing_id', 'high', 'medium', 'low']]

In [87]:
predictions['listing_id'] = predictions['listing_id'].astype(int) 

In [88]:
predictions.to_csv('result_2c_ivan.csv', index = False)