# Importing stuff

In [1]:
import numpy as np
import sklearn
import pandas as pd
import csv
import math
import matplotlib.pyplot as plt
import datetime
%matplotlib
from sklearn import datasets, linear_model
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics 
from scipy.stats.stats import pearsonr 
pd.options.display.max_columns = 999
from sklearn.metrics import confusion_matrix as cm
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import pickle
from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_boston

Using matplotlib backend: MacOSX


# Loading data

In [2]:
X_train = pd.read_json('data/train.json')
X_test = pd.read_json('data/test.json') 

In [3]:
Y_train = X_train['interest_level']

## mapping interest level str to int

In [5]:
num_Y_map = {'high':0, 'medium':1, 'low':2}
Y_train = np.array(X_train['interest_level'].apply(lambda x: num_Y_map[x]))

# looking at the data

In [6]:
X_train.pivot_table(index='bedrooms', columns='interest_level', values='price')

interest_level,high,low,medium
bedrooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2069.439197,2649.370206,2251.518957
1,2152.919406,3418.025437,2636.015191
2,2714.034456,4745.92562,3194.77901
3,3777.537162,5650.445526,4196.131323
4,5305.711409,7776.179238,5639.886654
5,6012.5,10502.658436,5700.0
6,,11002.704545,7625.0
7,,14500.0,6923.0
8,,8247.5,


# Feature engeneering

### Number of photos

In [7]:
X_train['num_photos'] = X_train['photos'].apply(len)
X_test['num_photos'] = X_test['photos'].apply(len)

### Date of creation

In [8]:
X_train['created'] = pd.to_datetime(X_train["created"])
X_test['created'] = pd.to_datetime(X_test["created"])

In [9]:
X_train["created_year"] = X_train["created"].dt.year
X_test["created_year"] = X_test["created"].dt.year
X_train["created_month"] = X_train["created"].dt.month
X_test["created_month"] = X_test["created"].dt.month
X_train["created_day"] = X_train["created"].dt.day
X_test["created_day"] = X_test["created"].dt.day
X_train["created_hour"] = X_train["created"].dt.hour
X_test["created_hour"] = X_test["created"].dt.hour

### Dummies from bedrooms

In [10]:
X_train['bedrooms'] = X_train['bedrooms'].replace(8,7)

In [11]:
X_train = X_train.join(pd.get_dummies(X_train['bedrooms'], prefix='bedrooms'))
del X_train['bedrooms']

X_test = X_test.join(pd.get_dummies(X_test['bedrooms'], prefix='bedrooms'))
del X_test['bedrooms']

### Dealing with managers

In [41]:
len(X_train['manager_id'].unique())

3481

In [44]:
X_train['manager_id'].value_counts()[0]

2533

In [None]:
X_train['manager_freq'] = 0

for x in X_train['manager_id']:
    X_train['manager_freq'] = X_train['manager_id'].value_counts()[x]
    
X_test['manager_freq'] = 0

for x in X_test['manager_id']:
    X_test['manager_freq'] = X_test['manager_id'].value_counts()[x]

In [None]:
X_train.head()

## Getting ready to GO

In [12]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()

In [13]:
FEATURES_LIST = []
for x in range(len(X_train['features'])):
    for y in range(len(X_train['features'][x])):
        FEATURES_LIST.append(X_train['features'][x][y])

In [14]:
FEATURES_LIST = list(set(FEATURES_LIST))

In [15]:
X_train.columns

Index(['index', 'bathrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'num_photos', 'created_year', 'created_month',
       'created_day', 'created_hour', 'bedrooms_0', 'bedrooms_1', 'bedrooms_2',
       'bedrooms_3', 'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7'],
      dtype='object')

In [19]:
features_to_use = ['bathrooms',
       'latitude','listing_id', 'longitude', 'price',
       'num_photos', 'created_year', 'created_month',
       'created_day', 'created_hour', 'bedrooms_0', 'bedrooms_1', 'bedrooms_2',
       'bedrooms_3', 'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7', 'KM']

# Kmeans clustering

In [17]:
from sklearn.cluster import KMeans

In [18]:
kmeans = KMeans(n_clusters=30, random_state=42).fit(X_train[features_to_use])
X_train = X_train.join(pd.DataFrame(kmeans.labels_))
X_train['KM'] = X_train[0]
del X_train[0]

kmeans2 = KMeans(n_clusters=30, random_state=42).fit(X_test[features_to_use])
X_test = X_test.join(pd.DataFrame(kmeans2.labels_))
X_test['KM'] = X_test[0]
del X_test[0]

In [20]:
X_train[features_to_use].head()

Unnamed: 0,bathrooms,latitude,listing_id,longitude,price,num_photos,created_year,created_month,created_day,created_hour,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,KM
0,1.5,40.7145,7211212,-73.9425,3000,5,2016,6,24,7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8
1,1.0,40.7947,7150865,-73.9667,5465,11,2016,6,12,12,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,16
2,1.0,40.7388,6887163,-74.0018,2850,8,2016,4,17,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23
3,1.0,40.7539,6888711,-73.9677,3275,3,2016,4,18,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23
4,1.0,40.8241,6934781,-73.9493,3350,3,2016,4,28,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,14


# Implementing XGBoost

In [23]:
Y_train = pd.Series(Y_train)

In [21]:
rng = np.random.RandomState(3)
kf = KFold(len(Y_train), n_folds=5, shuffle=True, random_state=rng)

In [30]:
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(X_train[features_to_use].iloc[train_index],Y_train.iloc[train_index])
    predictions = xgb_model.predict_proba(X_train[features_to_use].iloc[test_index])
    actuals = Y_train.iloc[test_index]
    print(sklearn.metrics.log_loss(actuals, predictions))

0.599808973253
0.612607784257


KeyboardInterrupt: 

learning_rate=0.05
n_estimators=700

0.599808973253
0.612607784257
0.606060899577
0.619159023335
0.611381953856




In [37]:
xgb.plot_importance(xgb_model)

<matplotlib.axes._subplots.AxesSubplot at 0x13a54acc0>

In [27]:
xgb_model_gs = xgb.XGBClassifier()
clf = GridSearchCV(xgb_model_gs,
                   {'max_depth': [6],
                    'learning_rate': [0.05],
                    'reg_alpha':[1],
                    'reg_lambda':[1],
                    'objective':['multi:softprob'],
                    'n_estimators':[700]},
                   verbose=1)
clf.fit(X_train[features_to_use],Y_train)
print(clf.best_estimator_)
print(clf.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  7.3min finished


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
{'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 1000, 'objective': 'multi:softprob', 'max_depth': 6, 'learning_rate': 0.02}


# Generating result

In [31]:
xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(X_train[features_to_use],Y_train)

In [33]:
predictions = xgb_model.predict_proba(X_test[features_to_use])

In [34]:
predictions = pd.DataFrame(predictions).join(X_test['listing_id'])
predictions.columns = ['high', 'medium', 'low', 'listing_id']
predictions = predictions[['listing_id', 'high', 'medium', 'low']]

In [35]:
predictions.head()

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.061105,0.365431,0.573464
1,7210040,0.595864,0.237663,0.166472
2,7103890,0.018516,0.127029,0.854455
3,7143442,0.02146,0.248896,0.729643
4,6860601,0.014135,0.196159,0.789706


In [36]:
predictions.to_csv('results/result2.csv', index = False)