In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import log_loss
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV




In [3]:
df_train = pd.read_json(open("../data/train.json", "r"))

In [5]:
df_train.head(10)
# df_train.shape

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,West 18th Street,[],medium,40.7429,6894514,-74.0028,b209e2c4384a64cc307c26759ee0c651,[https://photos.renthop.com/2/6894514_9abb8592...,7995,350 West 18th Street
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,West 107th Street,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",low,40.8012,6930771,-73.966,01287194f20de51872e81f660def4784,[https://photos.renthop.com/2/6930771_7e3622b6...,3600,210 West 107th Street
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",low,40.7427,6867392,-73.9957,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/6867392_b18283f6...,5645,155 West 21st Street
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",medium,40.8234,6898799,-73.9457,c1a6598437b7db560cde66e5a297a53f,[https://photos.renthop.com/2/6898799_3759be4c...,1725,63 Hamilton Terrace
100027,2.0,4,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,522 E 11th,"[Dishwasher, Hardwood Floors]",low,40.7278,6814332,-73.9808,23a01ea7717b38875f5b070282d1b9d2,[https://photos.renthop.com/2/6814332_e19a8552...,5800,522 E 11th


In [6]:
df_train['num_photos'] = df_train['photos'].apply(len)
df_train['num_features'] = df_train['features'].apply(len)
df_train["num_description_words"] = df_train["description"].apply(lambda x: len(x.split(" ")))
df_train["created"] = pd.to_datetime(df_train["created"])
df_train["created_year"] = df_train["created"].dt.year
df_train["created_month"] = df_train["created"].dt.month
df_train["created_day"] = df_train["created"].dt.day
cols_kept = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day"]
X = df_train[cols_kept]
y = df_train["interest_level"]
X.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,num_photos,num_features,num_description_words,created_year,created_month,created_day
10,1.5,3,40.7145,-73.9425,3000,5,0,95,2016,6,24
10000,1.0,2,40.7947,-73.9667,5465,11,5,9,2016,6,12
100004,1.0,1,40.7388,-74.0018,2850,8,4,94,2016,4,17
100007,1.0,1,40.7539,-73.9677,3275,3,2,80,2016,4,18
100013,1.0,4,40.8241,-73.9493,3350,3,1,68,2016,4,28


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [40]:
# rfc = RandomForestClassifier(n_estimators=1000)
# rfc.fit(X_train, y_train)
# y_pred = rfc.predict_proba(X_test)
# log_loss(y_test, y_pred)

0.62453892275267309

In [30]:
# abc = AdaBoostClassifier(n_estimators=1000)
# abc.fit(X_train, y_train)
# y_pred = abc.predict_proba(X_test)
# abc.score(X_test, y_test)
# log_loss(y_test, y_pred)

1.0971749607438652

In [62]:
# gbc = GradientBoostingClassifier(n_estimators = 1000)
# gbc.fit(X_train, y_train)
# y_pred = gbc.predict_proba(X_test)
# log_loss(y_test, y_pred)

0.61862133279704035

In [47]:
# params = {'learning_rate':np.logspace(-2,2,num=4),
#               'n_estimators':np.logspace(1,2,num=4).astype(int),
#               'max_depth':np.linspace(1,75, num=4).astype(int)
#     }
# #These params are a coarse search.
# coarse_search = GridSearchCV(gbc, params, n_jobs = -1)
# coarse_search.fit(X_train, y_train)

# coarse_params = coarse_search.best_params_
# #coarse_params = {'learning_rate': 0.61584821106602605, 'max_depth': 1, 'n_estimators': 100}
    
# print "Coarse search best params: %s " % coarse_params
# #print "Coarse search best score: %s " % coarse_search.best_score

Coarse search best params: {'n_estimators': 100, 'learning_rate': 0.21544346900318834, 'max_depth': 1} 


AttributeError: 'GridSearchCV' object has no attribute 'best_score'

In [10]:
xgb = xgboost.XGBClassifier(n_estimators= 1000)
xgb.fit(X_train, y_train)
y_pred = xgb.predict_proba(X_test)
log_loss(y_test, y_pred)

0.61161430814597106

In [None]:
# #Boosting Model
#
# aba = AdaBoostRegressor()
# aba.fit(X_train.drop('SalesID', axis=1),y_train)
# aba_hat = aba.predict(X_test.drop('SalesID', axis=1))
#
# aba_cv_mse_scores = cross_val_score(aba, X_test.drop('SalesID', axis=1), y_test, scoring='mean_squared_error', cv=10)
#
# aba_cv_mse = np.mean(np.abs(aba_cv_mse_scores))
#
#
# sales_ids = np.savetxt(np.array(X_test['SalesID']))
# rf_preds = np.savetxt('rf_preds.csv', (X_test['SalesID'],yhat)
# ada_preds = np.savetxt('ada_preds.csv', aba_hat)

In [12]:
gbc_1000 = GradientBoostingClassifier(n_estimators = 1000)
gbc_1000.fit(X_train, y_train)
y_pred = gbc_1000.predict_proba(X_test)
log_loss(y_test, y_pred)

0.61230040609080483

In [27]:
gbc_1000_test = GradientBoostingClassifier(n_estimators = 1000)
gbc_1000_test.fit(X_train, y_train)
y_pred = gbc_1000_test.predict_proba(X_test)
log_loss(y_test, y_pred)

0.61235544185749435

In [32]:
df = pd.read_json(open("../data/test.json", "r"))
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
X = df[cols_kept]

y = xgb.predict_proba(X)

In [33]:
index_label = {u'high': 0, u'medium': 2, u'low': 1}
fin = pd.DataFrame()
fin["listing_id"] = df["listing_id"]
for label in ["high", "medium", "low"]:
    fin[label] = y[:, index_label[label]]
fin.to_csv("submission_rf.csv", index=False)

In [31]:
index_label = {u'high': 0, u'medium': 2, u'low': 1}
print index_label

{u'high': 0, u'medium': 2, u'low': 1}
