In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_data = pd.read_json('milestone1_train.json')
train_data.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,low
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,high
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,low
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,low


In [3]:
test_data = pd.read_json('test.json')
test_data.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
100,1.0,1,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,101 East 10th Street,"[Doorman, Elevator, No Fee]",40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758,101 East 10th Street
1000,1.0,2,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,South Third Street\r,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300,251 South Third Street\r
100000,2.0,2,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","Midtown West, 8th Ave","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900,260 West 54th Street


In [4]:
def create_date(date):
    date = pd.to_datetime(date)
    return date

In [5]:
date = np.vectorize(create_date)

In [6]:
train_data['date_time'] = date(train_data['created'])
test_data['date_time'] = date(test_data['created'])

In [7]:
train_data['year'] = train_data['date_time'].dt.year
train_data['month'] = train_data['date_time'].dt.month
train_data['day'] = train_data['date_time'].dt.day
train_data['hour'] = train_data['date_time'].dt.hour
train_data['minute'] = train_data['date_time'].dt.minute

In [8]:
test_data['year'] = test_data['date_time'].dt.year
test_data['month'] = test_data['date_time'].dt.month
test_data['day'] = test_data['date_time'].dt.day
test_data['hour'] = test_data['date_time'].dt.hour
test_data['minute'] = test_data['date_time'].dt.minute

In [9]:
feature_selection = ['bathrooms','bedrooms','latitude','longitude','price','year','month','day','hour','minute']

In [10]:
X = train_data[feature_selection]

In [11]:
y = train_data['interest_level']

In [12]:
X_pred_test = test_data[feature_selection]

In [19]:
#Random Forest

In [40]:
random_forest_model = RandomForestClassifier(n_estimators = 100, max_depth = 5, min_samples_leaf = 10)

In [41]:
scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'neg_log_loss')

In [42]:
scores

array([-0.70638312, -0.70917273, -0.70188925, -0.70516611, -0.70522706,
       -0.70241623, -0.70233646, -0.69958134, -0.70325647, -0.70297816])

In [43]:
scores.mean()

-0.7038406917081323

In [44]:
random_forest_model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [45]:
lable_pred_test = random_forest_model.predict_proba(X_pred_test)

In [46]:
def output_file_CSV_rf(lable_pred_test):
    lable_index = {'high': 0, 'low': 1, 'medium': 2}
    submission_rf = pd.DataFrame()
    submission_rf["listing_id"] = test_data["listing_id"]
    for label in ["high", "medium", "low"]:
        submission_rf[label] = lable_pred_test[:, lable_index[label]]
    submission_rf.to_csv("submission_rf.csv", index=False)

In [47]:
output_file_CSV_rf(lable_pred_test)

In [None]:
# Score of the first version of random forest classifier is 0.70913

In [None]:
#Improvement of Random Forest

In [48]:
for n in [100,200,300,400,500,600,700]:
    random_forest_model = RandomForestClassifier(n_estimators = n, max_depth = 5, min_samples_leaf = 10)   
    scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'neg_log_loss')
    print('n:', n, 'scores:', scores.mean())

n: 100 scores: -0.704579135978101
n: 200 scores: -0.7039123286943537
n: 300 scores: -0.7041469212858144
n: 400 scores: -0.7044531234490596
n: 500 scores: -0.7037237230226576
n: 600 scores: -0.7037599309953971
n: 700 scores: -0.7040979983157054


In [49]:
for n in [1,2,3,4,5,6,7,8,9,10]:
    random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth = n, min_samples_leaf = 10)   
    scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'neg_log_loss')
    print('n:', n, 'scores:', scores.mean())

n: 1 scores: -0.7650911512021958
n: 2 scores: -0.7471589354425238
n: 3 scores: -0.7315450062796531
n: 4 scores: -0.7170911516054019
n: 5 scores: -0.7045637878522191
n: 6 scores: -0.6920484240181649
n: 7 scores: -0.6816916618318095
n: 8 scores: -0.6722302753965178
n: 9 scores: -0.6644383389083954
n: 10 scores: -0.6575612125550792


In [50]:
random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth = 10, min_samples_leaf = 10)

In [51]:
scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'neg_log_loss')

In [52]:
scores.mean()

-0.657307351773025

In [53]:
random_forest_model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [54]:
lable_pred_test = random_forest_model.predict_proba(X_pred_test)

In [55]:
def output_file_CSV_rf(lable_pred_test):
    lable_index = {'high': 0, 'low': 1, 'medium': 2}
    submission_rf = pd.DataFrame()
    submission_rf["listing_id"] = test_data["listing_id"]
    for label in ["high", "medium", "low"]:
        submission_rf[label] = lable_pred_test[:, lable_index[label]]
    submission_rf.to_csv("submission_rf.csv", index=False)

In [56]:
output_file_CSV_rf(lable_pred_test)

In [None]:
# Score of the second version of random forest classifier is 0.66077

In [57]:
#Gradient Boosting Classifier

In [18]:
gradient_boosting_model = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(n_estimators = 100)
)

In [19]:
scores = cross_val_score(gradient_boosting_model,X, y, cv=10,scoring = 'neg_log_loss')

In [20]:
scores.mean()

-0.648383017870594

In [21]:
gradient_boosting_model.fit(X,y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100,
                                            n_iter_no_change=None,
                                            pr

In [22]:
lable_pred_test = gradient_boosting_model.predict_proba(X_pred_test)

In [23]:
def output_file_CSV_gb(lable_pred_test):
    lable_index = {'high': 0, 'low': 1, 'medium': 2}
    submission_gb = pd.DataFrame()
    submission_gb["listing_id"] = test_data["listing_id"]
    for label in ["high", "medium", "low"]:
        submission_gb[label] = lable_pred_test[:, lable_index[label]]
    submission_gb.to_csv("submission_gb.csv", index=False)

In [24]:
output_file_CSV_gb(lable_pred_test)

In [None]:
# Score of the first version of random forest classifier is 0.65291

In [None]:
#Improvement of gradient_boosting

In [15]:
gradient_boosting_model = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(n_estimators = 300)
)

In [16]:
scores = cross_val_score(gradient_boosting_model,X, y, cv=10,scoring = 'neg_log_loss')
scores.mean()

-0.632714258273049

In [17]:
gradient_boosting_model = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(n_estimators = 500)
)
scores = cross_val_score(gradient_boosting_model,X, y, cv=10,scoring = 'neg_log_loss')
scores.mean()

-0.6290743418164083

In [25]:
gradient_boosting_model = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(n_estimators = 700)
)
scores = cross_val_score(gradient_boosting_model,X, y, cv=10,scoring = 'neg_log_loss')
scores.mean()

-0.62754211867065

In [26]:
gradient_boosting_model.fit(X,y)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=700,
                                            n_iter_no_change=None,
                                            pr

In [27]:
lable_pred_test = gradient_boosting_model.predict_proba(X_pred_test)

In [28]:
def output_file_CSV_gb(lable_pred_test):
    lable_index = {'high': 0, 'low': 1, 'medium': 2}
    submission_gb = pd.DataFrame()
    submission_gb["listing_id"] = test_data["listing_id"]
    for label in ["high", "medium", "low"]:
        submission_gb[label] = lable_pred_test[:, lable_index[label]]
    submission_gb.to_csv("submission_gb.csv", index=False)

In [29]:
output_file_CSV_gb(lable_pred_test)

In [None]:
# Score of the second version of random forest classifier is 0.63106

In [None]:
# one additional evaluation metrics "Accuracy"

In [39]:
for n in (1,2,3,4,5,6,7,8,9,10):
    decision_tree_model =  DecisionTreeClassifier(max_depth = n)   
    scores = cross_val_score(decision_tree_model, X, y, cv=10, scoring = 'accuracy')  
    print('n:', n, 'scores:', scores.mean())

n: 1 scores: 0.6945610193948848
n: 2 scores: 0.6945610193948848
n: 3 scores: 0.6990005990995368
n: 4 scores: 0.7003182474925013
n: 5 scores: 0.7038455827854453
n: 6 scores: 0.7055487088754449
n: 7 scores: 0.7058325118584324
n: 8 scores: 0.706217652470164
n: 9 scores: 0.70445377310032
n: 10 scores: 0.7039677921207093


In [53]:
decision_tree_model =  DecisionTreeClassifier(max_depth = 8)

In [54]:
scores = cross_val_score(decision_tree_model, X, y, cv=10, scoring = 'accuracy')

In [55]:
scores.mean()

0.7061568334383427

In [56]:
decision_tree_model.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [57]:
decision_tree_model.score(X,y)

0.7161912870725131

In [58]:
lable_pred_test_accuracy = decision_tree_model.predict_proba(X_pred_test)

In [59]:
def output_file_CSV_ds(lable_pred_test):
    lable_index = {'high': 0, 'low': 1, 'medium': 2}
    submission_ds = pd.DataFrame()
    submission_ds["listing_id"] = test_data["listing_id"]
    for label in ["high", "medium", "low"]:
        submission_ds[label] = lable_pred_test[:, lable_index[label]]
    submission_ds.to_csv("submission_ds_fr.csv", index=False)

In [60]:
output_file_CSV_ds(lable_pred_test_accuracy)

In [None]:
0.75994

In [44]:
for n in [100,200,300,400,500,600,700]:
    random_forest_model = RandomForestClassifier(n_estimators = n, max_depth = 5, min_samples_leaf = 10)   
    scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'accuracy')
    print('n:', n, 'scores:', scores.mean())

n: 100 scores: 0.6997303904693384
n: 200 scores: 0.6993046449225779
n: 300 scores: 0.6991830685056815
n: 400 scores: 0.6992640852033845
n: 500 scores: 0.6994465176324842
n: 600 scores: 0.6993046202645476
n: 700 scores: 0.6992437971274986


In [45]:
for n in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]:
    random_forest_model = RandomForestClassifier(n_estimators = 100, max_depth = n, min_samples_leaf = 10)   
    scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'accuracy')
    print('n:', n, 'scores:', scores.mean())

n: 1 scores: 0.6945610193948848
n: 2 scores: 0.6945610193948848
n: 3 scores: 0.6945610193948848
n: 4 scores: 0.6958787171239049
n: 5 scores: 0.6991019860776742
n: 6 scores: 0.7020413492577677
n: 7 scores: 0.7038658380045076
n: 8 scores: 0.7057512197571658
n: 9 scores: 0.7076364617787549
n: 10 scores: 0.7096029711885885
n: 11 scores: 0.7123802722433519
n: 12 scores: 0.7147726135213849
n: 13 scores: 0.7152388859055261
n: 14 scores: 0.7179753641932077
n: 15 scores: 0.7175902564582864


In [46]:
random_forest_model = RandomForestClassifier(n_estimators = 100, max_depth = 14, min_samples_leaf = 10)

In [47]:
scores = cross_val_score(random_forest_model, X, y, cv=10, scoring = 'accuracy')

In [48]:
scores.mean()

0.716941584486883

In [49]:
random_forest_model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=14, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
lable_pred_test = random_forest_model.predict_proba(X_pred_test)

In [51]:
def output_file_CSV_rf(lable_pred_test):
    lable_index = {'high': 0, 'low': 1, 'medium': 2}
    submission_rf = pd.DataFrame()
    submission_rf["listing_id"] = test_data["listing_id"]
    for label in ["high", "medium", "low"]:
        submission_rf[label] = lable_pred_test[:, lable_index[label]]
    submission_rf.to_csv("submission_rf.csv", index=False)

In [52]:
output_file_CSV_rf(lable_pred_test)

In [None]:
0.64445