In [57]:
%matplotlib inline
import quadkey
import os
import sys
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse
from sklearn import model_selection, preprocessing, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [122]:
submit_result = {}

In [454]:
data_path = "input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

(49352, 15)
(74659, 14)


In [455]:
train_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [464]:
def feature_preprocessing(df, if_train=True):
    
    target_num_map = {'high':0, 'medium':1, 'low':2}
    features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude",  "price" ,
                        "diff_price",'diff_bathrooms','diff_bedrooms',
                       'building_price']
    
    # datetime
    df['ts'] = pd.to_datetime(df['created'])
    df['created_year'] = df['ts'].dt.year
    df['created_month'] = df['ts'].dt.month
    df['created_day'] = df['ts'].dt.day
    df['created_hour'] = df['ts'].dt.hour
    
    #len
    df['features_cnt'] = df['features'].apply(len)
    df['photos_cnt'] = df['photos'].apply(len)
    df['desc_cnt'] = df['description'].apply(len)
    
    # group by building id
    mean_of_building = df.groupby('building_id')['bathrooms','bedrooms','price'].mean()
    mean_of_building.columns = ['building_bathrooms','building_bedrooms','building_price']
    df = pd.merge(df, mean_of_building, left_on = df.building_id, right_index=True, how='left')
    df['diff_bathrooms'] = df['bathrooms'] - df['building_bathrooms']
    df['diff_bedrooms'] = df['bedrooms'] - df['building_bedrooms']
    df['diff_price'] = df['price'] - df['building_price']
    
    df['diff_bathrooms'] =df['diff_bathrooms'].fillna(-9999)
    df['diff_bedrooms'] =df['diff_bedrooms'].fillna(-9999)
    df['diff_price'] = df['diff_price'].fillna(-9999)
    
    #features
    df['features'] = df["features"].apply(lambda x: 'Ringcentral ' + " ".join(["_".join(i.split(" ")) for i in x]))
    tfidf = CountVectorizer(stop_words='english' , max_features=200, min_df=1)
    tr_sparse = tfidf.fit_transform(df["features"])
    print 'tr_sparse' + str(tr_sparse.shape)
    
    features_to_use.extend(['created_year', 'created_month', 'created_day', 'created_hour','features_cnt','photos_cnt','desc_cnt'])
    train_x = df[features_to_use]
    print train_x.head()
    
    train_x = sparse.hstack([train_x, tr_sparse]).tocsr()
    #train_x = sparse.csr_matrix(train_x.values)
    
    print 'train_x' + str(train_x.shape)
    
    if if_train:
        train_y = pd.DataFrame(df['interest_level'])
        train_y = np.array(train_y['interest_level'].apply(lambda x: target_num_map[x]))
        return train_x, train_y
    else:
        return train_x

In [465]:
train_x, train_y = feature_preprocessing(train_df)

tr_sparse(49352, 200)
        bathrooms  bedrooms  latitude  longitude  price  diff_price  \
10            1.5         3   40.7145   -73.9425   3000  333.333333   
10000         1.0         2   40.7947   -73.9667   5465 -994.791667   
100004        1.0         1   40.7388   -74.0018   2850 -480.172414   
100007        1.0         1   40.7539   -73.9677   3275 -302.989899   
100013        1.0         4   40.8241   -73.9493   3350  -30.332609   

        diff_bathrooms  diff_bedrooms  building_price  created_year  \
10            0.333333       1.333333     2666.666667          2016   
10000        -0.708333       0.083333     6459.791667          2016   
100004        0.000000      -0.327586     3330.172414          2016   
100007       -0.020202       0.101010     3577.989899          2016   
100013       -0.154296       2.575187     3380.332609          2016   

        created_month  created_day  created_hour  features_cnt  photos_cnt  \
10                  6           24            

In [466]:
train_x.shape, train_y.shape

((49352, 216), (49352,))

In [446]:
# KFold

In [468]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_x.shape[0])):
        print("TRAIN:", dev_index, "TEST:", val_index)
        dev_x, val_x = train_x[dev_index,:], train_x[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        # RandomForest parameter 
        clf = RandomForestClassifier(n_estimators=1000, n_jobs=2, random_state=2016)
        
        clf.fit(dev_x, dev_y)
        val_y_pred = clf.predict_proba(val_x)
        cv_scores.append(log_loss(val_y, val_y_pred))
        
        print(cv_scores)
        


('TRAIN:', array([    0,     1,     2, ..., 49347, 49350, 49351]), 'TEST:', array([   24,    32,    38, ..., 49340, 49348, 49349]))
[0.59402499584440782]
('TRAIN:', array([    2,     3,     4, ..., 49349, 49350, 49351]), 'TEST:', array([    0,     1,     5, ..., 49336, 49338, 49346]))
[0.59402499584440782, 0.6087949055305123]
('TRAIN:', array([    0,     1,     2, ..., 49348, 49349, 49350]), 'TEST:', array([    7,    10,    12, ..., 49343, 49344, 49351]))
[0.59402499584440782, 0.6087949055305123, 0.60693673301433659]
('TRAIN:', array([    0,     1,     2, ..., 49349, 49350, 49351]), 'TEST:', array([    3,     4,     6, ..., 49332, 49341, 49347]))
[0.59402499584440782, 0.6087949055305123, 0.60693673301433659, 0.59918489932953511]
('TRAIN:', array([    0,     1,     3, ..., 49348, 49349, 49351]), 'TEST:', array([    2,     8,    15, ..., 49337, 49345, 49350]))
[0.59402499584440782, 0.6087949055305123, 0.60693673301433659, 0.59918489932953511, 0.59112530578188927]


In [372]:
# make prediction 

In [473]:
def prediction(test_df, model):
    target_num_map = {'high':0, 'medium':1, 'low':2}
    test_df_1 = feature_preprocessing(test_df, False)
    test_y = model.predict_proba(test_df_1)
    sub = pd.DataFrame(test_y, columns= ["high", "medium", "low"], index=test_df['listing_id'])
    sub.reset_index(level=0, inplace=True)
    return sub

In [474]:
sub = prediction(test_df, clf)
sub.to_csv("submission_rf.csv", index=False)

tr_sparse(74659, 200)
        bathrooms  bedrooms  latitude  longitude  price  diff_price  \
0             1.0         1   40.7185   -73.9865   2950   20.000000   
1             1.0         2   40.7278   -74.0000   2850 -501.978995   
100           1.0         1   40.7306   -73.9890   3758 -843.333333   
1000          1.0         2   40.7109   -73.9571   3300    0.000000   
100000        2.0         2   40.7650   -73.9845   4900  892.273292   

        diff_bathrooms  diff_bedrooms  building_price  created_year  \
0             0.000000       0.000000     2930.000000          2016   
1            -0.159396       0.563177     3351.978995          2016   
100          -0.333333      -0.333333     4601.333333          2016   
1000          0.000000       0.000000     3300.000000          2016   
100000        0.757764       0.391304     4007.726708          2016   

        created_month  created_day  created_hour  features_cnt  photos_cnt  \
0                   6           11            

In [127]:
def add_submit_result( submit_result, cv_scores, score):
    submit_result[str(reduce(lambda x,y: x + y, cv_scores)/len(cv_scores))] = score
    return  submit_result

In [485]:
submit_result = add_submit_result(submit_result, [0.55741334560380607, 0.5588175259935857, 0.56071541330880259, 0.54865627497218594, 0.54911830170626785], 0.62380)
submit_result

{'0.553507933531': 0.63627,
 '0.554944172317': 0.6238,
 '0.555730063743': 0.59856,
 '0.559361638057': 0.57909,
 '0.560714145301': 0.57416,
 '0.576255174477': 0.59235,
 '0.6000133679': 0.64538,
 '0.610919215449': 0.6412,
 '0.61678022171': 0.66728,
 '0.812603233401': 0.81832}