In [22]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import pandas as pd
import random

from sklearn import model_selection, preprocessing, ensemble
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import defaultdict, Counter

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [57]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [58]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}


gb_params = {
    'n_estimators': 500,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

lgb_params = {
    'objective': 'multi:softprob',
     'eta': 0.03,
     'max_depth': 6,
     'silent': 0,
     'num_class': 3,
     'eval_metric': 'mlogloss',
     'min_child_weight': 1,
     'subsample': 0.7,
     'colsample_bytree': 0.7,
     'nthread': 12
}

xgb_params = {
    'objective': 'multi:softprob',
     'eta': 0.03,
     'max_depth': 6,
     'silent': 0,
     'num_class': 3,
     'eval_metric': 'mlogloss',
     'min_child_weight': 1,
     'subsample': 0.7,
     'colsample_bytree': 0.7,
     'nthread': 12
}

In [59]:
SEED = 300
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
lgb = SklearnHelper(clf=LGBMClassifier, seed=SEED, params=lgb_params)
xgb = SklearnHelper(clf=XGBClassifier, seed=SEED, params=xgb_params)

In [7]:
train_df = pd.read_json('../input/train.json.zip', compression='zip')
test_df = pd.read_json('../input/test.json.zip', compression='zip')

features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

mean_price = int(train_df['price'].mean())
test_df.loc[test_df['price']<200,'price'] = mean_price
train_df.loc[train_df['price']<200,'price'] = mean_price

In [8]:
train_test = pd.concat([train_df, test_df], 0,sort=False)

features = train_test[["features"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])


n = 5

feature_counts = Counter()
for feature in features.features:
    feature_counts.update(feature)
feature = sorted([k for (k,v) in feature_counts.items() if v > n])
feature[:10]


def clean(s):
    x = s.replace("-", "")
    x = x.replace(" ", "")
    x = x.replace("24/7", "24")
    x = x.replace("24hr", "24")
    x = x.replace("24-hour", "24")
    x = x.replace("24hour", "24")
    x = x.replace("24 hour", "24")
    x = x.replace("common", "cm")
    x = x.replace("concierge", "doorman")
    x = x.replace("bicycle", "bike")
    x = x.replace("pets:cats", "cats")
    x = x.replace("allpetsok", "pets")
    x = x.replace("dogs", "pets")
    x = x.replace("private", "pv")
    x = x.replace("deco", "dc")
    x = x.replace("decorative", "dc")
    x = x.replace("onsite", "os")
    x = x.replace("outdoor", "od")
    x = x.replace("ss appliances", "stainless")
    return x

def feature_hash(x):
    cleaned = clean(x, uniq)
    key = cleaned[:4].strip()
    return key


key2original = defaultdict(list)
k = 4
for f in feature:
    cleaned = clean(f)
    key = cleaned[:k].strip()

    key2original[key].append(f)

    
def to_tuples():
    for f in feature:
        key = clean(f)[:k].strip()
        yield (f, key2original[key][0])
        
deduped = list(to_tuples())
df = pd.DataFrame(deduped, columns=["original_feature", "unique_feature"])

dict_rep_features = pd.Series(df['unique_feature'].values, df['original_feature'].values)

In [9]:
test_df['features'] = test_df['features'].apply(lambda x: list(map(str.strip, map(str.lower, x))))\
                    .apply(lambda x: [dict_rep_features[i] for i in x if i in dict_rep_features.index])\
                    .apply(lambda x: list(set(x)))

train_df['features'] = train_df['features'].apply(lambda x: list(map(str.strip, map(str.lower, x))))\
                    .apply(lambda x: [dict_rep_features[i] for i in x if i in dict_rep_features.index])\
                    .apply(lambda x: list(set(x)))


In [10]:
import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

train_df, test_df = operate_on_coordinates(train_df, test_df)

features_to_use.extend(['num_rho', 'num_phi', 'num_rot15_X', 'num_rot15_Y', 'num_rot30_X',
       'num_rot30_Y', 'num_rot45_X', 'num_rot45_Y', 'num_rot60_X',
       'num_rot60_Y'])

In [11]:
import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
   
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    
    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    #and... can we call them?
    
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)

    


features_to_use.extend(['num_cap_share', 'num_nr_of_lines', 'num_redacted',
       'num_email', 'num_phone_nr'])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [12]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", "created_month", "created_day", "listing_id", "created_hour"])

train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

features_to_use.extend(["price_t", "room_sum", "num_description_words"])

In [13]:
start_values = [0,0,0]

index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]= start_values.copy()
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c


a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]= start_values.copy()
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [14]:
train_df["listing_id1"] = train_df["listing_id"] - 68119576.0
test_df["listing_id1"] =  test_df["listing_id"] - 68119576.0

train_df["num_price_by_furniture"] = (train_df["price"])/ (train_df["bathrooms"] + train_df["bedrooms"] + 1.0)
test_df["num_price_by_furniture"] =  (test_df["price"])/ (test_df["bathrooms"] + test_df["bedrooms"] +  1.0)

train_df["price_latitue"] = (train_df["price"])/ (train_df["latitude"]+1.0)
test_df["price_latitue"] =  (test_df["price"])/ (test_df["latitude"]+1.0)

train_df["price_longtitude"] = (train_df["price"])/ (train_df["longitude"]-1.0)
test_df["price_longtitude"] =  (test_df["price"])/ (test_df["longitude"]-1.0)  

train_df["num_furniture"] =  train_df["bathrooms"] + train_df["bedrooms"] 
test_df["num_furniture"] =   test_df["bathrooms"] + test_df["bedrooms"] 

train_df["total_days"] =   (train_df["created_month"] -4.0)*30 + train_df["created_day"] +  train_df["created_hour"] /25.0
test_df["total_days"] =(test_df["created_month"] -4.0)*30 + test_df["created_day"] +  test_df["created_hour"] /25.0        
train_df["diff_rank"]= train_df["total_days"]/train_df["listing_id1"]
test_df["diff_rank"]= test_df["total_days"]/test_df["listing_id1"]


features_to_use.extend([ "total_days","diff_rank",
"num_price_by_furniture","price_latitue","price_longtitude",'num_furniture'])

In [15]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [16]:
train_df["price0"] = (train_df["price"]%10==0).astype(int)
test_df["price0"] = (test_df["price"]%10==0).astype(int)

train_df["manager_count"] = train_df["manager_id"].replace(train_df["manager_id"].value_counts())
test_df["manager_count"] = test_df["manager_id"].replace(train_df["manager_id"].value_counts())

features_to_use.extend(["price0",'manager_count'])

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=70)
te_sparse = tfidf.fit_transform(test_df["features"])
tr_sparse = tfidf.transform(train_df["features"])

tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, strip_accents='unicode',lowercase =True,
                    analyzer='word', token_pattern=r'\w{16,}', ngram_range=(1, 2), use_idf=False,smooth_idf=False, 
                    sublinear_tf=True, stop_words = 'english')  

train_df['description'] =  train_df['description'].apply(lambda x: str(x).encode('utf-8') if len(x)>2 else "nulldesc") 
test_df['description'] =   test_df['description'].apply(lambda x: str(x).encode('utf-8') if len(x)>2 else "nulldesc") 
te_sparsed = tfidfdesc. fit_transform (test_df["description"])  
tr_sparsed = tfidfdesc.transform(train_df["description"])

train_X = sparse.hstack([train_df[features_to_use], tr_sparse,tr_sparsed]).tocsr()#
test_X = sparse.hstack([test_df[features_to_use], te_sparse,te_sparsed]).tocsr()#

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

print(train_X.shape, test_X.shape)

10                                                         
10000     cats_allowed concierge all_pets_ok fitness_cen...
100004                         hardwood laundry dish_washer
100007                                      hardwood no_fee
100013                                              pre_war
Name: features, dtype: object
(49352, 166) (74659, 166)


In [122]:
train_X = train_X.toarray()
test_X = test_X.toarray()

train_X = train_X.astype(np.float32)
test_X = test_X.astype(np.float32)

train_X = np.nan_to_num(train_X)
test_X = np.nan_to_num(test_X)

[0]	train-mlogloss:1.07775	val-mlogloss:1.07827
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:1.05804	val-mlogloss:1.05915
[2]	train-mlogloss:1.0391	val-mlogloss:1.04076
[3]	train-mlogloss:1.02106	val-mlogloss:1.02331
[4]	train-mlogloss:1.00379	val-mlogloss:1.00659
[5]	train-mlogloss:0.987297	val-mlogloss:0.990692
[6]	train-mlogloss:0.971684	val-mlogloss:0.975621
[7]	train-mlogloss:0.956643	val-mlogloss:0.961212
[8]	train-mlogloss:0.942392	val-mlogloss:0.947541
[9]	train-mlogloss:0.928634	val-mlogloss:0.934332
[10]	train-mlogloss:0.915499	val-mlogloss:0.921662
[11]	train-mlogloss:0.902885	val-mlogloss:0.909531
[12]	train-mlogloss:0.890587	val-mlogloss:0.897704
[13]	train-mlogloss:0.879018	val-mlogloss:0.886706
[14]	train-mlogloss:0.867827	val-mlogloss:0.876054
[15]	train-mlogloss:0.857247	val-mlogloss:0.865963
[16]	train-mlogloss:0.847002	val-mlogloss:0.856158
[17]	

In [65]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((x_train.shape[0], 3))
    oof_test = np.zeros((x_test.shape[0], 3))

    for train_index, test_index in kf.split(x_train, y_train):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)
        oof_test += clf.predict_proba(x_test)
        
    return oof_train, oof_test / kf.n_splits

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)

et_oof_train, et_oof_test = get_oof(et, train_X, train_y, test_X) 
rf_oof_train, rf_oof_test = get_oof(rf,train_X, train_y, test_X) 
ada_oof_train, ada_oof_test = get_oof(ada, train_X, train_y, test_X) 
gb_oof_train, gb_oof_test = get_oof(gb,train_X, train_y, test_X) 
xgb_oof_train, xgb_oof_test = get_oof(xgb,train_X, train_y, test_X)
lgb_oof_train, lgb_oof_test = get_oof(lgb,train_X, train_y, test_X) 

  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


In [66]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, 
                          gb_oof_train, xgb_oof_train, lgb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, 
                         gb_oof_test, xgb_oof_test, lgb_oof_test), axis=1)

In [69]:
gbm = XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, train_y)
predictions = gbm.predict_proba(x_test)

In [71]:
out_df = pd.DataFrame(predictions)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_baseline3.csv", index=False)