In [9]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.externals import joblib
import re
from nltk.stem.snowball import SnowballStemmer

In [10]:
def xgb_data_prep():
    basic_feat = pd.read_csv('feat_input/basic_feat.csv')
    longtime_feat = pd.read_csv('feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('feat_input/feat_stats_encoding.csv')
    print('Loading features finished')

    # apply ordinal encoding to categorical feature

    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))
    print('Ordinal encoding finished')

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]
    
    train_num=train.shape[0]
    stemmer = SnowballStemmer('english')

    all_feat['features'] = all_feat['features'].apply(lambda x: re.findall(r"[\w'|\w\-|\ ]+", x))
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    dtm = CountVectorizer(stop_words='english', max_features=100, token_pattern=r"[\w\-|\w']+")

    all_sparse = dtm.fit_transform(all_feat["features"].values.astype('U'))
    print(dtm.get_feature_names())

    tr_sparse = all_sparse[:train_num]
    te_sparse = all_sparse[train_num:]
    print("Document-term matrix finished")

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    x_train = sparse.hstack([x_train.astype(float),tr_sparse.astype(float)]).tocsr()
    x_test = sparse.hstack([x_test.astype(float),te_sparse.astype(float)]).tocsr()

    return x_train, y_train, x_test, test.listing_id

In [11]:
def xgb_cv(dtrain, num_rounds = 50000, early_stop_rounds=250):
    print('Start xgboost cross-validation')
    params = {'booster': 'gbtree',
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'gamma': 1,
              'min_child_weight': 1.5,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.03,
              'tree_method': 'exact',
              'seed': 36683,
              'nthread': 4,
              'num_class': 3,
              'silent': 1
              }
    xgb2cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=num_rounds,
                    nfold=5,
                    stratified=True,
                    verbose_eval=50,
                    early_stopping_rounds=early_stop_rounds)
    return xgb2cv

In [12]:
X_train, y_train, X_test, listing_id = xgb_data_prep()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# cross-validation and training
xgb_cv_hist = xgb_cv(dtrain)

Loading features finished
Ordinal encoding finished
['feature', 'feature_actual_apt', 'feature_balconi', 'feature_bike_room', 'feature_building-common-outdoor-spac', 'feature_c', 'feature_cats_allow', 'feature_central_a', "feature_children's_playroom", 'feature_childrens_playroom', 'feature_common_outdoor_spac', 'feature_common_park', 'feature_common_roof_deck', 'feature_concierg', 'feature_courtyard', 'feature_dining_room', 'feature_dishwash', 'feature_dogs_allow', 'feature_doorman', 'feature_dryer', 'feature_dryer_in_unit', 'feature_duplex', 'feature_eat_in_kitchen', 'feature_elev', 'feature_exclus', 'feature_exposed_brick', 'feature_fireplac', 'feature_fit', 'feature_fitness_cent', 'feature_full-time_doorman', 'feature_furnish', 'feature_garag', 'feature_garden', 'feature_granite_kitchen', 'feature_green_build', 'feature_gym', 'feature_hardwood', 'feature_hardwood_floor', 'feature_high_ceil', 'feature_high_speed_internet', 'feature_highris', 'feature_indoor_pool', 'feature_laundri',