## 5-Fold XGB CV

### Problem of worse cv score with `json` file

`basic_feat` imported from a pre-generated `json` file. The problem of worse cv score with `json` file has been resolved. And the reason is due to the fact that, in the underlying code:

```python
tr_sparse = vect_sparse[:train_num]
te_sparse = vect_sparse[train_num:]
```

which requires in default the first `train_num` rows of `feat_all` should only contain training data. This is unfortunately not the case for the `json` file. So the following line is added to code:

```python
all_feat = all_feat.sort_values('interest_level', ascending=False).reset_index()
```

### Added custom function `most_freq_vects`

Select most frequent terms in features, and the subsequent document-term-matrix

### Pre-clean the features column data

remove `[0-9]` and punctuations except `['|-]`

In [6]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.externals import joblib
import re
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import csr_matrix

In [7]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return names, csr_matrix(feat_df[names].values)

In [8]:
def xgb_data_prep():
    basic_feat = pd.read_json('feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('feat_input/feat_stats_encoding.csv')
    print('Loading features finished')

    # apply ordinal encoding to categorical feature
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))
    print('Ordinal encoding finished')

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    all_feat = all_feat.sort_values('interest_level', ascending=False).reset_index()
    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]

    train_num=train.shape[0]
    stemmer = SnowballStemmer('english')

    import string
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    
    vect_names, vect_sparse = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")
    print(vect_names)
    
    tr_sparse = vect_sparse[:train_num]
    te_sparse = vect_sparse[train_num:]
    print("Document-term matrix finished")

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    x_train = sparse.hstack([x_train.astype(float),tr_sparse.astype(float)]).tocsr()
    x_test = sparse.hstack([x_test.astype(float),te_sparse.astype(float)]).tocsr()

    return x_train, y_train, x_test, test.listing_id

In [9]:
def xgb_cv(dtrain, num_rounds = 50000, early_stop_rounds=250):
    print('Start xgboost cross-validation')
    params = {'booster': 'gbtree',
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'gamma': 1,
              'min_child_weight': 1.5,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.03,
              'tree_method': 'exact',
              'seed': 36683,
              'nthread': 4,
              'num_class': 3,
              'silent': 1
              }
    xgb2cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=num_rounds,
                    nfold=5,
                    stratified=True,
                    verbose_eval=50,
                    early_stopping_rounds=early_stop_rounds)
    return xgb2cv

In [10]:
X_train, y_train, X_test, listing_id = xgb_data_prep()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# cross-validation and training
xgb_cv_hist = xgb_cv(dtrain)

Loading features finished
Ordinal encoding finished
['feature_elev', 'feature_cats_allow', 'feature_hardwood_floor', 'feature_dogs_allow', 'feature_doorman', 'feature_dishwash', 'feature_laundry_in_build', 'feature_no_fe', 'feature_fitness_cent', 'feature_laundry_in_unit', 'feature_pre-war', 'feature_roof_deck', 'feature_outdoor_spac', 'feature_dining_room', 'feature_high_speed_internet', 'feature_balconi', 'feature_swimming_pool', 'feature_new_construct', 'feature_terrac', 'feature_exclus', 'feature_loft', 'feature_gardenpatio', 'feature_prewar', 'feature_wheelchair_access', 'feature_common_outdoor_spac', 'feature_hardwood', 'feature_simplex', 'feature_fireplac', 'feature_high_ceil', 'feature_lowris', 'feature_garag', 'feature_reduced_fe', 'feature_laundry_room', 'feature_furnish', 'feature_multi-level', 'feature_private_outdoor_spac', 'feature_parking_spac', 'feature_publicoutdoor', 'feature_roof-deck', 'feature_live_in_sup', 'feature_renov', 'feature_pool', 'feature_on-site_laundri'