### LightGBM from Microsoft for classification

In [4]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
from collections import OrderedDict, defaultdict
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
import re, string, time
from nltk.stem.snowball import SnowballStemmer
import lightgbm as lgb

Custom functions for loading and preprocessing data.

In [2]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return feat_df[names]


def load_data():
    print('Loading features files')
    basic_feat = pd.read_json('../feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('../feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('../feat_input/feat_stats_encoding.csv')

    # apply ordinal encoding to categorical feature
    print('Ordinal encoding')
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    print("Features document-term matrix")
    stemmer = SnowballStemmer('english')
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    vect_df = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")
    
    all_feat = pd.concat([all_feat, vect_df], axis=1)
    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    return x_train, y_train, x_test, x_test.columns.values, x_test.listing_id


def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

Caculate performance using 5-fold cross-validation. Using built-in function `lgb.cv`.

In [24]:
def lgb_cv(dtrain, n_iters=10000, early_stop_rounds=250):
    print('Start lightGBM cross-validation')
    params = {'boosting': 'gbdt',
              'application': 'multiclass',
              'learning_rate': 0.03,
              'metric': 'multi_logloss',
              'max_depth': 5,
              'lambda_l2': 10,
              'feature_fraction': 0.7,
              'bagging_fraction': 0.7,
              'bagging_freq': 0,
              'num_threads': 4,
              'num_class': 3,
              }
    bst = lgb.cv(params=params, 
                 train_set=dtrain,
                 num_boost_round=n_iters,
                 nfold=5,
                 stratified=True,
                 metrics=['multi_error', 'multi_logloss'],
                 verbose_eval=50,
                 early_stopping_rounds=early_stop_rounds,
                 seed=816)
    return bst

In [25]:
X_train, y_train, X_test, _, _ = load_data()
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test)

# cross-validation and training
lgb_cv_hist = lgb_cv(dtrain)

Loading features files
Ordinal encoding
Features document-term matrix
Start lightGBM cross-validation
[50]	cv_agg's multi_logloss: 0.670743 + 0.00369493	cv_agg's multi_error: 0.257841 + 0.0029109
[100]	cv_agg's multi_logloss: 0.585834 + 0.00440668	cv_agg's multi_error: 0.250283 + 0.00271402
[150]	cv_agg's multi_logloss: 0.558139 + 0.00528378	cv_agg's multi_error: 0.245724 + 0.00340572
[200]	cv_agg's multi_logloss: 0.545208 + 0.00569631	cv_agg's multi_error: 0.241084 + 0.00353598
[250]	cv_agg's multi_logloss: 0.536845 + 0.00613826	cv_agg's multi_error: 0.237761 + 0.00371065
[300]	cv_agg's multi_logloss: 0.531038 + 0.00642768	cv_agg's multi_error: 0.235897 + 0.00362262
[350]	cv_agg's multi_logloss: 0.526804 + 0.00661905	cv_agg's multi_error: 0.23456 + 0.00374246
[400]	cv_agg's multi_logloss: 0.523813 + 0.00671971	cv_agg's multi_error: 0.233526 + 0.00399136
[450]	cv_agg's multi_logloss: 0.521442 + 0.00676792	cv_agg's multi_error: 0.232351 + 0.0037019
[500]	cv_agg's multi_logloss: 0.519334

In [26]:
X_train, y_train, X_test, _, _ = load_data()
X_train, X_test = _preprocess(X_train, X_test)
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test)

# cross-validation and training
lgb_cv_hist = lgb_cv(dtrain)

Loading features files
Ordinal encoding
Features document-term matrix
Start lightGBM cross-validation
[50]	cv_agg's multi_logloss: 0.670758 + 0.00379916	cv_agg's multi_error: 0.257558 + 0.00327367
[100]	cv_agg's multi_logloss: 0.585866 + 0.0044336	cv_agg's multi_error: 0.250365 + 0.00238552
[150]	cv_agg's multi_logloss: 0.55819 + 0.00509038	cv_agg's multi_error: 0.245015 + 0.00371422
[200]	cv_agg's multi_logloss: 0.545114 + 0.00556057	cv_agg's multi_error: 0.240983 + 0.00397213
[250]	cv_agg's multi_logloss: 0.536811 + 0.00600708	cv_agg's multi_error: 0.23764 + 0.00371976
[300]	cv_agg's multi_logloss: 0.530873 + 0.00631041	cv_agg's multi_error: 0.23616 + 0.0040185
[350]	cv_agg's multi_logloss: 0.526661 + 0.00654293	cv_agg's multi_error: 0.233567 + 0.00394095
[400]	cv_agg's multi_logloss: 0.523654 + 0.006598	cv_agg's multi_error: 0.232695 + 0.00421412
[450]	cv_agg's multi_logloss: 0.521304 + 0.00657527	cv_agg's multi_error: 0.231865 + 0.00371565
[500]	cv_agg's multi_logloss: 0.519348 + 0

In [27]:
X_train, y_train, X_test, _, _ = load_data()
X_train, X_test = _preprocess_log(X_train, X_test)
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test)

# cross-validation and training
lgb_cv_hist = lgb_cv(dtrain)

Loading features files
Ordinal encoding
Features document-term matrix
Start lightGBM cross-validation
[50]	cv_agg's multi_logloss: 0.670822 + 0.00382203	cv_agg's multi_error: 0.257355 + 0.00337708
[100]	cv_agg's multi_logloss: 0.585802 + 0.00450364	cv_agg's multi_error: 0.250121 + 0.00274774
[150]	cv_agg's multi_logloss: 0.558062 + 0.00512368	cv_agg's multi_error: 0.245056 + 0.00324044
[200]	cv_agg's multi_logloss: 0.544917 + 0.00560101	cv_agg's multi_error: 0.240942 + 0.00378651
[250]	cv_agg's multi_logloss: 0.536637 + 0.00604004	cv_agg's multi_error: 0.237822 + 0.00394783
[300]	cv_agg's multi_logloss: 0.530764 + 0.00644503	cv_agg's multi_error: 0.235877 + 0.00396739
[350]	cv_agg's multi_logloss: 0.526624 + 0.00664894	cv_agg's multi_error: 0.234316 + 0.00429234
[400]	cv_agg's multi_logloss: 0.523617 + 0.0067692	cv_agg's multi_error: 0.23306 + 0.00399839
[450]	cv_agg's multi_logloss: 0.521325 + 0.00674561	cv_agg's multi_error: 0.232027 + 0.00432905
[500]	cv_agg's multi_logloss: 0.51932