In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
from collections import OrderedDict, defaultdict
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
import re, string, time
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import log_loss

Custom functions for loading and preprocessing data.

In [2]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return feat_df[names]


def load_data():
    print('Loading features files')
    basic_feat = pd.read_json('../feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('../feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('../feat_input/feat_stats_encoding.csv')

    # apply ordinal encoding to categorical feature
    print('Ordinal encoding')
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    print("Features document-term matrix")
    stemmer = SnowballStemmer('english')
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    vect_df = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")
    
    all_feat = pd.concat([all_feat, vect_df], axis=1)
    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    return x_train, y_train, x_test, x_test.columns.values, x_test.listing_id


def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

Use sklearn built-in function `LogisticRegressionCV` to search for best parameter `C`, which gives the best prediction performance.

In [3]:
def lr2cv(preprocess='linear'):
    X_train, y_train_cls, X_test, _, _ = load_data()
    if preprocess=='log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    else:
        X_train, X_test = _preprocess(X_train, X_test)

    lrcv = LogisticRegressionCV()
    params = {'Cs': [0.1, 1, 10],
              'cv': 5,
              'solver': 'liblinear',
              'n_jobs': -1,
              'verbose': 1,
              'max_iter': 500,
              'random_state': 816
              }
    lrcv.set_params(**params)
    lrcv.fit(X_train, y_train_cls)
    return lrcv

In [4]:
clf = lr2cv(preprocess='log')

Loading features files
Ordinal encoding
Features document-term matrix


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 11.1min finished


In [5]:
# dict with classes as the keys
clf.scores_

{0: array([[ 0.80731436,  0.80630129,  0.80549083],
        [ 0.8123797 ,  0.81258231,  0.8138993 ],
        [ 0.80488299,  0.80437646,  0.80397123],
        [ 0.80042549,  0.80083072,  0.8005268 ],
        [ 0.79134576,  0.79154844,  0.79185245]]),
 1: array([[ 0.77175565,  0.77236349,  0.77155303],
        [ 0.77236349,  0.77266741,  0.77226218],
        [ 0.77135042,  0.77165434,  0.77226218],
        [ 0.76932428,  0.76831121,  0.76780468],
        [ 0.76520065,  0.76661938,  0.76793677]]),
 2: array([[ 0.92827474,  0.92807213,  0.92807213],
        [ 0.9306048 ,  0.93070611,  0.93070611],
        [ 0.92989565,  0.93009827,  0.92999696],
        [ 0.93080742,  0.93101003,  0.93070611],
        [ 0.92764491,  0.92784759,  0.92754358]])}

In [6]:
dic_scores = {}
for i in clf.scores_.keys():
    dic_scores[i] = np.mean(clf.scores_[i], axis=0)
dic_scores

{0: array([ 0.80326966,  0.80312784,  0.80314812]),
 1: array([ 0.7699989 ,  0.77032317,  0.77036377]),
 2: array([ 0.92944551,  0.92954682,  0.92940498])}

In [7]:
clf.Cs_

array([  0.1,   1. ,  10. ])

In [8]:
clf.C_

array([  0.1,  10. ,   1. ])

In [9]:
clf.n_iter_

array([[[ 7,  8, 12],
        [ 7, 10, 25],
        [ 6,  9, 11],
        [ 9,  8, 11],
        [ 6,  7, 12]],

       [[ 7, 10, 24],
        [ 7,  9, 13],
        [ 7,  8, 11],
        [ 9, 30, 41],
        [ 8, 20, 14]],

       [[ 9, 12, 30],
        [ 8, 16, 32],
        [ 8, 10, 19],
        [ 8, 12, 30],
        [ 8, 10, 82]]], dtype=int32)

cv training with another set of parameters: `solver=lbfgs` and `multi_class=multinomial`.

In [10]:
def lr2cv2(preprocess='linear'):
    X_train, y_train_cls, X_test, _, _ = load_data()
    if preprocess=='log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    else:
        X_train, X_test = _preprocess(X_train, X_test)

    lrcv = LogisticRegressionCV()
    params = {'Cs': [0.1, 1, 10],
              'cv': 5,
              'solver': 'lbfgs',
              'n_jobs': -1,
              'verbose': 1,
              'max_iter': 10000,
              'multi_class': 'multinomial',
              'random_state': 816
              }
    lrcv.set_params(**params)
    lrcv.fit(X_train, y_train_cls)
    return lrcv

In [11]:
clf2 = lr2cv2(preprocess='log')

Loading features files
Ordinal encoding
Features document-term matrix


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.7min finished


In [12]:
np.mean(clf2.scores_[0], axis=0)

array([ 0.7497158 ,  0.74902691,  0.74894587])

In [13]:
clf2.Cs_

array([  0.1,   1. ,  10. ])

In [14]:
clf2.n_iter_

array([[[ 298,  655,  906],
        [ 287,  647, 1189],
        [ 294,  546, 1343],
        [ 289,  616, 1195],
        [ 304,  634, 1001]]], dtype=int32)

Cross-validation by hand, to get logloss estimation.

In [15]:
def run_model(dtrain, dtest=None, solver='liblinear', multi_cls='ovr'):
    clf = LogisticRegression()
    params = {'C': 0.1,
              'solver': solver,
              'multi_class': multi_cls,
              'n_jobs': -1,
              'verbose': 1,
              'max_iter': 10000,
              'random_state': 36883
              }
    clf.set_params(**params)
    if dtest:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred, y_test_pred = clf.predict_proba(dtrain[0]), clf.predict_proba(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)
        return clf, y_train_loss, y_test_loss
    else:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred = clf.predict_proba(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return clf, y_train_loss

In [16]:
def train_cv(preprocess='linear', solver='liblinear', multi_cls='ovr'):
    X_train, y_train_cls, X_test, _, _ = load_data()
    if preprocess == 'log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    else:
        X_train, X_test = _preprocess(X_train, X_test)

    cv_scores, n_folds = [], 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=816)
    for i, (train_ind, val_ind) in enumerate(skf.split(X_train, y_train_cls)):
        print("Running Fold", i + 1, "/", n_folds)
        start = time.time()
        
        train_x, val_x = X_train.iloc[train_ind, :], X_train.iloc[val_ind, :]
        train_y, val_y = y_train_cls.iloc[train_ind], y_train_cls.iloc[val_ind]
        clf, train_loss, val_loss = run_model((train_x, train_y), (val_x, val_y), solver, multi_cls)
        cv_scores.append([train_loss, val_loss])
        
        print("train_loss: {0:.6f}, val_loss: {1:.6f}".format(train_loss, val_loss), end="\t")
        
        end = time.time()
        m, s = divmod(end-start, 60)
        h, m = divmod(m, 60)
        print("time elapsed: %d:%02d:%02d" % (h, m, s))
        
    mean_train_loss = np.mean([cv_scores[i][0] for i in range(len(cv_scores))])
    mean_val_loss = np.mean([cv_scores[i][1] for i in range(len(cv_scores))])
    print("train_loss mean: {0:.6f}, val_loss mean: {1:.6f}".format(mean_train_loss, mean_val_loss))

First try multi-class classification using one-vs-rest method.

In [17]:
train_cv(preprocess='log')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
[LibLinear]train_loss: 0.560027, val_loss: 0.577107	time elapsed: 0:00:17
Running Fold 2 / 5
[LibLinear]train_loss: 0.563201, val_loss: 0.562485	time elapsed: 0:00:18
Running Fold 3 / 5
[LibLinear]train_loss: 0.560879, val_loss: 0.571717	time elapsed: 0:00:17
Running Fold 4 / 5
[LibLinear]train_loss: 0.559111, val_loss: 0.573838	time elapsed: 0:00:17
Running Fold 5 / 5
[LibLinear]train_loss: 0.561189, val_loss: 0.567258	time elapsed: 0:00:17
train_loss mean: 0.560882, val_loss mean: 0.570481


Using one-versus-rest for multi-class classification, with `sag`.

In [18]:
train_cv(preprocess='log', solver='sag')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
convergence after 1414 epochs took 112 seconds
convergence after 1426 epochs took 113 seconds
convergence after 3515 epochs took 266 seconds
train_loss: 0.560049, val_loss: 0.577191	time elapsed: 0:04:26
Running Fold 2 / 5


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.4min finished


convergence after 1323 epochs took 106 seconds
convergence after 1751 epochs took 139 seconds
convergence after 3194 epochs took 246 seconds
train_loss: 0.563106, val_loss: 0.562527	time elapsed: 0:04:05
Running Fold 3 / 5


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.1min finished


convergence after 1407 epochs took 124 seconds
convergence after 1561 epochs took 125 seconds
convergence after 3135 epochs took 258 seconds
train_loss: 0.560876, val_loss: 0.572066	time elapsed: 0:04:18
Running Fold 4 / 5


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.3min finished


convergence after 1386 epochs took 113 seconds
convergence after 1420 epochs took 115 seconds
convergence after 3632 epochs took 279 seconds
train_loss: 0.559132, val_loss: 0.573781	time elapsed: 0:04:39
Running Fold 5 / 5


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.7min finished


convergence after 1419 epochs took 113 seconds
convergence after 1572 epochs took 125 seconds
convergence after 3623 epochs took 276 seconds
train_loss: 0.561150, val_loss: 0.567327	time elapsed: 0:04:36
train_loss mean: 0.560863, val_loss mean: 0.570578


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.6min finished


In [19]:
train_cv(preprocess='log', solver='sag', multi_cls='multinomial')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
convergence after 1401 epochs took 208 seconds
train_loss: 0.553578, val_loss: 0.572902	time elapsed: 0:03:28
Running Fold 2 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.5min finished


convergence after 2051 epochs took 307 seconds
train_loss: 0.557061, val_loss: 0.556253	time elapsed: 0:05:06
Running Fold 3 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  5.1min finished


convergence after 1473 epochs took 220 seconds
train_loss: 0.554637, val_loss: 0.567690	time elapsed: 0:03:40
Running Fold 4 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.7min finished


convergence after 1472 epochs took 221 seconds
train_loss: 0.553010, val_loss: 0.569458	time elapsed: 0:03:41
Running Fold 5 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.7min finished


convergence after 1518 epochs took 231 seconds
train_loss: 0.555273, val_loss: 0.561092	time elapsed: 0:03:50
train_loss mean: 0.554712, val_loss mean: 0.565479


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.8min finished


In [20]:
train_cv(preprocess='log', solver='lbfgs', multi_cls='multinomial')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.6s finished


train_loss: 0.553280, val_loss: 0.572040	time elapsed: 0:00:11
Running Fold 2 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.9s finished


train_loss: 0.556947, val_loss: 0.556453	time elapsed: 0:00:12
Running Fold 3 / 5
train_loss: 0.554349, val_loss: 0.567760	time elapsed: 0:00:12
Running Fold 4 / 5


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   12.4s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   12.4s finished


train_loss: 0.552729, val_loss: 0.569465	time elapsed: 0:00:12
Running Fold 5 / 5
train_loss: 0.555045, val_loss: 0.561168	time elapsed: 0:00:16
train_loss mean: 0.554470, val_loss mean: 0.565377


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   16.3s finished


It looks using different solvers and multi_class methods leads to the same classification logloss. Even though the underlying principle between `ovr` and `multinomial` is quite different, prediction from `multinomial` class is only better than `ovr` by around 0.05. Different solvers do not affect final prediction that much.