### SGD classifier for linear models

This estimator implements regularized linear models (SVM, logistic regression, a.o.) with stochastic gradient descent (SGD) learning

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
from collections import OrderedDict, defaultdict
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
import re, string, time
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

Custom functions for loading and preprocessing data.

In [2]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return feat_df[names]


def load_data():
    print('Loading features files')
    basic_feat = pd.read_json('../feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('../feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('../feat_input/feat_stats_encoding.csv')

    # apply ordinal encoding to categorical feature
    print('Ordinal encoding')
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    print("Features document-term matrix")
    stemmer = SnowballStemmer('english')
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    vect_df = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")
    
    all_feat = pd.concat([all_feat, vect_df], axis=1)
    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    return x_train, y_train, x_test, x_test.columns.values, x_test.listing_id


def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

Use 5-fold cv to train the model, **with default parameter setting**.

In [6]:
def run_model(dtrain, dtest=None):
    clf = SGDClassifier()
    params = {'loss': 'log',
              'alpha': 1e-4,
              'n_jobs': -1,
              'verbose': 1,
              'random_state': 36883
              }
    clf.set_params(**params)
    if dtest:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred, y_test_pred = clf.predict_proba(dtrain[0]), clf.predict_proba(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)
        return clf, y_train_loss, y_test_loss
    else:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred = clf.predict_proba(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return clf, y_train_loss

In [7]:
def train_cv(preprocess='linear'):
    X_train, y_train_cls, X_test, _, _ = load_data()
    if preprocess == 'log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    else:
        X_train, X_test = _preprocess(X_train, X_test)

    cv_scores, n_folds = [], 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=816)
    for i, (train_ind, val_ind) in enumerate(skf.split(X_train, y_train_cls)):
        print("Running Fold", i + 1, "/", n_folds)
        start = time.time()
        
        train_x, val_x = X_train.iloc[train_ind, :], X_train.iloc[val_ind, :]
        train_y, val_y = y_train_cls.iloc[train_ind], y_train_cls.iloc[val_ind]
        clf, train_loss, val_loss = run_model((train_x, train_y), (val_x, val_y))
        cv_scores.append([train_loss, val_loss])
        
        print("train_loss: {0:.6f}, val_loss: {1:.6f}".format(train_loss, val_loss), end="\t")
        
        end = time.time()
        m, s = divmod(end-start, 60)
        h, m = divmod(m, 60)
        print("time elapsed: %d:%02d:%02d" % (h, m, s))
        
    mean_train_loss = np.mean([cv_scores[i][0] for i in range(len(cv_scores))])
    mean_val_loss = np.mean([cv_scores[i][1] for i in range(len(cv_scores))])
    print("train_loss mean: {0:.6f}, val_loss mean: {1:.6f}".format(mean_train_loss, mean_val_loss))

Train with logloss, which is equivalent to logistic regression.

In [8]:
train_cv(preprocess='log')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 51.51, NNZs: 192, Bias: -23.384622, T: 39481, Avg. loss: 23.803967Norm: 59.25, NNZs: 192, Bias: 19.892870, T: 39481, Avg. loss: 21.247254
Total training time: 0.03 seconds.

Total training time: 0.03 seconds.-- Epoch 2

Norm: 102.54, NNZs: 192, Bias: -149.975169, T: 39481, Avg. loss: 13.192500-- Epoch 2
Total training time: 0.03 seconds.

-- Epoch 2
Norm: 34.40, NNZs: 192, Bias: 11.860152, T: 78962, Avg. loss: 12.803241Norm: 27.99, NNZs: 192, Bias: -12.219804, T: 78962, Avg. loss: 14.091982
Total training time: 0.06 seconds.

-- Epoch 3Total training time: 0.06 seconds.

-- Epoch 3
Norm: 56.24, NNZs: 192, Bias: -53.047952, T: 78962, Avg. loss: 7.984804
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 20.40, NNZs: 192, Bias: 7.635966, T: 118443, Avg. loss: 9.397871
Total training time: 0.09 seconds.Norm: 20.90, NNZs: 192, Bias: -8.263069, T: 118443, Avg. los

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
  np.exp(prob, prob)


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 73.87, NNZs: 192, Bias: -24.839262, T: 39481, Avg. loss: 24.180336
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 55.45, NNZs: 192, Bias: 19.383894, T: 39481, Avg. loss: 23.429065
Total training time: 0.06 seconds.
-- Epoch 2
Norm: 106.01, NNZs: 192, Bias: -156.082837, T: 39481, Avg. loss: 13.599829
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 35.94, NNZs: 192, Bias: -12.459367, T: 78962, Avg. loss: 14.295706
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 38.89, NNZs: 192, Bias: 8.483545, T: 78962, Avg. loss: 13.899961Norm: 58.32, NNZs: 192, Bias: -56.796536, T: 78962, Avg. loss: 8.288049

Total training time: 0.09 seconds.
-- Epoch 3Total training time: 0.08 seconds.

-- Epoch 3
Norm: 21.53, NNZs: 192, Bias: -8.606911, T: 118443, Avg. loss: 10.408919
Total training time: 0.12 seconds.
-- Epoch 4
Norm: 23.66, NNZs: 192, Bias: 7.265872, T: 118443, Avg. loss: 10.114465Norm: 36.70, NNZs: 192, Bias: -27.941497, T: 118443, Avg. loss: 5

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
  np.exp(prob, prob)


train_loss: 1.598521, val_loss: 1.580136	time elapsed: 0:00:00
Running Fold 3 / 5
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 58.45, NNZs: 192, Bias: 21.099649, T: 39481, Avg. loss: 21.892711
Total training time: 0.08 seconds.
-- Epoch 2
Norm: 64.95, NNZs: 192, Bias: -24.283633, T: 39481, Avg. loss: 23.976645
Total training time: 0.08 seconds.
-- Epoch 2
Norm: 87.88, NNZs: 192, Bias: -122.282128, T: 39481, Avg. loss: 11.716073
Total training time: 0.08 seconds.
-- Epoch 2
Norm: 36.72, NNZs: 192, Bias: 10.100100, T: 78962, Avg. loss: 13.157032
Total training time: 0.11 seconds.
-- Epoch 3
Norm: 48.39, NNZs: 192, Bias: -45.826512, T: 78962, Avg. loss: 7.089791
Total training time: 0.13 seconds.
-- Epoch 3
Norm: 32.52, NNZs: 192, Bias: -12.099215, T: 78962, Avg. loss: 14.160449
Total training time: 0.14 seconds.
-- Epoch 3
Norm: 22.63, NNZs: 192, Bias: 7.087966, T: 118443, Avg. loss: 9.629450
Total training time: 0.14 seconds.
-- Epoch 4
Norm: 30.19, NNZs: 192, Bias: -23.320150, T: 118443, Avg

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
  np.exp(prob, prob)


-- Epoch 1
-- Epoch 1-- Epoch 1

Norm: 73.30, NNZs: 192, Bias: -25.260119, T: 39481, Avg. loss: 23.672788
Total training time: 0.08 seconds.
-- Epoch 2
Norm: 75.59, NNZs: 192, Bias: 21.369607, T: 39481, Avg. loss: 23.352540
Total training time: 0.10 seconds.
-- Epoch 2
Norm: 100.52, NNZs: 192, Bias: -148.886450, T: 39481, Avg. loss: 12.906742
Total training time: 0.09 seconds.
-- Epoch 2
Norm: 54.97, NNZs: 192, Bias: -53.719352, T: 78962, Avg. loss: 7.824650
Total training time: 0.13 seconds.
-- Epoch 3
Norm: 38.17, NNZs: 192, Bias: 9.848600, T: 78962, Avg. loss: 13.854520
Total training time: 0.14 seconds.
-- Epoch 3
Norm: 34.97, NNZs: 192, Bias: -12.505304, T: 78962, Avg. loss: 14.099451
Total training time: 0.14 seconds.
-- Epoch 3
Norm: 35.75, NNZs: 192, Bias: -25.892027, T: 118443, Avg. loss: 5.627073
Total training time: 0.16 seconds.
-- Epoch 4
Norm: 24.64, NNZs: 192, Bias: 8.738233, T: 118443, Avg. loss: 10.060851
Total training time: 0.17 seconds.
-- Epoch 4
Norm: 22.05, NNZs:

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
  np.exp(prob, prob)


-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 79.87, NNZs: 192, Bias: -24.593413, T: 39484, Avg. loss: 23.343858
Total training time: 0.05 seconds.
-- Epoch 2
Norm: 79.54, NNZs: 192, Bias: 18.486360, T: 39484, Avg. loss: 22.053744
Total training time: 0.06 seconds.
-- Epoch 2
Norm: 94.80, NNZs: 192, Bias: -144.507312, T: 39484, Avg. loss: 12.476474
Total training time: 0.06 seconds.
-- Epoch 2
Norm: 33.41, NNZs: 192, Bias: -12.673495, T: 78968, Avg. loss: 13.875275
Total training time: 0.08 seconds.
-- Epoch 3
Norm: 37.84, NNZs: 192, Bias: 9.997301, T: 78968, Avg. loss: 13.208971
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 53.40, NNZs: 192, Bias: -49.088796, T: 78968, Avg. loss: 7.572454
Total training time: 0.09 seconds.
-- Epoch 3
Norm: 23.09, NNZs: 192, Bias: 8.839106, T: 118452, Avg. loss: 9.654947Norm: 21.87, NNZs: 192, Bias: -8.464138, T: 118452, Avg. loss: 10.114812

Total training time: 0.11 seconds.Total training time: 0.12 seconds.

-- Epoch 4-- Epoch 4

Norm: 35.06, NNZs: 1

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
  np.exp(prob, prob)


Haven't understand in detail about this method, the performance here is simply poor.