### Linear Support vector classifier

In [9]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
import re
from nltk.stem.snowball import SnowballStemmer
import string
import time
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

Custom functions for loading and preprocessing data.

In [10]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return feat_df[names]


def load_data():
    print('Loading features files')
    basic_feat = pd.read_json('../feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('../feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('../feat_input/feat_stats_encoding.csv')

    # apply ordinal encoding to categorical feature
    print('Ordinal encoding')
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    print("Features document-term matrix")
    stemmer = SnowballStemmer('english')
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    vect_df = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")
    
    all_feat = pd.concat([all_feat, vect_df], axis=1)
    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    return x_train, y_train, x_test, x_test.columns.values, x_test.listing_id


def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

Train model with 5-fold cv. Output train and test multi-logloss as a measure of performance.

In [11]:
def run_model(dtrain, dtest=None):
    lsvc = LinearSVC()
    params = {'C': 0.1,
              'loss': 'squared_hinge',
              'penalty': 'l2',
              'multi_class': 'ovr',
              'fit_intercept': True,
              'random_state': 36683,
              'verbose': 0
             }
    lsvc.set_params(**params)
    clf = CalibratedClassifierCV(lsvc)
    if dtest:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred, y_test_pred = clf.predict_proba(dtrain[0]), clf.predict_proba(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)
        return clf, y_train_loss, y_test_loss
    else:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred = clf.predict_proba(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return clf, y_train_loss

In [14]:
def train_cv(preprocess='linear'):
    X_train, y_train_cls, X_test, _, _ = load_data()
    if preprocess == 'log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    else:
        X_train, X_test = _preprocess(X_train, X_test)

    cv_scores, n_folds = [], 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=816)
    for i, (train_ind, val_ind) in enumerate(skf.split(X_train, y_train_cls)):
        print("Running Fold", i + 1, "/", n_folds, "\t")
        start = time.time()
        
        train_x, val_x = X_train.iloc[train_ind, :], X_train.iloc[val_ind, :]
        train_y, val_y = y_train_cls.iloc[train_ind], y_train_cls.iloc[val_ind]
        clf, train_loss, val_loss = run_model((train_x, train_y), (val_x, val_y))
        cv_scores.append([train_loss, val_loss])
        
        print("train_loss: {0:.6f}, val_loss: {1:.6f}".format(train_loss, val_loss), end="\t")
        
        end = time.time()
        m, s = divmod(end-start, 60)
        h, m = divmod(m, 60)
        print("time elapsed: %d:%02d:%02d" % (h, m, s))
        
    mean_train_loss = np.mean([cv_scores[i][0] for i in range(len(cv_scores))])
    mean_val_loss = np.mean([cv_scores[i][1] for i in range(len(cv_scores))])
    print("train_loss mean: {0:.6f}, val_loss mean: {1:.6f}".format(mean_train_loss, mean_val_loss))

In [13]:
train_cv(preprocess='log')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5 	


  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.570265, val_loss: 0.586759	time elapsed: 0:02:54
Running Fold 2 / 5 	


  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.578198, val_loss: 0.580624	time elapsed: 0:02:57
Running Fold 3 / 5 	
train_loss: 0.572091, val_loss: 0.580169	time elapsed: 0:02:51
Running Fold 4 / 5 	


  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.568077, val_loss: 0.583359	time elapsed: 0:02:55
Running Fold 5 / 5 	


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)


train_loss: 0.572528, val_loss: 0.577426	time elapsed: 0:02:46
train_loss mean: 0.572232, val_loss mean: 0.581667


  return 1. / (1. + np.exp(self.a_ * T + self.b_))


In [15]:
train_cv()

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5 	


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.592461, val_loss: 0.602951	time elapsed: 0:02:59
Running Fold 2 / 5 	


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.582140, val_loss: 0.584536	time elapsed: 0:02:59
Running Fold 3 / 5 	


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.596385, val_loss: 0.600892	time elapsed: 0:03:37
Running Fold 4 / 5 	


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  return 1. / (1. + np.exp(self.a_ * T + self.b_))


train_loss: 0.593650, val_loss: 0.608047	time elapsed: 0:03:03
Running Fold 5 / 5 	


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)
  E = np.exp(AB[0] * F + AB[1])


train_loss: 0.593393, val_loss: 0.603093	time elapsed: 0:03:12
train_loss mean: 0.591606, val_loss mean: 0.599904


  return 1. / (1. + np.exp(self.a_ * T + self.b_))
