In [1]:
import re

import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas(desc="pd")

In [2]:
df_train = pd.read_json('data/train.json')
df_test = pd.read_json('data/test.json')

In [3]:
df_train.interest_level.value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [4]:
np.random.seed(1)
folds = np.random.choice([0, 1, 2, 3, 4, 5], size=len(df_train))
df_train['fold'] = folds
df_test['fold'] = -1

In [5]:
df_train.reset_index(inplace=1)
df_test.reset_index(inplace=1)

df_all = pd.concat([df_train, df_test])

In [6]:
df_all.interest_level.fillna('', inplace=1)

In [7]:
re_unsplit = re.compile('([a-z])([A-Z])')
re_html = re.compile(r'<.{1,5}>')
re_tokens = re.compile(r'\w+')

stopwords = {'a', 'an', 'these', 'as', 'you', 'w', 'in', 'the', 'do', 'don', 't', 
             'we', 'have', 'in', 'for', 'to', 'lot', 'lots', 'of', 'it', 's', 'and',
             'by', 'is', 'are', 'been', 'or', 'any', 'me', 'at', 'that', 'via', 
             'this', 'also', 'has', 'very', 'many', 'your', 'i', 'forward', 'into', 
             'up', 'then', 'p'}

def unsplit(s):
    return re_unsplit.sub(r'\1 \2', s)

def remove_html(s):
    return re_html.sub(' ', s)

def prepare_text(s):
    s = unsplit(s)
    s = remove_html(s)
    s = s.lower()
    tokens = re_tokens.findall(s)
    tokens = [t for t in tokens if t not in stopwords]
    return ' '.join(tokens)

In [8]:
df_all['description_cleaned'] = df_all.description.progress_apply(prepare_text)

pd: 100%|██████████| 124011/124011 [00:09<00:00, 13730.56it/s]


In [9]:
df_all.display_address = df_all.display_address.str.lower()
df_all.street_address = df_all.street_address.str.lower()

df_all['num_features'] = df_all.features.apply(len)

In [10]:
def str_features(f):
    return ' '.join(s.replace(' ', '_').lower() for s in f)

df_all.features = df_all.features.progress_apply(str_features)

pd: 100%|██████████| 124011/124011 [00:00<00:00, 160507.96it/s]


In [11]:
interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
df_all.interest_level = df_all.interest_level.apply(lambda x: interest_level_map.get(x, -1))

In [12]:
df_all['num_photos'] = df_all.photos.apply(len)

In [13]:
b_cnt = df_all.building_id.value_counts()

df_all['building_cnt'] = b_cnt[df_all.building_id].reset_index(drop=1)
df_all.loc[df_all.building_cnt == 20664, 'building_cnt'] = -1

In [14]:
m_cnt = df_all.manager_id.value_counts()
df_all['manager_cnt'] = m_cnt[df_all.manager_id].reset_index(drop=1)

In [15]:
re_spaces = re.compile(' +')

def normalize_address(s):
    s = s.replace('.', '')
    s = s.replace(',', '')
    s = s.replace('\r', '')
    s = s.replace('\t', '')
    s = s.replace('avenue', 'av')
    s = s.replace('ave', 'av')
    s = s.replace('street', 'st')
    s = s.replace('east', 'e')
    s = s.replace('west', 'w')
    s = s.replace('st', '') # replaces both 1st and street
    s = s.replace('nd', '')
    s = s.replace('rd', '')
    s = s.replace('th', '')
    s = re_spaces.sub(' ', s)
    s = s.replace('av', '')
    s = s.strip()
    return s

In [16]:
df_all['address_normalized'] = df_all.street_address.progress_apply(normalize_address)

pd: 100%|██████████| 124011/124011 [00:00<00:00, 130360.01it/s]


In [17]:
def fit_mtv(df, target_col, cat_col, C):
    m0 = (df[target_col] == 1).mean()

    cnt = df[df[target_col] == 1][cat_col].value_counts()
    cnt_all = df[cat_col].value_counts()

    probs = (cnt + C * m0) / (cnt_all + C)
    return probs, m0

def transform_mtv(df_target, cat_col, probs, m0):
    probs_targ = probs[df_target[cat_col]].reset_index(drop=1)
    probs_targ.fillna(m0, inplace=1)
    return probs_targ.values

In [18]:
df_train = df_all[df_all.fold != -1].reset_index(drop=1)
df_test = df_all[df_all.fold == -1].reset_index(drop=1)

In [19]:
for i in [0, 1, 2]:
    df_train['interest_%s' % i] = (df_train.interest_level == i).astype('uint8')

In [28]:
categorical = ['building_id', 'manager_id', 'address_normalized', 'num_photos', 'num_features',
               'bathrooms', 'bedrooms']

C = 12

for c in tqdm(categorical):
    for i in [0, 1, 2, 3, 4, 5]:
        df_train_fold = df_train[df_train.fold != i].reset_index(drop=1)
        df_test_fold = df_train[df_train.fold == i].reset_index(drop=1)

        for target_col in ['interest_0', 'interest_1', 'interest_2']:
            res_name = '%s_%s_mtv' % (c, target_col)

            probs, m0 = fit_mtv(df_train_fold, target_col, c, C=C)
            df_train.loc[df_train.fold == i, res_name] = transform_mtv(df_test_fold, c, probs, m0)

    for target_col in ['interest_0', 'interest_1', 'interest_2']:
        res_name = '%s_%s_mtv' % (c, target_col)
        probs, m0 = fit_mtv(df_train, target_col, c, C=C)
        df_test[res_name] = transform_mtv(df_test, c, probs, m0)

100%|██████████| 7/7 [00:03<00:00,  1.93it/s]


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import log_loss

In [34]:
desc_vec = TfidfVectorizer(token_pattern='\S+', ngram_range=(1, 3), min_df=10)
X_desc_train = desc_vec.fit_transform(df_train.description_cleaned)
X_desc_test = desc_vec.transform(df_test.description_cleaned)

In [42]:
scores = []

for i in [0, 1, 2, 3, 4, 5]:
    train_idx = df_train.fold != i
    val_idx = df_train.fold == i

    X = X_desc_train[train_idx.values]
    y = df_train[train_idx].interest_level
    X_val = X_desc_train[val_idx.values]
    y_val = df_train[val_idx].interest_level

    clf = LogisticRegression(C=1, penalty='l1', random_state=1)
    clf.fit(X, y)

    y_pred = clf.predict_proba(X_val)
    score = log_loss(y_val, y_pred)
    scores.append(score)
    print(score)
    
    for i in [0, 1, 2]:
        res_name = 'description_lr_%d' % i
        df_train.loc[val_idx, res_name] = y_pred[:, i]

np.mean(scores), np.std(scores)

0.715655998193
0.723938513301
0.725838268891
0.712791916892
0.708006400636
0.70097148356


(0.71453376357895282, 0.0086320768492308114)

In [62]:
clf = LogisticRegression(C=1, penalty='l1', random_state=1)
clf.fit(X_desc_train, df_train.interest_level.values)

y_pred = clf.predict_proba(X_desc_test)
for i in [0, 1, 2]:
    res_name = 'description_lr_%d' % i
    df_test[res_name] = y_pred[:, i]

In [46]:
fvec = CountVectorizer(token_pattern='\S+', ngram_range=(1, 3), min_df=10)
Xf_train = fvec.fit_transform(df_train.features)
Xf_test = fvec.transform(df_test.features)

In [82]:
scores = []

for i in [0, 1, 2, 3, 4, 5]:
    train_idx = df_train.fold != i
    val_idx = df_train.fold == i

    X = Xf_train[train_idx.values]
    y = df_train[train_idx].interest_level
    X_val = Xf_train[val_idx.values]
    y_val = df_train[val_idx].interest_level

    clf = LogisticRegression(C=0.5, penalty='l1', random_state=1)
    clf.fit(X, y)

    y_pred = clf.predict_proba(X_val)
    score = log_loss(y_val, y_pred)
    scores.append(score)
    print(score)
    
    for i in [0, 1, 2]:
        res_name = 'features_lr_%d' % i
        df_train.loc[val_idx, res_name] = y_pred[:, i]

np.mean(scores), np.std(scores)

0.722846365401
0.731271872954
0.733929009458
0.72174829897
0.722139420345
0.718488895693


(0.72507064380363484, 0.0055501164223626361)

In [83]:
clf = LogisticRegression(C=0.5, penalty='l1', random_state=1)
clf.fit(Xf_train, df_train.interest_level.values)

y_pred = clf.predict_proba(Xf_test)
for i in [0, 1, 2]:
    res_name = 'features_lr_%d' % i
    df_test[res_name] = y_pred[:, i]

In [84]:
avec = TfidfVectorizer(token_pattern='\S+', ngram_range=(1, 3), min_df=10)
X_ad_train = avec.fit_transform(df_train.address_normalized)
X_ad_test = avec.transform(df_test.address_normalized)

In [91]:
scores = []

for i in [0, 1, 2, 3, 4, 5]:
    train_idx = df_train.fold != i
    val_idx = df_train.fold == i

    X = X_ad_train[train_idx.values]
    y = df_train[train_idx].interest_level
    X_val = X_ad_train[val_idx.values]
    y_val = df_train[val_idx].interest_level

    clf = LogisticRegression(C=1, penalty='l1', random_state=1)
    clf.fit(X, y)

    y_pred = clf.predict_proba(X_val)
    score = log_loss(y_val, y_pred)
    scores.append(score)
    print(score)
    
    for i in [0, 1, 2]:
        res_name = 'address_lr_%d' % i
        df_train.loc[val_idx, res_name] = y_pred[:, i]

np.mean(scores), np.std(scores)

0.751564992284
0.754053809436
0.764623296695
0.75382927063
0.744765232424
0.737144524952


(0.75099685440355002, 0.0085057863602203965)

In [92]:
clf = LogisticRegression(C=1, penalty='l1', random_state=1)
clf.fit(X_ad_train, df_train.interest_level.values)

y_pred = clf.predict_proba(X_ad_test)
for i in [0, 1, 2]:
    res_name = 'address_lr_%d' % i
    df_test[res_name] = y_pred[:, i]

In [94]:
import feather

In [95]:
df_images_train = feather.read_dataframe('dfs/df_train_group.feather')
df_images_test = feather.read_dataframe('dfs/df_test_group.feather')

In [112]:
df_train_m = df_train.merge(df_images_train, on='listing_id', how='left')
df_test_m = df_test.merge(df_images_test, on='listing_id', how='left')

In [109]:
to_exclude = {'building_id', 'created', 'description', 'display_address', 'features',
              'fold', 'index', 'interest_level', 'listing_id', 'manager_id', 'photos',
              'street_address', 'description_cleaned', 'address_normalized', 
              'interest_0', 'interest_1', 'interest_2', 'geometries'}
features = sorted(set(df_train_m.columns) - to_exclude)

In [145]:
i = 0 
X_train = df_train_m[df_train.fold != i][features].values
y_train = df_train_m[df_train.fold != i].interest_level.values

X_val = df_train_m[df_train.fold == i][features].values
y_val = df_train_m[df_train.fold == i].interest_level.values

In [146]:
import xgboost as xgb

In [147]:
xgb_pars = {
    'eta': 0.15,
    'gamma': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
# not deafauts
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': 3,
    'nthread': 8,
    'seed': 42,
    'silent': 1
}

n_estimators = 95

In [148]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

watchlist = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=5,
             evals=watchlist)

[0]	train-mlogloss:1.00144	val-mlogloss:1.00355
[5]	train-mlogloss:0.736731	val-mlogloss:0.749817
[10]	train-mlogloss:0.632958	val-mlogloss:0.655255
[15]	train-mlogloss:0.580952	val-mlogloss:0.611823
[20]	train-mlogloss:0.550654	val-mlogloss:0.590184
[25]	train-mlogloss:0.530236	val-mlogloss:0.577469
[30]	train-mlogloss:0.514732	val-mlogloss:0.570599
[35]	train-mlogloss:0.501694	val-mlogloss:0.56591
[40]	train-mlogloss:0.490268	val-mlogloss:0.561882
[45]	train-mlogloss:0.479862	val-mlogloss:0.559776
[50]	train-mlogloss:0.469656	val-mlogloss:0.55743
[55]	train-mlogloss:0.460824	val-mlogloss:0.556028
[60]	train-mlogloss:0.451789	val-mlogloss:0.554588
[65]	train-mlogloss:0.442779	val-mlogloss:0.553264
[70]	train-mlogloss:0.435168	val-mlogloss:0.552729
[75]	train-mlogloss:0.427881	val-mlogloss:0.552666
[80]	train-mlogloss:0.420621	val-mlogloss:0.55219
[85]	train-mlogloss:0.413019	val-mlogloss:0.551662
[90]	train-mlogloss:0.405725	val-mlogloss:0.551242


In [149]:
y_pred = model.predict(dval)
log_loss(y_val, y_pred)

0.55111310279975945

X_train = df_train_m[features].values
y_train = df_train_m.interest_level.values

watchlist = [(dtrain, 'train')]

model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=5,
             evals=watchlist)

In [126]:
print('feature importance:')

scores = model.get_score(importance_type='gain')

for n, s in sorted(scores.items(), key=lambda x: -x[1]):
    print(' - %s: %.4f' % (n, s))

not_used = set(features) - scores.keys()
print('not used features: %s' % not_used)

feature importance:
 - manager_id_interest_0_mtv: 29.8774
 - manager_id_interest_1_mtv: 12.2894
 - building_id_interest_0_mtv: 11.6440
 - interest_level_xgb_0_mean: 11.4313
 - manager_id_interest_2_mtv: 10.0934
 - price: 9.9939
 - address_normalized_interest_0_mtv: 7.6513
 - interest_level_xgb_0_min: 7.6261
 - building_id_interest_1_mtv: 7.6039
 - description_lr_0: 6.9513
 - bedrooms: 6.2555
 - description_lr_2: 6.2418
 - bathrooms_interest_0_mtv: 6.1566
 - interest_level_xgb_2_mean: 6.0577
 - features_lr_0: 5.9845
 - building_id_interest_2_mtv: 5.7086
 - interest_level_xgb_1_mean: 5.4699
 - bedrooms_interest_0_mtv: 5.2054
 - interest_level_xgb_1_max: 5.0503
 - features_lr_1: 4.9938
 - features_lr_2: 4.7784
 - interest_level_xgb_1_min: 4.6456
 - bedrooms_interest_1_mtv: 4.5214
 - description_lr_1: 4.2799
 - longitude: 4.2438
 - num_features: 4.1553
 - interest_level_xgb_0_max: 4.0930
 - latitude: 4.0846
 - building_cnt: 4.0547
 - interest_level_xgb_2_max: 3.8852
 - seconds_from_min_min

In [128]:
dtest = xgb.DMatrix(df_test_m[features], feature_names=features)

In [130]:
y_pred = model.predict(dtest)

In [136]:
df_res = df_test_m[['listing_id']].copy()

df_res['low'] = y_pred[:, 0]
df_res['medium'] = y_pred[:, 1]
df_res['high'] = y_pred[:, 2]

In [137]:
df_res.to_csv('eda.csv', index=False)

In [138]:
!gzip eda.csv