In [1]:
import pandas as pd 
import numpy as np

import pickle
import gc 

from avito_functions import * 
from avito_classes import TargetEncoder

from scipy.sparse import hstack, csr_matrix, vstack
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from datetime import datetime
from itertools import compress

In [8]:
# data

new_features = pd.read_csv('../input/new_feautures.csv').iloc[:, 1:]
data_keys = ['train', 'valid', 'holdout', 'fulltrain', 'test']

print('Load df')
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

print('Load agg input')
with open('../input/map_dict.pkl', 'rb') as file: map_dict = pickle.load(file)
#with open('../input/text_features.pkl', 'rb') as f: X_text = pickle.load(f)
with open('../input/text_num_features.pkl', 'rb') as f: X_text_num = pickle.load(f)
sgd = load_fe('sgd2')

Load df
Load agg input


In [9]:
## pipeline
n_train = df_train.shape[0]
add_features = X_text_num

X, y, category_features = preprocessing(df_train, df_test, map_dict, add_features)
X, category_features = feature_engineering(X, category_features)

del df_train, df_test
gc.collect()
str(datetime.now())

run preprocessing..
run feature engineering..
-- count fraction price_x_region__category_name_frac
-- count fraction price_x_region__param_1_frac
-- count fraction price_x_region__param_2_frac
-- count fraction price_x_region__image_top_1_frac
-- count fraction price_x_city__category_name_frac
-- count fraction price_x_city__param_1_frac
-- count fraction price_x_city__param_2_frac
-- count fraction price_x_city__image_top_1_frac
-- count fraction price_x_image_top_1__category_name_frac
-- count fraction price_x_image_top_1__param_1_frac
-- count fraction price_x_image_top_1__param_2_frac
-- count fraction price_x_population_groups__param_1_frac
-- combine factors: price_log_cut_x_parent_category_name
-- combine factors: price_log_cut_x_category_name
-- combine factors: price_log_cut_x_region


'2018-06-26 01:12:05.283169'

In [10]:
# OHE

# f = category_features[0]
# dense = pd.get_dummies(X[f], prefix=f, dummy_na=True)
# ohe_data = csr_matrix(dense)
# ohe_features = dense.columns.tolist()

# for f in category_features[1:]:
#     print(f)
#     dense = pd.get_dummies(X[f], prefix=f, dummy_na=True)
#     ohe_data = hstack([ohe_data, dense])
#     ohe_features += dense.columns.tolist()
    
# ohe_data = ohe_data.tocsr()

##################
# OHE: small RAM #
##################

# dict_levels = {}
# ohe_features = []
# for f in category_features:
#     X, levels = factorize(X, f)
#     dict_levels[f] = levels
#     ohe_features += levels 

# ohe_data = None
# for f in category_features:
#     if ohe_data is None:
#         ohe_data = onehot_vec(X, f, dict_levels)
#     else:
#         ohe_data = hstack([ohe_data, onehot_vec(X, f, dict_levels)]).tocsr()

# factorize for lgbm
for f in category_features:
    X[f] = pd.factorize(X[f])[0]


In [11]:
#ohe_train, ohe_test = ohe_data[:n_train], ohe_data[n_train:]
X_train, X_test = X[:n_train], X[n_train:]

del X
# del X, ohe_data
gc.collect()

186

In [12]:
# split 
x_train, x_valid, x_holdout, \
y_train, y_valid, y_holdout, \
_,_,_ = validation_split(X_train, y)

run validation splitting..


In [15]:
for x, k in zip([x_train, x_valid, x_holdout, X_train, X_test], data_keys):
    x['sgd'] = sgd[k]

In [16]:
# target encoding 

te_groups = []
for f in category_features:
    te_groups.append([f])

te_groups += [['price_log_cut', 'category_name'], 
              ['price_log_cut', 'region'],
              ['price_log_cut', 'param_1'],
              ['region', 'parent_category_name']
             ]

for group in te_groups:
    x_train, x_valid, x_holdout = target_encoding(x_train, y_train, x_valid, group, x_holdout)
    X_train, X_test = target_encoding(X_train, y, X_test, group)

-- target encoding: ['region']
-- target encoding: ['region']
-- target encoding: ['city']
-- target encoding: ['city']
-- target encoding: ['parent_category_name']
-- target encoding: ['parent_category_name']
-- target encoding: ['category_name']
-- target encoding: ['category_name']
-- target encoding: ['param_1']
-- target encoding: ['param_1']
-- target encoding: ['param_2']
-- target encoding: ['param_2']
-- target encoding: ['param_3']
-- target encoding: ['param_3']
-- target encoding: ['user_type']
-- target encoding: ['user_type']
-- target encoding: ['image_top_1']
-- target encoding: ['image_top_1']
-- target encoding: ['price_log_cut_x_parent_category_name']
-- target encoding: ['price_log_cut_x_parent_category_name']
-- target encoding: ['price_log_cut_x_category_name']
-- target encoding: ['price_log_cut_x_category_name']
-- target encoding: ['price_log_cut_x_region']
-- target encoding: ['price_log_cut_x_region']
-- target encoding: ['price_exists']
-- target encoding: [

In [17]:
# for x in [x_train, x_valid, x_holdout]:
#     x.drop(category_features, 1, inplace=True)
#     print(x.shape, all(x.columns == x_train.columns))

# save category features 
cat_data = []
for x in [x_train, x_valid, x_holdout, X_train, X_test]:
    cat_data.append(x[category_features])
    x.drop(category_features, 1, inplace=True)
    print(x.shape, all(x.columns == x_train.columns))

(1103424, 56) True
(300000, 56) True
(100000, 56) True
(1503424, 56) True
(508438, 56) True


In [18]:
## impute 
print('impute numeric')
x_train, x_valid, x_holdout, _ = num_fillna(x_train, x_valid, x_holdout)
X_train, X_test, _ = num_fillna(X_train, X_test)

## scale
print('scale numeric')
x_train, x_valid, x_holdout, _ = num_scaling(x_train, x_valid, x_holdout)
X_train, X_test, _ = num_scaling(X_train, X_test)

impute numeric
scale numeric


In [19]:
## feature union 

num_features = x_train.columns.tolist()
features = num_features
# features = num_features + ohe_features

# x_train = hstack([x_train, ohe_train]).tocsr()
# x_valid = hstack([x_valid, ohe_valid]).tocsr()
# x_holdout = hstack([x_holdout, ohe_holdout]).tocsr()
# X_train = hstack([X_train, ohe_data[:n_train]]).tocsr()
# X_test = hstack([X_test, ohe_test]).tocsr()

In [20]:
# numeric noise
# noise_train = np.random.randn(x_train.shape[0], 20)
# noise_valid = np.random.randn(x_valid.shape[0], 20)
# x_train_noise = np.hstack([x_train, noise_train])
# x_valid_noise = np.hstack([x_valid, noise_valid])

# rf = RandomForestRegressor(max_depth=8,
#                            n_estimators=250,
#                            verbose=1, 
#                            min_samples_leaf=10,
#                            max_features=10,
#                            n_jobs=4
#                           )
# rf.fit(x_train_noise, y_train)
# print(rmse(y_valid, rf.predict(x_valid_noise)))

# random_importance = rf.feature_importances_[-20:].max()
# mask = rf.feature_importances_ > random_importance

# x_train_masked = x_train_noise[:, mask]
# x_valid_masked = x_valid_noise[:, mask]
# x_holdout_masked = x_holdout.loc[:, mask[:-20]]
# X_train_masked = X_train.loc[:, mask[:-20]]
# X_test_masked = X_test.loc[:, mask[:-20]]

# useful_features = list(compress(features, mask.tolist()))

In [21]:
# train_tsvd(50, tfidf_dict)
# train_tsvd(100, tfidf_dict)

In [22]:
# from sklearn.decomposition import TruncatedSVD

# def train_tsvd(n, tfidf_dict):
#     print('-- tSVD:', n)
#     ret = {}
#     tsvd = TruncatedSVD(n_components=n, random_state=2018)
#     ret['train'] = tsvd.fit_transform(tfidf_dict['train'])
#     ret['valid'] = tsvd.transform(tfidf_dict['valid'])
#     ret['holdout'] = tsvd.transform(tfidf_dict['holdout'])    
#     ret['fulltrain'] = tsvd.fit_transform(tfidf_dict['fulltrain'])
#     ret['test'] = tsvd.transform(tfidf_dict['test'])
#     with open('../fe/tfidf_svd' + str(n) + '.pkl', 'wb') as file: pickle.dump(file=file, obj=ret)
#     return ret

# with open('../input/tfidf_1.pkl', 'rb') as f: 
#     tfidf_dict = pickle.load(f)

# n = 20
# fe_tfidf_svd = train_tsvd(n, tfidf_dict)

In [30]:
# data_keys = ['train', 'valid', 'holdout', 'fulltrain', 'test']
# data = []
# for x, s in zip([x_train, x_valid, x_holdout, X_train, X_test], data_keys):
#     data.append(np.hstack([x, fe_tfidf_svd[s]]))

data = []
labels = [y_train, y_valid, y_holdout]
for x in [x_train, x_valid, x_holdout, X_train, X_test]:
    data.append(x)

In [None]:
# rf = RandomForestRegressor(**params)
# rf.fit(data[3], y)
# pred_test = rf.predict(data[4])
# save_data = [pred_val, pred_hol, pred_test]
# save_pred_holdout(save_data, 'rf1')

In [46]:
from sklearn.ensemble import ExtraTreesRegressor

params = {'max_depth':30, 
          'n_estimators':100,
          'verbose':1, 
          'min_samples_leaf': 1,
          'max_features':20,
          'n_jobs':4
         }

# pred_val, pred_hol, extra = train_sklearn(ExtraTreesRegressor, params, data, labels)

model = ExtraTreesRegressor(**params)

# valid 
data = [x_train.values, x_valid.values, x_holdout.values]
preds = oof_prediction(model, data, y_train)
# test
data = [X_train.values, X_test.values]
preds += oof_prediction(model, data, y)

d_preds = {}
for pred, k in zip(preds, ['train', 'valid', 'holdout', 'fulltrain', 'test']):
    d_preds[k] = pred
    
with open('../fe/extra.pkl', 'wb') as file: pickle.dump(file=file, obj=d_preds)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    6.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  3.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    5.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: 

0.22352+-0.00028


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  5.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    7.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   11.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    7.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  5.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    7.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   10.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: 

0.22264+-0.00022


In [None]:
# extra = ExtraTreesRegressor(**params)
# extra.fit(data[3], y)
# pred_test = rf.predict(data[4])
# save_data = [pred_val, pred_hol, pred_test]
# save_pred_holdout(save_data, 'extra1')

LINEAR

In [17]:
from sklearn.linear_model import SGDRegressor

params = {'max_iter': 100, 
          'loss': 'squared_loss', 
          'random_state': 2018, 
          'alpha': 0.0001,
          'penalty': 'l2',
          'l1_ratio': 0.01
         }

train_sklearn_valid(SGDRegressor, params, tfidf_dict['train'], tfidf_dict['valid'], y_train, y_valid)

0.230253516518


(array([ 0.07392767,  0.42289072,  0.18770561, ...,  0.01847857,
         0.06639792,  0.32432398]),
 SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
        fit_intercept=True, l1_ratio=0.01, learning_rate='invscaling',
        loss='squared_loss', max_iter=100, n_iter=None,
        penalty='elasticnet', power_t=0.25, random_state=2018, shuffle=True,
        tol=None, verbose=0, warm_start=False))

In [53]:
model = SGDRegressor(alpha=1e-05, random_state=2018)

# valid 
data = [tfidf_dict['train'], tfidf_dict['valid'],  tfidf_dict['holdout']]
preds = oof_prediction(model, data, y_train)

# test
data = [tfidf_dict['fulltrain'], tfidf_dict['test']]
preds += oof_prediction(model, data, y)

sgd_preds = {}
for pred, k in zip(preds, ['train', 'valid', 'holdout', 'fulltrain', 'test']):
    sgd_preds[k] = pred
    
with open('../fe/sgd2.pkl', 'wb') as file: pickle.dump(file=file, obj=sgd_preds)

0.23258+-0.00036
0.23210+-0.00025
