In [3]:
import pandas as pd 
import numpy as np

import pickle
import gc 

from avito_functions import * 
from avito_classes import TargetEncoder

from scipy.sparse import hstack, csr_matrix, vstack
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from datetime import datetime
from itertools import compress


In [4]:
# data

data_keys = ['train', 'valid', 'holdout', 'fulltrain', 'test']

print('Load df')
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

print('Load agg input')
with open('../input/map_dict.pkl', 'rb') as file: map_dict = pickle.load(file)
#with open('../input/text_features.pkl', 'rb') as f: X_text = pickle.load(f)
with open('../input/text_num_features.pkl', 'rb') as f: X_text_num = pickle.load(f)
sgd = load_fe('sgd2')
extra = load_fe('extra')

Load df
Load agg input


In [5]:
## pipeline
n_train = df_train.shape[0]
add_features = X_text_num

X, y, category_features = preprocessing(df_train, df_test, map_dict, add_features)
X, category_features = feature_engineering(X, category_features)

for f in category_features:
    X[f] = pd.factorize(X[f])[0]
    
del df_train, df_test
gc.collect()
str(datetime.now())

run preprocessing..
run feature engineering..
-- count fraction price_x_region__category_name_frac
-- count fraction price_x_region__param_1_frac
-- count fraction price_x_region__param_2_frac
-- count fraction price_x_region__image_top_1_frac
-- count fraction price_x_city__category_name_frac
-- count fraction price_x_city__param_1_frac
-- count fraction price_x_city__param_2_frac
-- count fraction price_x_city__image_top_1_frac
-- count fraction price_x_image_top_1__category_name_frac
-- count fraction price_x_image_top_1__param_1_frac
-- count fraction price_x_image_top_1__param_2_frac
-- count fraction price_x_population_groups__param_1_frac
-- combine factors: price_log_cut_x_parent_category_name
-- combine factors: price_log_cut_x_category_name
-- combine factors: price_log_cut_x_region


'2018-06-26 18:13:30.559660'

In [6]:
X_train, X_test = X[:n_train], X[n_train:]

del X
gc.collect()

67

In [7]:
x_train, x_valid, x_holdout, \
y_train, y_valid, y_holdout, \
_,_,_ = validation_split(X_train, y)

run validation splitting..


In [8]:
for x, k in zip([x_train, x_valid, x_holdout, X_train, X_test], data_keys):
    x['sgd'] = sgd[k]
    x['ext'] = extra[k]

In [9]:
# target encoding 

te_groups = []
for f in category_features:
    te_groups.append([f])

te_groups += [['price_log_cut', 'category_name'], 
              ['price_log_cut', 'region'],
              ['price_log_cut', 'param_1'],
              ['region', 'parent_category_name']
             ]

for group in te_groups:
    x_train, x_valid, x_holdout = target_encoding(x_train, y_train, x_valid, group, x_holdout)
    X_train, X_test = target_encoding(X_train, y, X_test, group)

-- target encoding: ['region']
-- target encoding: ['region']
-- target encoding: ['city']
-- target encoding: ['city']
-- target encoding: ['parent_category_name']
-- target encoding: ['parent_category_name']
-- target encoding: ['category_name']
-- target encoding: ['category_name']
-- target encoding: ['param_1']
-- target encoding: ['param_1']
-- target encoding: ['param_2']
-- target encoding: ['param_2']
-- target encoding: ['param_3']
-- target encoding: ['param_3']
-- target encoding: ['user_type']
-- target encoding: ['user_type']
-- target encoding: ['image_top_1']
-- target encoding: ['image_top_1']
-- target encoding: ['price_log_cut_x_parent_category_name']
-- target encoding: ['price_log_cut_x_parent_category_name']
-- target encoding: ['price_log_cut_x_category_name']
-- target encoding: ['price_log_cut_x_category_name']
-- target encoding: ['price_log_cut_x_region']
-- target encoding: ['price_log_cut_x_region']
-- target encoding: ['price_exists']
-- target encoding: [

In [10]:
cat_data = []
for x in [x_train, x_valid, x_holdout, X_train, X_test]:
    cat_data.append(x[category_features])
    x.drop(category_features, 1, inplace=True)
    print(x.shape, all(x.columns == x_train.columns))

(1103424, 57) True
(300000, 57) True
(100000, 57) True
(1503424, 57) True
(508438, 57) True


In [11]:
## impute 
print('impute numeric')
x_train, x_valid, x_holdout, _ = num_fillna(x_train, x_valid, x_holdout)
X_train, X_test, _ = num_fillna(X_train, X_test)

## scale
print('scale numeric')
x_train, x_valid, x_holdout, _ = num_scaling(x_train, x_valid, x_holdout)
X_train, X_test, _ = num_scaling(X_train, X_test)

impute numeric
scale numeric


In [12]:
data = []
labels = [y_train, y_valid, y_holdout]
for x in [x_train, x_valid, x_holdout, X_train, X_test]:
    data.append(x)

In [13]:
import hdbscan


In [20]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=100)
clusterer

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='euclidean', min_cluster_size=100, min_samples=None, p=None,
    prediction_data=False)

In [None]:
clusterer.fit(X_train.append(X_test))

In [None]:
# from sklearn.neighbors import KNeighborsRegressor
# params = {'n_neighbors': 10, 'n_jobs': 3}
# pred_val, pred_hol, extra = train_sklearn(KNeighborsRegressor, params, data, labels)
# str(datetime.datetime.now())

In [None]:
# # kmeans
# from sklearn.cluster import KMeans
# for k in [30, 50]:
#     kmeans = KMeans(k, n_jobs=3)
#     kmeans_labels = kmeans.fit_predict(X_train.append(X_test))
#     d_preds = {}
#     d_preds['fulltrain'] = kmeans_labels[:n_train]
#     d_preds['test'] = kmeans_labels[n_train:]
#     _,_,_, d_preds['train'],d_preds['valid'],d_preds['holdout'] ,_,_,_ = validation_split(X_train, kmeans_labels)
#     with open('../fe/kmeans{}.pkl'.format(str(k)), 'wb') as file: pickle.dump(file=file, obj=d_preds)


In [54]:
# from sklearn.cluster import MeanShift, estimate_bandwidth
# bandwidth = estimate_bandwidth(X_train.append(X_test), quantile=0.2, n_samples=5000)

In [57]:
# ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False, n_jobs=2)
# ms.fit(X_train.append(X_test))
# labels = ms.labels_
# cluster_centers = ms.cluster_centers_
# labels_unique = np.unique(labels)
# n_clusters_ = len(labels_unique)

In [None]:

# model = ExtraTreesRegressor(**params)

# # valid 
# data = [x_train.values, x_valid.values, x_holdout.values]
# preds = oof_prediction(model, data, y_train)
# # test
# data = [X_train.values, X_test.values]
# preds += oof_prediction(model, data, y)

# d_preds = {}
# for pred, k in zip(preds, ['train', 'valid', 'holdout', 'fulltrain', 'test']):
#     d_preds[k] = pred
    
# with open('../fe/knn.pkl', 'wb') as file: pickle.dump(file=file, obj=d_preds)