In [1]:
import warnings
warnings.filterwarnings("ignore")

### Spark

In [2]:
from pyspark import init_spark
import pyspark.sql.functions as sf
from pyspark.sql.window import Window as sw
from pyspark.sql.types import StringType, IntegerType, ArrayType
spark = init_spark({"appName": 'smooth_features'})

### Imports

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
import seaborn as sns
plt.style.use("bmh")
plt.rcParams['font.family'] = 'DejaVu Sans'

# Datasets

In [4]:
train = spark.table('alfa.andrey_auto_train')
test = spark.table('alfa.andrey_auto_test')

In [5]:
N_TRAIN = train.count()
N_TEST = test.count()

# Smooth statistics

In [6]:
ALPHA_PERC = 0.1
MAX_K = 25

In [7]:
def smooth_statistic(stat_set, apply_set, encoding_list, global_y):
    new_feature_name = 'encod_via_' + '_'.join(encoding_list)
    stat_set = stat_set.select(['app_id', 'target'] + encoding_list)
    apply_set = apply_set.select(['app_id'] + encoding_list)
    means = stat_set.groupBy(encoding_list).agg(sf.mean('target').alias(new_feature_name))
    apply_set = apply_set.join(means, on=encoding_list, how='left')
    apply_set = apply_set.withColumn(new_feature_name,
                        sf.lit(1 - ALPHA_PERC) * sf.col(new_feature_name) + sf.lit(global_y * ALPHA_PERC))
    apply_set = apply_set.fillna(global_y)
    return apply_set

In [8]:
target_for_smooth_train = spark.table('alfa.andrey_auto_target_train')
target_for_smooth_train = target_for_smooth_train.withColumnRenamed('flag', 'target')
train_for_smooth = train.join(target_for_smooth_train, how='inner', on=['app_id'])

target_for_smooth_test = spark.table('alfa.andrey_auto_target_test')
test_for_smooth = test.join(target_for_smooth_test, how='inner', on=['app_id'])

app_id_groups = train_for_smooth.select(['app_id', 'target']).dropDuplicates()
app_id_groups = app_id_groups.withColumn('new_column', sf.lit("ABC"))
app_id_groups = app_id_groups.withColumn('k', sf.row_number().\
                    over(sw().partitionBy('new_column').orderBy(['target', 'app_id'])))
app_id_groups = app_id_groups.drop('new_column')
app_id_groups = app_id_groups.withColumn('k', sf.expr("mod(k, " + str(MAX_K) + ")"))
app_id_groups = app_id_groups.select(['app_id', 'k'])

In [9]:
TRAIN_GLOBAL_MEAN = train_for_smooth.select(sf.mean('target')).take(1)[0][0]
table_for_batches = train_for_smooth.select(['app_id', 'target'])
table_for_batches = table_for_batches.join(app_id_groups, how='left', on=['app_id'])
batches_mean_target = {}
for current_k in range(MAX_K):
    batches_mean_target[current_k] =\
        table_for_batches.filter(sf.col('k') != current_k).select(sf.mean('target')).take(1)[0][0]

In [10]:
from itertools import combinations
possible_columns = ['product', 'operation_kind', 'card_type', 'operation_type', 'ecommerce_flag', 'income_flag',
                    'mcc', 'city', 'mcc_category', 'day_of_week', 'hour', 'weekofyear']
all_possible_for_smooth = [[x] for x in possible_columns] + [list(x) for x in list(combinations(possible_columns, 2))]

In [11]:
all_possible_for_smooth = [['product', 'card_type'], ['card_type', 'hour_diff'], ['card_type', 'hour']]

In [12]:
train_smooth_table = train.select(['app_id']).dropDuplicates()
test_smooth_table = test.select(['app_id']).dropDuplicates()

for encoding_list in tqdm(all_possible_for_smooth):
    current_smooth = train_for_smooth.select(['app_id', 'target'] + encoding_list)
    current_smooth = current_smooth.join(app_id_groups, how='left', on=['app_id'])
    current_smooth = current_smooth.withColumn('n_k', sf.count('app_id').over(sw().partitionBy('k')))
    current_smooth = current_smooth.withColumn('n_m_k', sf.lit(N_TRAIN) - sf.col('n_k'))
    
    current_smooth_test = test_for_smooth.select(['app_id'] + encoding_list)
    current_smooth_test = current_smooth_test.withColumn('n_m_k', sf.lit(N_TRAIN))

    averages = []
    new_feature_name = 'encod_via_' + '_'.join(encoding_list)
    for c_k in range(MAX_K):
        smooth_k = smooth_statistic(current_smooth.filter(sf.col('k') != c_k),
                        current_smooth.filter(sf.col('k') == c_k), encoding_list, batches_mean_target[c_k])
        averages.append(smooth_k)
    
    smooth_cv_table = averages[0]
    for i in range(1, MAX_K):
        smooth_cv_table = smooth_cv_table.union(averages[i])
    smooth_test_table = smooth_statistic(current_smooth, current_smooth_test, encoding_list, TRAIN_GLOBAL_MEAN)

    if new_feature_name in ['encod_via_card_type_hour']:
        smooth_cv_table = smooth_cv_table.groupBy(['app_id']).agg(
                                    sf.mean(new_feature_name).alias(new_feature_name + '_mean'))
        smooth_test_table = smooth_test_table.groupBy(['app_id']).agg(
                                    sf.mean(new_feature_name).alias(new_feature_name + '_mean'))
    if new_feature_name in ['encod_via_product_card_type', 'encod_via_card_type_hour_diff']:
        smooth_cv_table = smooth_cv_table.groupBy(['app_id']).agg(
                                    sf.max(new_feature_name).alias(new_feature_name + '_max'))
        smooth_test_table = smooth_test_table.groupBy(['app_id']).agg(
                                    sf.max(new_feature_name).alias(new_feature_name + '_max'))

    train_smooth_table = train_smooth_table.join(smooth_cv_table, on=['app_id'], how='left')
    test_smooth_table = test_smooth_table.join(smooth_test_table, on=['app_id'], how='left')

100%|██████████| 3/3 [00:12<00:00,  4.13s/it]


_____

# Merge features and Add target

In [13]:
target_train = spark.table('alfa.andrey_auto_target_train')
target_train = target_train.withColumnRenamed('flag', 'target')
target_train = target_train.select(['app_id', 'target'])
target_train = target_train.join(train_smooth_table, on=['app_id'], how='left')

In [14]:
target_test = spark.table('alfa.andrey_auto_target_test')
target_test = target_test.withColumn('target', sf.lit(-1))
target_test = target_test.select(['app_id', 'target'])
target_test = target_test.join(test_smooth_table, on=['app_id'], how='left')

# Features after merge

In [15]:
len(target_train.columns), len(target_test.columns)

(5, 5)

# Save tables

In [16]:
%%time
target_train.write.format('orc').mode('overwrite').saveAsTable('alfa.andrey_auto_payments_train_main__')
spark.table('alfa.andrey_auto_payments_train_main__').toPandas().to_pickle('./train_smooth_.pkl')

CPU times: user 5.75 s, sys: 743 ms, total: 6.5 s
Wall time: 4min 50s


In [17]:
%%time
target_test.write.format('orc').mode('overwrite').saveAsTable('alfa.andrey_auto_payments_test_main__')
spark.table('alfa.andrey_auto_payments_test_main__').toPandas().to_pickle('./test_smooth_.pkl')

CPU times: user 3.06 s, sys: 322 ms, total: 3.38 s
Wall time: 1min 18s


In [18]:
spark.stop()

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______