In [1]:
! pip3.5 install xgboost



Чтение данных

In [2]:
%matplotlib inline
%config InlineBackend.figure_format ='retina'

import os
import shutil
import sys
import glob
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import pyspark
import pyspark.sql.functions as F
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import Row

COMMON_PATH = '/workspace/common'

sys.path.append(os.path.join(COMMON_PATH, 'utils'))

os.environ['PYSPARK_SUBMIT_ARGS'] = """
--jars {common}/xgboost4j-spark-0.72.jar,{common}/xgboost4j-0.72.jar
--py-files {common}/sparkxgb.zip pyspark-shell
""".format(common=COMMON_PATH).replace('\n', ' ')

spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("spark_sql_examples") \
    .config("spark.executor.memory", "100g") \
    .config("spark.driver.memory", "100g") \
    .config("spark.task.cpus", "6") \
    .config("spark.executor.cores", "6") \
    .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

from metrics import rocauc, logloss, ne, get_ate
from processing import split_by_col

from sparkxgb.xgboost import *

In [3]:
DATA_PATH = '/workspace/data/criteo'

TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')

In [4]:
import pandas as pd 

In [5]:
train = pd.read_csv(TRAIN_PATH)

In [6]:
train.head()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,...,_c31,_c32,_c33,_c34,_c35,_c36,_c37,_c38,_c39,id
0,1,0.0,-1,,,1465.0,0.0,17.0,0.0,4.0,...,e5f8f18f,,,f3ddd519,,32c7478e,b34f3128,,,12
1,1,0.0,1,20.0,16.0,1548.0,93.0,42.0,32.0,912.0,...,1f868fdd,21ddcdc9,a458ea53,7eee76d1,,32c7478e,9af06ad9,9d93af03,cdfe5ab7,26
2,0,8.0,0,15.0,20.0,115.0,24.0,8.0,23.0,24.0,...,1304f63b,21ddcdc9,b1252a9d,07b2853e,,32c7478e,94bde4f2,010f6491,09b76f8d,39
3,1,88.0,319,,4.0,5.0,4.0,89.0,40.0,88.0,...,bbf70d82,,,16e2e3b3,,32c7478e,d859b4dd,,,41
4,0,0.0,53,,10.0,6550.0,98.0,34.0,11.0,349.0,...,fa0643ee,21ddcdc9,b1252a9d,0094bc78,,32c7478e,29ece3ed,001f3601,402185f3,85


In [7]:
num_columns = ['_c{}'.format(i) for i in range(1, 14)]
cat_columns = ['_c{}'.format(i) for i in range(14, 40)]
len(num_columns), len(cat_columns)

(13, 26)

In [8]:
! pip3.5 install category_encoders



In [9]:
import category_encoders as ce

In [10]:
encoder = ce.CatBoostEncoder(cols=cat_columns)

In [11]:
X, y = train.drop(columns=['_c0', 'id']), train['_c0']

In [12]:
%%time
X_encoded = encoder.fit_transform(X, y)

CPU times: user 5min 15s, sys: 27.5 s, total: 5min 43s
Wall time: 5min 43s


In [13]:
X_encoded[['_c0', 'id']] = train[['_c0', 'id']]

In [14]:
X_encoded.head()

Unnamed: 0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,...,_c32,_c33,_c34,_c35,_c36,_c37,_c38,_c39,_c0,id
0,0.0,-1,,,1465.0,0.0,17.0,0.0,4.0,0.0,...,0.256457,0.256457,0.256457,0.256457,0.256457,0.256457,0.256457,0.256457,1,12
1,0.0,1,20.0,16.0,1548.0,93.0,42.0,32.0,912.0,0.0,...,0.256457,0.256457,0.256457,0.628229,0.628229,0.256457,0.256457,0.256457,1,26
2,8.0,0,15.0,20.0,115.0,24.0,8.0,23.0,24.0,2.0,...,0.628229,0.256457,0.256457,0.752152,0.752152,0.256457,0.256457,0.256457,0,39
3,88.0,319,,4.0,5.0,4.0,89.0,40.0,88.0,3.0,...,0.628229,0.628229,0.256457,0.564114,0.564114,0.256457,0.628229,0.628229,1,41
4,0.0,53,,10.0,6550.0,98.0,34.0,11.0,349.0,0.0,...,0.418819,0.128229,0.256457,0.651291,0.651291,0.256457,0.256457,0.256457,0,85


In [15]:
# from: https://stackoverflow.com/a/56895546/6696410

from pyspark.sql.types import *

# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)


# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlContext.createDataFrame(pandas_df, p_schema)

In [16]:
df = pandas_to_spark(X_encoded)

In [17]:
df = df.fillna(0, subset=num_columns)

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler

In [19]:
assembler = VectorAssembler(
    inputCols=num_columns + cat_columns,
    outputCol="features")

In [20]:
pipeline = Pipeline(stages=[assembler])

In [21]:
fitted_pipeline = pipeline.fit(df)

In [22]:
df = fitted_pipeline \
        .transform(df) \
        .select(F.col('_c0').alias('label'), 'features', 'id') \
        .cache()

In [23]:
%%time
df.count()

CPU times: user 5.94 ms, sys: 245 µs, total: 6.19 ms
Wall time: 26.4 s


3664931

In [24]:
df = df.repartition(6).cache()

In [25]:
%%time
df.count()

CPU times: user 0 ns, sys: 3.4 ms, total: 3.4 ms
Wall time: 14.7 s


3664931

In [26]:
train_df, val_df, test_df = split_by_col(df, 'id', [0.8, 0.1, 0.1])

In [27]:
train_df, val_df = train_df.cache(), val_df.cache()

In [28]:
static_params = {
    'featuresCol': "features", 
    'labelCol': "label", 
    'predictionCol': "prediction",
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'nthread': 6,
    'silent': 0,
    'nworkers': 1
}

In [29]:
%%time
CONTROL_NAME = 'xgb baseline'

baseline_params = {
    'colsample_bytree': 0.9,
    'eta': 0.15,
    'gamma': 0.9,
    'max_depth': 6,
    'min_child_weight': 50.0,
    'subsample': 0.9,
    'num_round': 20
}

baseline_model = XGBoostEstimator(**{**static_params, **baseline_params}).fit(train_df)

CPU times: user 73.1 ms, sys: 8.56 ms, total: 81.7 ms
Wall time: 1min 15s


In [30]:
all_metrics = {}

In [31]:
baseline_test_metrics = {
    'logloss': logloss(baseline_model, test_df, probabilities_col='probabilities'),
    'rocauc': rocauc(baseline_model, test_df, probabilities_col='probabilities')
}

all_metrics[CONTROL_NAME] = baseline_test_metrics

### Hyperopt

In [32]:
!pip3.5 install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.3-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 14.6 MB/s eta 0:00:01
[?25hCollecting networkx==2.2
  Downloading networkx-2.2.zip (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 54.9 MB/s eta 0:00:01
Collecting tqdm
  Downloading tqdm-4.44.1-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 3.8 MB/s  eta 0:00:01
Collecting cloudpickle
  Downloading cloudpickle-1.3.0-py2.py3-none-any.whl (26 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 56.6 MB/s eta 0:00:01
Installing collected packages: networkx, tqdm, cloudpickle, future, hyperopt
    Running setup.py install for networkx ... [?25ldone
[?25h    Running setup.py install for future ... [?25ldone
[?25hSuccessfully installed cloudpickle-1.3.0 future-0.18.2 hyperopt-0.2.3 networkx-2.2 tqdm-4.44.1


In [79]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import scipy.stats as st

In [81]:
from time import perf_counter

In [86]:
cast_to_int_params = ['num_round', 'max_depth']

def objective(space):
    start_time = perf_counter()
    for param in cast_to_int_params:
        space[param] = int(space[param])
    print(space)
    estimator = XGBoostEstimator(**space)
    success = False
    attempts = 0
    model = None
    while not success and attempts < 2:
        try:
            model = estimator.fit(train_df)
            success = True
        except Exception as e:
            attempts += 1
            print(e)
            print('Try again')
        
    log_loss = logloss(model, val_df, probabilities_col='probabilities')
    roc_auc = rocauc(model, val_df, probabilities_col='probabilities')
    
    print('LOG-LOSS: {}, ROC-AUC: {}, elapsed_time: {}\n'.format(log_loss, roc_auc, perf_counter() - start_time))

    return {'loss': log_loss, 'rocauc': roc_auc, 'status': STATUS_OK }

In [87]:
space = {
    # Optimize
    'num_round': hp.quniform('num_round', 10, 100, 10),
    'eta': hp.uniform('eta', 0.025, 0.8),
    
    'max_depth': hp.quniform('max_depth', 3, 9, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 100, 15),
    
    'gamma': hp.loguniform('gamma', -3, 0),
    
    'subsample': hp.uniform('subsample', 0.3, 1.),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.),
    
    **static_params
}

In [88]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials)

{'subsample': 0.8241716986818315, 'eta': 0.5416518294422941, 'max_depth': 5, 'nthread': 6, 'min_child_weight': 90.0, 'objective': 'binary:logistic', 'num_round': 70, 'featuresCol': 'features', 'nworkers': 1, 'silent': 0, 'labelCol': 'label', 'predictionCol': 'prediction', 'eval_metric': 'logloss', 'colsample_bytree': 0.6842649958524079, 'gamma': 0.24813774864179464}
LOG-LOSS: 0.4689897531185601, ROC-AUC: 0.7834624736571406, elapsed_time: 164.7245990309748

{'subsample': 0.39842035677857357, 'eta': 0.43410000408036803, 'max_depth': 4, 'nthread': 6, 'min_child_weight': 45.0, 'objective': 'binary:logistic', 'num_round': 10, 'featuresCol': 'features', 'nworkers': 1, 'silent': 0, 'labelCol': 'label', 'predictionCol': 'prediction', 'eval_metric': 'logloss', 'colsample_bytree': 0.4571084153944315, 'gamma': 0.32235261370526436}
LOG-LOSS: 0.48237613951997416, ROC-AUC: 0.7683602800055078, elapsed_time: 66.64418106200173

{'subsample': 0.8242792700942874, 'eta': 0.5194181554204429, 'max_depth': 5

In [89]:
best

{'colsample_bytree': 0.6310204096347904,
 'eta': 0.3360421120626716,
 'gamma': 0.1807684385259789,
 'max_depth': 7.0,
 'min_child_weight': 90.0,
 'num_round': 80.0,
 'subsample': 0.5671199406830716}

In [91]:
for param in cast_to_int_params:
    best[param] = int(best[param])

In [92]:
tuned_model = XGBoostEstimator(**{**static_params, **best}).fit(train_df)

In [93]:
tuned_xgb_test_metrics = {
    'logloss': logloss(tuned_model, test_df, probabilities_col='probabilities'),
    'rocauc': rocauc(tuned_model, test_df, probabilities_col='probabilities')
}

all_metrics['tuned_xgb'] = tuned_xgb_test_metrics

In [94]:
get_ate(all_metrics, CONTROL_NAME)

Unnamed: 0,metric,tuned_xgb ate %
0,logloss,-2.649236
1,rocauc,1.435124


In [95]:
all_metrics

{'tuned_xgb': {'logloss': 0.4717134469662689, 'rocauc': 0.7839528973677383},
 'xgb baseline': {'logloss': 0.4845503277394011, 'rocauc': 0.7728613798531613}}

## Kaggle

In [111]:
from pyspark.sql import types as T

In [125]:
TEST_PATH = os.path.join(DATA_PATH, 'test.csv')

In [126]:
kaggle = pd.read_csv(TEST_PATH)

In [127]:
kaggle.head()

Unnamed: 0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,...,_c31,_c32,_c33,_c34,_c35,_c36,_c37,_c38,_c39,id
0,,19,2.0,4.0,4576.0,6.0,6.0,5.0,15.0,,...,43de85d3,,,b64021bd,,32c7478e,f1a27f66,,,566935904713
1,,1,1.0,,5688.0,,0.0,2.0,10.0,,...,e7e991cb,efa3470f,a458ea53,6ef75f1d,78e2e389,32c7478e,f53ea242,cb079c2d,e9b68fcc,566935904715
2,,445,2.0,2.0,8579.0,26.0,1.0,2.0,26.0,,...,e5f8f18f,,,f3ddd519,ad3062eb,32c7478e,b34f3128,,,566935904727
3,0.0,172,7.0,1.0,2008.0,143.0,24.0,28.0,430.0,0.0,...,eef7297e,,,8ae05402,ad3062eb,423fab69,8d4a9014,,,566935904737
4,,11,4.0,4.0,14.0,,0.0,4.0,6.0,,...,7181ccc8,,,2265e99d,,32c7478e,5dc43b96,,,566935904741


In [128]:
kaggle, id_kaggle = kaggle.drop(columns='id'), kaggle['id']

In [129]:
%%time
kaggle_encoded = encoder.transform(kaggle)

CPU times: user 14.5 s, sys: 2.67 s, total: 17.2 s
Wall time: 17.2 s


In [130]:
kaggle_encoded['id'] = id_kaggle

In [122]:
kaggle_encoded.head()

Unnamed: 0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,...,_c30,_c31,_c32,_c33,_c34,_c35,_c36,_c37,_c38,_c39
0,,19,2.0,4.0,4576.0,6.0,6.0,5.0,15.0,,...,0.30911,0.287443,0.269864,0.269864,0.256457,0.248842,0.27127,0.246929,0.269864,0.269864
1,,1,1.0,,5688.0,,0.0,2.0,10.0,,...,0.30911,0.307631,0.262105,0.236122,0.256457,0.282588,0.27127,0.242785,0.269719,0.242785
2,,445,2.0,2.0,8579.0,26.0,1.0,2.0,26.0,,...,0.186589,0.423678,0.269864,0.269864,0.365926,0.293339,0.27127,0.265294,0.269864,0.269864
3,0.0,172,7.0,1.0,2008.0,143.0,24.0,28.0,430.0,0.0,...,0.30911,0.486853,0.269864,0.269864,0.426548,0.293339,0.356999,0.486043,0.269864,0.269864
4,,11,4.0,4.0,14.0,,0.0,4.0,6.0,,...,0.30911,0.250631,0.269864,0.269864,0.114223,0.248842,0.27127,0.304967,0.269864,0.269864


In [131]:
kaggle_df = pandas_to_spark(kaggle_encoded)

In [132]:
kaggle_df = kaggle_df.fillna(0, subset=num_columns)

In [133]:
kaggle_df = fitted_pipeline \
        .transform(kaggle_df) \
        .select('features', 'id') \
        .cache()

In [134]:
kaggle_df = kaggle_df.repartition(6).cache()

In [135]:
preds = tuned_model.transform(kaggle_df)

In [136]:
preds = preds.withColumn('proba', F.udf(lambda v: float(v[1]), T.FloatType())(F.col('probabilities')))

In [137]:
SUBMISSIONS_FOLDER = os.path.join(DATA_PATH, 'submissions')
os.makedirs(SUBMISSIONS_FOLDER, exist_ok=True)

In [138]:
SUBMISSION_PATH = os.path.join(SUBMISSIONS_FOLDER, 'full_dataset_mte_xgb.csv')

In [139]:
preds.select('id', 'proba').toPandas().to_csv(SUBMISSION_PATH, index=False)

In [140]:
preds.limit(5).select('id', 'proba').toPandas()

Unnamed: 0,id,proba
0,566936179499,0.157406
1,584116096827,0.044797
2,575525985092,0.102863
3,575525778555,0.575226
4,584115725080,0.382264


In [141]:
! kaggle competitions submit mlbd-20-ctr-prediction-1 -f $SUBMISSION_PATH -m "full train, mte, xgb, tuned"

100%|██████████████████████████████████████| 20.9M/20.9M [00:03<00:00, 6.12MB/s]
Successfully submitted to CTR prediction

In [142]:
! kaggle competitions submissions mlbd-20-ctr-prediction-1 -v

fileName,date,description,status,publicScore,privateScore
full_dataset_mte_xgb.csv,2020-03-31 21:58:04,"full train, mte, xgb, tuned",complete,0.77880,None
hyperopt_xgb.csv,2020-03-12 06:31:23,"still 0.5 of train, xgb, but tuned with hyperopt",complete,0.73301,None
pure_xgb.csv,2020-03-12 06:30:03,"still 0.5 of train, xgb, without tuning",complete,0.71998,None
hyperopt_logreg.csv,2020-03-12 06:25:26,"still 0.5 of train, logreg, but tuned with hyperopt",complete,0.69511,None
hyperopt_logreg.csv,2020-03-12 06:20:56,error_sub,error,None,None
submission.csv,2020-02-27 01:26:19,"Random half of train, moreover 0.8 of it, log reg with 50 iterations. Local AUC: ~70",complete,0.69420,None


Если бы в соревновании была бы другая метрика (например `logloss`), то могло бы помочь вычитания разности между средним значением по предсказанию и тренировочным данным, однако на roc-auc это никак не повлияет

In [143]:
def get_mean(df, col):
    return df.select(F.mean(col)).first()[0]

In [144]:
get_mean(preds, 'proba')

0.2720650592369951

In [145]:
get_mean(val_df, 'label')

0.2613037629640948