In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import optuna
import gc

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error


# Loading Data

In [2]:
train = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")

train.shape, test.shape

((900000, 33), (700000, 32))

# Understanding Data

- Describe

In [3]:
train.describe()

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30,target
count,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,...,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0,900000.0
mean,449999.5,-0.000286,0.001165,0.001174,-0.001368,-0.000571,0.000284,-0.000709,2.03146,2.057998,...,-0.156307,-0.009273,-0.369459,-0.342738,0.176549,0.357591,-0.380876,0.345661,1.002654,0.486488
std,259807.765473,0.998888,0.999193,1.000514,1.000175,1.000167,0.999875,0.999942,1.656172,1.590955,...,2.484706,2.450797,2.453405,2.386941,2.416959,2.47602,238.773054,0.475584,0.818989,0.499818
min,0.0,-4.599856,-4.682199,-4.642676,-4.658816,-4.748501,-4.750214,-4.842919,0.0,0.0,...,-13.310146,-11.85353,-12.301097,-11.416189,-11.918306,-14.300577,-1229.753052,0.0,0.0,0.0
25%,224999.75,-0.67549,-0.675162,-0.674369,-0.676114,-0.675909,-0.673437,-0.674876,1.0,1.0,...,-1.820063,-1.645585,-2.019739,-1.955956,-1.440424,-1.261598,-159.427418,0.0,0.0,0.0
50%,449999.5,0.001144,0.002014,0.002218,-0.002227,-0.001662,-0.000438,-0.001492,2.0,2.0,...,-0.152668,0.03085,-0.390966,-0.340746,0.160912,0.404212,-0.519808,0.0,1.0,0.0
75%,674999.25,0.674337,0.675021,0.677505,0.672544,0.673789,0.675028,0.674749,3.0,3.0,...,1.507071,1.661676,1.255408,1.266673,1.795928,2.028219,158.987357,1.0,2.0,1.0
max,899999.0,4.749301,4.815699,4.961982,4.45492,4.948983,4.971881,4.822668,15.0,16.0,...,14.455426,11.34408,12.2471,12.389844,12.529179,12.913041,1229.562577,1.0,2.0,1.0


In [4]:
test.describe()

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,...,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,1250000.0,-8.7e-05,-0.000732,-0.000385,0.000705,0.0008,-0.000468,0.00255,2.030819,2.056923,...,-0.173972,-0.153938,-0.005623,-0.371983,-0.340683,0.174245,0.356288,0.604213,0.34609,1.003081
std,202072.7,1.000264,0.997131,1.000317,1.001758,0.999088,1.000249,0.999536,1.655909,1.591726,...,2.397938,2.484198,2.450907,2.452619,2.388938,2.417555,2.477713,238.888993,0.475723,0.819673
min,900000.0,-4.658018,-4.922718,-4.457561,-4.567419,-4.675301,-5.141356,-4.782164,0.0,0.0,...,-11.08991,-12.186778,-11.700814,-12.104478,-11.838417,-13.312784,-13.462486,-1204.243716,0.0,0.0
25%,1075000.0,-0.675708,-0.67404,-0.675961,-0.674561,-0.674696,-0.674444,-0.671481,1.0,1.0,...,-1.794757,-1.816739,-1.641035,-2.021342,-1.948923,-1.443815,-1.263991,-158.660917,0.0,0.0
50%,1250000.0,-0.000395,-0.000241,-0.000723,-0.000294,0.000276,-0.001674,0.002623,2.0,2.0,...,-0.184945,-0.1491,0.033726,-0.396281,-0.339431,0.165229,0.399981,0.671386,0.0,1.0
75%,1424999.0,0.675788,0.671415,0.673105,0.676376,0.674542,0.674146,0.67659,3.0,3.0,...,1.447103,1.508052,1.664146,1.251361,1.269879,1.794429,2.025163,159.789006,1.0,2.0
max,1599999.0,5.76195,4.768073,4.599902,4.899904,4.494312,4.701004,4.673145,16.0,13.0,...,10.691366,11.999494,10.991597,11.366652,10.236546,11.29434,12.811976,1141.957328,1.0,2.0


- Infos

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900000 entries, 0 to 899999
Data columns (total 33 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      900000 non-null  int64  
 1   f_00    900000 non-null  float64
 2   f_01    900000 non-null  float64
 3   f_02    900000 non-null  float64
 4   f_03    900000 non-null  float64
 5   f_04    900000 non-null  float64
 6   f_05    900000 non-null  float64
 7   f_06    900000 non-null  float64
 8   f_07    900000 non-null  int64  
 9   f_08    900000 non-null  int64  
 10  f_09    900000 non-null  int64  
 11  f_10    900000 non-null  int64  
 12  f_11    900000 non-null  int64  
 13  f_12    900000 non-null  int64  
 14  f_13    900000 non-null  int64  
 15  f_14    900000 non-null  int64  
 16  f_15    900000 non-null  int64  
 17  f_16    900000 non-null  int64  
 18  f_17    900000 non-null  int64  
 19  f_18    900000 non-null  int64  
 20  f_19    900000 non-null  float64
 21  f_20    90

We have :
- 16 float64 columns
- 16 int64 columns
- 1 object columns

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      700000 non-null  int64  
 1   f_00    700000 non-null  float64
 2   f_01    700000 non-null  float64
 3   f_02    700000 non-null  float64
 4   f_03    700000 non-null  float64
 5   f_04    700000 non-null  float64
 6   f_05    700000 non-null  float64
 7   f_06    700000 non-null  float64
 8   f_07    700000 non-null  int64  
 9   f_08    700000 non-null  int64  
 10  f_09    700000 non-null  int64  
 11  f_10    700000 non-null  int64  
 12  f_11    700000 non-null  int64  
 13  f_12    700000 non-null  int64  
 14  f_13    700000 non-null  int64  
 15  f_14    700000 non-null  int64  
 16  f_15    700000 non-null  int64  
 17  f_16    700000 non-null  int64  
 18  f_17    700000 non-null  int64  
 19  f_18    700000 non-null  int64  
 20  f_19    700000 non-null  float64
 21  f_20    70

- Check missing columns

In [7]:
train.isna().sum().any()

False

In [8]:
test.isna().sum().any()

False

# Feature Engineering (from @ambrosm)

Nous remercions @ambrosm pour le travail fait et partagé sur cette section dans son [carnet](https://www.kaggle.com/code/ambrosm/tpsmay22-keras-quickstart#Feature-engineering).

"*We read the data and apply minimal feature engineering: We only split the f_27 string into ten separate features as described in the [EDA](https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense), and we count the unique characters in the string.*"

In [9]:
for df in [train, test]:
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
    # Next feature is from https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
features = [f for f in test.columns if f != 'id' and f != 'f_27']
test[features].head(2)

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,ch1,ch2,ch3,ch4,ch5,ch6,ch7,ch8,ch9,unique_characters
0,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,0,...,0,0,0,1,0,3,11,0,2,5
1,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,4,...,5,0,1,1,0,4,6,2,1,6


# Modeling

### Installing EvalML

In [10]:
!python3 -m pip install -q evalml==0.28.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 3.1.7 requires google-cloud-bigquery-storage, which is not installed.
tfx-bsl 1.7.0 requires pyarrow<6,>=1, but you have pyarrow 7.0.0 which is incompatible.
tfx-bsl 1.7.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3,>=1.15.5, but you have tensorflow 2.6.3 which is incompatible.
tensorflow-transform 1.7.0 requires pyarrow<6,>=1, but you have pyarrow 7.0.0 which is incompatible.
tensorflow-transform 1.7.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<2.9,>=1.15.5, but you have tensorflow 2.6.3 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.7 which is incompatible.
pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.5.1 which is incompatible.
mxnet 1.9.0

In [11]:
from evalml.automl import AutoMLSearch

### Split Data

In [12]:
y = train["target"]
X = train.drop(columns=["id","f_27", "target"])

test = test.drop(columns=["id","f_27"])

X.shape, test.shape

((900000, 41), (700000, 41))

In [13]:
from sklearn.model_selection import train_test_split
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training Model

In [14]:
## run model
model_evalml = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=3600)
model_evalml.search()

Generating pipelines to search over...


8 pipelines ready for search.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Binary. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Will stop searching for new pipelines after 3600 seconds.

Allowed model families: random_forest, decision_tree, xgboost, linear_model, catboost, extra_trees, lightgbm



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mode Baseline Binary Classification Pipeline
Mode Baseline Binary Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 16.802

*****************************
* Evaluating Batch Number 1 *
*****************************

Elastic Net Classifier w/ Imputer + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.587
Decision Tree Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.640
Random Forest Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.606
LightGBM Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.318
Logistic Regression Classifier w/ Imputer + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.587
XGBoost Classifier w/ Imputer:
	Starting cross validation
	Fin

In [15]:
## check leaderboard
model_evalml.rankings

Unnamed: 0,id,pipeline_name,search_order,mean_cv_score,standard_deviation_cv_score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,54,XGBoost Classifier w/ Imputer,54,0.175443,,0.175443,98.955846,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,44,CatBoost Classifier w/ Imputer,44,0.22895,,0.22895,98.637397,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,13,LightGBM Classifier w/ Imputer,13,0.272268,,0.272268,98.379587,False,{'Imputer': {'categorical_impute_strategy': 'm...
18,21,Elastic Net Classifier w/ Imputer + Standard S...,21,0.58718,,0.58718,96.505376,False,{'Imputer': {'categorical_impute_strategy': 'm...
22,26,Logistic Regression Classifier w/ Imputer + St...,26,0.58718,,0.58718,96.505375,False,{'Imputer': {'categorical_impute_strategy': 'm...
32,3,Random Forest Classifier w/ Imputer,3,0.605539,,0.605539,96.396113,False,{'Imputer': {'categorical_impute_strategy': 'm...
36,37,Extra Trees Classifier w/ Imputer,37,0.624855,,0.624855,96.281153,False,{'Imputer': {'categorical_impute_strategy': 'm...
40,2,Decision Tree Classifier w/ Imputer,2,0.640156,,0.640156,96.190086,False,{'Imputer': {'categorical_impute_strategy': 'm...
55,0,Mode Baseline Binary Classification Pipeline,0,16.802373,,16.802373,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}


In [16]:
pred = model_evalml.best_pipeline.predict_proba(X_test)[True]

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print("MAE : %.7f" % mae)
print("MSE : %.7f" % mse)
print("Roc : %.7f" % roc_auc_score(y_test, pred))

MAE : 0.0950312
MSE : 0.0333516
Roc : 0.9933357


In [17]:
pred

0         0.109090
1         0.963090
2         0.075195
3         0.029337
4         0.972238
            ...   
179995    0.020792
179996    0.990287
179997    0.982249
179998    0.290683
179999    0.993511
Name: 1, Length: 180000, dtype: float64

### Free Memory

In [18]:
del train
del X
del y
del X_test
del y_test

gc.collect()

81

# Test

In [19]:
## generate predictions
y_predict = model_evalml.best_pipeline.predict_proba(test)[True]

#pred = automl.predict(test)
len(y_predict)

700000

In [20]:
y_predict

0         0.957320
1         0.989653
2         0.000062
3         0.022352
4         0.976022
            ...   
699995    0.523001
699996    0.992446
699997    0.306104
699998    0.010068
699999    0.000011
Name: 1, Length: 700000, dtype: float64

# Submission

In [21]:
submission = pd.read_csv("../input/tabular-playground-series-may-2022/sample_submission.csv")
submission.shape

(700000, 2)

In [22]:
!pip install pandas -U
import pandas as pd

Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.2.4
    Uninstalling pandas-1.2.4:
      Successfully uninstalled pandas-1.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 3.1.7 requires google-cloud-bigquery-storage, which is not installed.
woodwork 0.4.2 requires pandas<1.2.5,>=1.2.0, but you have pandas 1.3.5 which is incompatible.
tfx-bsl 1.7.0 requires pyarrow<6,>=1, but you have pyarrow 7.0.0 which is incompatible.
tfx-bsl 1.7.0 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3,>=1.15.5, bu

In [23]:
submission['target'] = y_predict # np.array(y_predict).mean(axis=0)
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,target
0,900000,0.957320
1,900001,0.989653
2,900002,0.000062
3,900003,0.022352
4,900004,0.976022
...,...,...
699995,1599995,0.523001
699996,1599996,0.992446
699997,1599997,0.306104
699998,1599998,0.010068
