In [18]:
%%capture
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
import pyspark.sql.functions as F
import pyspark

### Read dataframe

In [2]:
df_train = pd.read_csv("data/train.csv.zip")
df_test = pd.read_csv("data/test.csv.zip")

id_columns = ['ID']
target_column = ['TARGET']


x_train = df_train.drop(columns=id_columns + target_column).fillna(0)
x_test = df_test[x_train.columns]

### Shuffling some distributions to understand if it will be detected by the data drift detector

In [14]:
x_test['var15']  = x_test['var15'] * 2
x_test['var3']  = x_test['var3'] * 2
x_train['var38']  = x_train['var38']  * 2
x_train['var36']  = x_train['var36']* 2

### Define target adversarial

In [15]:
### Definindo master table
x_train['target'] = 1
x_test['target'] = 0

master = pd.concat([x_train,x_test])

x = master.drop(columns = 'target')
y = master[['target']]

### Import lib

In [16]:
import AdversarialFeatureSelection as Ad

### Apply framework

In [17]:
# Set the specifications
adversarial_model = Ad.AdversarialModel(auc_threshold=0.5,
                                     num_feature_delete_step=1)

# Train the model that will save the variables to be deleted
adversarial_model.fit(x, y)

# Apply to the dataframe
transformed_data = adversarial_model.transform(x_train)

print('-----------------')
print('Number of features before', len(x_train.columns))
print('Number of features after', len(transformed_data.columns))


Initial number of features:  369
First AUC on test: 0.9999
Drift detected
Deleting features:
var3
---------------------
AUC on test: 0.9950
Drift detected
Deleting features:
var3
var15
---------------------
AUC on test: 0.9680
Drift detected
Deleting features:
var3
var15
var38
---------------------
AUC on test: 0.9380
Drift detected
Deleting features:
var3
var15
var38
var36
---------------------
AUC on test: 0.4969
Features to delete:
var3
var15
var38
var36
Optimal number of features: 365
['var3', 'var15', 'var38', 'var36']
-----------------
Number of features before 370
Number of features after 366


### Done
The approach was able to identify the four variables whose distributions were shuffled. This type of approach is very useful for avoiding taking features with high volatility into the training part, reducing the chance of having a model with high variance/overfitting.