In [1]:
%cd src

/Users/kristjan.roosild/OneDrive/kool/AutoML/projekt/automl/src


In [2]:
# Load training data only
import pandas as pd

train = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
y = train.target.values

train = train.drop(['ID_code', 'target'], axis=1)
X = train.values.astype(float)

In [3]:
X.shape, y.shape

((200000, 200), (200000,))

In [None]:
# train regressor, not classifier to be able to later get AUC score and curve from the prediction score
from autofeat import AutoFeatRegressor

afreg = AutoFeatRegressor(verbose=1, feateng_steps=1)
df = afreg.fit_transform(X, y)


In [None]:
df.shape

In [None]:
df[:10]

In [None]:
# save the model to be able to create new features later
import pickle
with open('autofeat_regressor.pickle', mode='wb') as fp:
    pickle.dump(afreg, fp)

In [None]:
# load pickle to make sure everything checked out
import pickle
with open('autofeat_regressor.pickle', mode='rb') as fp:
    afreg_loaded = pickle.load(fp)
df_loaded = afreg_loaded.transform(X)

In [None]:
# some sanity checks
df_loaded.shape

In [None]:
df_loaded[:10]

In [None]:
(df == df_loaded)[:10]

In [None]:
# predict for roc auc
y_pred = afreg_loaded.predict(X)


In [None]:
from matplotlib import pyplot as plt
def draw_roc_curve(fpr, tpr, roc_score, algo_name):
    lw=2
    plt.figure()
    plt.plot(fpr, tpr, color="darkorange", lw=lw, label=f"ROC curve for training data (area = {roc_score:0.4f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])

    plt.title(algo_name)

    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    plt.legend(loc="lower right")
    plt.show()

In [None]:
from sklearn import metrics
roc_score = metrics.roc_auc_score(y, y_pred)
fpr, tpr, thresholds = metrics.roc_curve(y, y_pred)
draw_roc_curve(fpr, tpr, roc_score, 'AutoFeatRegressor')

In [None]:
# train classifier, because autofit sets the class weights to balanced behind the scenes - we should be able to get a better result this way
# see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html?highlight=logisticregressioncv#sklearn.linear_model.LogisticRegressionCV
# and https://github.com/cod3licious/autofeat/blob/d08dd474919236c3a78087700051da31645d1e7a/autofeat/autofeat.py#L346
from autofeat import AutoFeatClassifier

afclass = AutoFeatClassifier(verbose=1, feateng_steps=1)
df_class = afclass.fit_transform(X, y)

[AutoFeat] The 1 step feature engineering process could generate up to 1400 features.
[AutoFeat] With 200000 data points this new feature matrix would use about 1.12 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 839 transformed features from 200 original features - done.
[feateng] Generated altogether 839 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 409 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5


In [None]:
# save the model to be able to create new features later
import pickle
with open('autofeat_class.pickle', mode='wb') as fp:
    pickle.dump(afclass, fp)

In [None]:
y_pred = afclass.predict(X)


In [None]:
from sklearn import metrics
roc_score = metrics.roc_auc_score(y, y_pred)
fpr, tpr, thresholds = metrics.roc_curve(y, y_pred)
draw_roc_curve(fpr, tpr, roc_score, 'AutoFeatClassifier')