# Import

In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

from sklearn import metrics, preprocessing
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    StandardScaler,
    FunctionTransformer,
)
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, train_test_split

In [24]:
# load cleaned data
app_train = pd.read_pickle("../data/processed/app_train_cleaned.pkl")
app_test = pd.read_pickle(
    "../data/processed/app_test_cleaned.pkl"
)  # used for datadrift only

In [3]:
app_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 249 entries, SK_ID_CURR to FE_PAYMENT_RATE
dtypes: bool(135), float64(71), int64(43)
memory usage: 307.0 MB


# Split train/test
To keep some test data not seen by gridsearch

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    app_train.drop("TARGET", axis=1), app_train.TARGET, test_size=0.15
)

## Naive undersampler to fix target imbalance

In [26]:
def NaiveUnderSampler(df):
    # find the minority class
    min_class = df.TARGET.value_counts().idxmin()
    # keep same number of majority class than minority
    target_minority_count = df[df.TARGET == min_class].shape[0]
    df_balanced = pd.concat(
        [
            df[df.TARGET == min_class].sample(target_minority_count),
            df[df.TARGET != min_class].sample(target_minority_count),
        ]
    )
    # shuffle rows
    return df_balanced.sample(frac=1)

In [27]:
# create transformer with fit methods from my function
NaiveUnderSamplerTf = FunctionTransformer(NaiveUnderSampler)

In [28]:
# test naive imbalance fixer
app_train.pipe(NaiveUnderSampler).TARGET.value_counts()

1    24825
0    24825
Name: TARGET, dtype: int64

# Pipeline

In [29]:
# separate target
# X, y = app_train.drop("TARGET", axis=1), app_train.TARGET
# print(X.shape)
# print(y.shape)

In [30]:
samplerList = [NaiveUnderSamplerTf, RandomUnderSampler()]
pipe = Pipeline(
    [
        ("sampler", RandomUnderSampler()),
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler()),
        ("classifier", LogisticRegression(max_iter=500)),
    ]
)
pipe

In [31]:
grid = GridSearchCV(pipe, param_grid={}, cv=3, n_jobs=-1, verbose=3, scoring="roc_auc")
grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,23.84216,1.011094,7.030889,0.697363,{},0.744817,0.740656,0.745193,0.743555,0.002056,1


In [33]:
y_pred = grid.predict(X_test)

In [34]:
y_pred.shape

(46127,)

In [20]:
X_train.shape

(307511, 243)

# Modeling

## Logistic Regression

In [14]:
# # Make the model with the specified regularization parameter
# log_reg = LogisticRegression(max_iter=500)
# dummy = DummyClassifier()

# # Train on the training data
# log_reg.fit(X_train, y_train)
# # dummy.fit(X_train, y_train)

In [None]:
# y_pred = log_reg.predict(X_test)

In [None]:
# y_pred

# Scoring metric

In [35]:
y_pred.mean()

0.3432046306935201

In [36]:
metrics.accuracy_score(y_test, y_pred)

0.6870379604136406

In [None]:
[fpr, tpr, thr] = metrics.roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color="coral", lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("1 - specificite", fontsize=14)
plt.ylabel("Sensibilite", fontsize=14)

# indice du premier seuil pour lequel
# la sensibilité est supérieure à 0.95
idx = np.min(np.where(tpr > 0.95))

print("Sensibilité : {:.2f}".format(tpr[idx]))
print("Spécificité : {:.2f}".format(1 - fpr[idx]))
print("Seuil : {:.2f}".format(thr[idx]))

# Profits and Loss function

In [100]:
INTEREST = 0.015


def pnl(y_pred, y_true):
    r = pd.concat([pd.Series(y_pred, index=X_test.index), y_test], axis=1).rename(
        columns={0: "pred", "TARGET": "true"}
    )
    credit = app_train.loc[X_test.index, "AMT_CREDIT"]
    print(r.shape)
    print(credit.shape)
    r["pnl"] = np.nan
    # TP : pret refusé correctement = pas de perte
    r["pnl"] = np.where((r.pred == 1) & (r.true == 1), 0, r.pnl)
    # FP : pret refusé par erreur : perte des interets
    r["pnl"] = np.where(
        (r.pred == 1) & (r.true == 0),
        credit * -1 * INTEREST,
        r.pnl,
    )
    # TN : pret accordé correctement : gain des interets
    r["pnl"] = np.where(
        (r.pred == 0) & (r.true == 0),
        credit * INTEREST,
        r.pnl,
    )
    # FN : pret accordé par erreur : perte du montant du pret et des interets
    r["pnl"] = np.where(
        (r.pred == 0) & (r.true == 1),
        credit * -1 * (1 + INTEREST),
        r.pnl,
    )
    return r.pnl  # .sum()

(46127, 2)
(46127,)


273660         0.00
256492      6750.00
4110        3375.00
156093      2362.50
98261       5970.24
            ...    
178857      3597.75
116442     11812.50
106726   -739386.90
188140     15103.80
303409      4691.52
Name: pnl, Length: 46127, dtype: float64

In [97]:
pnl(y_pred, y_test)

ValueError: operands could not be broadcast together with shapes (46127,) (261384,) (46127,) 

Essai d'estimer intérêts et durée du prêt en fonction du montant total et de l'annuity : non concluant

In [33]:
app_train[["AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE"]].sample(10)

Unnamed: 0,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE
118835,810000.0,28827.0,810000.0
304723,270000.0,14647.5,270000.0
236024,749349.0,24304.5,625500.0
156584,597024.0,38286.0,540000.0
254212,540000.0,29295.0,540000.0
46099,1288350.0,37800.0,1125000.0
121198,904500.0,29178.0,904500.0
190959,1080000.0,38394.0,1080000.0
177385,202500.0,7762.5,202500.0
115880,657702.0,19359.0,549000.0


In [44]:
(app_train.AMT_ANNUITY / app_train.AMT_CREDIT).describe()

# app_train.AMT_CREDIT * 1.0665 / app_train.AMT_ANNUITY

count    307499.000000
mean          0.053695
std           0.022481
min           0.022073
25%           0.036900
50%           0.050000
75%           0.064043
max           0.124430
dtype: float64

# Shap values

# Data Drift
Pour simuler évolution des commportements utilisateurs dans le temps, on fait l'hypothèse que `app_train` représente les données d'entrainement et `app_test` les données de test.  
On applique le même nettoyage à train et test

In [None]:
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

data_drift_report = Report(
    metrics=[
        DataDriftPreset(),
    ]
)

data_drift_report.run(
    current_data=app_test,
    reference_data=app_train.drop("TARGET", axis=1),
    column_mapping=None,
)
data_drift_report

In [None]:
data_drift_report.save_html("../reports/data_drift_report.html")