# Application My Credit

>> Descriptions détaillées des colonnes données clients bancaires :

1 - âge (numérique)

2 - emploi : type d'emploi (catégorique : "admin.", "inconnu", "chômeur", "gestion", "femme de ménage", "entrepreneur", "étudiant", "col bleu", "indépendant", "retraité", "technicien", "services")

3 - matrimonial : état civil (catégorique : "marié", "divorcé", "célibataire" ; attention : "divorcé" signifie divorcé ou veuf)

4 - éducation (catégorique : "inconnu", "secondaire", "primaire", "tertiaire")

5 - défaut : le crédit est-il en défaut ? (binaire : "oui", "non")

6 - solde : solde annuel moyen, en euros (numérique) 

7 - logement : a-t-il un prêt logement ? (binaire : "oui", "non")

8 - prêt : avez-vous un prêt personnel ? (binaire : "oui", "non")

>> lié au dernier contact de la campagne en cours :

9 - contact : type de communication du contact (catégorique : "inconnu", "téléphone", "cellulaire")

10 - jour : dernier jour de contact du mois (numérique)

11 - mois : dernier mois de contact de l'année (catégorique : "jan" , "feb", "mar", …, "nov", "dec")

12 - durée : durée du dernier contact, en secondes (numérique)

>> autres attributs :

13 - campagne : nombre de contacts effectués pendant cette campagne et pour cette client (numérique, inclut le dernier contact)

14 - pdays : nombre de jours écoulés après que le client a été contacté pour la dernière fois lors d'une campagne précédente (numérique, -1 signifie que le client n'a pas été contacté auparavant)

15 - précédent :nombre de contacts effectués avant cette campagne et pour ce client (numérique)

16 - poutcome : résultat de la campagne marketing précédente (catégorique : "inconnu", "autre", "échec", "succès")

>> Variable de sortie (cible souhaitée) :

17 - y - le client a-t-il obtenu un crédit ? (binaire : "oui", "non")

In [1]:
import pandas as pd

train = pd.read_csv('./train.csv', delimiter=';')
test = pd.read_csv('./test.csv', delimiter=';')
train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [2]:
train.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [3]:
train['y'].value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [4]:
test['y'].value_counts()

y
no     4000
yes     521
Name: count, dtype: int64

In [5]:
non_numerical = train.select_dtypes(exclude=['number']).columns.to_list()
non_numerical

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [6]:
features = train.drop('y', axis=1).columns
features

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [7]:
from sklearn.preprocessing import LabelEncoder

encoders = {}

for col in non_numerical:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    encoders[col] = le

for col, le in encoders.items():
    test[col] = le.transform(test[col])

In [8]:
correlations = train.corr()['y'].abs()
correlations

age          0.025155
job          0.040438
marital      0.045588
education    0.066241
default      0.022419
balance      0.052838
housing      0.139173
loan         0.068185
contact      0.148395
day          0.028348
month        0.024471
duration     0.394521
campaign     0.073172
pdays        0.103621
previous     0.093236
poutcome     0.077840
y            1.000000
Name: y, dtype: float64

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15,5))
sns.heatmap(correlations, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Heatmap de Corrélation')
plt.show()

In [9]:
from sklearn.preprocessing import StandardScaler

scalers = {}

for col in features:
    scaler = StandardScaler()
    train[col] = scaler.fit_transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))
    scalers[col] = scaler

In [10]:
train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1.606965,-0.103820,-0.275762,1.036362,-0.13549,0.256419,0.893915,-0.436803,1.514306,-1.298476,0.823773,0.011016,-0.569351,-0.411453,-0.251940,0.444898,0
1,0.288529,1.424008,1.368372,-0.300556,-0.13549,-0.437895,0.893915,-0.436803,1.514306,-1.298476,0.823773,-0.416127,-0.569351,-0.411453,-0.251940,0.444898,0
2,-0.747384,-0.714951,-0.275762,-0.300556,-0.13549,-0.446762,0.893915,2.289359,1.514306,-1.298476,0.823773,-0.707361,-0.569351,-0.411453,-0.251940,0.444898,0
3,0.571051,-1.020516,-0.275762,2.373280,-0.13549,0.047205,0.893915,-0.436803,1.514306,-1.298476,0.823773,-0.645231,-0.569351,-0.411453,-0.251940,0.444898,0
4,-0.747384,2.035139,1.368372,2.373280,-0.13549,-0.447091,-1.118674,-0.436803,1.514306,-1.298476,0.823773,-0.233620,-0.569351,-0.411453,-0.251940,0.444898,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.947747,1.424008,-0.275762,1.036362,-0.13549,-0.176460,-1.118674,-0.436803,-0.713012,0.143418,1.156344,2.791329,0.076230,-0.411453,-0.251940,0.444898,1
45207,2.831227,0.201746,-1.919895,-1.637474,-0.13549,0.120447,-1.118674,-0.436803,-0.713012,0.143418,1.156344,0.768224,-0.246560,-0.411453,-0.251940,0.444898,1
45208,2.925401,0.201746,-0.275762,-0.300556,-0.13549,1.429593,-1.118674,-0.436803,-0.713012,0.143418,1.156344,3.373797,0.721811,1.436189,1.050473,-0.566175,1
45209,1.512791,-1.020516,-0.275762,-0.300556,-0.13549,-0.228024,-1.118674,-0.436803,0.400647,0.143418,1.156344,0.970146,0.399020,-0.411453,-0.251940,0.444898,0


In [11]:
test

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,-1.029906,1.729573,-0.275762,-1.637474,-0.135490,0.139496,-1.118674,-0.436803,-0.713012,0.383734,1.488915,-0.695712,-0.569351,-0.411453,-0.251940,0.444898,0
1,-0.747384,0.812877,-0.275762,-0.300556,-0.135490,1.125461,0.893915,2.289359,-0.713012,-0.577529,0.823773,-0.148192,-0.569351,2.984213,1.484611,-2.588321,0
2,-0.559037,-0.103820,1.368372,1.036362,-0.135490,-0.004031,0.893915,-0.436803,-0.713012,0.023260,-1.836794,-0.284101,-0.569351,2.894327,0.182198,-2.588321,0
3,-1.029906,-0.103820,-0.275762,1.036362,-0.135490,0.037352,0.893915,2.289359,1.514306,-1.538792,0.158632,-0.229737,0.399020,-0.411453,-0.251940,0.444898,0
4,1.701139,-1.020516,-0.275762,-0.300556,-0.135490,-0.447419,0.893915,-0.436803,1.514306,-1.298476,0.823773,-0.124893,-0.569351,-0.411453,-0.251940,0.444898,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,-0.747384,0.812877,-0.275762,-0.300556,-0.135490,-0.556789,0.893915,-0.436803,-0.713012,1.705471,-0.173939,0.275068,0.721811,-0.411453,-0.251940,0.444898,0
4517,1.512791,0.507311,-0.275762,1.036362,7.380625,-1.535528,0.893915,2.289359,1.514306,-0.817845,0.823773,-0.408361,-0.569351,-0.411453,-0.251940,0.444898,0
4518,1.512791,1.424008,-0.275762,-0.300556,-0.135490,-0.350531,-1.118674,-0.436803,-0.713012,0.383734,-1.504223,-0.416127,2.658552,-0.411453,-0.251940,0.444898,0
4519,-1.218254,-1.020516,-0.275762,-0.300556,-0.135490,-0.073987,-1.118674,-0.436803,-0.713012,-1.178318,-0.839081,-0.501556,0.399020,1.705844,1.050473,-1.577248,0


In [12]:
X_train = train.drop('y', axis=1)
y_train = train['y']

X_test = test.drop('y', axis=1)
y_test = test['y']

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier,
                            param_grid=param_grid,
                              scoring='accuracy',
                                cv=5)

In [14]:
grid_search.fit(X_train, y_train)

In [15]:
rf_model = grid_search.best_estimator_

In [16]:
def show_metrics(y_pred, y_test):
    accuracy = accuracy_score(y_test, y_pred)
    confusion_matrix_result = confusion_matrix(y_test, y_pred)
    classification_report_result = classification_report(y_test, y_pred)
    print(f"Accuracy : {accuracy}")
    print(f"Confusion Matrix :\n{confusion_matrix_result}")
    print(f"Classification Report :\n{classification_report_result}")

In [17]:
y_pred_rf = rf_model.predict(X_test)
show_metrics(y_pred_rf, y_test)

Accuracy : 0.9316522893165229
Confusion Matrix :
[[3966   34]
 [ 275  246]]
Classification Report :
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      4000
           1       0.88      0.47      0.61       521

    accuracy                           0.93      4521
   macro avg       0.91      0.73      0.79      4521
weighted avg       0.93      0.93      0.92      4521



In [19]:
import joblib

joblib.dump(rf_model, './model')
joblib.dump(encoders, './encoders')
joblib.dump(scalers, './scalers')

['./scalers']