##  Data Preparation 

In [72]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
df = pd.read_csv('P1M2_Yonathan_Anggraiwan.csv', sep=';')
df = df.drop(columns=['ApplicationDate'])
df.head()

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,45,39948,617,Employed,Master,22,13152,48,Married,2,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,38,39709,628,Employed,Associate,15,26045,48,Single,1,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,47,40724,570,Employed,Bachelor,26,17627,36,Married,2,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,58,69084,545,Employed,High School,34,37898,96,Single,1,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,37,103264,594,Employed,Associate,17,9184,36,Married,1,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


In [73]:
# Nettoyage des chaînes (strip)
df['EducationLevel'] = df['EducationLevel'].astype(str).str.strip()

# Mapping ordonné
edu_map = {
    'High School': 1,
    'Associate': 2,
    'Bachelor': 3,
    'Master': 4,
    'Doctorate': 5
}
df['EducationLevel'] = df['EducationLevel'].map(edu_map)

# Vérification
print(df['EducationLevel'].isnull().sum())
df['EducationLevel'].unique()

0


array([4, 2, 3, 1, 5], dtype=int64)

##  Encodage des variables catégorielles

In [74]:
df_encoded = pd.get_dummies(df, columns=[
    'EmploymentStatus',
    'MaritalStatus',
    'HomeOwnershipStatus',
    'LoanPurpose'
], drop_first=True)

# Conversion booléens → entiers
df_encoded = df_encoded.astype(int)
df_encoded.head()

Unnamed: 0,Age,AnnualIncome,CreditScore,EducationLevel,Experience,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,...,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,HomeOwnershipStatus_Other,HomeOwnershipStatus_Own,HomeOwnershipStatus_Rent,LoanPurpose_Debt Consolidation,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,45,39948,617,4,22,13152,48,2,183,0,...,1,0,0,0,1,0,0,0,1,0
1,38,39709,628,2,15,26045,48,1,496,0,...,0,1,0,0,0,0,1,0,0,0
2,47,40724,570,3,26,17627,36,2,902,0,...,1,0,0,0,0,1,0,1,0,0
3,58,69084,545,1,34,37898,96,1,755,0,...,0,1,0,0,0,0,0,0,1,0
4,37,103264,594,2,17,9184,36,1,274,0,...,1,0,0,0,0,0,1,0,0,0


##  Traitement des outliers par Winsorizing (Capping)

In [75]:


# 1. Séparer les colonnes non numériques AVANT le traitement
colonnes_non_numeriques = df_encoded.select_dtypes(exclude='number')

# 2. Sélectionner les colonnes numériques pour le capping
colonnes_numeriques = df_encoded.select_dtypes(include='number')

# 3. Calcul des bornes pour le capping (IQR)
Q1 = colonnes_numeriques.quantile(0.25)
Q3 = colonnes_numeriques.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 4. Appliquer le capping sur les colonnes numériques
colonnes_numeriques_capped = colonnes_numeriques.clip(lower=lower_bound, upper=upper_bound, axis=1)

# 5. Réintégrer les colonnes non numériques dans le DataFrame final
df_final = pd.concat([colonnes_non_numeriques.reset_index(drop=True), colonnes_numeriques_capped.reset_index(drop=True)], axis=1)

# 6. Afficher le résultat
df_final.head()

Unnamed: 0,Age,AnnualIncome,CreditScore,EducationLevel,Experience,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,...,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,HomeOwnershipStatus_Other,HomeOwnershipStatus_Own,HomeOwnershipStatus_Rent,LoanPurpose_Debt Consolidation,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,45,39948,617.0,4,22,13152,48,2,183,0,...,1,0,0,0,0,0,0,0,1,0
1,38,39709,628.0,2,15,26045,48,1,496,0,...,0,1,0,0,0,0,1,0,0,0
2,47,40724,570.0,3,26,17627,36,2,902,0,...,1,0,0,0,0,1,0,0,0,0
3,58,69084,545.0,1,34,37898,96,1,755,0,...,0,1,0,0,0,0,0,0,1,0
4,37,103264,594.0,2,17,9184,36,1,274,0,...,1,0,0,0,0,0,1,0,0,0


##  Standardisation des variables numériques

In [76]:
features = df_final.drop('LoanApproved', axis=1)
target = df_final['LoanApproved']

numeric_cols = features.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])
features.head()

Unnamed: 0,Age,AnnualIncome,CreditScore,EducationLevel,Experience,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,...,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,HomeOwnershipStatus_Other,HomeOwnershipStatus_Own,HomeOwnershipStatus_Rent,LoanPurpose_Debt Consolidation,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,45,39948,0.897666,4,22,13152,48,2,183,0,...,1,0,0,0,0,0,0,0,1,0
1,38,39709,1.116339,2,15,26045,48,1,496,0,...,0,1,0,0,0,0,1,0,0,0
2,47,40724,-0.036666,3,26,17627,36,2,902,0,...,1,0,0,0,0,1,0,0,0,0
3,58,69084,-0.53365,1,34,37898,96,1,755,0,...,0,1,0,0,0,0,0,0,1,0
4,37,103264,0.44044,2,17,9184,36,1,274,0,...,1,0,0,0,0,0,1,0,0,0


##  Séparation Train/Test

In [77]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)
print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

Train shape: (16000, 42)
Test shape: (4000, 42)


In [78]:
print(df_encoded["LoanApproved"].value_counts())


LoanApproved
0    15220
1     4780
Name: count, dtype: int64


In [79]:
from sklearn.model_selection import train_test_split

# Vérifie d’abord que la variable cible existe bien avec les deux classes
print(df_encoded["LoanApproved"].value_counts())

# Séparation X/y
X = df_encoded.drop("LoanApproved", axis=1)
y = df_encoded["LoanApproved"]

# Split stratifié
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Vérification
print("Répartition dans y_train :")
print(y_train.value_counts())


LoanApproved
0    15220
1     4780
Name: count, dtype: int64
Répartition dans y_train :
LoanApproved
0    12176
1     3824
Name: count, dtype: int64


##  Rééquilibrage des classes avec SMOTE

In [80]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Répartition après SMOTE :")
print(y_train_res.value_counts())


Répartition après SMOTE :
LoanApproved
0    12176
1    12176
Name: count, dtype: int64
