In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
import pandas as pd

# File path to your dataset
file_path = r"C:\Users\welcome\Desktop\Customer Credit Risk Prediction project\default of credit card clients.xls"

# Load the dataset
data = pd.read_excel(file_path, header=1)  # Adjust header if necessary

# Display the first few rows of the dataset
print(data.head())

   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0   1      20000    2          2         1   24      2      2     -1     -1   
1   2     120000    2          2         2   26     -1      2      0      0   
2   3      90000    2          2         2   34      0      0      0      0   
3   4      50000    2          2         1   37      0      0      0      0   
4   5      50000    1          2         1   57     -1      0     -1      0   

   ...  Credit_UTL4  Credit_UTL5  Credit_UTL6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0  ...     0.000000     0.000000     0.000000         0       689         0   
1  ...     0.027267     0.028792     0.027175         0      1000      1000   
2  ...     0.159233     0.166089     0.172767      1518      1500      1000   
3  ...     0.566280     0.579180     0.590940      2000      2019      1200   
4  ...     0.418800     0.382920     0.382620      2000     36681     10000   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default payment n

In [3]:

# Check the shape of the dataset
print("\nShape of the dataset (rows, columns):")
print(data.shape)

# Display information about the dataset
print("\nDataset information:")
print(data.info())


Shape of the dataset (rows, columns):
(30000, 31)

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  int64  
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1      

In [4]:
# Display column names
print("Column Names:")
print(data.columns)

# Display statistical summary of the dataset
print("\nStatistical Summary:")
print(data.describe())

Column Names:
Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'Credit_UTL1',
       'Credit_UTL2', 'Credit_UTL3', 'Credit_UTL4', 'Credit_UTL5',
       'Credit_UTL6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4',
       'PAY_AMT5', 'PAY_AMT6', 'default payment next month'],
      dtype='object')

Statistical Summary:
                 ID       LIMIT_BAL           SEX     EDUCATION      MARRIAGE  \
count  30000.000000    30000.000000  30000.000000  30000.000000  30000.000000   
mean   15000.500000   167484.322667      1.603733      1.853133      1.551867   
std     8660.398374   129747.661567      0.489129      0.790349      0.521970   
min        1.000000    10000.000000      1.000000      0.000000      0.000000   
25%     7500.750000    50000.000000      1.000000      1.000000      1.000000   
50%    15000.500000   140000.00

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Select categorical columns
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']

# Apply one-hot encoding
encoder = OneHotEncoder(drop='first')  # Remove the `sparse` parameter
encoded_data = encoder.fit_transform(data[categorical_cols]).toarray()  # Convert sparse matrix to dense array

# Convert to DataFrame and merge with original data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))
data = pd.concat([data.drop(columns=categorical_cols), encoded_df], axis=1)

In [6]:
print(data.head())

   ID  LIMIT_BAL  AGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6  BILL_AMT1  \
0   1      20000   24      2      2     -1     -1     -2     -2       3913   
1   2     120000   26     -1      2      0      0      0      2       2682   
2   3      90000   34      0      0      0      0      0      0      29239   
3   4      50000   37      0      0      0      0      0      0      46990   
4   5      50000   57     -1      0     -1      0      0      0       8617   

   ...  SEX_2  EDUCATION_1  EDUCATION_2  EDUCATION_3  EDUCATION_4  \
0  ...    1.0          0.0          1.0          0.0          0.0   
1  ...    1.0          0.0          1.0          0.0          0.0   
2  ...    1.0          0.0          1.0          0.0          0.0   
3  ...    1.0          0.0          1.0          0.0          0.0   
4  ...    0.0          0.0          1.0          0.0          0.0   

   EDUCATION_5  EDUCATION_6  MARRIAGE_1  MARRIAGE_2  MARRIAGE_3  
0          0.0          0.0         1.0         0.

In [7]:

target_column = 'default payment next month'
features = data.drop(columns=[target_column])  # Drop the target column
target = data[target_column]

print("Features shape:", features.shape)
print("Target shape:", target.shape)

Features shape: (30000, 37)
Target shape: (30000,)


In [8]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (24000, 37)
Testing set shape: (6000, 37)


In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data (using the same scaler parameters)
X_test_scaled = scaler.transform(X_test)

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# File path to the dataset
file_path = r"C:\Users\welcome\Desktop\Customer Credit Risk Prediction project\default of credit card clients.xls"

# Load the dataset
data = pd.read_excel(file_path, skiprows=1)  # Skip the first row as it contains metadata

# Display basic information about the dataset
print(data.info())
print(data.head())

# Define target and features
target_column = 'default payment next month'
features = data.drop(columns=[target_column])
target = data[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a function to evaluate and print model performance
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model and print classification report, ROC-AUC score, and confusion matrix.
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Define and train multiple models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), random_state=42, eval_metric='logloss'),
    "Support Vector Machine": SVC(probability=True, class_weight='balanced', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training and Evaluating: {model_name}")
    model.fit(X_train_scaled, y_train)
    evaluate_model(model, X_test_scaled, y_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  int64  
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  int64  
 13  BILL_AMT2                   300