In [6]:
from ucimlrepo import fetch_ucirepo 
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 

# data (as pandas dataframes) 
X = default_of_credit_card_clients.data.features 
y = default_of_credit_card_clients.data.targets 

# metadata 
print(default_of_credit_card_clients.metadata) 

# variable information 
print(default_of_credit_card_clients.variables) 



{'uci_id': 350, 'name': 'Default of Credit Card Clients', 'repository_url': 'https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients', 'data_url': 'https://archive.ics.uci.edu/static/public/350/data.csv', 'abstract': "This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.", 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 30000, 'num_features': 23, 'feature_types': ['Integer', 'Real'], 'demographics': ['Sex', 'Education Level', 'Marital Status', 'Age'], 'target_col': ['Y'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C55S3H', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'ID': 365, 'type': 'NATIVE', 'title': 'The comparisons of data mining techniques for the predictive accuracy of 

In [8]:
import pandas as pd
df = pd.concat([X, y], axis=1)
categorical = []
numerical = []

for col in X.columns:
    if X[col].nunique() <= 10:
        categorical.append(col)
    else:
        numerical.append(col)
        
print('Categorical:', categorical)
print('Numerical:', numerical)

Categorical: ['X2', 'X3', 'X4', 'X10', 'X11']
Numerical: ['X1', 'X5', 'X6', 'X7', 'X8', 'X9', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']


In [10]:
for col in categorical:
    print(X[col].value_counts())
    
X[numerical].describe()

X2
2    18112
1    11888
Name: count, dtype: int64
X3
2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: count, dtype: int64
X4
2    15964
1    13659
3      323
0       54
Name: count, dtype: int64
X10
 0    16947
-1     5539
-2     4546
 2     2626
 3      178
 4       84
 7       58
 5       17
 6        4
 8        1
Name: count, dtype: int64
X11
 0    16286
-1     5740
-2     4895
 2     2766
 3      184
 4       49
 7       46
 6       19
 5       13
 8        2
Name: count, dtype: int64


Unnamed: 0,X1,X5,X6,X7,X8,X9,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567
std,129747.661567,9.217904,1.123802,1.197186,1.196868,1.169139,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775
min,10000.0,21.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,28.0,-1.0,-1.0,-1.0,-1.0,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75
50%,140000.0,34.0,0.0,0.0,0.0,0.0,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0
75%,240000.0,41.0,0.0,0.0,0.0,0.0,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0
max,1000000.0,79.0,8.0,8.0,8.0,8.0,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0


In [13]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)
X_cat_encoded = encoder.fit_transform(X[categorical])
X_cat_encoded = pd.DataFrame(X_cat_encoded, columns=encoder.get_feature_names_out(categorical))


In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_num_scaled=scaler.fit_transform(X[numerical])
X_num_scaled=pd.DataFrame(X_num_scaled, columns=numerical)

In [15]:
X_final = pd.concat([X_num_scaled, X_cat_encoded], axis=1)

In [18]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train.values.ravel())

In [20]:
y_pred = model.predict(X_test)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4525  162]
 [ 996  317]]
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4687
           1       0.66      0.24      0.35      1313

    accuracy                           0.81      6000
   macro avg       0.74      0.60      0.62      6000
weighted avg       0.79      0.81      0.77      6000

