In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
credit_card_data = pd.read_csv(r"./UCI_Credit_Card.csv")

credit_card_data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


# Data Preparation 

In [3]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

In [4]:
credit_card_data.isnull().sum()


ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

In [5]:
credit_card_data['default.payment.next.month'].value_counts()

default.payment.next.month
0    23364
1     6636
Name: count, dtype: int64

In [6]:
legit = credit_card_data[credit_card_data['default.payment.next.month'] == 0]
fraud = credit_card_data[credit_card_data['default.payment.next.month'] == 1]


In [7]:
print(legit.shape)
print(fraud.shape)

(23364, 25)
(6636, 25)


In [8]:
legit.PAY_AMT6.describe()

count     23364.000000
mean       5719.371769
std       18792.950473
min           0.000000
25%         300.000000
50%        1706.000000
75%        4545.000000
max      528666.000000
Name: PAY_AMT6, dtype: float64

In [9]:
fraud.PAY_AMT6.describe()

count      6636.000000
mean       3441.482068
std       13464.005894
min           0.000000
25%           0.000000
50%        1000.000000
75%        2974.500000
max      345293.000000
Name: PAY_AMT6, dtype: float64

In [10]:
credit_card_data.groupby('default.payment.next.month').mean()

Unnamed: 0_level_0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
default.payment.next.month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,15064.893982,178099.726074,1.61415,1.841337,1.558637,35.417266,-0.211222,-0.301917,-0.316256,-0.355633,...,47533.365605,43611.165254,40530.445343,39042.268704,6307.337357,6640.465074,5753.496833,5300.529319,5248.220296,5719.371769
1,14773.781947,130109.65642,1.567058,1.894665,1.528029,35.725738,0.668174,0.458258,0.362116,0.254521,...,45181.598855,42036.950573,39540.190476,38271.435503,3397.044153,3388.649638,3367.351567,3155.626733,3219.139542,3441.482068


In [11]:
print(credit_card_data.columns)

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')


In [12]:
credit_card_data.drop('MARRIAGE', axis=1, inplace=True)

In [13]:
print(credit_card_data.columns)


Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3',
       'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')


In [14]:
credit_card_data.groupby('default.payment.next.month').mean()

Unnamed: 0_level_0,ID,LIMIT_BAL,SEX,EDUCATION,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
default.payment.next.month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,15064.893982,178099.726074,1.61415,1.841337,35.417266,-0.211222,-0.301917,-0.316256,-0.355633,-0.389488,...,47533.365605,43611.165254,40530.445343,39042.268704,6307.337357,6640.465074,5753.496833,5300.529319,5248.220296,5719.371769
1,14773.781947,130109.65642,1.567058,1.894665,35.725738,0.668174,0.458258,0.362116,0.254521,0.167872,...,45181.598855,42036.950573,39540.190476,38271.435503,3397.044153,3388.649638,3367.351567,3155.626733,3219.139542,3441.482068


In [15]:
legit_sample = legit.sample(n=6636)

In [16]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [17]:
new_dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
15948,15949,120000.0,2,2,2,30,0,0,0,0,...,118053.0,119047.0,119641.0,5555.0,5847.0,5800.0,4445.0,4700.0,4500.0,0
9741,9742,60000.0,1,2,1,42,0,0,0,0,...,25488.0,26199.0,28200.0,2000.0,2000.0,1000.0,1501.0,3000.0,1500.0,0
29480,29481,20000.0,1,2,1,32,0,0,0,2,...,20511.0,20316.0,20474.0,1700.0,4000.0,0.0,800.0,1000.0,800.0,0
13009,13010,110000.0,2,2,1,49,-2,-2,-2,-2,...,50994.0,40652.0,5785.0,5085.0,2044.0,2026.0,2097.0,1011.0,48179.0,0
27197,27198,150000.0,1,3,2,45,-1,-1,-1,-1,...,671.0,37082.0,34725.0,6051.0,5901.0,671.0,37082.0,5000.0,5000.0,0


In [18]:
new_dataset.drop('MARRIAGE', axis=1, inplace=True)
new_dataset.drop('ID', axis=1, inplace=True)
new_dataset.drop('EDUCATION', axis=1, inplace=True)
new_dataset.drop('SEX', axis=1, inplace=True)

In [19]:
new_dataset

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
15948,120000.0,30,0,0,0,0,0,0,110612.0,114323.0,...,118053.0,119047.0,119641.0,5555.0,5847.0,5800.0,4445.0,4700.0,4500.0,0
9741,60000.0,42,0,0,0,0,0,0,49244.0,48310.0,...,25488.0,26199.0,28200.0,2000.0,2000.0,1000.0,1501.0,3000.0,1500.0,0
29480,20000.0,32,0,0,0,2,0,0,16354.0,17776.0,...,20511.0,20316.0,20474.0,1700.0,4000.0,0.0,800.0,1000.0,800.0,0
13009,110000.0,49,-2,-2,-2,-2,-2,-2,115672.0,109892.0,...,50994.0,40652.0,5785.0,5085.0,2044.0,2026.0,2097.0,1011.0,48179.0,0
27197,150000.0,45,-1,-1,-1,-1,-1,0,6129.0,6051.0,...,671.0,37082.0,34725.0,6051.0,5901.0,671.0,37082.0,5000.0,5000.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29991,210000.0,34,3,2,2,2,2,2,2500.0,2500.0,...,2500.0,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,1
29994,80000.0,34,2,2,2,2,2,2,72557.0,77708.0,...,77519.0,82607.0,81158.0,7000.0,3500.0,0.0,7000.0,0.0,4000.0,1
29997,30000.0,37,4,3,2,-1,0,0,3565.0,3356.0,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,80000.0,41,1,-1,0,0,0,-1,-1645.0,78379.0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [20]:
new_dataset['default.payment.next.month'].value_counts()

default.payment.next.month
0    6636
1    6636
Name: count, dtype: int64

In [21]:
new_dataset.groupby('default.payment.next.month').mean()

Unnamed: 0_level_0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
default.payment.next.month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,176262.808921,35.479204,-0.218656,-0.30425,-0.31736,-0.361664,-0.391652,-0.400693,52144.540989,49556.249548,47229.41305,43465.28255,40206.846745,38633.481615,6100.022453,6266.587402,5835.364376,4990.723629,5233.928571,5908.833484
1,130109.65642,35.725738,0.668174,0.458258,0.362116,0.254521,0.167872,0.112116,48509.162297,47283.617842,45181.598855,42036.950573,39540.190476,38271.435503,3397.044153,3388.649638,3367.351567,3155.626733,3219.139542,3441.482068


In [22]:
x = new_dataset.drop(columns = 'default.payment.next.month', axis = 1)
y = new_dataset['default.payment.next.month']

In [23]:
print(x)

       LIMIT_BAL  AGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6  BILL_AMT1  \
15948   120000.0   30      0      0      0      0      0      0   110612.0   
9741     60000.0   42      0      0      0      0      0      0    49244.0   
29480    20000.0   32      0      0      0      2      0      0    16354.0   
13009   110000.0   49     -2     -2     -2     -2     -2     -2   115672.0   
27197   150000.0   45     -1     -1     -1     -1     -1      0     6129.0   
...          ...  ...    ...    ...    ...    ...    ...    ...        ...   
29991   210000.0   34      3      2      2      2      2      2     2500.0   
29994    80000.0   34      2      2      2      2      2      2    72557.0   
29997    30000.0   37      4      3      2     -1      0      0     3565.0   
29998    80000.0   41      1     -1      0      0      0     -1    -1645.0   
29999    50000.0   46      0      0      0      0      0      0    47929.0   

       BILL_AMT2  BILL_AMT3  BILL_AMT4  BILL_AMT5  BILL_AMT6  P

In [24]:
print(y)

15948    0
9741     0
29480    0
13009    0
27197    0
        ..
29991    1
29994    1
29997    1
29998    1
29999    1
Name: default.payment.next.month, Length: 13272, dtype: int64


# test_train_split

In [25]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y,random_state = 2)

In [26]:
print(x.shape, x_train.shape, x_test.shape)

(13272, 20) (10617, 20) (2655, 20)


# Model Training

In [27]:

model = LogisticRegression()

In [28]:
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Model evaluation

In [29]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [30]:

print(training_data_accuracy)

0.6136385042855798


In [31]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [32]:
print(test_data_accuracy) 

0.6135593220338983
