In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset
cc_apps = pd.read_csv("cc_approvals.csv", header=None) 
cc_apps.head(700)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,1,2.000,3,4,5,6,7.00,8,9,10,11,12,13
1,b,30.83,0.000,u,g,w,v,1.25,t,t,1,g,0,+
2,a,58.67,4.460,u,g,q,h,3.04,t,t,6,g,560,+
3,a,24.50,0.500,u,g,q,h,1.50,t,f,0,g,824,+
4,b,27.83,1.540,u,g,w,v,3.75,t,t,5,g,3,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686,b,21.08,10.085,y,p,e,h,1.25,f,f,0,g,0,-
687,a,22.67,0.750,u,g,c,v,2.00,f,t,2,g,394,-
688,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,g,1,-
689,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,g,750,-


In [3]:
# Inspect data
cc_apps_description = cc_apps.describe()
print(cc_apps_description)


               2           7           10             12
count  691.000000  691.000000  691.000000     691.000000
mean     4.754732    2.230318    2.410999    1015.930535
std      4.975661    3.349021    4.868008    5206.466275
min      0.000000    0.000000    0.000000       0.000000
25%      1.000000    0.165000    0.000000       0.000000
50%      2.750000    1.000000    0.000000       5.000000
75%      7.165000    2.667500    3.000000     395.000000
max     28.000000   28.500000   67.000000  100000.000000


In [4]:
#check null values
cc_apps.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691 entries, 0 to 690
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       691 non-null    object 
 1   1       691 non-null    object 
 2   2       691 non-null    float64
 3   3       691 non-null    object 
 4   4       691 non-null    object 
 5   5       691 non-null    object 
 6   6       691 non-null    object 
 7   7       691 non-null    float64
 8   8       691 non-null    object 
 9   9       691 non-null    object 
 10  10      691 non-null    int64  
 11  11      691 non-null    object 
 12  12      691 non-null    int64  
 13  13      691 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 75.7+ KB


In [5]:
#one hot encoding
cc_apps = pd.get_dummies(cc_apps)
cc_apps.head(20)


Unnamed: 0,2,7,10,12,0_0,0_?,0_a,0_b,1_1,1_13.75,...,9_9,9_f,9_t,11_11,11_g,11_p,11_s,13_+,13_-,13_13
0,2.0,7.0,10,12,True,False,False,False,True,False,...,True,False,False,True,False,False,False,False,False,True
1,0.0,1.25,1,0,False,False,False,True,False,False,...,False,False,True,False,True,False,False,True,False,False
2,4.46,3.04,6,560,False,False,True,False,False,False,...,False,False,True,False,True,False,False,True,False,False
3,0.5,1.5,0,824,False,False,True,False,False,False,...,False,True,False,False,True,False,False,True,False,False
4,1.54,3.75,5,3,False,False,False,True,False,False,...,False,False,True,False,True,False,False,True,False,False
5,5.625,1.71,0,0,False,False,False,True,False,False,...,False,True,False,False,False,False,True,True,False,False
6,4.0,2.5,0,0,False,False,False,True,False,False,...,False,True,False,False,True,False,False,True,False,False
7,1.04,6.5,0,31285,False,False,False,True,False,False,...,False,True,False,False,True,False,False,True,False,False
8,11.585,0.04,0,1349,False,False,True,False,False,False,...,False,True,False,False,True,False,False,True,False,False
9,0.5,3.96,0,314,False,False,False,True,False,False,...,False,True,False,False,True,False,False,True,False,False


In [6]:
#define target and features
cc_apps = cc_apps.values
X,y = cc_apps[:,0:13] , cc_apps[:,13]


In [11]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                y,
                                test_size=0.33,
                                random_state=42)
#scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [12]:
#logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


In [15]:
evaluate = confusion_matrix(y_test, y_pred, labels=[1,0])
print(evaluate)
score = logreg.score(X_test, y_test)
print(score)
accuracy = (evaluate[0,0] + evaluate[1,1]) / (evaluate[0,0] + evaluate[0,1] + evaluate[1,0] + evaluate[1,1])
print(accuracy)

[[  0   0]
 [  0 229]]
1.0
1.0
