# Importing Packages

In [145]:
import pandas as pd
import numpy as np

# Viewing the Data

In [133]:
df = pd.read_csv('~/Desktop/CC_approvals/cc_approvals.csv', header = None)
df.columns= ["Gender", 'Age','Debt','Married','BankCustomer', 'EducationLevel','Ethnicity','YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Citizen', 'ZipCode', 'Income',  'ApprovalStatus']
print(df.head())
print(df.describe())
print(df.info())
for x in df.columns:
    if df[x].dtype != 'float64'and df[x].dtype != 'int64':
        print(df[x].value_counts())


  Gender    Age   Debt Married BankCustomer EducationLevel Ethnicity  \
0      b  30.83  0.000       u            g              w         v   
1      a  58.67  4.460       u            g              q         h   
2      a  24.50  0.500       u            g              q         h   
3      b  27.83  1.540       u            g              w         v   
4      b  20.17  5.625       u            g              w         v   

   YearsEmployed PriorDefault Employed  CreditScore DriversLicense Citizen  \
0           1.25            t        t            1              f       g   
1           3.04            t        t            6              f       g   
2           1.50            t        f            0              f       g   
3           3.75            t        t            5              t       g   
4           1.71            t        f            0              f       s   

  ZipCode  Income ApprovalStatus  
0   00202       0              +  
1   00043     560           

# Cleanning the Data

In [134]:
df = df.replace({"?": None})
df = df.dropna()
df = df.drop(['Ethnicity','DriversLicense'], axis =1)
df['ApprovalStatus'] = df['ApprovalStatus'].replace({"+":1,"-":0})
df['ApprovalStatus'] = df['ApprovalStatus'].astype('category')
df['ZipCode'] = df['ZipCode'].astype('category')
df['Age'] = df['Age'].astype('float64')


# Preprocessing the Data

In [135]:
## we will be converting all the non-numeric values into numeric ones. 
##We do this because not only it results in a faster computation but also many machine learning models,
##and especially the ones developed using scikit-learn, require the data to be in a strictly numeric format. 
##We will do this by using a technique called label encoding.

## Encoding 

In [136]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Iterate over all the values of each column and extract their dtypes
for col in df.columns:
    if df[col].dtype=='object':
        df_dict = {k:i for i, k in enumerate(df[col].unique(), 0)} 
    # Use LabelEncoder to do the numeric transformation
        df.loc[:,col]=df.loc[:,col].map(df_dict)
        

## Scaling

In [137]:
from sklearn.preprocessing import MinMaxScaler
X,y = df.iloc[:,0:13] , df.iloc[:,13]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

In [138]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(rescaledX,
                                y,
                                test_size=0.20,
                                random_state=123)

## Training with Logistic Regression

In [139]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression()

## Predicting

In [140]:
from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_test)

# Evaluating

In [141]:
# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(X_test,y_test))
print('Our model was pretty good! It was able to yield an accuracy score of almost 88%.')

Accuracy of logistic regression classifier:  0.8854961832061069
Our model was pretty good! It was able to yield an accuracy score of almost 88%.


In [144]:
##For the confusion matrix, 
##the first element of the of the first row of the confusion matrix denotes the true negatives
##meaning the number of negative instances (denied applications) predicted by the model correctly. 
##the last element of the second row of the confusion matrix denotes the true positives
##meaning the number of positive instances (approved applications) predicted by the model correctly.
confusion_matrix(y_test,y_pred)

array([[61, 10],
       [ 5, 55]])

class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, 
tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, 
solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    

## Improving the regression parameters

In [143]:
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01,0.001,0.0001]
max_iter = [100,150,200]

## tol : Tolerance for stopping criteria.
## mar_iter : Maximum number of iterations taken for the solvers to converge.

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

print(param_grid)

# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

##cv : int, cross-validation generator or an iterable, default=None
##Determines the cross-validation splitting strategy. Possible inputs for cv are:


# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_ , grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}
Best: 0.859107 using {'max_iter': 100, 'tol': 0.01}


In [None]:
## Traning with 