# Project: Creditworthiness

In [None]:
# Load Packages
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = [13, 13]

## Step 1: Business and Data Understanding

In [None]:
# load past applications
past_applications = pd.read_excel('credit-data-training.xlsx')
new_customers = pd.read_excel('customers-to-score.xlsx')


In [None]:
past_applications.head()

### Key Decisions:

* What decisions needs to be made?
  * I need to evaluate the creditworthiness of the new 500 loan applicants.

* What data is needed to inform those decisions?
  * I need past loan applicant's information on credit application results and the data used to rate those results like Duration of credit, credit amount, installment, age of the applicant, etc.

* What kind of model (Continuous, Binary, Non-Binary, Time-Series) do we need to use to help make these decisions?
  * The model type will be Binary as I will be predicting an applicant to be either creditworthy or non-creditworthy.


## Step 2: Building the Training Set

### Guidelines:
* For numerical data fields, are there any fields that highly-correlate with each other? The correlation should be at least .70 to be considered “high”.
* Are there any missing data for each of the data fields? Fields with a lot of missing data should be removed
* Are there only a few values in a subset of your data field? Does the data field look very uniform (there is only one value for the entire field?). This is called “low variability” and you should remove fields that have low variability. Refer to the "Tips" section to find examples of data fields with low-variability.
*Your clean data set should have 13 columns where the Average of Age Years should be 36 (rounded up)


In [None]:
# Variables Non Null Count
past_applications.info()

columns_to_drop = []

In [None]:
# count missing values
print('Counting Missing Values')
past_applications.isnull().sum()

In [None]:
# Data Vizualization
fig, axes = plt.subplots(4,5, figsize=(23, 23))
x = list(past_applications.columns)

for i, column in enumerate(past_applications.columns):
    if past_applications[column].dtype == np.dtype('O'):
        past_applications[column].value_counts().plot(kind='bar', rot=0, ax=axes[int(i/5)][i%5]).set_title(column)
    else:
        past_applications[column].hist(ax=axes[int(i/5)][i%5]).set_title(column)


In [None]:
# drop Duration-in-Current-address due to many missing data
columns_to_drop.append('Duration-in-Current-address')
# drop Concurrent-Credits due to low variability
columns_to_drop.append('Concurrent-Credits')
# drop Occupation due to low variability
columns_to_drop.append('Occupation')

# drop due to low variability
columns_to_drop.append('Guarantors')
columns_to_drop.append('Telephone')
columns_to_drop.append('No-of-dependents')
columns_to_drop.append('Foreign-Worker')

clean_data = past_applications.drop(columns=columns_to_drop)

past_applications[columns_to_drop].info()

In [None]:
# Data Removed Vizualization
fig, axes = plt.subplots(2,4, figsize=(15, 9))

for i, column in enumerate(columns_to_drop):
    if past_applications[column].dtype == np.dtype('O'):
        past_applications[column].value_counts().plot(kind='bar', rot=0, ax=axes[int(i/4)][i%4]).set_title(column)
    else:
        past_applications[column].hist(ax=axes[int(i/4)][i%4]).set_title(column)

# fig.savefig('droped_variables_graph.png')

In [None]:
# Median
clean_data = clean_data.fillna(clean_data.median())
clean_data.describe().round(0)

In [None]:
# Correlation
clean_data.corr().round(2)

### Answer this question:

* In your cleanup process, which fields did you remove or impute? Please justify why you removed or imputed these fields. Visualizations are encouraged.
  * The imputed field is Age-years, There 12 applicants with empty age data. I can not remove these applicants as I will lose 2.4% of the data. I will fill all empty data with an age median of 33.
  * I will remove all fields with low variability to remove bias in my model. The removed fields are:
    - Duration in a current address
    - Concurrent credits
    - Occupation
    - Guarantors
    - Telephone
    - No of dependents
    - Foreign worker

## Step 3: Train your Classification Models

First, create your Estimation and Validation samples where 70% of your dataset should go to Estimation and 30% of your entire dataset should be reserved for Validation. Set the Random Seed to 1.

Create all of the following models: Logistic Regression, Decision Tree, Forest Model, Boosted Model

In [None]:
# replace target to binary
target_column = 'Credit-Application-Result'
target_label = ['Creditworthy', 'Non-Creditworthy'] # list(clean_data[target_column].unique())
clean_data[target_column].replace({'Creditworthy': 1, 'Non-Creditworthy': 0}, inplace=True)

# Categorical Columns
categorical_columns = ['Account-Balance', 'Payment-Status-of-Previous-Credit', 'Purpose', 'Value-Savings-Stocks',  'Length-of-current-employment',  'No-of-Credits-at-this-Bank'] # clean_data.select_dtypes(include='O')

# Numerical Columns
numerical_columns = ['Duration-of-Credit-Month', 'Credit-Amount', 'Instalment-per-cent', 'Most-valuable-available-asset', 'Age-years', 'Type-of-apartment'] # clean_data.select_dtypes(exclude='O')

# get target and predictors from cleaned dataframe
def get_predictors_target(df):
    target = None
    if target_column in df.columns:
        target = df[target_column]
    predictors = df[numerical_columns+categorical_columns]
    return predictors, target

# Preprocessing Data for Models
transformer = ColumnTransformer(
    [
        ('scaler' , StandardScaler(), numerical_columns),
        ('encoder', OneHotEncoder(drop='first'), categorical_columns)
    ], 
    remainder='drop'
)

# Training Data
X, y = get_predictors_target(clean_data)

# fit data to create features
transformer.fit(X)

# split trainging data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

# Features after scaler and encoder
feature_names = numerical_columns + list(transformer.transformers_[1][1].get_feature_names(categorical_columns))

print('Features Name:', feature_names)

In [None]:
# Logistic Regression Model - stasmodels
sm_logReg = sm.Logit(y_train, transformer.transform(X_train)).fit(maxiter=1000)
y_hat = list(map(round, sm_logReg.predict(transformer.transform(X_test))))

cm = metrics.confusion_matrix(y_true=y_test, y_pred=y_hat)
print(sm_logReg.summary2(xname=feature_names))

print('Confusion Matrix')
_, ax = plt.subplots(figsize=(6,5))
fig_ = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_label[::-1])
fig_.plot(ax=ax)
plt.show()

print('\nAccuracy on Training Data: ', metrics.accuracy_score(y_test, y_hat), '\n')
print('Report\n', metrics.classification_report(y_true=y_test, y_pred=y_hat))

### Decision Tree Model - GridSearch

### Find best parameters
`parameters = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': range(1,20)}`

`decision_tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=parameters, scoring='precision', cv=10, n_jobs=-1)`

`decision_tree_grid.fit(transformer.transform(X_train), y_train)`

`decision_tree_grid.best_params_`

- Best params: {'criterion': 'gini', 'max_depth': 13, 'splitter': 'random'}

In [None]:
# Decision Tree Model
decision_tree_model = DecisionTreeClassifier(criterion='gini', max_depth=13, splitter='random')
decision_tree_model.fit(transformer.transform(X_train), y_train)

# Features Impotances
feature_importances_dtm = pd.Series(decision_tree_model.feature_importances_, index=feature_names).sort_values()
print('\nFeatures Impotances Plot')
feature_importances_dtm.plot(kind='barh', figsize=(7,6))
plt.show()

print('\nConfusion Matrix')
_, ax = plt.subplots(figsize=(6,5))
metrics.plot_confusion_matrix(decision_tree_model, transformer.transform(X_test), y_test, ax=ax, display_labels=target_label[::-1])
plt.show()

print('\nAccuracy on Training Data: ', decision_tree_model.score(transformer.transform(X_test), y_test), '\n')
print('Report\n' ,metrics.classification_report(y_test, decision_tree_model.predict(transformer.transform(X_test))))


### Forest Model Model - GridSearch

### Find best parameters
`parameters = {'n_estimators': range(50, 1100, 25),'criterion': ['gini', 'entropy'], 'max_depth': range(1,20)}`

`random_forest_grid = GridSearchCV(RandomForestClassifier(), param_grid=parameters, scoring='precision', cv=10, n_jobs=-1)`

`random_forest_grid.fit(transformer.transform(X_train), y_train)`

`random_forest_grid.best_params_`

- Best params: {'criterion': 'gini', 'max_depth': 16, 'n_estimators': 100}


In [None]:
# Forest Model Model
random_forest_model = RandomForestClassifier(criterion='gini', max_depth=16, n_estimators=100)
random_forest_model.fit(transformer.transform(X_train), y_train)

# Features Impotances
feature_importances_rfm = pd.Series(random_forest_model.feature_importances_, index=feature_names).sort_values()
print('\nFeatures Impotances Plot')
feature_importances_rfm.plot(kind='barh', figsize=(7,6))
plt.show()

print('\nConfusion Matrix')
_, ax = plt.subplots(figsize=(6,5))
metrics.plot_confusion_matrix(random_forest_model, transformer.transform(X_test), y_test, ax=ax, display_labels=target_label[::-1])
plt.show()

print('\nAccuracy on Training Data: ', random_forest_model.score(transformer.transform(X_test), y_test), '\n')
print('Report\n', metrics.classification_report(y_test, random_forest_model.predict(transformer.transform(X_test))))

### AdaBoost Model - GridSearch

### Find best parameters
`parameters = {'n_estimators': range(50, 1100, 100), 'algorithm': ['SAMME', 'SAMME.R'], 'learning_rate': np.arange(.1, 1.1, .1)}`

`ada_boost_grid = GridSearchCV(AdaBoostClassifier(), param_grid=parameters, scoring='precision', cv=10, n_jobs=-1)`

`ada_boost_grid.fit(transformer.transform(X_train), y_train)`

`ada_boost_grid.best_params_`

- Best params: {'algorithm': 'SAMME.R', 'learning_rate': 0.8, 'n_estimators': 150}

In [None]:
# Boosted Tree Model
ada_boost_model = AdaBoostClassifier(n_estimators=150, algorithm='SAMME.R', learning_rate=0.8)
ada_boost_model.fit(transformer.transform(X_train), y_train)

# Features Impotances
feature_importances_btm = pd.Series(ada_boost_model.feature_importances_, index=feature_names).sort_values()
print('\nFeatures Impotances Plot')
feature_importances_btm.plot(kind='barh', figsize=(7,6))
plt.show()

print('\nConfusion Matrix')
_, ax = plt.subplots(figsize=(6,5))
metrics.plot_confusion_matrix(ada_boost_model, transformer.transform(X_test), y_test, ax=ax, display_labels=target_label[::-1])
plt.show()

print('\nAccuracy on Training Data: ', ada_boost_model.score(transformer.transform(X_test), y_test), '\n')
print('Report\n' ,metrics.classification_report(y_test, ada_boost_model.predict(transformer.transform(X_test)), target_names=target_label[::-1]))

### Answer these questions for each model you created:

* Which predictor variables are significant or the most important? Please show the p-values or variable importance charts for all of your predictor variables.

* Validate your model against the Validation set. What was the overall percent accuracy? Show the confusion matrix. Are there any bias seen in the model’s predictions? 


## Step 4: Writeup

Decide on the best model and score your new customers. For reviewing consistency, if Score_Creditworthy is greater than Score_NonCreditworthy, the person should be labeled as “Creditworthy

### Model Selected
ada_boost_model

In [None]:
# prepair new customers data for prediction
data_to_predict, _ = get_predictors_target(new_customers)

In [None]:
# roc plot
_, ax = plt.subplots(figsize=(8,7))
# plot roc curve for statsmodles
fpr, tpr, thresholds = metrics.roc_curve(y_test, list(map(round, sm_logReg.predict(transformer.transform(X_test)))))
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
display.plot(ax=ax, name='Logistic Regression Model')

metrics.plot_roc_curve(decision_tree_model, transformer.transform(X_test), y_test, name ='Decison Tree Model', ax=ax)
metrics.plot_roc_curve(random_forest_model, transformer.transform(X_test), y_test, name ='Forest Model', ax=ax)
metrics.plot_roc_curve(ada_boost_model, transformer.transform(X_test), y_test, name ='Boosted Model', ax=ax)

In [None]:
# Predict New Customers - Logistic Regression
predicted = np.array(list(map(round, sm_logReg.predict(transformer.transform(data_to_predict)))))

# count predection
creditworthy_count = (predicted == 1).sum()
non_creditworthy_count = (predicted == 0).sum()
print('Creditworthy applicants: ', creditworthy_count)
print('Non Creditworthy applicants: ', non_creditworthy_count)

In [None]:
# Predict New Customers - Decision Tree
predicted = decision_tree_model.predict(transformer.transform(data_to_predict))

# count predection
creditworthy_count = (predicted == 1).sum()
non_creditworthy_count = (predicted == 0).sum()
print('Creditworthy applicants: ', creditworthy_count)
print('Non Creditworthy applicants: ', non_creditworthy_count)

In [None]:
# Predict New Customers - Forest
predicted = random_forest_model.predict(transformer.transform(data_to_predict))

# count predection
creditworthy_count = (predicted == 1).sum()
non_creditworthy_count = (predicted == 0).sum()
print('Creditworthy applicants: ', creditworthy_count)
print('Non Creditworthy applicants: ', non_creditworthy_count)

In [None]:
# Predict New Customers - Boost
predicted = ada_boost_model.predict(transformer.transform(data_to_predict))

# count predection
creditworthy_count = (predicted == 1).sum()
non_creditworthy_count = (predicted == 0).sum()
print('Creditworthy applicants: ', creditworthy_count)
print('Non Creditworthy applicants: ', non_creditworthy_count)

### Answer these questions
* Which model did you choose to use? 
* How many individuals are creditworthy?