# IN-STK-5000 Project: Credit risk for mortgages - part 1
## Deadline 1: September 18
## Luca Attanasio, ...



### You can execute the code by running the cell below. 
Some changes were made to `TestLending.py` in order to collect all the utilities from each of the bankers and use boxplots to plot the utility values. Boxplots are not the best idea to plot the utility values since they can be misleading (on different tests one banker may outperform another and viceversa in other situations), but we can feel how well the banker is doing.
# WARNING: The code runs really slow with n_tests=100 (so we set the value to n_tests=2 here). For n_tests=100 look at the report.

In [None]:
%matplotlib inline
%run TestLending.py 2
None

In this project, we use five bankers to collect data. In particular:
1. deterministic banker: always grant
2. deterministic banker: never grant
3. random banker
4. random forest banker
5. neural network banker (not optimized)

### 1. and 2. deterministic banker
The determinisc_banker can be initialized with an action that can be either 0 or 1, respectively the never grant and always grant bankers. Consequently, the action it always chooses is either 0 or 1 respectively.

In [None]:
# %load deterministic_banker
import numpy as np

class DeterministicBanker:

    """
    Initialize banker with a given action: 0 don't grant, 1 grant the loan
    """
    def __init__(self, action):
        self.name = 'deterministic'
        self.action = action

    def fit(self, X, y):
        pass

    def set_interest_rate(self, rate):
        self.rate = rate

    def predict_proba(self, x):
        return 0


    def expected_utility(self, x, action):
        print("Expected utility: Not implemented")


    """
    Always grant or not grant the loan
    """
    def get_best_action(self, x):
        return self.action


### 3. random banker
The random banker returns a random action between 0 and 1. It may randomly go well, but it is the worst strategy to choose if we want to maximize the total utility.

In [None]:
# %load random_banker.py
import numpy as np

class RandomBanker:

    def __init__(self):
        self.name = 'random'

    def fit(self, X, y):
        self.data = [X, y]

    def set_interest_rate(self, rate):
        self.rate = rate
        # return

    def predict_proba(self, x):
        return 0

    def expected_utility(self, x, action):
        print("Expected utility: Not implemented")

    """
    The returned action is either to grant (1) or not the loan (0) randomly
    """
    def get_best_action(self, x):
        return np.random.choice(2,1)[0]


### Calculating the exected utility and best action
This part is documented on the report

### 4. Random forest banker
This banker uses a random forest classifier to predict the probability that the given input is a bad loan.
To get the best performances out of the classifier, cross fold validation is used against each training set, to select the best value of max depth for the tree. In addition, the random forest was optimizied by setting the $best\_max\_depth$ and $best\_max\_features$ using cross-fold validation.

In [None]:
# %load project_banker.py
import numpy as np
import pandas as pd
# model for fitting dataset
from sklearn.ensemble import RandomForestClassifier
# select best model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

class ProjectBanker:

    def __init__(self):
        self.name = 'forest'
        # best values
        # set one at a time to None for testing the best value with cv!
        # self.best_max_depth = None
        self.best_max_depth = 15
        # self.best_max_features = None
        self.best_max_features = 35

    def preprocessing(self, X, fit=False):
        X_temp = X.copy()

        if fit:
            self.scaler = MinMaxScaler()
            X_some_features = self.scaler.fit_transform(X_temp)
        else:
            self.scaler.transform(X_temp)

        return X_temp

    # Fit the model to the data.  You can use any model you like to do
    # the fit, however you should be able to predict all class
    # probabilities
    """
    This function uses a random forest classifier to predict new probabilities
    """
    def fit(self, X, y):
        if self.best_max_depth == None and self.best_max_features == None:
            X_scaled = self.preprocessing(X)
        else:
            X_scaled = self.preprocessing(X,fit=True)

        self.clf = RandomForestClassifier(
            n_estimators=100,
            random_state=0,
            max_depth=self.best_max_depth,
            max_features=self.best_max_features,
            class_weight="balanced"
        ) # storing classifier
        self.clf.fit(X_scaled, y)
        print("training score ", self.clf.score(X_scaled, y))
        # print("feature_importances_", self.clf.feature_importances_)

    """
    Function invoked once to get the best max depth of the tree for
    the test set
    """
    def set_best_max_depth(self, X, y):
        if self.best_max_depth == None:
            X_scaled = self.preprocessing(X, fit=True)
            depths = range(5,20)
            untrained_models = [RandomForestClassifier(n_estimators=100, random_state=0, max_depth=d, max_features=self.best_max_features, class_weight="balanced") for d in depths]
            fold_scores = [cross_val_score(estimator=m, X=X_scaled, y=y, cv=5) for m in untrained_models]
            mean_xv_scores = [s.mean() for s in fold_scores]
            print("fold: ", fold_scores, len(fold_scores), np.asarray(mean_xv_scores).argmax())
            self.best_max_depth = depths[np.asarray(mean_xv_scores).argmax()]
            print("best_max_depth: ", self.best_max_depth)

    """
    Function invoked to evaluate feature importance
    """
    def set_best_max_features(self, X, y):
        if self.best_max_features == None:
            X_scaled = self.preprocessing(X, fit=True)
            features = range(20,40,5)
            untrained_models = [RandomForestClassifier(n_estimators=100, random_state=0, max_depth=self.best_max_depth, max_features=f, class_weight="balanced") for f in features]
            fold_scores = [cross_val_score(estimator=m, X=X_scaled, y=y, cv=5) for m in untrained_models]
            mean_xv_scores = [s.mean() for s in fold_scores]
            self.best_max_features = features[np.asarray(mean_xv_scores).argmax()]
            print("best_max_features: ", self.best_max_features)

    """
    Function to test the accuracy of the classifier
    """
    def test_accuracy(self, X, y):
        X_scaled = self.preprocessing(X)
        return self.clf.score(X_scaled, y)

    # set the interest rate
    """
    This function stores the interest rate within the function
    """
    def set_interest_rate(self, rate):
        self.rate = rate
        return

    # Predict the probability of failure for a specific person with data x
    """
    This function predicts the probability of failure (being a bad loan),
    given the data to predict for. It is necessary to cast the input
    to numpy since it is a Series.
    In case of single sample we also need to reshape it.
    """
    def predict_proba(self, x):
        x_reshaped = np.reshape(x.to_numpy(), (1, -1))
        x_scaled = self.preprocessing(x_reshaped)
        prediction = self.clf.predict_proba(x_scaled)
        return prediction[0][1]

    # The expected utility of granting the loan or not. Here there are two actions:
    # action = 0 do not grant the loan
    # action = 1 grant the loan
    #
    # Make sure that you extract the length_of_loan from the
    # 2nd attribute of x. Then the return if the loan is paid off to you is
    # amount_of_loan*(1 + rate)^length_of_loan
    # The return if the loan is not paid off is -amount_of_loan.
    """
    This function calculates the expected utility.
    The expected utility if the action is to grant the loan is given by
    the formula:
    amount_of_loan*(1 + self.rate)^length_of_loan * (1-self.predict_proba(x)) +
    -amount_of_loan * self.predict_proba(x)
    The expected utility if the action is not to grant anything is: 0
    This is true because we don't loose or get anything.
    """
    def expected_utility(self, x, action):
        amount_of_loan = x['amount']
        length_of_loan = x['duration']
        if action == 1:
            proba = self.predict_proba(x)
            gain = amount_of_loan * (pow(1 + self.rate, length_of_loan)) * (1 - proba)
            loss = amount_of_loan * proba
            return gain - loss

        return 0

    # Return the best action. This is normally the one that maximises expected utility.
    # However, you are allowed to deviate from this if you can justify the reason.
    """
    This function returns the best action such that the expected utility is
    maximized
    """
    def get_best_action(self, x):
        ## better way to calculate utility that allows to defiate from max
        ## threshold may be set higher to avoid granting too many loans
        actions = [0, 1]
        utility_0 = self.expected_utility(x, actions[0])
        utility_1 = self.expected_utility(x, actions[1])
        # grant about accuracy/100*200 = 150 -> error estimate
        # most of the measures are below 20 000
        if utility_1 - utility_0 > 0:
            return actions[1]
        return actions[0]


### 5. Neural network banker
This banker uses a neural network classifier to predict the probability that the given input is a bad loan.

In [None]:
# %load nn_banker.py
import numpy as np
import pandas as pd
import numpy.random as npr
# model for fitting dataset
# implement a nn here with keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.preprocessing import MinMaxScaler
# from sklearn.utils import class_weight # assign a class weight

# suppress warnings
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

class ProjectBanker:

    def __init__(self):
        self.name = 'nn'
        npr.seed(100)

    def preprocessing(self, X, fit=False):
        X_temp = X.copy()

        if fit:
            self.scaler = MinMaxScaler()
            X_some_features = self.scaler.fit_transform(X_temp)
        else:
            self.scaler.transform(X_temp)

        return X_temp

    # Fit the model to the data.  You can use any model you like to do
    # the fit, however you should be able to predict all class
    # probabilities
    """
    This function uses a neural network classifier to predict new probabilities
    """
    def fit(self, X, y):
        y = y - 1 # 0 -> 1 good loan, 1 -> 2 bad loan
        X_scaled = self.preprocessing(X, fit=True)

        ## nn with keras
        self.model = Sequential([
            Dense(64, input_shape=(X.shape[1],)),
            Activation('tanh'),
            Dense(32),
            Activation('tanh'),
            Dense(16),
            Activation('tanh'),
            Dense(1),
            Activation('sigmoid'),
        ])

        # class_weights = class_weight.compute_class_weight(
        #     'balanced',
        #     np.unique(y),
        #     y
        # ) # 0 -> 1 good loan, 1 -> 2 bad loan
        class_weights = {0: 700, 1:300}

        self.model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # print(self.model.summary())
        self.model.fit(X_scaled, y, epochs=20)

    def test_accuracy(self, X, y):
        y = y - 1
        X_scaled = self.preprocessing(X)
        test_loss, test_acc = self.model.evaluate(X_scaled, y)
        return test_acc

    # set the interest rate
    """
    This function stores the interest rate within the function
    """
    def set_interest_rate(self, rate):
        self.rate = rate
        return

    # Predict the probability of failure for a specific person with data x
    """
    This function predicts the probability of failure (being a bad loan),
    given the data to predict for. It is necessary to cast the input
    to numpy since it is a Series.
    In case of single sample we also need to reshape it.
    """
    def predict_proba(self, x):
        x_reshaped = np.reshape(x.to_numpy(), (1, -1))
        # preprocessing
        x_scaled = self.preprocessing(x_reshaped)
        prediction = self.model.predict(x_scaled)
        return prediction[0][0]

    # The expected utility of granting the loan or not. Here there are two actions:
    # action = 0 do not grant the loan
    # action = 1 grant the loan
    #
    # Make sure that you extract the length_of_loan from the
    # 2nd attribute of x. Then the return if the loan is paid off to you is
    # amount_of_loan*(1 + rate)^length_of_loan
    # The return if the loan is not paid off is -amount_of_loan.
    """
    This function calculates the expected utility.
    The expected utility if the action is to grant the loan is given by
    the formula:
    amount_of_loan * (pow(1 + self.rate, length_of_loan)) * (1 - self.predict_proba(x)) +
    -amount_of_loan * self.predict_proba(x)
    The expected utility if the action is not to grant anything is: 0
    This is true because we don't loose or get anything.
    """
    def expected_utility(self, x, action):
        amount_of_loan = x['amount']
        length_of_loan = x['duration']
        if action == 1:
            proba = self.predict_proba(x)
            gain = amount_of_loan * (pow(1 + self.rate, length_of_loan)) * (1 - proba)
            loss = amount_of_loan * proba
            return gain - loss

        return 0

    # Return the best action. This is normally the one that maximises expected utility.
    # However, you are allowed to deviate from this if you can justify the reason.
    """
    This function returns the best action such that the expected utility is
    maximized
    """
    def get_best_action(self, x):
        actions = [0, 1]
        utility_0 = self.expected_utility(x, actions[0])
        utility_1 = self.expected_utility(x, actions[1])
        if utility_1 > utility_0:
            return actions[1]

        return actions[0]
