<a href="https://colab.research.google.com/github/YI-CHENG-SHIH645/ML-in-Business_practice/blob/main/ch3_linear_regression_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Imports
!gdown --id "15p2cUU3ocPUDDS9tGmhNos05dSyJFd2A"
!pip install scikit-learn==0.24.1

import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from utils import DataLoader, display_side_by_side, classifier_measurement_under_threshs

set_config(display='diagram')
rs = 3  # random state
np.random.seed(3)

Downloading...
From: https://drive.google.com/uc?id=15p2cUU3ocPUDDS9tGmhNos05dSyJFd2A
To: /content/utils.py
100% 6.59k/6.59k [00:00<00:00, 10.1MB/s]
Collecting scikit-learn==0.24.1
[?25l  Downloading https://files.pythonhosted.org/packages/f3/74/eb899f41d55f957e2591cde5528e75871f817d9fb46d4732423ecaca736d/scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3MB)
[K     |████████████████████████████████| 22.3MB 45.3MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.1 threadpoolctl-2.1.0


Exercise 3.13

In [None]:
table1_2 = DataLoader.load_data('age_salary_url', sheet_name='val')
X = table1_2['Age'].values.reshape(-1, 1)
y = table1_2['Salary'] / 1e3
table1_2.head()

Unnamed: 0,Age,Salary
0,30,166000
1,26,78000
2,58,310000
3,29,100000
4,40,260000


In [None]:
class NoInteractionPolyFeatures():
    def __init__(self, degree=2):
        self.degree = degree

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.hstack([X**i for i in range(1, self.degree+1)])

def print_model_info(pipe, X, y):
    model = pipe.steps[-1][1]
    print("bias: ", round(model.intercept_, 2))
    print("weights: ", model.coef_.round(2))
    print("mse: ", round(((y-pipe.predict(X))**2).sum(), 2), '\n')

In [None]:
#@title simple regression
pipe = make_pipeline(NoInteractionPolyFeatures(degree=5),
                     StandardScaler(),
                     LinearRegression())
pipe.fit(X, y)
print_model_info(pipe, X, y)

bias:  178.6
weights:  [ -19161.97   84643.67 -143504.71  110804.49  -32724.59]
mse:  6048.82 



In [None]:
#@title ridge & lasso X different alpha (lambda hyperparameter)
for model, alpha in itertools.product([Ridge(), Lasso(max_iter=3000)],
                                      [.02, .05, .1]):
    print(model.__class__.__name__, "alpha =", alpha)
    model.alpha = alpha
    pipe = make_pipeline(NoInteractionPolyFeatures(degree=5),
                         StandardScaler(), model)
    pipe.fit(X, y)
    print_model_info(pipe, X, y)

Ridge alpha = 0.02
bias:  178.6
weights:  [ 101.19   79.59   33.    -34.99 -119.77]
mse:  7459.4 

Ridge alpha = 0.05
bias:  178.6
weights:  [110.62  68.64  17.64 -39.03 -98.56]
mse:  7611.46 

Ridge alpha = 0.1
bias:  178.6
weights:  [108.66  61.91  12.13 -37.69 -85.66]
mse:  7901.53 

Lasso alpha = 0.02
bias:  178.6
weights:  [  -1.53  196.75   70.13    0.   -208.31]
mse:  7198.53 

Lasso alpha = 0.05
bias:  178.6
weights:  [  29.79  179.62   25.14    0.   -176.77]
mse:  7268.38 

Lasso alpha = 0.1
bias:  178.6
weights:  [  77.46  126.96    0.      0.   -145.77]
mse:  7382.74 



  positive)
  positive)
  positive)


Exercise 3.14

In [None]:
train_set = DataLoader.load_data('lending_club_reduced_train_url')
test_set = DataLoader.load_data('lending_club_reduced_test_url')
train_set.head()

Unnamed: 0,home_ownership,income,dti,fico_low,loan_status
0,1,44.304,18.47,690,0
1,0,38.5,33.73,660,0
2,1,54.0,19.0,660,0
3,1,60.0,33.98,695,0
4,0,39.354,10.85,685,0


In [None]:
x_train = train_set.iloc[:, :4]
y_train = train_set.iloc[:, -1]
x_test = test_set.iloc[:, :4]
y_test = test_set.iloc[:, -1]

def print_model_info(pipe):
    model = pipe.steps[-1][1]
    print("bias: ", model.intercept_)
    print("weights: ", model.coef_)

In [None]:
pipe = make_pipeline(LogisticRegression(penalty='none', solver='newton-cg', random_state=rs))
pipe.fit(x_train, y_train)
print_model_info(pipe)

bias:  [-6.56522903]
weights:  [[ 0.139496    0.00410667 -0.00112302  0.0112521 ]]


In [None]:
#@markdown if label are exchanged, probabilities will simply exchanged
pipe.fit(x_train, y_train)
res1 = pipe.predict_proba(x_train).round(4)
pipe.fit(x_train, abs(y_train-1))
res2 = pipe.predict_proba(x_train).round(4)
res1 == np.flip(res2, 1)

array([[ True,  True],
       [ True,  True],
       [ True,  True],
       ...,
       [ True,  True],
       [ True,  True],
       [ True,  True]])

In [None]:
classifier_measurement_under_threshs(abs(y_test-1),
                                     pipe.predict_proba(x_test),
                                     [.25, .20, .15])

Unnamed: 0,0.25,0.20,0.15
Accuracy,79.21,63.47,42.8
TPR,9.07,45.46,79.11
TNR,94.48,67.39,34.89
FPR,5.52,32.61,65.11
Precision,26.37,23.29,20.92
F-Score,13.5,30.8,33.1
F_0.5-Score,19.09,25.81,24.53
F_2-Score,10.44,38.19,50.84
tp,1.62,8.13,14.15
fn,16.26,9.75,3.74


In [None]:
roc_auc_score(abs(y_test-1), pipe.predict_proba(x_test)[:, 1]).round(4)

0.602

Exercise 3.15

In [None]:
iowa_scaled_df = DataLoader.load_data('iowa_reduced_scaled')
iowa_original_df = DataLoader.load_data('iowa_original')

numerical_features = list(iowa_scaled_df.columns[:21])
additional_features = ['LotFrontage', 'LotShape']
categorical_features = ['Neighborhood', 'BsmtQual']
iowa_df = iowa_original_df[numerical_features +\
                           additional_features +\
                           categorical_features].copy()
target_label = iowa_original_df['SalePrice']

In [None]:
iowa_df[additional_features + categorical_features].head()

Unnamed: 0,LotFrontage,LotShape,Neighborhood,BsmtQual
0,65,Reg,CollgCr,Gd
1,80,Reg,Veenker,Gd
2,68,IR1,CollgCr,Gd
3,60,IR1,Crawfor,TA
4,84,IR1,NoRidge,Gd


In [None]:
ordering_mappings = {
    "LotShape": {
        "Reg": 1, "IR1": 2, "IR2": 3, 
        "IR3": 4,
    },
    "BsmtQual": {
        "Ex": 5, "Gd": 4, "TA": 3,
        "Fa": 2, "NA": 0
    }
}

for feature in ordering_mappings:
    iowa_df = iowa_df.replace(ordering_mappings[feature])

iowa_df['LotFrontage'] = iowa_df['LotFrontage'].apply(pd.to_numeric, errors='coerce')
numerical_features = numerical_features + ['LotFrontage']

ordering_features = list(ordering_mappings.keys())
onehot_features = ['Neighborhood']
all_features = numerical_features +\
               ordering_features +\
               list(iowa_df[onehot_features[0]].unique())

iowa_df[additional_features + categorical_features].head()

Unnamed: 0,LotFrontage,LotShape,Neighborhood,BsmtQual
0,65,1,CollgCr,4
1,80,1,Veenker,4
2,68,2,CollgCr,4
3,60,2,Crawfor,3
4,84,2,NoRidge,4


In [None]:
numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),
                                  ('scaler', StandardScaler())])
ordering_transformer = StandardScaler()
onehot_transformer = Pipeline([('imputer', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                               ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features),
                                               ('ord', ordering_transformer, ordering_features),
                                               ('hot', onehot_transformer, onehot_features)])

In [None]:
x, y = iowa_df, target_label
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rs)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=rs)
y_train_scaler = StandardScaler()
y_train_scaled = y_train_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()

In [None]:
pipe = Pipeline([('preprocessor', preprocessor),
                 ('regressor', LinearRegression())])
pipe

In [None]:
#@title save best model for each regressor
models_grid_params = [{"regressor": [LinearRegression()]}] +\
                     [{"regressor": [reg()], 'regressor__alpha': [.01, .05, .1]}\
                      for reg in [Ridge, Lasso, ElasticNet]]

res = {param_grid['regressor'][0].__class__.__name__:
       GridSearchCV(pipe, param_grid).fit(x_train, y_train_scaled) for param_grid in models_grid_params}

In [None]:
def r_squared(y_true, y_predicted):
    return round((y_predicted - y_true.mean()).var() / y_true.var(), 3)

def standardize_tgt(transform_func, data):
    return transform_func(data.values.reshape(-1, 1)).ravel()

dfs, captions = [], []
for model_name in res:
    best_model = res[model_name]
    train_mse_info = "train : " + str(r_squared(y_train_scaled, best_model.predict(x_train))) + '<br>'
    val_mse_info = "vali  : " + str(r_squared(standardize_tgt(y_train_scaler.transform, y_val), best_model.predict(x_val)))
    captions.append(train_mse_info + val_mse_info)
    weights = best_model.best_estimator_.steps[-1][1].coef_
    nonzero_weights = weights[weights!=0]
    df = pd.DataFrame([[len(nonzero_weights)],
                       ['{:.2E}'.format(nonzero_weights.mean())]], 
                      columns=[model_name],
                      index=['num of features', 'magnitude of avg weights'])
    dfs.append(df)
display_side_by_side(dfs, captions)

Unnamed: 0_level_0,LinearRegression,Unnamed: 2_level_0,Unnamed: 3_level_0
Unnamed: 0_level_1,Ridge,Unnamed: 2_level_1,Unnamed: 3_level_1
Unnamed: 0_level_2,Lasso,Unnamed: 2_level_2,Unnamed: 3_level_2
Unnamed: 0_level_3,ElasticNet,Unnamed: 2_level_3,Unnamed: 3_level_3
num of features,49,,
magnitude of avg weights,2.80E+10,,
num of features,49,,
magnitude of avg weights,3.00E-02,,
num of features,33,,
magnitude of avg weights,4.83E-02,,
num of features,40,,
magnitude of avg weights,3.97E-02,,
train : 0.883 vali : 0.907,train : 0.883 vali : 0.907,train : 0.843 vali : 0.859,train : 0.859 vali : 0.877
LinearRegression  num of features  49  magnitude of avg weights  2.80E+10,Ridge  num of features  49  magnitude of avg weights  3.00E-02,Lasso  num of features  33  magnitude of avg weights  4.83E-02,ElasticNet  num of features  40  magnitude of avg weights  3.97E-02

Unnamed: 0,LinearRegression
num of features,49.0
magnitude of avg weights,28000000000.0

Unnamed: 0,Ridge
num of features,49.0
magnitude of avg weights,0.03

Unnamed: 0,Lasso
num of features,33.0
magnitude of avg weights,0.0483

Unnamed: 0,ElasticNet
num of features,40.0
magnitude of avg weights,0.0397


In [None]:
print("test ridge r-squared:", r_squared(standardize_tgt(y_train_scaler.transform, y_test), res['Ridge'].predict(x_test)))
print("test lasso r-squared:", r_squared(standardize_tgt(y_train_scaler.transform, y_test), res['Lasso'].predict(x_test)))

ridge_predictions = standardize_tgt(y_train_scaler.inverse_transform, pd.DataFrame(res['Ridge'].predict(x_test)))
lasso_predictions = standardize_tgt(y_train_scaler.inverse_transform, pd.DataFrame(res['Lasso'].predict(x_test)))
display_side_by_side([pd.DataFrame(ridge_predictions, columns=['Ridge']).astype(int).head(),
                      pd.DataFrame(lasso_predictions, columns=['Lasso']).astype(int).head(),
                      pd.DataFrame(y_test).reset_index(drop=True).head()])

test ridge r-squared: 0.853
test lasso r-squared: 0.811


Unnamed: 0_level_0,Ridge,Unnamed: 2_level_0
Unnamed: 0_level_1,Lasso,Unnamed: 2_level_1
Unnamed: 0_level_2,SalePrice,Unnamed: 2_level_2
0,90037,
1,283676,
2,204334,
3,126787,
4,181873,
0,89964,
1,283370,
2,203860,
3,140846,
4,184828,

Unnamed: 0,Ridge
0,90037
1,283676
2,204334
3,126787
4,181873

Unnamed: 0,Lasso
0,89964
1,283370
2,203860
3,140846
4,184828

Unnamed: 0,SalePrice
0,103000
1,252000
2,196500
3,138500
4,176485


# if you don't scale the target value (SalePrice), then you won't have sparsity on Lasso <br>
# It turns out that ridge / lasso results are almost identical

In [None]:
#@title fit "y_train" instead of "y_train_scaled"
x, y = iowa_df, target_label
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rs)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=rs)

models_grid_params = [{"regressor": [LinearRegression()]}] +\
                     [{"regressor": [reg(tol=0.05)], 'regressor__alpha': [.01, .05, .1]}\
                      for reg in [Ridge, Lasso, ElasticNet]]

res = {param_grid['regressor'][0].__class__.__name__:
       GridSearchCV(pipe, param_grid).fit(x_train, y_train) for param_grid in models_grid_params}

In [None]:
#@title result
dfs, captions = [], []
for model_name in res:
    best_model = res[model_name]
    train_mse_info = "train : " + str(r_squared(y_train.values, best_model.predict(x_train))) + '<br>'
    val_mse_info = "vali  : " + str(r_squared(y_val.values, best_model.predict(x_val)))
    captions.append(train_mse_info + val_mse_info)
    weights = best_model.best_estimator_.steps[-1][1].coef_
    nonzero_weights = weights[weights!=0]
    df = pd.DataFrame([[len(nonzero_weights)],
                       ['{:.2E}'.format(nonzero_weights.mean())]], 
                      columns=[model_name],
                      index=['num of features', 'magnitude of avg weights'])
    dfs.append(df)
display_side_by_side(dfs, captions)

Unnamed: 0_level_0,LinearRegression,Unnamed: 2_level_0,Unnamed: 3_level_0
Unnamed: 0_level_1,Ridge,Unnamed: 2_level_1,Unnamed: 3_level_1
Unnamed: 0_level_2,Lasso,Unnamed: 2_level_2,Unnamed: 3_level_2
Unnamed: 0_level_3,ElasticNet,Unnamed: 2_level_3,Unnamed: 3_level_3
num of features,49,,
magnitude of avg weights,2.17E+15,,
num of features,49,,
magnitude of avg weights,2.32E+03,,
num of features,49,,
magnitude of avg weights,2.18E+03,,
num of features,49,,
magnitude of avg weights,2.25E+03,,
train : 0.883 vali : 0.907,train : 0.883 vali : 0.907,train : 0.883 vali : 0.907,train : 0.862 vali : 0.881
LinearRegression  num of features  49  magnitude of avg weights  2.17E+15,Ridge  num of features  49  magnitude of avg weights  2.32E+03,Lasso  num of features  49  magnitude of avg weights  2.18E+03,ElasticNet  num of features  49  magnitude of avg weights  2.25E+03

Unnamed: 0,LinearRegression
num of features,49.0
magnitude of avg weights,2170000000000000.0

Unnamed: 0,Ridge
num of features,49.0
magnitude of avg weights,2320.0

Unnamed: 0,Lasso
num of features,49.0
magnitude of avg weights,2180.0

Unnamed: 0,ElasticNet
num of features,49.0
magnitude of avg weights,2250.0


In [None]:
#@title prediction is already the SalePrice, no need to do inverse transform
print("test ridge r-squared:", r_squared(y_test.values, res['Ridge'].predict(x_test)))
print("test lasso r-squared:", r_squared(y_test.values, res['Lasso'].predict(x_test)))

ridge_predictions = res['Ridge'].predict(x_test)
lasso_predictions = res['Lasso'].predict(x_test)
display_side_by_side([pd.DataFrame(ridge_predictions, columns=['Ridge']).astype(int).head(),
                      pd.DataFrame(lasso_predictions, columns=['Lasso']).astype(int).head(),
                      pd.DataFrame(y_test).reset_index(drop=True).head()])

test ridge r-squared: 0.853
test lasso r-squared: 0.853


Unnamed: 0_level_0,Ridge,Unnamed: 2_level_0
Unnamed: 0_level_1,Lasso,Unnamed: 2_level_1
Unnamed: 0_level_2,SalePrice,Unnamed: 2_level_2
0,90037,
1,283676,
2,204334,
3,126787,
4,181873,
0,90041,
1,283673,
2,204335,
3,126786,
4,181874,

Unnamed: 0,Ridge
0,90037
1,283676
2,204334
3,126787
4,181873

Unnamed: 0,Lasso
0,90041
1,283673
2,204335
3,126786
4,181874

Unnamed: 0,SalePrice
0,103000
1,252000
2,196500
3,138500
4,176485


Exercise 3.16

In [None]:
# it takes ~ 1 min
full_df = DataLoader.load_data('lending_club_full_url')
full_df = full_df[full_df.columns[1:]]

In [None]:
#@title processed data
def select_columns(dataframe, cols: list):
    return dataframe[cols]

def select_rows_by_col_values(dataframe, col: str, values: list):
    return dataframe[dataframe[col].isin(values).values]

def replace_col_values(dataframe, col: str, mapping: dict):
    dataframe[col] = dataframe[col].replace(mapping)
    return dataframe

def math_op(dataframe, col: str, op_func, v):
    dataframe[col] = op_func(dataframe[col], v)
    return dataframe

new_train_df = full_df.pipe(select_columns, cols=['home_ownership', 'annual_inc', 'dti', 'fico_range_low', 'loan_status'])\
                      .pipe(select_rows_by_col_values, col='loan_status', values=['Fully Paid', 'Charged Off'])\
                      .pipe(replace_col_values, col='loan_status', mapping={"Charged Off": 0, "Fully Paid": 1})\
                      .pipe(select_rows_by_col_values, col='home_ownership', values=['OWN', 'RENT'])\
                      .pipe(replace_col_values, col='home_ownership', mapping={"RENT": 0, "OWN": 1})\
                      .pipe(math_op, col='annual_inc', op_func=np.divide, v=1e3)
new_train_df.head(6)

Unnamed: 0,home_ownership,annual_inc,dti,fico_range_low,loan_status
2,0,50.0,29.62,735.0,1
3,0,64.4,16.68,675.0,1
5,0,38.5,33.73,660.0,0
11,0,76.0,17.31,685.0,1
12,0,75.0,22.34,700.0,1
17,0,55.0,22.0,665.0,1


In [None]:
#@title original data with selected rows
full_df.pipe(select_columns, cols=['home_ownership', 'annual_inc', 'dti', 'fico_range_low', 'loan_status'])\
       .loc[new_train_df.index, :].head(6)

Unnamed: 0,home_ownership,annual_inc,dti,fico_range_low,loan_status
2,RENT,50000.0,29.62,735.0,Fully Paid
3,RENT,64400.0,16.68,675.0,Fully Paid
5,RENT,38500.0,33.73,660.0,Charged Off
11,RENT,76000.0,17.31,685.0,Fully Paid
12,RENT,75000.0,22.34,700.0,Fully Paid
17,RENT,55000.0,22.0,665.0,Fully Paid


In [None]:
pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                 ('classifier', LogisticRegression(penalty='none', solver='newton-cg', random_state=3))])

X = new_train_df.drop('loan_status', axis=1)
y = new_train_df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs, stratify=y)

pipe.fit(X_train, y_train)

In [None]:
classifier_measurement_under_threshs(y_test, pipe.predict_proba(X_test), [0.3, 0.4, 0.5, 0.6, 0.75, 0.80, 0.85])

Unnamed: 0,0.30,0.40,0.50,0.60,0.75,0.80,0.85
Accuracy,76.49,76.49,76.49,76.36,60.74,47.62,34.88
TPR,99.91,99.91,99.83,97.27,62.68,39.54,17.85
TNR,0.28,0.28,0.56,8.33,54.44,73.89,90.28
FPR,99.72,99.72,99.44,91.67,45.56,26.11,9.72
Precision,76.52,76.52,76.56,77.54,81.74,83.12,85.66
F-Score,86.67,86.67,86.66,86.29,70.95,53.59,29.54
F_0.5-Score,80.28,80.28,80.3,80.81,77.05,68.11,48.67
F_2-Score,94.16,94.16,94.11,92.56,65.75,44.17,21.21
tp,76.42,76.42,76.36,74.4,47.94,30.24,13.65
fn,0.07,0.07,0.13,2.09,28.54,46.24,62.83


# F-score

$F_{\beta} = \frac{1+\beta^2}{\frac{1}{precision} + \frac{\beta^2}{recall}}$ ,  consider $F_{2}$, $F_{0.5}$ <br><br>
$F_2 = \frac{5}{\frac{1}{precision} + \frac{4}{recall}}$,
$F_{0.5} = \frac{1.25}{\frac{1}{precision} + \frac{0.25}{recall}} = \frac{5}{\frac{4}{precision} + \frac{1}{recall}}$ <br><br>

---

let $precision = 1, recall = 0.5$ <br><br>
$F_2 = \frac{5}{9}, F_{0.5} = \frac{5}{6}$ <br><br>

---

let $precision = 0.5, recall = 1$ <br><br>
$F_2 = \frac{5}{6}, F_{0.5} = \frac{5}{9}$ <br><br>

---

conclusion: <br>&nbsp;&nbsp;&nbsp;&nbsp;
if $\beta > 1$, it's encouraged to improve recall

給定 model, 設定不同的 threshold, 取能夠最大化 $F_{\beta}$ 的 threshold

In [None]:
# V * TP - 4V * FP
print(round(47.94 - 4 * 10.71, 3))
print(round(30.24 - 4 * 6.14, 3))
print(round(13.65 - 4 * 2.29, 3))

5.1
5.68
4.49


In [None]:
round(roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1]), 3)

0.614

In [None]:
#@title add some other features to see if there's any improvement
full_df_with_float_col = full_df[[col for col in full_df.columns 
                                  if (full_df[col].dtype == float) and (col not in new_train_df.columns)]]
full_df_with_float_col = full_df_with_float_col.loc[new_train_df.index, :]
full_df_with_float_col = full_df_with_float_col.dropna(axis=1, thresh=int(len(full_df_with_float_col))*0.95)

orig_addi_df = pd.concat([new_train_df, full_df_with_float_col], axis=1)
orig_features = list(new_train_df.drop('loan_status', axis=1).columns)
addi_features = list(full_df_with_float_col.columns)
print("features used above: ", orig_features)
print("some additional features to choose from:", addi_features)

features used above:  ['home_ownership', 'annual_inc', 'dti', 'fico_range_low']
some additional features to choose from: ['loan_amnt', 'int_rate', 'installment', 'delinq_2yrs', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens']


In [None]:
orig_features_transformer = SimpleImputer(strategy='median')
addi_features_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),
                                      ('feature_selection', SelectKBest(chi2, k=2))])
preprocessor = ColumnTransformer(transformers=[('orig', orig_features_transformer, orig_features),
                                               ('addi', addi_features_transformer, addi_features)])

pipe = Pipeline([('preprocessor', preprocessor),
                 ('classifier', LogisticRegression(penalty='none', solver='newton-cg', random_state=3, tol=20))])

In [None]:
x = orig_addi_df.drop('loan_status', axis=1)
y = orig_addi_df['loan_status']
print(x.shape[1])
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=rs, stratify=y)

pipe.fit(x_train, y_train)

24


In [None]:
#@title addi features selected by "SelectKBest" Algorithm
feature_selected_idx = pipe.named_steps['preprocessor']\
                           .named_transformers_['addi']\
                           .named_steps['feature_selection']\
                           .get_support(True)
selected_features = list(np.array(addi_features)[feature_selected_idx])
selected_features

['loan_amnt', 'last_fico_range_low']

In [None]:
classifier_measurement_under_threshs(y_test, pipe.predict_proba(x_test), [0.75, 0.80, 0.85])

0.928


Unnamed: 0,0.75,0.80,0.85
Accuracy,87.66,86.22,83.41
TPR,88.13,84.88,80.61
TNR,86.11,90.56,92.5
FPR,13.89,9.44,7.5
Precision,95.38,96.69,97.22
F-Score,91.61,90.4,88.14
F_0.5-Score,93.84,94.08,93.37
F_2-Score,89.49,87.01,83.47
tp,67.41,64.92,61.66
fn,9.08,11.56,14.83


In [None]:
# V * TP - 4V * FP
print(round(67.41 - 4 * 3.27, 3))
print(round(64.92 - 4 * 2.22, 3))
print(round(61.66 - 4 * 1.76, 3))

54.33
56.04
54.62


In [None]:
print(round(roc_auc_score(y_test, pipe.predict_proba(x_test)[:, 1]), 3))

0.928
