# Project 4 - Hackathon Project<br>

Brandie Hatch

- Preprocessing
- Modeling
- Evaluation
- Conclusions & Next Steps

## Preprocessing

In [125]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

%matplotlib inline

from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, RidgeCV, Ridge, Lasso, ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer, TransformedTargetRegressor
from sklearn.decomposition import PCA

pd.options.display.max_columns =999

In [102]:
df = pd.read_csv('../data/bank_clean.csv', index_col=False)
print(df.shape)
df.head()

(45211, 9)


Unnamed: 0,age,job,marital,education,default,balance,housing,personal,termdep
0,58,management,married,tertiary,0,2143,1,0,0
1,44,technician,single,secondary,0,29,1,0,0
2,33,entrepreneur,married,secondary,0,2,1,1,0
3,47,blue-collar,married,unknown,0,1506,1,0,0
4,33,unknown,single,unknown,0,1,0,0,0


In [103]:
# Define X, y, and train/test/split (stratify because of categorical 0/1 data for y)
X = df.drop(columns='default')
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=.03,
    random_state=42,
    stratify=y
)

In [104]:
# Print shapes and head to confirm train/test/split
print(f"Shape of original data frame: {df.shape}")
print(" ")
print(f"Shape of X train: {X_train.shape}")
print(f"Shape of y train: {y_train.shape}")
print(" ")
print(f"Shape of X test: {X_test.shape}")
print(f"Shape of y test: {y_test.shape}")
X_train.head()

Shape of original data frame: (45211, 9)
 
Shape of X train: (43854, 8)
Shape of y train: (43854,)
 
Shape of X test: (1357, 8)
Shape of y test: (1357,)


Unnamed: 0,age,job,marital,education,balance,housing,personal,termdep
44050,74,retired,divorced,primary,29080,0,0,1
42112,32,student,married,secondary,833,1,0,0
20135,30,entrepreneur,married,tertiary,5,0,0,0
42230,42,technician,married,secondary,994,1,0,1
25134,32,services,divorced,secondary,453,1,0,0


### Prepare job, marital, and education

In [105]:
X_train['job'].value_counts()

blue-collar      9423
management       9173
technician       7371
admin.           5034
services         4022
retired          2199
self-employed    1541
entrepreneur     1456
unemployed       1254
housemaid        1201
student           903
unknown           277
Name: job, dtype: int64

In [106]:
X_train['marital'].value_counts()

married     26385
single      12419
divorced     5050
Name: marital, dtype: int64

In [107]:
X_train['education'].value_counts()

secondary    22506
tertiary     12895
primary       6653
unknown       1800
Name: education, dtype: int64

In [108]:
numeric = [col for col in X_train._get_numeric_data().columns if col not in 'default']
categorical = [col for col in X_train.columns if col not in numeric and col not in 'default']

In [109]:
# Make column transformer for job, marital, and education; 
col_trans = ColumnTransformer(
    [('ss', StandardScaler(), numeric),
    ('ohe', OneHotEncoder(sparse=False, drop='if_binary', handle_unknown='ignore'), categorical)],
    sparse_threshold=0
)

In [110]:
col_trans.fit_transform(X_train)

array([[ 3.11313413,  9.16891124, -1.11961857, ...,  0.        ,
         0.        ,  0.        ],
       [-0.84228288, -0.17432059,  0.89316132, ...,  1.        ,
         0.        ,  0.        ],
       [-1.03063607, -0.44819731, -1.11961857, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.40734245, -0.3651743 ,  0.89316132, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.13542563,  1.29032453, -1.11961857, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.19365967,  0.45347897, -1.11961857, ...,  0.        ,
         1.        ,  0.        ]])

In [126]:
# Create Pipeline for transformers, scaling, and estimating + instantiate Regression
pipe = Pipeline([
    ('col_trans', col_trans),
    ('pca', PCA()),
    ('lasso', Lasso())
])

lasso_params= {
    'col_trans__remainder': ['passthrough'],
    'pca__n_components': range(1,X.shape[1]+1,1),
    'lasso__alpha': [-0.1, 0.1, 100],
    'lasso__normalize': [True, False],
    'lasso__selection': ['cyclic', 'random']
}

In [127]:
# Gridsearch for instantiation and running model
gs = GridSearchCV(
    pipe,
    lasso_params,
    cv=5
)

## Modeling


In [136]:
# Baseline model
y_train.mean()

0.01803712318146577

In [139]:
# these are the predicted values
y_pred = [y_train.mean()]*len(y_test)

In [140]:
r2_score(y_test, y_pred)

-7.093468467278541e-06

Few features are important in this model, so I am going to start with Lasso regression 

In [128]:
gs.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set para

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('col_trans',
                                        ColumnTransformer(sparse_threshold=0,
                                                          transformers=[('ss',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'balance',
                                                                          'housing',
                                                                          'personal',
                                                                          'termdep']),
                                                                        ('ohe',
                                                                         OneHotEncoder(drop='if_binary',
                                                                       

In [131]:
    print("Best Number Of Components:", gs.best_estimator_.get_params()["pca__n_components"])
    print(gs.best_estimator_.get_params()["lasso"])

Best Number Of Components: 1
Lasso(alpha=0.1, normalize=True)


In [129]:
gs.best_score_

-0.00021436579644049659

In [132]:
gs.best_estimator_

Pipeline(steps=[('col_trans',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('ss', StandardScaler(),
                                                  ['age', 'balance', 'housing',
                                                   'personal', 'termdep']),
                                                 ('ohe',
                                                  OneHotEncoder(drop='if_binary',
                                                                handle_unknown='ignore',
                                                                sparse=False),
                                                  ['job', 'marital',
                                                   'education'])])),
                ('pca', PCA(n_components=1)),
                ('lasso', Lasso(alpha=0.1, normalize=True))])

In [116]:
gs.best_params_

{'col_trans__remainder': 'passthrough', 'lasso__alpha': 0.1}

In [134]:
gs.score(X_test, y_test)

-7.093468467278541e-06

## Evaluation


## Conclusions & Next Steps