# Explore here

In [93]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_percentage_error, accuracy_score
import xgboost as xgb
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import GridSearchCV
from pickle import dump



In [94]:
df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv', sep=';')

In [95]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [97]:
df.default.unique()

array(['no', 'unknown', 'yes'], dtype=object)

In [98]:
df.duplicated().sum()

12

In [99]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [100]:
df.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [101]:
df.nunique()

age                 78
job                 12
marital              4
education            8
default              3
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1544
campaign            42
pdays               27
previous             8
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          316
nr.employed         11
y                    2
dtype: int64

In [102]:
print(f'''
      default: {df.default.value_counts()},
      housing: {df.housing.value_counts()},
      loan: {df.loan.value_counts()},
      contact: {df.contact.value_counts()},
      previous: {df.previous.value_counts()},
      poutcome: {df.poutcome.value_counts()},
      y: {df.y.value_counts()}
      ''')



      default: default
no         32577
unknown     8596
yes            3
Name: count, dtype: int64,
      housing: housing
yes        21571
no         18615
unknown      990
Name: count, dtype: int64,
      loan: loan
no         33938
yes         6248
unknown      990
Name: count, dtype: int64,
      contact: contact
cellular     26135
telephone    15041
Name: count, dtype: int64,
      previous: previous
0    35551
1     4561
2      754
3      216
4       70
5       18
6        5
7        1
Name: count, dtype: int64,
      poutcome: poutcome
nonexistent    35551
failure         4252
success         1373
Name: count, dtype: int64,
      y: y
no     36537
yes     4639
Name: count, dtype: int64
      


In [103]:
df.default.value_counts()

default
no         32577
unknown     8596
yes            3
Name: count, dtype: int64

In [104]:
pd.pivot_table(
    df,
    index=['education', 'job'],
    columns=['y'],
    aggfunc='size',
    fill_value=0  # to avoid the 'nan' legend when some value doesn't exists
)

Unnamed: 0_level_0,y,no,yes
education,job,Unnamed: 2_level_1,Unnamed: 3_level_1
basic.4y,admin.,67,10
basic.4y,blue-collar,2195,123
basic.4y,entrepreneur,130,7
basic.4y,housemaid,423,51
basic.4y,management,95,5
...,...,...,...
unknown,services,131,19
unknown,student,108,59
unknown,technician,187,25
unknown,unemployed,14,5


In [105]:
pivot = pd.pivot_table(
    df,
    index=['education', 'job'],
    columns=['y'],
    aggfunc='size',
    fill_value=0  # to avoid the 'nan' legend when some value doesn't exists
)
# to force the pivot table format with percentajes
pivot_percentage = pivot.div(pivot.sum(axis=1), axis=0) * 100
pivot_percentage.style.format("{:.2f}%")

Unnamed: 0_level_0,y,no,yes
education,job,Unnamed: 2_level_1,Unnamed: 3_level_1
basic.4y,admin.,87.01%,12.99%
basic.4y,blue-collar,94.69%,5.31%
basic.4y,entrepreneur,94.89%,5.11%
basic.4y,housemaid,89.24%,10.76%
basic.4y,management,95.00%,5.00%
basic.4y,retired,69.01%,30.99%
basic.4y,self-employed,96.77%,3.23%
basic.4y,services,94.70%,5.30%
basic.4y,student,69.23%,30.77%
basic.4y,technician,84.48%,15.52%


EDA Completed

In [None]:
# starting to work with train and test data
df_train, df_test = tts(df.drop(columns=['y']), test_size=0.25, random_state=100)

In [107]:
# define columns by numeric or categoric to factorize and then scale
data_types = df.dtypes
numeric_columns = [c for c in list(data_types[data_types != 'object'].index) if c != 'y']
categoric_columns = [c for c in list(data_types[data_types == 'object'].index) if c!= 'y']

In [108]:
df_train_categ_factorized = df_train[categoric_columns].apply(lambda col: pd.factorize(col)[0])
df_test_categ_factorized = df_test[categoric_columns].apply(lambda col: pd.factorize(col)[0])
df_train_categ_factorized

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
6673,0,0,0,0,0,0,0,0,0,0
31941,1,0,0,0,0,0,0,0,1,0
40893,2,1,1,1,1,0,1,1,2,1
35237,1,1,2,1,1,0,1,0,2,0
37623,3,0,3,1,0,0,1,2,3,0
...,...,...,...,...,...,...,...,...,...,...
16307,0,1,0,1,1,0,1,3,4,0
79,6,0,0,0,0,0,0,0,3,0
12120,5,2,3,1,1,0,0,4,3,0
14149,0,0,2,0,0,1,1,3,3,0


In [109]:
# an inner_join to present all numeric data in 'one' pd to take to scale further
total_data_train = df_train_categ_factorized.join(df_train[numeric_columns]).join(df['y'])
total_data_test = df_test_categ_factorized.join(df_test[numeric_columns]).join(df['y'])
total_data_test

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
22409,0,0,0,0,0,0,0,0,0,0,...,78,1,999,0,1.4,93.444,-36.1,4.964,5228.1,no
7844,0,0,1,0,0,1,1,1,1,0,...,464,1,999,0,1.4,94.465,-41.8,4.865,5228.1,no
37808,1,0,2,0,0,0,0,0,2,1,...,291,2,3,2,-2.9,92.201,-31.4,0.838,5076.2,yes
29235,1,0,3,0,0,0,0,2,0,2,...,224,3,999,2,-1.8,93.075,-47.1,1.405,5099.1,no
39374,2,0,2,0,0,0,1,3,1,0,...,113,4,999,0,-1.8,93.369,-34.8,0.635,5008.7,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17050,1,0,0,1,1,0,0,7,0,0,...,823,3,999,0,1.4,93.918,-42.7,4.962,5228.1,no
37697,1,0,2,0,1,0,0,0,4,0,...,321,2,999,0,-2.9,92.201,-31.4,0.854,5076.2,no
5800,9,0,6,0,0,0,1,5,1,0,...,48,2,999,0,1.1,93.994,-36.4,4.857,5191.0,no
38719,5,1,2,0,1,0,1,8,2,0,...,345,2,999,0,-3.4,92.649,-30.1,0.715,5017.5,no


In [110]:
# Let's Scale the data
scaler = StandardScaler()

In [111]:
# We scale numeric columns
total_data_train_scaled = scaler.fit_transform(total_data_train.drop('y',axis=1))
total_data_train_scaled = pd.DataFrame(total_data_train_scaled, columns=total_data_train.columns.drop('y'))
total_data_test_scaled = scaler.transform(total_data_test.drop('y',axis=1)) # to use the same scale transformed 
total_data_test_scaled = pd.DataFrame(total_data_test_scaled, columns=total_data_test.columns.drop('y'))
total_data_test_scaled

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,-1.196533,-0.734029,-1.660080,-1.951610,-1.062216,-0.439241,-1.317599,-1.085281,-1.410514,-0.374514,-0.289499,-0.697610,-0.566038,0.195600,-0.351135,0.839871,-0.229346,0.954871,0.774643,0.845544
1,-1.196533,-0.734029,-1.125424,-1.951610,-1.062216,1.757817,0.758956,-0.675183,-0.703311,-0.374514,-0.864100,0.786043,-0.566038,0.195600,-0.351135,0.839871,1.535586,-0.278285,0.717572,0.845544
2,-0.886786,-0.734029,-0.590767,-1.951610,-1.062216,-0.439241,-1.317599,-1.085281,0.003893,1.818043,0.285102,0.121090,-0.203915,-5.128554,3.667304,-1.900552,-2.378034,1.971683,-1.603886,-1.254242
3,-0.886786,-0.734029,-0.056111,-1.951610,-1.062216,-0.439241,-1.317599,-0.265085,-1.410514,4.010601,0.763936,-0.136435,0.158207,0.195600,3.667304,-1.199514,-0.867211,-1.424903,-1.277026,-0.937684
4,-0.577038,-0.734029,-0.590767,-1.951610,-1.062216,-0.439241,0.758956,0.145012,-0.703311,-0.374514,1.721605,-0.563082,0.520330,0.195600,-0.351135,-1.199514,-0.358993,1.236117,-1.720910,-2.187326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10289,-0.886786,-0.734029,-1.660080,0.511793,0.789094,-0.439241,-1.317599,1.785403,-1.410514,-0.374514,0.093568,2.165916,0.158207,0.195600,-0.351135,0.839871,0.590025,-0.472993,0.773490,0.845544
10290,-0.886786,-0.734029,-0.590767,-1.951610,0.789094,-0.439241,-1.317599,-1.085281,1.418301,-0.374514,-0.672567,0.236399,-0.203915,0.195600,-0.351135,-1.900552,-2.378034,1.971683,-1.594663,-1.254242
10291,1.591195,-0.734029,1.547858,-1.951610,-1.062216,-0.439241,0.758956,0.965207,-0.703311,-0.374514,1.721605,-0.812919,-0.203915,0.195600,-0.351135,0.648679,0.721401,0.889968,0.712960,0.332693
10292,0.352205,0.702036,-0.590767,-1.951610,0.789094,-0.439241,0.758956,2.195500,0.003893,-0.374514,-1.438702,0.328647,-0.203915,0.195600,-0.351135,-2.219206,-1.603607,2.252929,-1.674792,-2.065679


In [112]:
total_data_test.y = total_data_test.y.replace(('yes','no'),(1,0))
total_data_train.y = total_data_train.y.replace(('yes','no'),(1,0))
total_data_train.y

6673     0
31941    0
40893    0
35237    0
37623    0
        ..
16307    0
79       0
12120    0
14149    0
38420    1
Name: y, Length: 30882, dtype: int64

In [113]:
# Indep and dependent variables
X_train = total_data_train_scaled
y_train = total_data_train.y # this was the original line till I checked it in chatgpt, which mentioned wasn't necessary cause I haven't changed the indexes: total_data_train.y.loc[X_train.index]

X_test = total_data_test_scaled
y_test = total_data_test.y

In [114]:
# xgb to evaluate the importance/impact of every feature
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=100).fit(X_train, y_train)
df_importance = pd.DataFrame({'Feature':X_train.columns, 'Importance':xgb_model.feature_importances_*100})
df_importance = df_importance.sort_values(by='Importance', ascending=False)

# Let's focus of 30 most important
aux = df_importance.sort_values('Importance', ascending=False).head(30)
df_importance = df_importance.loc[aux.index]
df_importance


Unnamed: 0,Feature,Importance
19,nr.employed,73.680946
11,duration,4.122779
13,pdays,2.453902
17,cons.conf.idx,2.391132
18,euribor3m,1.667777
9,poutcome,1.494949
16,cons.price.idx,1.414721
7,month,1.290402
3,default,1.251059
6,contact,1.167755


In [None]:
# we preserve only the important features 
X_train = X_train[df_importance.Feature.values]
X_test = X_test[df_importance.Feature.values]

In [129]:
# we evaluate them in VIF
vif = pd.Series([variance_inflation_factor(X_train.corr().values, i) for i in range(X_train.corr().shape[1])], index=X_train.columns)
vif

duration       1.022731
default        1.195276
age            1.508851
day_of_week    1.007448
campaign       1.024420
education      1.031338
loan           1.381674
job            1.458412
housing        1.382209
marital        1.135892
dtype: float64

In [None]:
# filtering and keeping the variables with acceptable VIF 
X_train = X_train.loc[:, vif < 5]
X_train

Unnamed: 0,duration,default,age,day_of_week,campaign,education,loan,job,housing,marital
0,-0.059562,-1.951610,0.093568,-1.410514,1.244576,-1.660080,-0.439241,-1.196533,-1.062216,-0.734029
1,-0.955135,-1.951610,-1.151401,-0.703311,-0.203915,-1.660080,-0.439241,-0.886786,-1.062216,-0.734029
2,0.970538,0.511793,-1.247168,0.003893,-0.566038,-1.125424,-0.439241,-0.577038,0.789094,0.702036
3,-0.278651,0.511793,-0.959867,0.003893,3.055190,-0.590767,-0.439241,-0.886786,0.789094,0.702036
4,-0.524645,0.511793,-0.576800,0.711097,-0.566038,-0.056111,-0.439241,-0.267290,-1.062216,-0.734029
...,...,...,...,...,...,...,...,...,...,...
30877,-0.828294,0.511793,-1.247168,1.418301,0.520330,-1.660080,-0.439241,-1.196533,0.789094,0.702036
30878,-0.197934,-1.951610,0.189335,0.711097,-0.566038,-1.660080,-0.439241,0.661952,-1.062216,-0.734029
30879,-0.543863,0.511793,-0.768334,0.711097,-0.566038,-0.056111,-0.439241,0.352205,0.789094,2.138101
30880,-0.674548,-1.951610,-0.481033,0.711097,-0.203915,-0.590767,1.757817,-1.196533,-1.062216,-0.734029


In [118]:
X_test = X_test.loc[:, vif < 5]
X_test

Unnamed: 0,duration,default,age,day_of_week,campaign,education,loan,job,housing,marital
0,-0.697610,-1.951610,-0.289499,-1.410514,-0.566038,-1.660080,-0.439241,-1.196533,-1.062216,-0.734029
1,0.786043,-1.951610,-0.864100,-0.703311,-0.566038,-1.125424,1.757817,-1.196533,-1.062216,-0.734029
2,0.121090,-1.951610,0.285102,0.003893,-0.203915,-0.590767,-0.439241,-0.886786,-1.062216,-0.734029
3,-0.136435,-1.951610,0.763936,-1.410514,0.158207,-0.056111,-0.439241,-0.886786,-1.062216,-0.734029
4,-0.563082,-1.951610,1.721605,-0.703311,0.520330,-0.590767,-0.439241,-0.577038,-1.062216,-0.734029
...,...,...,...,...,...,...,...,...,...,...
10289,2.165916,0.511793,0.093568,-1.410514,0.158207,-1.660080,-0.439241,-0.886786,0.789094,-0.734029
10290,0.236399,-1.951610,-0.672567,1.418301,-0.203915,-0.590767,-0.439241,-0.886786,0.789094,-0.734029
10291,-0.812919,-1.951610,1.721605,-0.703311,-0.203915,1.547858,-0.439241,1.591195,-1.062216,-0.734029
10292,0.328647,-1.951610,-1.438702,0.003893,-0.203915,-0.590767,-0.439241,0.352205,0.789094,0.702036


In [None]:
# to verify the correlation between variables
X_corr = X_train.copy()
X_corr['y'] = y_train
corr = X_corr.corr()
corr.style.background_gradient(cmap='coolwarm').format(precision=3)

Unnamed: 0,duration,default,age,day_of_week,campaign,education,loan,job,housing,marital,y
duration,1.0,0.017,-0.003,-0.018,-0.068,-0.009,-0.007,0.011,-0.014,0.002,-0.003
default,0.017,1.0,-0.165,-0.01,-0.035,0.045,-0.001,-0.123,0.009,0.106,-0.005
age,-0.003,-0.165,1.0,0.035,0.006,0.04,-0.006,0.273,-0.005,-0.116,0.004
day_of_week,-0.018,-0.01,0.035,1.0,-0.0,-0.001,0.004,-0.001,-0.004,0.004,-0.001
campaign,-0.068,-0.035,0.006,-0.0,1.0,0.001,0.005,-0.008,-0.006,-0.0,0.004
education,-0.009,0.045,0.04,-0.001,0.001,1.0,0.009,-0.045,0.004,0.016,-0.011
loan,-0.007,-0.001,-0.006,0.004,0.005,0.009,1.0,-0.009,0.284,0.001,0.003
job,0.011,-0.123,0.273,-0.001,-0.008,-0.045,-0.009,1.0,-0.002,-0.133,0.003
housing,-0.014,0.009,-0.005,-0.004,-0.006,0.004,0.284,-0.002,1.0,0.006,0.0
marital,0.002,0.106,-0.116,0.004,-0.0,0.016,0.001,-0.133,0.006,1.0,-0.005


### Nice!

In [None]:
# working with the model
model = LogisticRegression()
m_log_reg = model.fit(X_train, y_train)


In [None]:
# predicting
y_pred_l_train = m_log_reg.predict(X_train) 
y_pred_l_test = m_log_reg.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
# to see how it went
accuracy_score(y_test, y_pred_l_test)


0.8923644841655333

In [None]:
# optimizing hyperparams
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)
grid

In [124]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
# working with best hp
model_optimized = LogisticRegression(C=0.01, penalty='l2', solver='liblinear')
m_log_reg_optimized = model_optimized.fit(X_train, y_train)

In [None]:
y_train_pred_opt = m_log_reg_optimized.predict(X_train) 
y_test_pred_opt = m_log_reg_optimized.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [127]:
print(f'''
      First Model Accuracy:
      {accuracy_score(y_test, y_pred_l_test)}
      
      Optimized Model Accuracy:
      {accuracy_score(y_test, y_test_pred_opt)}
      
      Optimization Gain:
      {(accuracy_score(y_test, y_test_pred_opt) - accuracy_score(y_test, y_pred_l_test))}
      ''')



      First Model Accuracy:
      0.8923644841655333
      
      Optimized Model Accuracy:
      0.8925587721002526
      
      Optimization Gain:
      0.00019428793471931716
      


In [128]:
dump(model, open("../models/log-regression-C-0.01-penalty-l2-solver-liblinear.sav", "wb"))
