### Necessary libraries

In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# options
pd.set_option('display.max_columns', 200)

### Reading Data

In [64]:
raw_data = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv', sep = ';')

In [65]:
raw_data.to_csv('../data/interim/total_data.csv', index = False)

### Exploring Data

In [66]:
total_data = pd.read_csv('../data/interim/total_data.csv')
total_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [67]:
total_data.shape

(41188, 21)

In [68]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

### Check for Null Values

In [69]:
100 * total_data.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

### Check Duplicates

In [70]:
total_data[total_data.duplicated()]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
1266,39,blue-collar,married,basic.6y,no,no,no,telephone,may,thu,124,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
12261,36,retired,married,unknown,no,no,no,telephone,jul,thu,88,1,999,0,nonexistent,1.4,93.918,-42.7,4.966,5228.1,no
14234,27,technician,single,professional.course,no,no,no,cellular,jul,mon,331,2,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
16956,47,technician,divorced,high.school,no,yes,no,cellular,jul,thu,43,3,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
18465,32,technician,single,professional.course,no,yes,no,cellular,jul,thu,128,1,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,no
20216,55,services,married,high.school,unknown,no,no,cellular,aug,mon,33,1,999,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1,no
20534,41,technician,married,professional.course,no,yes,no,cellular,aug,tue,127,1,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no
25217,39,admin.,married,university.degree,no,no,no,cellular,nov,tue,123,2,999,0,nonexistent,-0.1,93.2,-42.0,4.153,5195.8,no
28477,24,services,single,high.school,no,yes,no,cellular,apr,tue,114,1,999,0,nonexistent,-1.8,93.075,-47.1,1.423,5099.1,no
32516,35,admin.,married,university.degree,no,yes,no,cellular,may,fri,348,4,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no


In [71]:
# Amount of duplicates
total_data.duplicated().sum()

12

In [72]:
# Eliminate duplicates
total_data.drop_duplicates(inplace = True)

In [73]:
total_data['job'] = pd.factorize(total_data['job'])[0]
total_data['marital'] = pd.factorize(total_data['marital'])[0]
total_data['education'] = pd.factorize(total_data['education'])[0]
total_data['default'] = pd.factorize(total_data['default'])[0]
total_data['housing'] = pd.factorize(total_data['housing'])[0]
total_data['loan'] = pd.factorize(total_data['loan'])[0]
total_data['contact'] = pd.factorize(total_data['contact'])[0]
total_data['month'] = pd.factorize(total_data['month'])[0]
total_data['day_of_week'] = pd.factorize(total_data['day_of_week'])[0]
total_data['poutcome'] = pd.factorize(total_data['poutcome'])[0]
total_data['y'] = pd.factorize(total_data['y'])[0]

In [74]:
columnas = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan','contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']

# Min-Max Scaler
scaler = MinMaxScaler()
scal_features = scaler.fit_transform(total_data[columnas])
total_data_scaler = pd.DataFrame(data = scal_features, index = total_data.index, columns = columnas)
total_data_scaler.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0.481481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05307,0.0,1.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.493827,0.090909,0.0,0.142857,0.5,0.0,0.0,0.0,0.0,0.0,0.030297,0.0,1.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.246914,0.090909,0.0,0.142857,0.0,0.5,0.0,0.0,0.0,0.0,0.045954,0.0,1.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.283951,0.181818,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.030704,0.0,1.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.481481,0.090909,0.0,0.142857,0.0,0.0,0.5,0.0,0.0,0.0,0.062424,0.0,1.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


In [75]:
# Feature Selection

X = total_data_scaler.drop(['y'], axis = 1)
y = total_data_scaler['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [76]:
train_data = pd.concat([X_train, y_train], axis = 1)
test_data = pd.concat([X_test, y_test], axis = 1)

In [77]:
train_data.to_csv('../data/processed/clean_bank_train.csv', index = False)
test_data.to_csv('../data/processed/clean_bank_test.csv', index = False)

## Logistic Regression Model

In [78]:
train_data = pd.read_csv('../data/processed/clean_bank_train.csv')
test_data = pd.read_csv('../data/processed/clean_bank_test.csv')

In [79]:
# Data Preparation
X_train = train_data.drop(['y'], axis = 1)
y_train = train_data['y']
X_test = test_data.drop(['y'], axis = 1)
y_test = test_data['y']

In [81]:
# Initialization and training the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [82]:
# Model prediction
y_pred = model.predict(X_test)
y_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [83]:
base_accuracy = accuracy_score(y_test, y_pred)
print('Base Accuracy {:.2f}%'.format(base_accuracy * 100))

Base Accuracy 90.24%


## Model Optimizzation

In [87]:
# Hyperparameter tuning
hyperparams = {
  'C': np.linspace(0.001, 1000, 50),
  'penalty': ['l1', 'l2', 'elasticnet', None],
  'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [88]:
random_search = RandomizedSearchCV(
  estimator = model,
  param_distributions = hyperparams,
  n_iter = 100,
  scoring = 'accuracy',
  cv = 5,
  random_state = 42
)

In [89]:
# Suppress warnings
def warn(*args, **kwargs):
  pass

warnings.warn = warn

In [90]:
# Perform random search
random_search.fit(X_train, y_train)
print(f'Best Hyperparameters: {random_search.best_params_}')

Best Hyperparameters: {'solver': 'sag', 'penalty': None, 'C': 306.1231428571428}


In [91]:
# Train the model with best hyperparameters
best_params = random_search.best_params_

model_random_search = LogisticRegression(
  solver = best_params['solver'],
  penalty = best_params['penalty'],
  C = best_params['C']
)

model_random_search.fit(X_train, y_train)
random_y_pred = model_random_search.predict(X_test)

In [92]:
# Evaluate the model
random_accuracy_score = accuracy_score(y_test, random_y_pred)
print('Random Accuracy Score: {:.2f}%'.format(random_accuracy_score * 100))

Random Accuracy Score: 90.38%


In [93]:
from pickle import dump

dump(model_random_search, open(f'C:/Users/angel/Banking-Marketing-Campaign/models/logistic_regression_C-{best_params['C']}_penalty-{best_params['penalty']}_solver-{best_params['solver']}.sav', 'wb'))