In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, classification_report


In [8]:
df = pd.read_csv('C:/Users/ameyr/Documents/Alex Documents/Machine Learning Class/Month 1/Module 17/module17_starter/data/bank-additional-full.csv', sep = ';')

In [59]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,0,0,0,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,0,1,0,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,0,0,0,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,0,0,1,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
6,59,admin.,married,professional.course,0,0,0,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [35]:
#Problem 1: How many marketing campaigns does this data represent?
df['campaign'].sum()

76874

In [11]:
df = df[~df.isin(['unknown']).any(axis=1)]

yes_no_cols = [col for col in df.columns if set(df[col].unique()) <= {'yes', 'no'}]

# Convert only those columns
df[yes_no_cols] = df[yes_no_cols].replace({'yes': 1, 'no': 0})

X = df.drop(columns=['y'])
y = df['y']

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols),
    ],
    remainder='drop'
)

  df[yes_no_cols] = df[yes_no_cols].replace({'yes': 1, 'no': 0})


In [12]:
# Create Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

In [13]:
# 7: Baseline Model: What is the subscription rate
df['y'].mean()

0.1265743899239045

In [14]:
### Problem 8: A Simple Model
clf = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)


In [17]:
## Problem 8b: Accuracy and Test Score of Simple Model
print("Train score:", clf.score(X_train, y_train))
print("Test score:", clf.score(X_test, y_test))

Train score: 0.9026064899851307
Test score: 0.8983206507478352


In [23]:
### Problem 10: Model Comparisons
# Models (default settings)
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC()  # rbf kernel by default
}

rows = []
for name, clf in models.items():
    pipe = Pipeline(steps=[('prep', preprocessor), ('model', clf)])
    t0 = time.perf_counter()
    pipe.fit(X_train, y_train)
    train_time = time.perf_counter() - t0

    train_acc = pipe.score(X_train, y_train)
    test_acc  = pipe.score(X_test, y_test)

    rows.append({
        'Model': name,
        'Train Time (s)': round(train_time, 4),
        'Train Accuracy': round(train_acc, 4),
        'Test Accuracy': round(test_acc, 4)
    })

results_df = pd.DataFrame(rows).sort_values('Test Accuracy', ascending=False).reset_index(drop=True)
results_df

Unnamed: 0,Model,Train Time (s),Train Accuracy,Test Accuracy
0,SVM,14.9053,0.9165,0.8987
1,Logistic Regression,0.2889,0.9026,0.8983
2,KNN,0.0716,0.92,0.8885
3,Decision Tree,0.5173,1.0,0.8756


In [27]:
### Problem 11: Improving the Model
# Logistic Regression
logit_grid = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty': ['l2'],
    'model__class_weight': [None, 'balanced']
}

# KNN
knn_grid = {
    'model__n_neighbors': [3,5,7,11,15],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1,2]  # Manhattan vs Euclidean
}

# Decision Tree
dt_grid = {
    'model__max_depth': [3,5,8,12,None],
    'model__min_samples_split': [2,5,10],
    'model__min_samples_leaf': [1,2,5],
    'model__class_weight': [None, 'balanced'],
    'model__criterion': ['gini','entropy']
}

# SVM 
svm_grid = {
    'model__C': [0.1,1,10],
    'model__kernel': ['linear','rbf'],
    'model__gamma': ['scale','auto']
}

def tune(pipeline, param_grid, X, y, scoring='roc_auc'):
    gs = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, n_jobs=-1)
    gs.fit(X_train, y_train)
    print("Best params:", gs.best_params_)
    print("Best CV", scoring, ":", round(gs.best_score_,4))
    return gs.best_estimator_

# Examples 
logit_pipe = Pipeline([('prep', preprocessor), ('model', LogisticRegression(max_iter=1000))])
best_logit = tune(logit_pipe, logit_grid, X_train, y_train)

knn_pipe = Pipeline([('prep', preprocessor), ('model', KNeighborsClassifier())])
best_knn = tune(knn_pipe, knn_grid, X_train, y_train)

dt_pipe = Pipeline([('prep', preprocessor), ('model', DecisionTreeClassifier(random_state=42))])
best_dt = tune(dt_pipe, dt_grid, X_train, y_train)

svm_pipe = Pipeline([('prep', preprocessor), ('model', SVC(probability=True, random_state=42))])
best_svm = tune(svm_pipe, svm_grid, X_train, y_train)

Best params: {'model__C': 1, 'model__class_weight': 'balanced', 'model__penalty': 'l2'}
Best CV roc_auc : 0.9341
Best params: {'model__n_neighbors': 15, 'model__p': 2, 'model__weights': 'distance'}
Best CV roc_auc : 0.9061
Best params: {'model__class_weight': 'balanced', 'model__criterion': 'entropy', 'model__max_depth': 5, 'model__min_samples_leaf': 5, 'model__min_samples_split': 2}
Best CV roc_auc : 0.9294
Best params: {'model__C': 1, 'model__gamma': 'auto', 'model__kernel': 'rbf'}
Best CV roc_auc : 0.9325


In [37]:
##Problem 11: Adjusting Performance Metrics
def eval_model(name, fitted, X_te, y_te):
    y_hat = fitted.predict(X_te)
    if hasattr(fitted.named_steps['model'], 'predict_proba'):
        y_p = fitted.predict_proba(X_te)[:,1]
    else:
        y_p = fitted.decision_function(X_te)
    print(f"\n[{name}]")
    print("Accuracy:", round(accuracy_score(y_te, y_hat),4))
    print("ROC AUC:", round(roc_auc_score(y_te, y_p),4))
    print("PR AUC (AP):", round(average_precision_score(y_te, y_p),4))
    print(classification_report(y_te, y_hat, digits=4))

eval_model("LogReg (tuned)", best_logit, X_test, y_test)
eval_model("KNN (tuned)", best_knn, X_test, y_test)
eval_model("DecisionTree (tuned)", best_dt, X_test, y_test)
eval_model("SVM (tuned)", best_svm, X_test, y_test)


[LogReg (tuned)]
Accuracy: 0.8461
ROC AUC: 0.9251
PR AUC (AP): 0.5749
              precision    recall  f1-score   support

           0     0.9770    0.8436    0.9054      6657
           1     0.4445    0.8632    0.5868       965

    accuracy                         0.8461      7622
   macro avg     0.7108    0.8534    0.7461      7622
weighted avg     0.9096    0.8461    0.8651      7622


[KNN (tuned)]
Accuracy: 0.8897
ROC AUC: 0.8977
PR AUC (AP): 0.5578
              precision    recall  f1-score   support

           0     0.9150    0.9632    0.9385      6657
           1     0.6010    0.3824    0.4674       965

    accuracy                         0.8897      7622
   macro avg     0.7580    0.6728    0.7029      7622
weighted avg     0.8752    0.8897    0.8788      7622


[DecisionTree (tuned)]
Accuracy: 0.8418
ROC AUC: 0.923
PR AUC (AP): 0.5852
              precision    recall  f1-score   support

           0     0.9801    0.8358    0.9022      6657
           1     0.438

In [57]:
import numpy as np
pipe = best_logit  # this is what tune(...) returned (already fitted)

# Sanity check: see step names
print("Pipeline steps:", list(pipe.named_steps.keys()))
# Expect: ['prep', 'model']

# Final estimator (logistic regression) and guard that it's fitted
clf = pipe.named_steps['model']
if not hasattr(clf, "coef_"):
    raise RuntimeError("LogisticRegression is not fitted. Make sure you're using best_logit (from GridSearchCV).")

# Intercept and coefficients
intercept = float(np.ravel(clf.intercept_)[0]) if hasattr(clf, "intercept_") else None
coefs = np.ravel(clf.coef_)  # shape (n_features,)

# Get transformed feature names from the ColumnTransformer ('prep')
prep = pipe.named_steps['prep']

# Try modern API first
try:
    feature_names = prep.get_feature_names_out()
except Exception:
    # Fallback: build names manually from your two transformers ('cat' and 'num')
    ohe = prep.named_transformers_['cat']
    cat_cols = prep.transformers_[0][2]
    num_cols = prep.transformers_[1][2]
    ohe_names = ohe.get_feature_names_out(cat_cols)
    feature_names = np.concatenate([ohe_names, np.array(num_cols)])

# Build and display coefficients table
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefs,
    'Abs_Coefficient': np.abs(coefs)
}).sort_values('Abs_Coefficient', ascending=False)

print("Intercept:", intercept)
print(coef_df[['Feature', 'Coefficient']].head(25))

Pipeline steps: ['prep', 'model']
Intercept: -0.4751396165163223
                      Feature  Coefficient
49          num__emp.var.rate    -3.458324
28             cat__month_mar     1.915283
45              num__duration     1.791355
50        num__cons.price.idx     1.469870
27             cat__month_jun    -1.068432
29             cat__month_may    -1.046968
30             cat__month_nov    -0.842928
24             cat__month_aug     0.736893
52             num__euribor3m     0.693228
38      cat__poutcome_failure    -0.513324
53           num__nr.employed     0.507043
22     cat__contact_telephone    -0.463368
5            cat__job_retired     0.400224
8            cat__job_student     0.333625
26             cat__month_jul    -0.316187
31             cat__month_oct     0.315535
1        cat__job_blue-collar    -0.314050
47                 num__pdays    -0.269075
6      cat__job_self-employed    -0.254565
32             cat__month_sep     0.230945
16    cat__education_basic.9y   