In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier

df = pd.read_csv("./bank-full.csv", sep=";")
#df = full_df.sample(500, random_state=42)

In [2]:
def print_error_metrics(y_true, y_pred, model_name="Model"):
    f1 = f1_score(y_true, y_pred, pos_label='yes')
    precision = precision_score(y_true, y_pred, pos_label='yes')
    recall = recall_score(y_true, y_pred, pos_label='yes')
    accuracy = accuracy_score(y_true, y_pred)  # Removed pos_label parameter
    print(f"{model_name} Performance:")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Accuracy: {accuracy:.2f}")

In [9]:
numeric_features = ["age", "balance", "campaign", "pdays", "previous"]
categorical_features = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day", "poutcome"]
all_features = numeric_features + categorical_features
y_target = df["y"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df[all_features], y_target, test_size=0.2, random_state=42
)

In [4]:
# 1. Basic Linear Regression Model with pipeline
nonlinear_classifier = GradientBoostingClassifier(random_state=42)
def build_model_pipeline():
    # Preprocessing: Create separate pipelines for numeric and categorical features
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("poly_features", PolynomialFeatures(degree=2, include_bias=False),
        )
    ])
    
    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("numeric", numeric_pipeline, numeric_features),
            ("categorical", categorical_pipeline, categorical_features)
        ]
    )

    # Define the model pipeline
    model_pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", nonlinear_classifier)
        ]
    )
    return model_pipeline

In [5]:
# Fit the model
model_pipeline = build_model_pipeline()
model_pipeline.fit(X_train, y_train)

# Predict on the test set
prediction = model_pipeline.predict(X_test)

# Evaluate the model
print_error_metrics(y_test, prediction, model_name="Model")


KeyboardInterrupt: 

In [10]:
# 1. Basic Linear Regression Model with pipeline
nonlinear_classifier = GradientBoostingClassifier(random_state=42, n_estimators=300,max_depth=7, max_features=0.5, min_samples_leaf=5)
def build_model_pipeline():
    # Preprocessing: Create separate pipelines for numeric and categorical features
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("poly_features", PolynomialFeatures(degree=2, include_bias=False),
        )
    ])
    
    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("numeric", numeric_pipeline, numeric_features),
            ("categorical", categorical_pipeline, categorical_features)
        ]
    )

    # Define the model pipeline
    model_pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", nonlinear_classifier)
        ]
    )
    return model_pipeline

In [11]:
# Fit the model
model_pipeline = build_model_pipeline()
model_pipeline.fit(X_train, y_train)

# Predict on the test set
prediction = model_pipeline.predict(X_test)

# Evaluate the model
print_error_metrics(y_test, prediction, model_name="Model")


Model Performance:
F1 Score: 0.36
Precision: 0.61
Recall: 0.25
Accuracy: 0.89


In [None]:
""" Model Performance:
F1 Score: 0.36
Precision: 0.61
Recall: 0.25
Accuracy: 0.89"""

"""Duration Only Model Performance:
F1 Score: 0.26
Precision: 0.54
Recall: 0.17
Accuracy: 0.88"""

In [None]:
param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [3, 5, 7],
    "model__max_features": ["sqrt", 0.5, 1],
    "model__min_samples_leaf": [5, 10, 20]
}

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    scoring="r2",
    cv=5,cl
    verbose=2,  # Increase verbosity to see progress
    n_jobs=-1,  # Use all available CPU cores for speed
)
grid_search.fit(X_train, y_train)
y_pred_v3 = grid_search.predict(X_test)
print_error_metrics(y_test, y_pred_v3)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


KeyboardInterrupt: 

In [25]:
x = "duration"
y = "y"

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    df[[x]], df[y], test_size=0.2, random_state=42
)

def build_model_pipeline2():
    # Preprocessing: Create separate pipelines for numeric and categorical features
    #numeric_pipeline = Pipeline([
    #    ("imputer", SimpleImputer(strategy="median")),
    #    ("scaler", StandardScaler()),
    #])
    
    #preprocessor = ColumnTransformer(
    #    transformers=[
    #        ("numeric", numeric_pipeline, ["duration"]),
    #    ]
    #)

    # Define the model pipeline
    model_pipeline = Pipeline(
        steps=[
            #("preprocessor", preprocessor),
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("model", nonlinear_classifier)
        ]
    )
    return model_pipeline



In [None]:
model2 = build_model_pipeline2()
model2.fit(X_train2, y_train2)
prediction2 = model2.predict(X_test2)
print_error_metrics(y_test2, prediction2, model_name="Duration Only Model")

In [None]:
""" Model Performance:
F1 Score: 0.36
Precision: 0.61
Recall: 0.25
Accuracy: 0.89"""

"""Duration Only Model Performance:
F1 Score: 0.26
Precision: 0.54
Recall: 0.17
Accuracy: 0.88"""