In [None]:
!pip install category_encoders scikit-learn plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, fbeta_score
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import PowerTransformer

In [None]:
df_original = pd.read_csv('/content/European_bank_marketing.csv')
df_featured = pd.read_csv('/content/ml_final.csv')

In [None]:
df_featured

Unnamed: 0,duration_minutes,age emp.var.rate,previous euribor3m,edu_dur,age pdays,month,pdays,pdays cons.conf.idx,pdays euribor3m,cons.price.idx euribor3m,...,age cons.conf.idx,emp.var.rate euribor3m,age euribor3m,campaign nr.employed,emp.var.rate cons.price.idx,pdays cons.price.idx,nr.employed,euribor3m,campaign cons.conf.idx,term_deposit
0,4.350000,61.6,0.000,basic.4y0-5,55944.0,may,999,-36363.6,4852.143,456.528858,...,-2038.4,5.3427,271.992,5191.0,103.3934,93900.006,5191.0,4.857,-36.4,0
1,2.483333,62.7,0.000,high.school0-5,56943.0,may,999,-36363.6,4852.143,456.528858,...,-2074.8,5.3427,276.849,5191.0,103.3934,93900.006,5191.0,4.857,-36.4,0
2,3.766667,40.7,0.000,high.school0-5,36963.0,may,999,-36363.6,4852.143,456.528858,...,-1346.8,5.3427,179.709,5191.0,103.3934,93900.006,5191.0,4.857,-36.4,0
3,2.516667,44.0,0.000,basic.6y0-5,39960.0,may,999,-36363.6,4852.143,456.528858,...,-1456.0,5.3427,194.280,5191.0,103.3934,93900.006,5191.0,4.857,-36.4,0
4,5.116667,61.6,0.000,high.school5-10,55944.0,may,999,-36363.6,4852.143,456.528858,...,-2038.4,5.3427,271.992,5191.0,103.3934,93900.006,5191.0,4.857,-36.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,5.566667,-80.3,0.000,professional.course5-10,72927.0,nov,999,-50749.2,1026.972,97.420476,...,-3708.4,-1.1308,75.044,4963.6,-104.2437,94672.233,4963.6,1.028,-50.8,1
41184,6.383333,-50.6,0.000,professional.course5-10,45954.0,nov,999,-50749.2,1026.972,97.420476,...,-2336.8,-1.1308,47.288,4963.6,-104.2437,94672.233,4963.6,1.028,-50.8,0
41185,3.150000,-61.6,0.000,university.degree0-5,55944.0,nov,999,-50749.2,1026.972,97.420476,...,-2844.8,-1.1308,57.568,9927.2,-104.2437,94672.233,4963.6,1.028,-101.6,0
41186,7.366667,-48.4,0.000,professional.course5-10,43956.0,nov,999,-50749.2,1026.972,97.420476,...,-2235.2,-1.1308,45.232,4963.6,-104.2437,94672.233,4963.6,1.028,-50.8,1


Function to Segregate Features

In [None]:
def segregate_features(df):
  # df = df.drop(columns='term_deposit')
  for col in df.columns:
   if set(df[col].unique()) == {0, 1}:
      # if all values are 0 or 1, convert column to categorical data type
      df[col] = df[col].astype(np.int8)
  binary_features = df.select_dtypes(np.int8).copy()
  cat_features = df.select_dtypes(include=['object', 'category']).copy()
  num_features = df.select_dtypes(include=['int','float']).copy()

  return cat_features, num_features, binary_features

Function to encode categorical values

In [None]:
def apply_categorical_encodings(df, target_col, threshold):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
      unique_values = df[col].nunique()
      if unique_values <= threshold:
        one_hot_encoder = OneHotEncoder(sparse_output=False)
        encoded_col = pd.DataFrame(one_hot_encoder.fit_transform(df[[col]]))
        encoded_col.columns = [f'{col}_{val}' for val in one_hot_encoder.categories_[0]]
        df = pd.concat([df, encoded_col], axis=1)
      elif unique_values > threshold:
        target_encoder = TargetEncoder()
        df[col] = target_encoder.fit_transform(df[col], target_col)
      else:
        exit()

      df.drop(col, axis=1, inplace=True)

  return df

Function to handle outliers

In [None]:
def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a pandas DataFrame as input and applies Winsorization to the data to handle outliers.
    Specifically, it replaces all data points below the 5th percentile with the value at the 5th percentile and 
    all data points above the 95th percentile with the value at the 95th percentile. The function then returns a 
    new DataFrame containing the Winsorized data with the last column included.
    
    Args:
        df: A pandas DataFrame containing the data to be Winsorized.
    
    Returns:
        A new pandas DataFrame containing the Winsorized data with the last column included.
    """
    # Define the lower and upper percentiles
    lower_percentile = 5
    upper_percentile = 95

    # Choose all columns except the last one
    cols = df.columns

    # For each column, find the values at the chosen lower and upper percentiles
    lower_limits = df[cols].apply(lambda x: np.percentile(x, lower_percentile))
    upper_limits = df[cols].apply(lambda x: np.percentile(x, upper_percentile))

    # Replace all data points below the lower percentile with the value at the lower percentile
    # Replace all data points above the upper percentile with the value at the upper percentile
    data_winsorized = df[cols].apply(lambda x: np.clip(x, lower_limits[x.name], upper_limits[x.name]))

    # Concatenate the Winsorized columns with the last column
    return data_winsorized


Function to handle Skewness

In [None]:
def handle_skew(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a DataFrame as input and applies a Yeo-Johnson power transform to the data to reduce skewness. 
    It then returns a new DataFrame containing the transformed data.
    
    Args:
        df: A pandas DataFrame containing data to be transformed.
    
    Returns:
        A new pandas DataFrame containing the transformed data.
    """


    # Define the PowerTransformer with the Yeo-Johnson method
    pt = PowerTransformer(method='yeo-johnson')

    # Fit and transform the data using the PowerTransformer
    data_transformed = pt.fit_transform(df)

    # Convert the transformed data to a new DataFrame
    return pd.DataFrame(data_transformed, columns=df.columns)

Function to Standardize data since SVM needs scaled down data for modelling

In [None]:
def standardize_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a pandas DataFrame as input and standardizes all features except the last column using the 
    StandardScaler from scikit-learn. It then returns a new DataFrame containing the standardized data with the last 
    column included.
    
    Args:
        df: A pandas DataFrame containing the data to be standardized.
    
    Returns:
        A new pandas DataFrame containing the standardized data with the last column included.
    """
    # Define the StandardScaler
    scaler = StandardScaler()

    # Scale the data excluding the last colu?mn
    scaled_data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

    return scaled_data

Function to build the model

In [None]:
def svm_model(df, target_col, kernel, winsorizing_bounds=(0.05, 0.95)):
    """
    Train an SVM model on a DataFrame with specified kernel and preprocessing steps.

    Args:
        df (pd.DataFrame): Input DataFrame with features and target column.
        target_col (str): Name of the target column in the DataFrame.
        kernel (str, optional): SVM kernel to use. Default is 'rbf'.
        winsorizing_bounds (tuple, optional): Tuple of lower and upper bounds for Winsorizing.
            Default is (0.05, 0.95).
        num_features (int, optional): Number of top features to select. Default is 10.

    Returns:
        svm_model: Trained SVM model.
        X_test: Test data after preprocessing.
        y_test: Test target values.
    """
    #defining target variable y
    y = df[target_col]

    #segregating features
    cat_features, num_features, binary_features = segregate_features(df)

    #encoding categorical features
    cat_features = apply_categorical_encodings(cat_features,y,3)

    #handling outliers
    no_outliers = handle_outliers(num_features)

    #handling skew
    no_skew = handle_skew(no_outliers)

    new_df = pd.concat([no_skew,cat_features,binary_features], axis = 1)

    #merging no_skew num features with encoded cat features to get final X
    X = new_df.drop(columns='term_deposit',axis = 1)

    #standardizing data
    X_scale = standardize_data(X)

    svm = SVC(kernel=kernel, C=1, class_weight='balanced', gamma='scale', shrinking=True, probability=True)

    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, stratify=y, random_state=42)

    # Fit the model
    svm.fit(X_train, y_train)

    # # Predict on the test data
    # y_pred = svm.predict(X_test)

    # # Predict probabilities of positive and negative classes
    # y_pred_probs = svm.predict_proba(X_test)
    # y_pred_probs_positive = y_pred_probs[:, 1]  # Probabilities of positive class
    # y_pred_probs_negative = y_pred_probs[:, 0]  # Probabilities of negative class

    return svm, X_test, y_test

Function to evaluate threshold and build confusion matrices

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, confusion_matrix, roc_curve

def evaluate_svm_model(model, X_test, y_test, thresholds):
    """
    Evaluate an SVM model using different thresholds.
    
    Args:
        model: SVM model object.
        X_test: Test input data.
        y_test: Test target labels.
        thresholds: List of thresholds to use for binary classification.
        
    Returns:
        None
    """
    # Predict probabilities for positive and negative classes
    y_pred_probs = model.predict_proba(X_test)
    y_pred_probs_positive = y_pred_probs[:, 1]
    # y_pred_probs_negative = y_pred_probs[:, 0]

    for threshold in thresholds:
        # Create binary predictions based on threshold for positive class
        y_pred_positive = np.where(y_pred_probs_positive >= threshold, 1, 0)
        # Create binary predictions based on threshold for negative class
        # y_pred_negative = np.where(y_pred_probs_negative >= threshold, 1, 0)

        # Calculate performance metrics for positive class
        tn_t, fp_t, fn_t, tp_t = confusion_matrix(y_test, y_pred_positive).ravel()
        acc_t = accuracy_score(y_test, y_pred_positive)
        f1_t = f1_score(y_test, y_pred_positive)
        f2_t = fbeta_score(y_test, y_pred_positive, beta=2)
        # Get FPR and TPR at the specified threshold
        tpr_t = tp_t / (tp_t + fn_t)
        fpr_t = fp_t / (fp_t + tn_t) 
        print(f"Positive Class | Threshold: {threshold:.2f} | Confusion Matrix: TN={tn_t}, FP={fp_t}, FN={fn_t}, TP={tp_t} | TPR={tpr_t:.2f} | FPR={fpr_t:.2f} | Accuracy={acc_t:.2f} | F1={f1_t:.2f} | F2={f2_t:.2f}")

        # # Calculate performance metrics for negative class
        # tn_f, fp_f, fn_f, tp_f = confusion_matrix(y_test, y_pred_negative).ravel()
        # acc_f = accuracy_score(y_test, y_pred_negative)
        # f1_f = f1_score(y_test, y_pred_negative)
        # f2_f = fbeta_score(y_test, y_pred_negative, beta=2)
        # tpr_f = tp_f / (tp_f + fn_f)
        # fpr_f = fp_f / (fp_f + tn_f) 
        # print(f"Negative Class | Threshold: {threshold:.2f} | Confusion Matrix: TN={tn_f}, FP={fp_f}, FN={fn_f}, TP={tp_f} | TPR={tpr_f:.2f} | FPR={fpr_f:.2f} | Accuracy={acc_f:.2f} | F1={f1_f:.2f} | F2={f2_f:.2f}")
        # print("----------")

    
    # Compute the false positive rate (FPR), true positive rate (TPR), and threshold for the ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs_positive, pos_label=1)
    # fpr_f, tpr_f, thresholds = roc_curve(y_test, y_pred_probs_negative, pos_label=1)

    # Compute the area under the curve (AUC) of the ROC curve
    roc_auc = auc(fpr, tpr)
    # roc_auc_f = auc(fpr_f, tpr_f)

    # Plot the ROC curve
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve(AUC = %f)' % roc_auc))
    # fig.add_trace(go.Scatter(x=fpr_f, y=tpr_f, mode='lines', name='Negative Class(AUC = %f)' % roc_auc_f))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random Guessing'))
    # fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier'))
    fig.update_layout(title='Receiver Operating Characteristic (ROC) Curve',
                      xaxis_title='False Positive Rate (FPR)',
                      yaxis_title='True Positive Rate (TPR)',
                      width = 800,
                      height = 600,
                      margin=dict(l=50, r=50, t=50, b=50))
    fig.show()



Function to build the model using original and feature engineered dataset and outputs Confusion matrix, and ROC graphs for each case

In [None]:
def model_evaluation(df1, df2, target_col, kernel):
    model, X_test, y_test = svm_model(df1, target_col,kernel)
    model_f, X_test_f, y_test_f = svm_model(df2, target_col,kernel)

    thresholds = [0.1, 0.2, 0.35, 0.5]

    print("******Original Dataset******")
    evaluate_svm_model(model, X_test, y_test, thresholds)
    print("******Featured Dataset******")
    evaluate_svm_model(model_f, X_test_f, y_test_f, thresholds)


Running model evaluation with Linear Kernel

In [None]:
model_evaluation(df_original,df_featured,'term_deposit','linear')


overflow encountered in multiply


overflow encountered in reduce


overflow encountered in multiply


overflow encountered in reduce



******Original Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=8805, FP=2160, FN=138, TP=1254 | TPR=0.90 | FPR=0.20 | Accuracy=0.81 | F1=0.52 | F2=0.70
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9787, FP=1178, FN=292, TP=1100 | TPR=0.79 | FPR=0.11 | Accuracy=0.88 | F1=0.60 | F2=0.70
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=10438, FP=527, FN=613, TP=779 | TPR=0.56 | FPR=0.05 | Accuracy=0.91 | F1=0.58 | F2=0.57
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10669, FP=296, FN=808, TP=584 | TPR=0.42 | FPR=0.03 | Accuracy=0.91 | F1=0.51 | F2=0.45


******Featured Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=8766, FP=2199, FN=110, TP=1282 | TPR=0.92 | FPR=0.20 | Accuracy=0.81 | F1=0.53 | F2=0.71
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9781, FP=1184, FN=270, TP=1122 | TPR=0.81 | FPR=0.11 | Accuracy=0.88 | F1=0.61 | F2=0.71
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=10465, FP=500, FN=670, TP=722 | TPR=0.52 | FPR=0.05 | Accuracy=0.91 | F1=0.55 | F2=0.53
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10698, FP=267, FN=864, TP=528 | TPR=0.38 | FPR=0.02 | Accuracy=0.91 | F1=0.48 | F2=0.41


Running model evaluation with Polynomial Kernel

In [None]:
model_evaluation(df_original,df_featured,'term_deposit','poly')


overflow encountered in multiply


overflow encountered in reduce


overflow encountered in multiply


overflow encountered in reduce



******Original Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=8906, FP=2059, FN=95, TP=1297 | TPR=0.93 | FPR=0.19 | Accuracy=0.83 | F1=0.55 | F2=0.73
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9591, FP=1374, FN=191, TP=1201 | TPR=0.86 | FPR=0.13 | Accuracy=0.87 | F1=0.61 | F2=0.74
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=10102, FP=863, FN=363, TP=1029 | TPR=0.74 | FPR=0.08 | Accuracy=0.90 | F1=0.63 | F2=0.69
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10596, FP=369, FN=783, TP=609 | TPR=0.44 | FPR=0.03 | Accuracy=0.91 | F1=0.51 | F2=0.47


******Featured Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=9049, FP=1916, FN=99, TP=1293 | TPR=0.93 | FPR=0.17 | Accuracy=0.84 | F1=0.56 | F2=0.74
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9633, FP=1332, FN=179, TP=1213 | TPR=0.87 | FPR=0.12 | Accuracy=0.88 | F1=0.62 | F2=0.75
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=10054, FP=911, FN=321, TP=1071 | TPR=0.77 | FPR=0.08 | Accuracy=0.90 | F1=0.63 | F2=0.71
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10527, FP=438, FN=727, TP=665 | TPR=0.48 | FPR=0.04 | Accuracy=0.91 | F1=0.53 | F2=0.50


Running model evaluation with Radial Basis Function Kernel

In [None]:
model_evaluation(df_original,df_featured,'term_deposit','rbf')


overflow encountered in multiply


overflow encountered in reduce


overflow encountered in multiply


overflow encountered in reduce



******Original Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=8858, FP=2107, FN=85, TP=1307 | TPR=0.94 | FPR=0.19 | Accuracy=0.82 | F1=0.54 | F2=0.73
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9470, FP=1495, FN=170, TP=1222 | TPR=0.88 | FPR=0.14 | Accuracy=0.87 | F1=0.59 | F2=0.74
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=9991, FP=974, FN=313, TP=1079 | TPR=0.78 | FPR=0.09 | Accuracy=0.90 | F1=0.63 | F2=0.71
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10586, FP=379, FN=853, TP=539 | TPR=0.39 | FPR=0.03 | Accuracy=0.90 | F1=0.47 | F2=0.42


******Featured Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=8891, FP=2074, FN=82, TP=1310 | TPR=0.94 | FPR=0.19 | Accuracy=0.83 | F1=0.55 | F2=0.73
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9467, FP=1498, FN=148, TP=1244 | TPR=0.89 | FPR=0.14 | Accuracy=0.87 | F1=0.60 | F2=0.75
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=9965, FP=1000, FN=286, TP=1106 | TPR=0.79 | FPR=0.09 | Accuracy=0.90 | F1=0.63 | F2=0.72
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10583, FP=382, FN=884, TP=508 | TPR=0.36 | FPR=0.03 | Accuracy=0.90 | F1=0.45 | F2=0.39


Running model evaluation with Sigmoid Kernel

In [None]:
model_evaluation(df_original,df_featured,'term_deposit','sigmoid')


overflow encountered in multiply


overflow encountered in reduce


overflow encountered in multiply


overflow encountered in reduce



******Original Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=7870, FP=3095, FN=261, TP=1131 | TPR=0.81 | FPR=0.28 | Accuracy=0.73 | F1=0.40 | F2=0.58
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=9924, FP=1041, FN=855, TP=537 | TPR=0.39 | FPR=0.09 | Accuracy=0.85 | F1=0.36 | F2=0.38
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=10601, FP=364, FN=1175, TP=217 | TPR=0.16 | FPR=0.03 | Accuracy=0.88 | F1=0.22 | F2=0.18
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10775, FP=190, FN=1296, TP=96 | TPR=0.07 | FPR=0.02 | Accuracy=0.88 | F1=0.11 | F2=0.08


******Featured Dataset******
Positive Class | Threshold: 0.10 | Confusion Matrix: TN=7061, FP=3904, FN=389, TP=1003 | TPR=0.72 | FPR=0.36 | Accuracy=0.65 | F1=0.32 | F2=0.48
Positive Class | Threshold: 0.20 | Confusion Matrix: TN=8982, FP=1983, FN=726, TP=666 | TPR=0.48 | FPR=0.18 | Accuracy=0.78 | F1=0.33 | F2=0.41
Positive Class | Threshold: 0.35 | Confusion Matrix: TN=10131, FP=834, FN=1008, TP=384 | TPR=0.28 | FPR=0.08 | Accuracy=0.85 | F1=0.29 | F2=0.28
Positive Class | Threshold: 0.50 | Confusion Matrix: TN=10600, FP=365, FN=1186, TP=206 | TPR=0.15 | FPR=0.03 | Accuracy=0.87 | F1=0.21 | F2=0.17
