In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
input_df = pd.read_csv("dataset/bank-full.csv", sep=';')

# Print the first few rows to understand the data
print("Dataset preview:")
print(input_df.head())
print("\nDataset shape:", input_df.shape)

# Separate features and target
target_column = 'y'
y = (input_df[target_column] == 'yes').astype(int)
X = input_df.drop(target_column, axis=1)

# Identify numerical and categorical columns
categorical_columns = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]
numerical_columns = [col for col in X.columns if col not in categorical_columns]

print("\nCategorical columns:", categorical_columns)
print("\nNumerical columns:", numerical_columns)

# Create a preprocessing pipeline with one-hot encoding and normalization
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ])

# Fit and transform the data
X_transformed = preprocessor.fit_transform(X)

# Get the feature names after one-hot encoding
ohe_feature_names = []
for i, encoder in enumerate(preprocessor.transformers_):
    if encoder[0] == 'cat':
        encoder_obj = encoder[1]
        feature_names = encoder_obj.get_feature_names_out(categorical_columns)
        ohe_feature_names.extend(feature_names)
    else:
        ohe_feature_names.extend(numerical_columns)

print("\nFeature names after one-hot encoding:")
for i, name in enumerate(ohe_feature_names):
    print(f"{i}: {name}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train.toarray() if hasattr(X_train, 'toarray') else X_train)
X_test_tensor = torch.FloatTensor(X_test.toarray() if hasattr(X_test, 'toarray') else X_test)
y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)
y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1)


Dataset preview:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  

Dataset shape: (45211, 17)

Categorical columns: ['job', 'marital', 'education', 'default', '

In [2]:

# Define the logistic regression model with L1 regularization
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        return self.sigmoid(self.linear(x))

# Get input dimension from transformed data
input_dim = X_train_tensor.shape[1]
model = LogisticRegressionModel(input_dim)

# Define loss function and optimizer with L1 regularization
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training parameters
num_epochs = 1000
l1_lambda = 0.01  # L1 regularization strength

# Training loop
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Add L1 regularization
    l1_norm = sum(p.abs().sum() for p in model.parameters())
    loss = loss + l1_lambda * l1_norm
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
with torch.no_grad():
    model.eval()
    y_pred_prob = model(X_test_tensor)
    y_pred = (y_pred_prob > 0.5).float()
    accuracy = accuracy_score(y_test_tensor, y_pred)
    print(f'\nTest Accuracy: {accuracy:.4f}')
    print('\nClassification Report:')
    print(classification_report(y_test_tensor, y_pred))

# Extract and print feature weights
weights = model.linear.weight.data.numpy().flatten()
bias = model.linear.bias.data.numpy()[0]

print("\nModel coefficients (feature weights):")
print(f"Bias term: {bias:.4f}")

# Sort weights by absolute value for interpretability
sorted_indices = np.argsort(np.abs(weights))[::-1]
for i, idx in enumerate(sorted_indices):
    if i < len(ohe_feature_names):
        print(f"{ohe_feature_names[idx]}: {weights[idx]:.4f}")
    else:
        print(f"Feature {idx}: {weights[idx]:.4f}")

# Examine which features have non-zero weights (L1 regularization should induce sparsity)
non_zero_weights = weights[np.abs(weights) > 1e-4]
print(f"\nNumber of non-zero weights: {len(non_zero_weights)} out of {len(weights)}")
print(f"Percentage of weights zeroed out: {(1 - len(non_zero_weights)/len(weights)) * 100:.2f}%")

Epoch [100/1000], Loss: 0.3292
Epoch [200/1000], Loss: 0.3169
Epoch [300/1000], Loss: 0.3141
Epoch [400/1000], Loss: 0.3132
Epoch [500/1000], Loss: 0.3130
Epoch [600/1000], Loss: 0.3128
Epoch [700/1000], Loss: 0.3130
Epoch [800/1000], Loss: 0.3129
Epoch [900/1000], Loss: 0.3129
Epoch [1000/1000], Loss: 0.3129

Test Accuracy: 0.8861

Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94      7952
         1.0       0.60      0.17      0.27      1091

    accuracy                           0.89      9043
   macro avg       0.75      0.58      0.60      9043
weighted avg       0.86      0.89      0.86      9043


Model coefficients (feature weights):
Bias term: -0.8664
poutcome_unknown: -1.0030
duration: 0.8393
housing_yes: -0.7632
contact_unknown: -0.3531
poutcome_success: 0.2853
marital_married: -0.1318
campaign: -0.0442
month_may: -0.0099
job_unknown: -0.0040
month_aug: -0.0035
month_oct: 0.0035
job_services: 0.0032
mo