In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

from get_dataset import X, y

# Define the categorical columns
X = pd.DataFrame(X)

# Define the categorical columns
categorical = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign', 'previous', 'poutcome']


In [2]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3


In [3]:

# One-hot encode the categorical features
X_encoded = pd.get_dummies(X, columns=categorical)

# Initialize the scaler and apply to the entire dataset (after encoding)
scaler = StandardScaler()

# Fit and transform the entire dataset after one-hot encoding
X_scaled = scaler.fit_transform(X_encoded)

# Split the dataset into training and testing sets
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert the scaled data to torch tensors
X_scaled_train = torch.tensor(X_scaled_train, dtype=torch.float32)
X_scaled_test = torch.tensor(X_scaled_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)  # Reshape to match the output of sigmoid
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Define the Logistic Regression model with L1 regularization
class LogisticRegressionL1(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionL1, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# L1 regularization function
def l1_regularization(model):
    l1_norm = 0
    for param in model.parameters():
        l1_norm += torch.sum(torch.abs(param))
    return l1_norm

# Initialize model, loss function, and optimizer
input_dim = X_scaled_train.shape[1]
model = LogisticRegressionL1(input_dim)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Hyperparameters
num_epochs = 100
lambda_l1 = 0.01  # L1 regularization strength

# Lists to store loss and accuracy
performance = {'train_loss': [], 'test_loss': [], 'train_accuracy': [], 'test_accuracy': []}

# Training loop
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass (train set)
    outputs_train = model(X_scaled_train)
    loss_train = criterion(outputs_train, y_train)
    
    # Apply L1 regularization
    l1_loss = l1_regularization(model)
    loss_train += lambda_l1 * l1_loss
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss_train.backward()
    optimizer.step()
    
    # Calculate train accuracy
    with torch.no_grad():
        predictions_train = (outputs_train > 0.5).float()
        train_accuracy = (predictions_train == y_train).float().mean()
        
        # Calculate test loss and accuracy
        outputs_test = model(X_scaled_test)
        loss_test = criterion(outputs_test, y_test)
        predictions_test = (outputs_test > 0.5).float()
        test_accuracy = (predictions_test == y_test).float().mean()
    
    # Store loss and accuracy
    performance['train_loss'].append(loss_train.item())
    performance['test_loss'].append(loss_test.item())
    performance['train_accuracy'].append(train_accuracy.item())
    performance['test_accuracy'].append(test_accuracy.item())

# Function to plot performance (same as before)
def plot_performance(performance):
    '''
    Function for plotting training and test losses and accuracies
    '''
    plt.style.use('seaborn-v0_8-dark')
    fig, ax = plt.subplots(1, 2, figsize=(16, 4.5))
    
    # Plot loss (train vs test)
    ax[0].plot(performance['train_loss'], label='Train Loss')
    ax[0].plot(performance['test_loss'], label='Test Loss')
    ax[0].set(title="Loss over epochs", xlabel='Epoch', ylabel='Loss')
    
    # Plot accuracy (train vs test)
    ax[1].plot(performance['train_accuracy'], label='Train Accuracy')
    ax[1].plot(performance['test_accuracy'], label='Test Accuracy')
    ax[1].set(title="Accuracy over epochs", xlabel='Epoch', ylabel='Accuracy')

    ax[0].legend()
    ax[1].legend()

    plt.show()
    plt.style.use('default')

# Plot the performance results
plot_performance(performance)


# Assuming 'model' is the trained logistic regression model
# Print model parameters (weights and bias)
# for name, param in model.named_parameters():
#     if 'linear' in name:
#         print(f"Parameter: {name} | Shape: {param.shape}")
#         print(param.data)

# Get the weights and biases
weights = model.linear.weight.data  # Shape: (1, input_dim)
bias = model.linear.bias.data  # Shape: (1,)

# Print the weights for each feature
# print(f"Weights: {weights}")
# print(f"Bias: {bias}")

# Convert weights to a numpy array for easier inspection
weights = weights.squeeze().cpu().numpy()  # Convert to 1D array if it's a row vector

# Visualize the weights (using matplotlib)
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 2))
plt.bar(range(len(weights)), weights)
plt.title("Feature Weights after L1 Regularization")
plt.xlabel("Feature Index")
plt.ylabel("Weight Value")
plt.show()

# Check which weights are near zero (this indicates non-relevant features)
threshold = .01  # A small threshold to consider weights near zero
relevant_features = np.abs(weights) > threshold

print(f"Relevant Features: {np.where(relevant_features)[0]}")
print(f"Irrelevant Features: {np.where(~relevant_features)[0]}")

KeyError: "None of [Index(['job', 'marital', 'education', 'housing', 'loan', 'contact', 'campaign',\n       'previous', 'poutcome'],\n      dtype='object')] are in the [columns]"