In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt


In [2]:
from get_dataset import X
from get_dataset import y
# Assuming the data is already imported
# from get_dataset import X, y


In [3]:

# Custom dataset class
class LoanDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.FloatTensor(labels)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# SVM model with PyTorch
class SVM(nn.Module):
    def __init__(self, input_size):
        super(SVM, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        
    def forward(self, x):
        return self.linear(x)

# Hinge loss for SVM
def hinge_loss(outputs, targets, model, margin=1.0):
    # Convert targets to -1/1
    targets = 2 * targets - 1
    # Calculate hinge loss
    loss = torch.mean(torch.clamp(margin - outputs * targets, min=0))
    # Add regularization term
    loss += 0.01 * torch.sum(torch.pow(next(model.parameters()), 2))
    return loss

# Calculate accuracy
def calculate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for features, labels in data_loader:
            outputs = model(features).squeeze()
            predicted = (outputs > 0).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    return correct / total

# Function to train the model and track metrics
def train_model(model, train_loader, test_loader, optimizer, num_epochs):
    model.train()
    train_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for epoch in range(num_epochs):
        total_loss = 0
        for features, labels in train_loader:
            # Forward pass
            outputs = model(features).squeeze()
            
            # Compute loss - pass model as an argument
            loss = hinge_loss(outputs, labels, model)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Calculate training and test accuracy for this epoch
        train_accuracy = calculate_accuracy(model, train_loader)
        test_accuracy = calculate_accuracy(model, test_loader)
        
        # Store metrics
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        train_accuracies.append(train_accuracy)
        test_accuracies.append(test_accuracy)
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Train Loss: {avg_loss:.4f}, '
                  f'Train Accuracy: {train_accuracy:.4f}, '
                  f'Test Accuracy: {test_accuracy:.4f}')
    
    # Return collected metrics
    return train_losses, train_accuracies, test_accuracies

# Function to evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actual = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features).squeeze()
            predicted = (outputs > 0).float()
            
            predictions.extend(predicted.cpu().numpy())
            actual.extend(labels.cpu().numpy())
    
    # Calculate accuracy
    accuracy = accuracy_score(actual, predictions)
    report = classification_report(actual, predictions)
    conf_matrix = confusion_matrix(actual, predictions)
    
    return accuracy, report, conf_matrix, predictions, actual

# Main function to run the SVM classifier
def run_svm_classifier(X, y, test_size=0.2, batch_size=32, num_epochs=100, learning_rate=0.001):
    # Preprocess the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, random_state=42
    )
    
    # Create datasets
    train_dataset = LoanDataset(X_train, y_train)
    test_dataset = LoanDataset(X_test, y_test)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize the model
    input_size = X.shape[1]
    model = SVM(input_size)
    
    # Define optimizer
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    # Train the model and track metrics
    print("Training the model...")
    train_losses, train_accuracies, test_accuracies = train_model(
        model, train_loader, test_loader, optimizer, num_epochs
    )
    
    # Plot training vs test accuracy
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, num_epochs + 1), train_accuracies, label='Training Accuracy')
    plt.plot(range(1, num_epochs + 1), test_accuracies, label='Testing Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training vs Testing Accuracy')
    plt.legend()
    plt.grid(True)
    plt.savefig('accuracy_comparison.png')
    plt.close()
    
    # Plot training loss
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, num_epochs + 1), train_losses)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid(True)
    plt.savefig('training_loss.png')
    plt.close()
    
    # Evaluate the model
    print("Evaluating the model...")
    accuracy, report, conf_matrix, predictions, actual = evaluate_model(model, test_loader)
    
    print(f"Final Test Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print("\nConfusion Matrix:")
    print(conf_matrix)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['No Default', 'Default'])
    plt.yticks(tick_marks, ['No Default', 'Default'])
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Add text annotations
    thresh = conf_matrix.max() / 2
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, format(conf_matrix[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if conf_matrix[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    # Plot predictions distribution
    plt.figure(figsize=(8, 6))
    plt.hist(predictions, bins=20, alpha=0.5, label='Predictions')
    plt.axvline(0.5, color='red', linestyle='--', label='Decision Boundary')
    plt.xlabel('Prediction Value')
    plt.ylabel('Frequency')
    plt.title('Distribution of Predictions')
    plt.legend()
    plt.savefig('predictions_distribution.png')
    plt.close()
    
    return model, scaler

# # Run the classifier
# if __name__ == "__main__":
#     # Import your data
#     # from get_dataset import X, y
    
#     # Run the SVM classifier
#     model, scaler = run_svm_classifier(X, y)
    
#     # Save the model
#     torch.save({
#         'model_state_dict': model.state_dict(),
#         'input_size': X.shape[1]
#     }, 'svm_model.pth')
    
#     print("Model saved as 'svm_model.pth'")

In [4]:
# Import your data
# from get_dataset import X, y

# Run the SVM classifier
model, scaler = run_svm_classifier(X, y)

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'input_size': X.shape[1]
}, 'svm_model.pth')

print("Model saved as 'svm_model.pth'")

Training the model...
Epoch [10/100], Train Loss: 0.2339, Train Accuracy: 0.8840, Test Accuracy: 0.8794
Epoch [20/100], Train Loss: 0.2330, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [30/100], Train Loss: 0.2323, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [40/100], Train Loss: 0.2324, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [50/100], Train Loss: 0.2321, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [60/100], Train Loss: 0.2322, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [70/100], Train Loss: 0.2322, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [80/100], Train Loss: 0.2321, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [90/100], Train Loss: 0.2321, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Epoch [100/100], Train Loss: 0.2321, Train Accuracy: 0.8839, Test Accuracy: 0.8794
Evaluating the model...
Final Test Accuracy: 0.8794

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
