In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

def main():
    """Main function to run the entire logistic regression pipeline"""
    print("LOGISTIC REGRESSION FROM SCRATCH")
    print("================================")
    
    # 1. Load and preprocess data
    print("\n1. Loading and preprocessing data...")
    try:
        data = pd.read_csv('Attacks.csv')
        print(f"Successfully loaded data with {data.shape[0]} rows and {data.shape[1]} columns")
        print("First 5 rows:")
        print(data.head())
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    # 2. Preprocess data
    print("\n2. Preprocessing data...")
    X, y = preprocess_data(data)
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    
    # 3. Convert to binary classification problem
    print("\n3. Converting to binary classification problem...")
    threshold = np.median(y)
    y_binary = (y > threshold).astype(int)
    print(f"Using threshold (median): {threshold}")
    print(f"Class distribution: {np.bincount(y_binary)} (0s and 1s)")
    
    # 4. Initialize parameters
    print("\n4. Initializing parameters...")
    n_features = X.shape[1]
    theta = np.zeros(n_features)
    learning_rate = 0.01
    num_iterations = 5000
    print(f"Number of features: {n_features}")
    print(f"Initial theta: {theta}")
    print(f"Learning rate: {learning_rate}")
    print(f"Number of iterations: {num_iterations}")
    
    # 5. Run gradient descent
    print("\n5. Running gradient descent...")
    theta, cost_history = gradient_descent(X, y_binary, theta, learning_rate, num_iterations)
    print(f"Final theta: {theta}")
    print(f"Final cost: {cost_history[-1]}")
    
    # 6. Evaluate model
    print("\n6. Evaluating model...")
    metrics = evaluate_model(X, y_binary, theta)
    print_metrics(metrics)
    
    # 7. Plot results
    print("\n7. Plotting results...")
    plot_cost_history(cost_history)
    
    print("\nLogistic Regression completed successfully!")
    return theta, X, y_binary, cost_history, metrics

def preprocess_data(data):
    """Preprocess the data for logistic regression"""
    # Convert date to numerical feature
    if 'Date_reported' in data.columns:
        data['Date_reported'] = pd.to_datetime(data['Date_reported'])
        data['Days_Since_Start'] = (data['Date_reported'] - data['Date_reported'].min()).dt.days
        data = data.drop('Date_reported', axis=1)
    
    # Drop non-numeric columns except target
    for col in data.columns:
        if col != 'Total_Loss(Y)' and not np.issubdtype(data[col].dtype, np.number):
            data = data.drop(col, axis=1)
    
    # Split features and target
    X = data.drop('Total_Loss(Y)', axis=1)
    y = data['Total_Loss(Y)'].values
    
    # Normalize features
    X_normalized = (X - X.mean()) / X.std()
    
    # Add bias term
    X_with_bias = np.column_stack((np.ones(X_normalized.shape[0]), X_normalized))
    
    return X_with_bias, y

def sigmoid(z):
    """Sigmoid activation function with overflow protection"""
    # Clip to avoid overflow
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, theta):
    """Cost function for logistic regression"""
    m = len(y)
    h = sigmoid(X @ theta)
    
    # Add small epsilon to avoid log(0)
    epsilon = 1e-15
    h = np.clip(h, epsilon, 1 - epsilon)
    
    cost = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    return cost

def gradient_descent(X, y, theta, learning_rate, num_iterations, print_every=500):
    """Gradient descent optimization algorithm"""
    m = len(y)
    cost_history = []
    
    for i in range(num_iterations + 1):
        # Calculate hypothesis
        h = sigmoid(X @ theta)
        
        # Calculate gradient
        gradient = (1/m) * (X.T @ (h - y))
        
        # Update parameters
        theta = theta - learning_rate * gradient
        
        # Calculate and store cost
        if i % print_every == 0:
            cost = compute_cost(X, y, theta)
            cost_history.append(cost)
            print(f"  Iteration {i}: Cost = {cost:.6f}")
    
    return theta, cost_history

def predict(X, theta, threshold=0.5):
    """Make predictions using the trained model"""
    probabilities = sigmoid(X @ theta)
    predictions = (probabilities >= threshold).astype(int)
    return predictions, probabilities

def evaluate_model(X, y, theta):
    """Evaluate model performance"""
    y_pred, y_prob = predict(X, theta)
    
    # Accuracy
    accuracy = np.mean(y_pred == y)
    
    # Confusion matrix components
    true_pos = np.sum((y_pred == 1) & (y == 1))
    false_pos = np.sum((y_pred == 1) & (y == 0))
    true_neg = np.sum((y_pred == 0) & (y == 0))
    false_neg = np.sum((y_pred == 0) & (y == 1))
    
    # Precision, recall, F1
    precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
    recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'confusion_matrix': {
            'true_positive': true_pos,
            'false_positive': false_pos,
            'true_negative': true_neg,
            'false_negative': false_neg
        },
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

def print_metrics(metrics):
    """Print model performance metrics"""
    print(f"  Accuracy: {metrics['accuracy'] * 100:.2f}%")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1_score']:.4f}")
    
    cm = metrics['confusion_matrix']
    print("\n  Confusion Matrix:")
    print(f"  True Positives: {cm['true_positive']}")
    print(f"  False Positives: {cm['false_positive']}")
    print(f"  True Negatives: {cm['true_negative']}")
    print(f"  False Negatives: {cm['false_negative']}")

def plot_cost_history(cost_history):
    """Plot cost history over iterations"""
    plt.figure(figsize=(10, 6))
    plt.plot(range(0, len(cost_history) * 500, 500), cost_history)
    plt.title('Cost Function Over Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Cost')
    plt.grid(True)
    try:
        plt.savefig('cost_history.png')
        print("  Cost history plot saved as 'cost_history.png'")
    except:
        print("  Could not save plot. Displaying instead.")
        plt.show()
    plt.close()

if __name__ == "__main__":
    main()

LOGISTIC REGRESSION FROM SCRATCH

1. Loading and preprocessing data...
Successfully loaded data with 24854 rows and 6 columns
First 5 rows:
  Date_reported Sever_Location  Packet_Size  No._of_Packets  Attack_Packets  \
0     3/29/2020           Asia         67.0              91             2.0   
1      4/5/2020           Asia        183.0             274             3.0   
2     4/12/2020           Asia        247.0             521            10.0   
3     4/19/2020           Asia        387.0             908            15.0   
4     4/26/2020           Asia        422.0            1330            13.0   

   Total_Loss(Y)  
0              2  
1              5  
2             15  
3             30  
4             43  

2. Preprocessing data...
Features shape: (24854, 5)
Target shape: (24854,)

3. Converting to binary classification problem...
Using threshold (median): 3962.5
Class distribution: [12427 12427] (0s and 1s)

4. Initializing parameters...
Number of features: 5
Initial thet