In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
import time

In [2]:
data = pd.read_csv("dfdata.csv")

X = data.drop('outcome', axis=1)
y = data['outcome']

In [3]:
configurations = [
    {'data_size': 1000, 'hidden_layers': (4,), 'name': '1 hidden layer 4 nodes'},
    {'data_size': 10000, 'hidden_layers': (4,), 'name': '1 hidden layer 4 nodes'},
    {'data_size': 100000, 'hidden_layers': (4,), 'name': '1 hidden layer 4 nodes'},
    {'data_size': 1000, 'hidden_layers': (4, 4), 'name': '2 hidden layers of 4 nodes each'},
    {'data_size': 10000, 'hidden_layers': (4, 4), 'name': '2 hidden layers of 4 nodes each'},
    {'data_size': 100000, 'hidden_layers': (4, 4), 'name': '2 hidden layers of 4 nodes each'},
]

# Store results
results = []

In [4]:
for config in configurations:
    print(f"\nRunning: {config['name']} with {config['data_size']} samples")

    # Generate appropriate amount of data
    if config['data_size'] > len(X):
        # Create additional synthetic data by sampling with replacement
        indices = np.random.choice(len(X), config['data_size'], replace=True)
        X_sample = X.iloc[indices]
        y_sample = y.iloc[indices]
    else:
        # Sample without replacement if data size is smaller
        X_sample = X.sample(n=config['data_size'], random_state=42)
        y_sample = y.loc[X_sample.index]

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(
        X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
    )

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Create and train the model
    start_time = time.time()

    model = MLPClassifier(
        hidden_layer_sizes=config['hidden_layers'],
        max_iter=1000,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    )

    model.fit(X_train_scaled, y_train)

    end_time = time.time()
    execution_time = end_time - start_time

    # Calculate errors (using MSE for both training and validation)
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)

    train_error = mean_squared_error(y_train, y_train_pred)
    val_error = mean_squared_error(y_val, y_val_pred)

    # Store results
    results.append({
        'Data size': config['data_size'],
        'Configuration': config['name'],
        'Training error': f"{train_error:.6f}",
        'Validation error': f"{val_error:.6f}",
        'Time of execution': f"{execution_time:.2f}s"
    })

# Create results dataframe and display
results_df = pd.DataFrame(results)
print("\n\nResults Table:")
print(results_df.to_string(index=False))


Running: 1 hidden layer 4 nodes with 1000 samples

Running: 1 hidden layer 4 nodes with 10000 samples

Running: 1 hidden layer 4 nodes with 100000 samples

Running: 2 hidden layers of 4 nodes each with 1000 samples

Running: 2 hidden layers of 4 nodes each with 10000 samples

Running: 2 hidden layers of 4 nodes each with 100000 samples


Results Table:
 Data size                   Configuration Training error Validation error Time of execution
      1000          1 hidden layer 4 nodes       0.241250         0.235000             0.12s
     10000          1 hidden layer 4 nodes       0.010500         0.016500             3.14s
    100000          1 hidden layer 4 nodes       0.000750         0.001200             7.80s
      1000 2 hidden layers of 4 nodes each       0.217500         0.220000             0.04s
     10000 2 hidden layers of 4 nodes each       0.239750         0.240000             0.21s
    100000 2 hidden layers of 4 nodes each       0.000988         0.001600            