In [None]:
#problem A
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import Dropout

data = np.loadtxt('Homework-1-Data.txt', skiprows=1)
labels = np.concatenate([np.ones(36499), np.zeros(93565)])
Accuracy_comparison = ""
auc_comparison = ""

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels  # Keep class proportions
)

model = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    colsample_bytree=0.8,
    subsample=0.8,
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_score = auc(fpr, tpr)

def results_display(model_name: str):
    global Accuracy_comparison, auc_comparison

    print(model_name+f" Accuracy: {accuracy:.4f}")
    print(model_name+f" AUC: {auc_score:.4f}")
    Accuracy_comparison += model_name+f" Accuracy: {accuracy:.4f}\n"
    auc_comparison += model_name+f" AUC: {auc_score:.4f}\n"

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_score:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(model_name + ' ROC Curve')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()

    if(model_name == "BDT"):
        return
    
    plt.plot(model_history.history['loss'], label='Training Loss')
    plt.plot(model_history.history['val_loss'], label='Validation Loss')
    plt.title(model_name + ' Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)

results_display("BDT")

In [None]:
#problem B
feature_importance = model.feature_importances_
top_10_indices = np.argsort(feature_importance)[-10:][::-1]  # Top 10 most important features

print("Top 10 F-score:")
for i, idx in enumerate(top_10_indices):
    print(f"{i+1:2d}. Feature {idx:2d}: {feature_importance[idx]:.6f}")

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(range(10), feature_importance[top_10_indices])
plt.xlabel('Top 10 Features')
plt.ylabel('F-score (Feature Importance)')
plt.title('Top 10 Most Important Features (F-score)')
plt.xticks(range(10), [f'Feature {idx}' for idx in top_10_indices], rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Find the most important feature
most_important_idx = top_10_indices[0]
most_important_score = feature_importance[most_important_idx]

print(f"\nMost Important Feature: Feature {most_important_idx}")
print(f"F-score: {most_important_score:.6f}")

# Plot histogram of the most important feature
plt.figure(figsize=(12, 6))

# Get the most important feature values for test set
feature_values = X_test[:, most_important_idx]
signal_values = feature_values[y_test == 1]  # Signal (label = 1)
background_values = feature_values[y_test == 0]  # Background (label = 0)

# Calculate bin edges
min_val = np.min(feature_values)
max_val = np.max(feature_values)
bins = np.linspace(min_val, max_val, 101)  # 100 bins

# Plot histograms
plt.hist(signal_values, bins=bins, alpha=0.7, label='Signal', color='red', density=True)
plt.hist(background_values, bins=bins, alpha=0.7, label='Background', color='blue', density=True)

plt.xlabel(f'Feature {most_important_idx} Value')
plt.ylabel('Density')
plt.title(f'Distribution of Most Important Feature {most_important_idx}\n(Signal vs Background)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Statistical analysis of the most important feature
print(f"\nFeature {most_important_idx} Statistics:")
print(f"Signal (label=1):")
print(f"  Mean: {np.mean(signal_values):.4f}")

print(f"\nBackground (label=0):")
print(f"  Mean: {np.mean(background_values):.4f}")

print("\n Signal tends to have HIGHER values than background")


In [None]:
#problem C
from urllib.parse import _ResultMixinStr
tf.random.set_seed(42)
np.random.seed(42)

model = Sequential([
    Dense(128, activation='tanh', input_shape=(50,)),
    Dense(128, activation='tanh'),
    Dense(128, activation='tanh'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=SGD(learning_rate=0.01),
    loss=BinaryCrossentropy(),
    metrics=['accuracy']
)

model_history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=50,
    validation_data=(X_test, y_test),
    verbose=1
)

y_pred = (y_pred_proba > 0.5).astype(int)
y_pred_proba = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_score = auc(fpr, tpr)

results_display("NN+tanh")

In [None]:
#problem D
print("As seen below, without standardization, using the ReLU activation function makes it completely impossible to train.")

model = Sequential([
    Dense(128, activation='relu', input_shape=(50,)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=SGD(learning_rate=0.01),
    loss=BinaryCrossentropy(),
    metrics=['accuracy']
)

model_history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=50,
    validation_data=(X_test, y_test),
    verbose=1
)

y_pred = (y_pred_proba > 0.5).astype(int)
y_pred_proba = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_score = auc(fpr, tpr)

results_display("NN+ReLU")

In [None]:
#problem E
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Sequential([
    Dense(128, activation='relu', input_shape=(50,)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),  # Adam optimizer with default learning rate
    loss=BinaryCrossentropy(),
    metrics=['accuracy']
)

model_history = model.fit(
    X_train, y_train,  # Use scaled training data
    batch_size=128,
    epochs=50,
    validation_data=(X_test, y_test),  # Use scaled test data
    verbose=1
)

y_pred = (y_pred_proba > 0.5).astype(int)
y_pred_proba = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_score = auc(fpr, tpr)

results_display("NN+ReLU+Scaler+Adam")

In [None]:
#problem F
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau

model = Sequential([
    Dense(200, activation='relu', input_shape=(50,), kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(200, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001), 
    loss=BinaryCrossentropy(),
    metrics=['accuracy']
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

model_history = model.fit(
    X_train, y_train,  
    batch_size=128,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[reduce_lr],
    verbose=1
)

y_pred_proba = model.predict(X_test).flatten()
y_pred = (y_pred_proba > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc_score = auc(fpr, tpr)

results_display("Deep_NN+ReLU+L2+Adam+LR_Decay")

In [12]:
#summary
print(Accuracy_comparison+"\n")
print(auc_comparison)

BDT Accuracy: 0.9446
NN+tanh Accuracy: 0.9446
NN+ReLU+Scaler+Adam Accuracy: 0.7194
Deep_NN+ReLU+L2+Adam+LR_Decay Accuracy: 0.9360


BDT AUC: 0.9850
NN+tanh AUC: 0.8882
NN+ReLU+Scaler+Adam AUC: 0.9776
Deep_NN+ReLU+L2+Adam+LR_Decay AUC: 0.9802

