In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

# Load Data
data = pd.read_excel('WA_Fn-UseC_-Telco-Customer-Churn.xlsx')

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [None]:
data.head(10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# 1 Statistik Deskriptif
print("\nStatistik Deskriptif:")
print(data.describe(include='all'))

In [None]:
# 2 Korelasi antar Fitur
plt.figure(figsize=(12,8))

# Drop kolom non-numerik sebelum mencari korelasi
numeric_data = data.select_dtypes(include=['number'])
correlation = numeric_data.corr()

sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Heatmap Korelasi Antar Fitur')
plt.show()

In [None]:
# 3 Distribusi Churn
plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=data)
plt.title('Distribusi Churn')
plt.xlabel('Churn (0 = No, 1 = Yes)')
plt.ylabel('Jumlah Customer')
plt.show()

In [None]:
# 4 Distribusi Fitur Numerik
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create a figure with subplots (1 row, 3 columns)
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot each numeric feature
for idx, feature in enumerate(numeric_features):
    # Handle potential non-numeric data in TotalCharges
    data[feature] = pd.to_numeric(data[feature], errors='coerce')
    sns.histplot(data[feature].dropna(), kde=True, bins=50, ax=axes[idx])
    axes[idx].set_title(f'Distribusi {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Jumlah')

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
# 5 Visualisasi Fitur Kategorikal
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                        'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                        'TechSupport', 'StreamingTV', 'StreamingMovies',
                        'Contract', 'PaperlessBilling', 'PaymentMethod']

# Calculate the number of rows and columns for the subplot grid
n_features = len(categorical_features)
n_cols = 3  # Adjust the number of columns as needed
n_rows = int(np.ceil(n_features / n_cols))  # Calculate rows needed

# Create a figure with subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
axes = axes.flatten()  # Flatten the 2D array of axes for easier iteration

# Plot each categorical feature
for idx, feature in enumerate(categorical_features):
    sns.countplot(x=feature, data=data, order=data[feature].value_counts().index, ax=axes[idx])
    axes[idx].set_title(f'Distribusi {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Jumlah Customer')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide any unused subplots
for idx in range(len(categorical_features), len(axes)):
    axes[idx].set_visible(False)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Check data types
print("Data types:")
print(data.dtypes)

# If columns are strings, clean and convert
if data['MonthlyCharges'].dtype == 'object':
    data['MonthlyCharges'] = data['MonthlyCharges'].str.replace(',', '.')
    data['MonthlyCharges'] = data['MonthlyCharges'].astype(float)

if data['TotalCharges'].dtype == 'object':
    data['TotalCharges'] = data['TotalCharges'].str.replace(',', '.')
    data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan)
    data['TotalCharges'] = data['TotalCharges'].astype(float)

# Check missing values before cleaning
print("Missing values before cleaning:")
print(data.isnull().sum())

# Fill missing values in TotalCharges with median
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

# Check missing values after cleaning
print("Missing values after cleaning:")
print(data.isnull().sum())

In [None]:
# 2. Data Transformation
# Encode categorical data
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                    'PaperlessBilling', 'PaymentMethod', 'Churn']

for col in categorical_cols:
    data[col] = le.fit_transform(data[col])
    print(data[col])

In [None]:
# 3. Data Reduction
# Pilih subset kolom yang penting (contoh: hapus customerID karena tidak relevan)
data_reduced = data.drop(columns=['customerID'])
print(data_reduced.head(10))

In [None]:
# 4. Data Discretization
# Binning tenure menjadi kategori (contoh: New, Mid, Loyal)

bins = [0, 12, 48, 72]
labels = ['New', 'Mid', 'Loyal']
data_reduced['tenure_group'] = pd.cut(data_reduced['tenure'], bins=bins, labels=labels)

# Encode tenure_group
data_reduced['tenure_group'] = le.fit_transform(data_reduced['tenure_group'])

print(data_reduced['tenure_group'].head(10))

In [None]:
# 5. Data Normalization
# Normalisasi kolom numerik
scaler = MinMaxScaler()
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
data_reduced[numeric_cols] = scaler.fit_transform(data_reduced[numeric_cols])

print(data_reduced[numeric_cols].head(10))

In [None]:
# 6. Feature Selection
# Misalnya pilih 15 fitur terbaik
X = data_reduced.drop('Churn', axis=1)
y = data_reduced['Churn']

selector = SelectKBest(score_func=chi2, k=20)
X_new = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support(indices=True)]

print("Selected Features:")
print(selected_features)

In [None]:
# Final Data
final_data = data_reduced[selected_features.tolist() + ['Churn']]

# Save preprocessed data
final_data.to_csv('preprocessed_data.csv', index=False)

print("\nData preprocessing complete! Output saved to 'preprocessed_data.csv'")

In [None]:
final_data.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFE
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
import tensorflow as tf

# Suppress warnings
warnings.filterwarnings('ignore')

# Create 'models' directory if it doesn't exist
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# Check for GPU availability
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print(f"GPU available: {physical_devices}")
    print(f"Using GPU: {tf.test.gpu_device_name()}")
else:
    print("No GPU available, using CPU")

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# Load preprocessed data
try:
    data = pd.read_csv('preprocessed_data.csv')
    print(f"Data loaded successfully with shape: {data.shape}")
except FileNotFoundError:
    print("Error: 'preprocessed_data.csv' not found.")
    exit(1)

# Feature importance and selection function
def select_features(X, y, n_features=None):
    """Select most important features using Random Forest feature importance"""
    if n_features is None:
        n_features = int(X.shape[1] * 0.8)  # Default to using 80% of features
    
    print(f"Selecting top {n_features} features...")
    
    # Use Random Forest to get feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    rf.fit(X, y)
    
    # Get feature importance
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(x='importance', y='feature', data=feature_importances.head(15))
    plt.title('Top 15 Feature Importance')
    plt.tight_layout()
    plt.savefig('plots/feature_importance.png')
    plt.close()
    
    # Select top features
    top_features = feature_importances.head(n_features)['feature'].tolist()
    X_selected = X[top_features]
    
    print(f"Selected features: {top_features}")
    return X_selected, top_features

# Function to handle class imbalance
def balance_data(X, y, method='smote_tomek'):
    """Balance dataset using SMOTE or SMOTETomek"""
    print(f"Balancing data using {method}...")
    if method == 'smote':
        sampler = SMOTE(random_state=RANDOM_STATE)
    elif method == 'smote_tomek':
        sampler = SMOTETomek(random_state=RANDOM_STATE)
    else:
        raise ValueError("Method must be 'smote' or 'smote_tomek'")
    
    X_balanced, y_balanced = sampler.fit_resample(X, y)
    print(f"Original class distribution: {pd.Series(y).value_counts().to_dict()}")
    print(f"Balanced class distribution: {pd.Series(y_balanced).value_counts().to_dict()}")
    return X_balanced, y_balanced

# Data exploration and prep
print("\n--- Data exploration ---")
print(f"Dataset shape: {data.shape}")
print(f"Class distribution: {data['Churn'].value_counts().to_dict()}")
print(f"Class balance percentage: {data['Churn'].value_counts(normalize=True) * 100}")

# Check for missing values
missing_values = data.isnull().sum()
if missing_values.sum() > 0:
    print(f"Missing values found: \n{missing_values[missing_values > 0]}")
    # Handle missing values (simple imputation)
    data = data.fillna(data.median())

# Detect outliers using IQR for numerical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    if col == 'Churn':  # Skip target variable
        continue
    
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((data[col] < lower_bound) | (data[col] > upper_bound)).sum()
    if outliers > 0:
        print(f"Column {col} has {outliers} outliers")
        # Cap outliers instead of removing them
        data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
        data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])

# Separate features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Feature selection
X_selected, selected_features = select_features(X, y, n_features=int(X.shape[1] * 0.8))

# Balance the dataset
X_balanced, y_balanced = balance_data(X_selected, y, method='smote_tomek')

# Split the data (stratified to maintain class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=RANDOM_STATE, stratify=y_balanced
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler and selected features
with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('models/selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

# Define models with hyperparameters tuned for ~85% accuracy
models = {
    'Logistic Regression': LogisticRegression(
        C=0.8, 
        penalty='l2',
        solver='liblinear',
        random_state=RANDOM_STATE
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=4,
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=4,
        min_samples_split=10,
        min_samples_leaf=4,
        subsample=0.8,
        random_state=RANDOM_STATE
    ),
    'SVM': SVC(
        C=1.0,
        kernel='rbf',
        gamma='scale',
        probability=True,
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    'Decision Tree': DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=4,
        class_weight='balanced',
        random_state=RANDOM_STATE
    ),
    'Naive Bayes': GaussianNB()
}

# Dictionary to store results
results = {
    'Model': [],
    'Train Accuracy': [],
    'Test Accuracy': [],
    'Gap': [],
    'ROC AUC': []
}

# Train and evaluate traditional ML models
print("\n--- Training Machine Learning Models ---")
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    gap = abs(train_accuracy - test_accuracy)
    
    # Calculate ROC AUC if model supports predict_proba
    if hasattr(model, "predict_proba"):
        y_test_prob = model.predict_proba(X_test_scaled)[:, 1]
        roc_auc = roc_auc_score(y_test, y_test_prob)
    else:
        roc_auc = "N/A"
    
    # Store results
    results['Model'].append(name)
    results['Train Accuracy'].append(train_accuracy)
    results['Test Accuracy'].append(test_accuracy)
    results['Gap'].append(gap)
    results['ROC AUC'].append(roc_auc)
    
    # Print results
    print(f"\nClassification Report for {name} (Test Set):")
    print(classification_report(y_test, y_test_pred))
    print(f"{name} Train Accuracy: {train_accuracy:.4f}")
    print(f"{name} Test Accuracy: {test_accuracy:.4f}")
    print(f"{name} Accuracy Gap: {gap:.4f}")
    if roc_auc != "N/A":
        print(f"{name} ROC AUC: {roc_auc:.4f}")
    
    # Save model
    model_filename = f"models/{name.replace(' ', '_')}_model.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model {name} saved as {model_filename}")

# Build optimized ANN
print("\n--- Training Neural Network Model ---")
def create_ann_model(input_shape):
    """Create an ANN model optimized for ~85% accuracy with minimal gap"""
    model = Sequential([
        Dense(32, activation='relu', input_shape=(input_shape,), 
              kernel_regularizer=l1_l2(l1=0.001, l2=0.001)),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(16, activation='relu', kernel_regularizer=l1_l2(l1=0.001, l2=0.001)),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create ANN model
ann = create_ann_model(X_train_scaled.shape[1])
ann.summary()

# Define callbacks for ANN training
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=0.0001,
    verbose=1
)

# Train ANN
with tf.device('/GPU:0') if physical_devices else tf.device('/CPU:0'):
    history = ann.fit(
        X_train_scaled, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

# Evaluate ANN
y_train_pred_ann = (ann.predict(X_train_scaled) > 0.5).astype(int).flatten()
y_test_pred_ann = (ann.predict(X_test_scaled) > 0.5).astype(int).flatten()
y_test_prob_ann = ann.predict(X_test_scaled).flatten()

# Calculate metrics
ann_train_accuracy = accuracy_score(y_train, y_train_pred_ann)
ann_test_accuracy = accuracy_score(y_test, y_test_pred_ann)
ann_gap = abs(ann_train_accuracy - ann_test_accuracy)
ann_roc_auc = roc_auc_score(y_test, y_test_prob_ann)

# Store ANN results
results['Model'].append('ANN')
results['Train Accuracy'].append(ann_train_accuracy)
results['Test Accuracy'].append(ann_test_accuracy)
results['Gap'].append(ann_gap)
results['ROC AUC'].append(ann_roc_auc)

# Print ANN results
print("\nClassification Report for ANN (Test Set):")
print(classification_report(y_test, y_test_pred_ann))
print(f"ANN Train Accuracy: {ann_train_accuracy:.4f}")
print(f"ANN Test Accuracy: {ann_test_accuracy:.4f}")
print(f"ANN Accuracy Gap: {ann_gap:.4f}")
print(f"ANN ROC AUC: {ann_roc_auc:.4f}")

# Save ANN model
ann.save('models/ANN_model.h5')
print("ANN model saved as models/ANN_model.h5")

# Plot training history for ANN
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('ANN Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('ANN Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.savefig('plots/ann_training_history.png')
plt.close()

# Create results DataFrame and add gap column
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test Accuracy', ascending=False).reset_index(drop=True)

# Display comparison table
print("\nModel Performance Comparison:")
print(results_df)

# Plot comparison
plt.figure(figsize=(14, 10))

# Test vs Train accuracy plot
plt.subplot(2, 1, 1)
results_melted = results_df.melt(id_vars='Model', value_vars=['Train Accuracy', 'Test Accuracy'],
                                var_name='Dataset', value_name='Accuracy')
sns.barplot(x='Model', y='Accuracy', hue='Dataset', data=results_melted)
plt.title('Train vs Test Accuracy Comparison')
plt.ylim(0.75, 1.0)  # Focus on the higher accuracy range
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Gap plot
plt.subplot(2, 1, 2)
sns.barplot(x='Model', y='Gap', data=results_df)
plt.title('Accuracy Gap (|Train - Test|)')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('plots/model_comparison.png')
plt.close()

# Find models with test accuracy close to 85%
target_accuracy = 0.85
best_models = results_df[
    (results_df['Test Accuracy'] >= 0.83) & 
    (results_df['Test Accuracy'] <= 0.87) & 
    (results_df['Gap'] <= 0.05)
].sort_values('Gap')

if not best_models.empty:
    best_model = best_models.iloc[0]
    print(f"\nBest Model (closest to 85% with minimal gap): {best_model['Model']}")
    print(f"Train Accuracy: {best_model['Train Accuracy']:.4f}")
    print(f"Test Accuracy: {best_model['Test Accuracy']:.4f}")
    print(f"Accuracy Gap: {best_model['Gap']:.4f}")
    if best_model['ROC AUC'] != "N/A":
        print(f"ROC AUC: {best_model['ROC AUC']:.4f}")
else:
    # If no model is within the desired range, find the closest one
    results_df['Distance_from_target'] = abs(results_df['Test Accuracy'] - target_accuracy)
    best_model = results_df.sort_values(['Distance_from_target', 'Gap']).iloc[0]
    print(f"\nBest Model (closest to target accuracy): {best_model['Model']}")
    print(f"Train Accuracy: {best_model['Train Accuracy']:.4f}")
    print(f"Test Accuracy: {best_model['Test Accuracy']:.4f}")
    print(f"Accuracy Gap: {best_model['Gap']:.4f}")
    if best_model['ROC AUC'] != "N/A":
        print(f"ROC AUC: {best_model['ROC AUC']:.4f}")

print("\nModel optimization completed successfully!")

In [None]:
# Add accuracy gap column for analysis
results_df['Accuracy Gap'] = results_df['Train Accuracy'] - results_df['Test Accuracy']

# Function to determine fitting status
def determine_fit(train_acc, test_acc, gap, threshold_overfit=0.05, threshold_underfit=0.80):
    if train_acc < threshold_underfit and test_acc < threshold_underfit:
        return "Underfitting"
    elif gap > threshold_overfit:
        return "Overfitting"
    else:
        return "Normal"

# Apply fitting status to each model
results_df['Fit Status'] = results_df.apply(
    lambda row: determine_fit(row['Train Accuracy'], row['Test Accuracy'], row['Accuracy Gap']),
    axis=1
)

# 1. Bar Plot for Train vs Test Accuracy
plt.figure(figsize=(12, 6))
results_melted = results_df.melt(id_vars='Model', value_vars=['Train Accuracy', 'Test Accuracy'],
                                 var_name='Dataset', value_name='Accuracy')
sns.barplot(x='Model', y='Accuracy', hue='Dataset', data=results_melted)
plt.title('Train vs Test Accuracy Comparison Across Models')
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.legend(title='Dataset')
plt.tight_layout()
plt.savefig('train_test_accuracy_barplot.png')
plt.close()

# 2. Line Plot for Accuracy Gap
plt.figure(figsize=(12, 6))
sns.lineplot(x='Model', y='Accuracy Gap', data=results_df, marker='o', label='Accuracy Gap')
plt.axhline(y=0.05, color='r', linestyle='--', label='Overfitting Threshold (5%)')
plt.title('Accuracy Gap (Train - Test) Across Models')
plt.ylabel('Accuracy Gap')
plt.xlabel('Model')
plt.legend()
plt.tight_layout()
plt.savefig('accuracy_gap_lineplot.png')
plt.close()

# 3. Plot ANN Training History (Loss and Accuracy Curves)
plt.figure(figsize=(12, 4))

# Subplot for Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('ANN Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Subplot for Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('ANN Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.savefig('ann_training_history.png')
plt.close()

# 4. Print Fitting Status and Explanation
print("\nFitting Status for Each Model:")
print("=" * 50)
for index, row in results_df.iterrows():
    print(f"Model: {row['Model']}")
    print(f"Train Accuracy: {row['Train Accuracy']:.4f}")
    print(f"Test Accuracy: {row['Test Accuracy']:.4f}")
    print(f"Accuracy Gap: {row['Accuracy Gap']:.4f}")
    print(f"Fit Status: {row['Fit Status']}")
    if row['Fit Status'] == 'Overfitting':
        print("Explanation: The model is overfitting. It performs well on the training data but poorly on the test data, indicating it has learned noise or specific patterns in the training set that do not generalize.")
    elif row['Fit Status'] == 'Underfitting':
        print("Explanation: The model is underfitting. Both training and test accuracies are low, indicating the model is too simple to capture the underlying patterns in the data.")
    else:
        print("Explanation: The model has a normal fit. The training and test accuracies are close, and both are reasonably high, indicating good generalization.")
    print("-" * 50)

# 5. Additional Analysis for ANN (based on training history)
print("\nANN Training History Analysis:")
print("=" * 50)
final_train_loss = history.history['loss'][-1]
final_val_loss = history.history['val_loss'][-1]
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
loss_gap = final_train_loss - final_val_loss
acc_gap = final_train_acc - final_val_acc

print(f"Final Training Loss: {final_train_loss:.4f}")
print(f"Final Validation Loss: {final_val_loss:.4f}")
print(f"Final Training Accuracy: {final_train_acc:.4f}")
print(f"Final Validation Accuracy: {final_val_acc:.4f}")
print(f"Loss Gap (Train - Val): {loss_gap:.4f}")
print(f"Accuracy Gap (Train - Val): {acc_gap:.4f}")

if final_val_loss > final_train_loss + 0.1 or acc_gap > 0.05:
    print("ANN Fit Status: Overfitting")
    print("Explanation: The validation loss is significantly higher than the training loss, or the validation accuracy is much lower than the training accuracy, indicating overfitting.")
elif final_train_acc < 0.80 and final_val_acc < 0.80:
    print("ANN Fit Status: Underfitting")
    print("Explanation: Both training and validation accuracies are low, indicating the ANN is too simple to capture the data patterns.")
else:
    print("ANN Fit Status: Normal")
    print("Explanation: The training and validation metrics are close, and both are reasonably high, indicating good generalization.")
print("-" * 50)