# üéØ Generic Binary Classification Template

## üìã How to Use This Template

1. **Copy this notebook** to your project folder
2. **Update the Configuration Section** below with your dataset details
3. **Run cells sequentially** and customize as needed
4. **Remove/Add sections** based on your specific requirements

---

## ‚öôÔ∏è CONFIGURATION - CUSTOMIZE FOR YOUR DATASET

In [None]:
# ============================================================================
# DATASET CONFIGURATION - UPDATE THESE VALUES
# ============================================================================

# File paths
DATASET_PATH = 'data/your_dataset.csv'  # TODO: Update path
TARGET_COLUMN = 'target'                # TODO: Target column name

# Feature lists (leave empty for auto-detection)
NUMERIC_FEATURES = []      # e.g., ['age', 'income', 'score']
CATEGORICAL_FEATURES = []  # e.g., ['gender', 'category']
FEATURES_TO_DROP = []      # e.g., ['id', 'timestamp']

# Features with impossible zeros (will be treated as missing)
ZERO_AS_MISSING = []       # e.g., ['blood_pressure', 'glucose']

# Model parameters
RANDOM_STATE = 42
TEST_SIZE = 0.15
VALIDATION_SIZE = 0.15
CV_FOLDS = 5

# Problem description
PROBLEM_NAME = "Binary Classification Problem"
BUSINESS_OBJECTIVE = "Describe your prediction goal"

print("‚úÖ Configuration loaded!")

## 1Ô∏è‚É£ Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

np.random.seed(RANDOM_STATE)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("‚úÖ Libraries imported successfully!")

## 2Ô∏è‚É£ Load Dataset

In [None]:
df = pd.read_csv(DATASET_PATH)
print(f"Dataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,} | Columns: {df.shape[1]}")

## 3Ô∏è‚É£ Initial Data Understanding

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

## 4Ô∏è‚É£ Target Variable Analysis

In [None]:
print("Target Distribution:")
print(df[TARGET_COLUMN].value_counts())
print("\nPercentage:")
print(df[TARGET_COLUMN].value_counts(normalize=True) * 100)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df[TARGET_COLUMN].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title(f'Class Distribution - {TARGET_COLUMN}')
df[TARGET_COLUMN].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[1])
axes[1].set_title('Class Distribution %')
plt.tight_layout()
plt.show()

## 5Ô∏è‚É£ Missing Data Analysis

In [None]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing': missing, 'Percentage': missing_pct})
missing_df = missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)

if len(missing_df) > 0:
    print(missing_df)
else:
    print("‚úÖ No missing values found!")

## 6Ô∏è‚É£ Feature Analysis

In [None]:
# Auto-detect features if not specified
if not NUMERIC_FEATURES and not CATEGORICAL_FEATURES:
    feature_cols = [c for c in df.columns if c != TARGET_COLUMN and c not in FEATURES_TO_DROP]
    NUMERIC_FEATURES = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    CATEGORICAL_FEATURES = df[feature_cols].select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric Features ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f"Categorical Features ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")

In [None]:
# Distribution plots
if NUMERIC_FEATURES:
    n = len(NUMERIC_FEATURES)
    cols = 3
    rows = (n + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 4))
    axes = axes.ravel() if n > 1 else [axes]
    
    for i, feat in enumerate(NUMERIC_FEATURES):
        df[feat].hist(bins=30, ax=axes[i], edgecolor='black')
        axes[i].set_title(feat)
    
    for i in range(n, len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('Feature Distributions', y=1.00)
    plt.tight_layout()
    plt.show()

## 7Ô∏è‚É£ Bivariate Analysis

In [None]:
# Feature vs Target analysis
if NUMERIC_FEATURES:
    n = len(NUMERIC_FEATURES)
    cols = 3
    rows = (n + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 4))
    axes = axes.ravel() if n > 1 else [axes]
    
    for i, feat in enumerate(NUMERIC_FEATURES):
        for label in df[TARGET_COLUMN].unique():
            subset = df[df[TARGET_COLUMN] == label]
            axes[i].hist(subset[feat], alpha=0.6, label=f'{TARGET_COLUMN}={label}', bins=20)
        axes[i].set_title(feat)
        axes[i].legend()
    
    for i in range(n, len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle(f'Features vs {TARGET_COLUMN}', y=1.00)
    plt.tight_layout()
    plt.show()

## 8Ô∏è‚É£ Correlation Analysis

In [None]:
if NUMERIC_FEATURES:
    plt.figure(figsize=(12, 10))
    corr = df[NUMERIC_FEATURES + [TARGET_COLUMN]].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    print(f"\nCorrelation with {TARGET_COLUMN}:")
    print(corr[TARGET_COLUMN].sort_values(ascending=False))

## 9Ô∏è‚É£ Data Preprocessing

In [None]:
# Create copy for preprocessing
df_processed = df.copy()

# Handle zeros as missing values
if ZERO_AS_MISSING:
    for col in ZERO_AS_MISSING:
        if col in df_processed.columns:
            df_processed.loc[df_processed[col] == 0, col] = np.nan
            print(f"Replaced {(df[col] == 0).sum()} zeros with NaN in {col}")

# Fill missing values with median
if df_processed.isnull().sum().sum() > 0:
    for col in df_processed.select_dtypes(include=[np.number]).columns:
        if df_processed[col].isnull().sum() > 0:
            median_val = df_processed[col].median()
            df_processed[col].fillna(median_val, inplace=True)
            print(f"Filled {col} with median: {median_val:.2f}")

print(f"\n‚úÖ Preprocessing complete!")
print(f"Missing values remaining: {df_processed.isnull().sum().sum()}")

## üîü Feature Engineering

In [None]:
# Add custom feature engineering here
# Example: df_processed['new_feature'] = df_processed['feat1'] / df_processed['feat2']

print("Feature engineering step - customize as needed")
print(f"Current features: {df_processed.shape[1]}")

## 1Ô∏è‚É£1Ô∏è‚É£ Train-Validation-Test Split

In [None]:
# Prepare features and target
X = df_processed.drop([TARGET_COLUMN] + FEATURES_TO_DROP, axis=1, errors='ignore')
y = df_processed[TARGET_COLUMN]

# First split: separate test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# Second split: separate validation from training
val_size_adjusted = VALIDATION_SIZE / (1 - TEST_SIZE)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adjusted, random_state=RANDOM_STATE, stratify=y_temp
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nClass distribution - Train: {y_train.value_counts().to_dict()}")
print(f"Class distribution - Val: {y_val.value_counts().to_dict()}")
print(f"Class distribution - Test: {y_test.value_counts().to_dict()}")

## 1Ô∏è‚É£2Ô∏è‚É£ Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Feature scaling complete!")
print(f"Mean after scaling: {X_train_scaled.mean():.6f}")
print(f"Std after scaling: {X_train_scaled.std():.6f}")

## 1Ô∏è‚É£3Ô∏è‚É£ Baseline Model

In [None]:
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
baseline.fit(X_train_scaled, y_train)
baseline_acc = baseline.score(X_val_scaled, y_val)

print(f"Baseline Accuracy (most frequent): {baseline_acc:.4f}")
print("Models must beat this baseline to be useful!")

## 1Ô∏è‚É£4Ô∏è‚É£ Train Multiple Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'SVM': SVC(random_state=RANDOM_STATE, probability=True),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    y_pred_proba = model.predict_proba(X_val_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
    
    acc = accuracy_score(y_val, y_pred)
    roc = roc_auc_score(y_val, y_pred_proba) if y_pred_proba is not None else None
    
    results[name] = {'accuracy': acc, 'roc_auc': roc, 'model': model}
    print(f"{name}: Accuracy={acc:.4f}, ROC-AUC={roc:.4f if roc else 'N/A'}")

print("\n‚úÖ All models trained!")

In [None]:
# Compare models
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [r['accuracy'] for r in results.values()],
    'ROC-AUC': [r['roc_auc'] if r['roc_auc'] else 0 for r in results.values()]
})
results_df = results_df.sort_values('ROC-AUC', ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))
results_df.plot(x='Model', y='Accuracy', kind='bar', ax=axes[0], legend=False)
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_ylabel('Accuracy')
axes[0].tick_params(axis='x', rotation=45)

results_df.plot(x='Model', y='ROC-AUC', kind='bar', ax=axes[1], legend=False, color='orange')
axes[1].set_title('Model ROC-AUC Comparison')
axes[1].set_ylabel('ROC-AUC')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nModel Rankings:")
print(results_df)

## 1Ô∏è‚É£5Ô∏è‚É£ Select Best Model & Detailed Evaluation

In [None]:
# Select best model based on ROC-AUC
best_model_name = max(results.items(), key=lambda x: x[1]['roc_auc'] if x[1]['roc_auc'] else 0)[0]
best_model = results[best_model_name]['model']

print(f"üèÜ Best Model: {best_model_name}")
print(f"\nValidation Performance:")
y_pred = best_model.predict(X_val_scaled)
print(classification_report(y_val, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# ROC Curve
if hasattr(best_model, 'predict_proba'):
    y_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {best_model_name}')
    plt.legend()
    plt.grid(True)
    plt.show()

## 1Ô∏è‚É£6Ô∏è‚É£ Cross-Validation

In [None]:
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=cv, scoring='roc_auc')

print(f"Cross-Validation ROC-AUC Scores: {cv_scores}")
print(f"Mean: {cv_scores.mean():.4f}")
print(f"Std: {cv_scores.std():.4f}")

plt.figure(figsize=(8, 5))
plt.bar(range(1, CV_FOLDS + 1), cv_scores)
plt.axhline(y=cv_scores.mean(), color='r', linestyle='--', label=f'Mean: {cv_scores.mean():.4f}')
plt.xlabel('Fold')
plt.ylabel('ROC-AUC Score')
plt.title(f'{CV_FOLDS}-Fold Cross-Validation')
plt.legend()
plt.show()

## 1Ô∏è‚É£7Ô∏è‚É£ Feature Importance

In [None]:
if hasattr(best_model, 'feature_importances_'):
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(importance['Feature'], importance['Importance'])
    plt.xlabel('Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop Features:")
    print(importance.head(10))
elif hasattr(best_model, 'coef_'):
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': best_model.coef_[0]
    }).sort_values('Coefficient', ascending=False, key=abs)
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if x < 0 else 'green' for x in importance['Coefficient']]
    plt.barh(importance['Feature'], importance['Coefficient'], color=colors)
    plt.xlabel('Coefficient')
    plt.title(f'Feature Coefficients - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop Features:")
    print(importance.head(10))
else:
    print("Feature importance not available for this model type.")

## 1Ô∏è‚É£8Ô∏è‚É£ Handle Class Imbalance (Optional)

In [None]:
# Check class balance
class_counts = y_train.value_counts()
imbalance_ratio = class_counts.iloc[0] / class_counts.iloc[1]

if imbalance_ratio > 1.5:
    print(f"‚ö†Ô∏è Class imbalance detected: {imbalance_ratio:.2f}:1")
    print("Applying SMOTE...")
    
    smote = SMOTE(random_state=RANDOM_STATE)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
    
    # Retrain best model
    best_model_balanced = type(best_model)(**best_model.get_params())
    best_model_balanced.fit(X_train_balanced, y_train_balanced)
    
    y_pred_balanced = best_model_balanced.predict(X_val_scaled)
    print("\nBalanced Model Performance:")
    print(classification_report(y_val, y_pred_balanced))
    
    # Compare
    roc_original = roc_auc_score(y_val, best_model.predict_proba(X_val_scaled)[:, 1])
    roc_balanced = roc_auc_score(y_val, best_model_balanced.predict_proba(X_val_scaled)[:, 1])
    
    print(f"\nROC-AUC Original: {roc_original:.4f}")
    print(f"ROC-AUC Balanced: {roc_balanced:.4f}")
    
    if roc_balanced > roc_original:
        print("‚úÖ Using balanced model!")
        best_model = best_model_balanced
else:
    print("‚úÖ Classes are reasonably balanced.")

## 1Ô∏è‚É£9Ô∏è‚É£ Final Test Set Evaluation

In [None]:
print("üéØ Final Test Set Evaluation")
print("=" * 50)

y_test_pred = best_model.predict(X_test_scaled)
y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1] if hasattr(best_model, 'predict_proba') else None

print(f"\nModel: {best_model_name}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

if y_test_proba is not None:
    test_roc_auc = roc_auc_score(y_test, y_test_proba)
    print(f"\nTest ROC-AUC: {test_roc_auc:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Test Set Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 2Ô∏è‚É£0Ô∏è‚É£ Save Model

In [None]:
# Save model and scaler
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("‚úÖ Model saved as 'best_model.pkl'")
print("‚úÖ Scaler saved as 'scaler.pkl'")

# Save feature names for future use
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')
print("‚úÖ Feature names saved as 'feature_names.pkl'")

## 2Ô∏è‚É£1Ô∏è‚É£ Conclusion

### Summary
- **Best Model:** {best_model_name}
- **Test Accuracy:** {Check above}
- **Test ROC-AUC:** {Check above}

### Next Steps
1. Deploy model to production
2. Monitor model performance over time
3. Retrain with new data periodically
4. Consider ensemble methods for improvement
5. Implement explainability tools (SHAP, LIME)

### How to Use Saved Model
```python
# Load model
model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')
feature_names = joblib.load('feature_names.pkl')

# Make predictions
new_data = pd.DataFrame(...)  # Your new data
new_data_scaled = scaler.transform(new_data[feature_names])
predictions = model.predict(new_data_scaled)
probabilities = model.predict_proba(new_data_scaled)
```