In [1]:
import random
import numpy as np
import pandas as pd

# Scikit-learn core utilities
from sklearn.base import BaseEstimator, ClassifierMixin, _fit_context
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import check_classification_targets

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# Model and evaluation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

## Data processing
Data is processed as discussed in the other two notebooks

In [2]:
student_ids = [8110, 8000, 7497]
random_seed = sum(student_ids)
random.seed(random_seed)

# Define input and output file paths
input_csv = "data.csv"     
output_csv = "my_data.csv"  

# Load dataset
df = pd.read_csv(input_csv)

# Define the target column and separate it from feature selection
target = 'smoking'
feature_candidates = df.columns.drop(target).tolist()

# Randomly select 10 features (excluding the target)
selected_features = random.sample(feature_candidates, 10)
print(selected_features)  # Required output

# Append the target column back
selected_features.append(target)

# Create new DataFrame with selected features and save
selected_df = df[selected_features].copy()
selected_df.to_csv(output_csv, index=False)  # Ready for train/valid/test splitting


['fasting blood sugar', 'HDL', 'AST', 'age', 'ALT', 'Urine protein', 'Cholesterol', 'hearing(right)', 'dental caries', 'hearing(left)']


In [3]:
df = pd.read_csv('my_data.csv')

# Drop index column if it exists
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)


In [4]:
def remove_outliers(df, feature):
    if df[feature].nunique() == 2:
        return

    original_count = df.shape[0]

    lower_bound = df[feature].quantile(0.10)
    upper_bound = df[feature].quantile(0.90)
    mask = (df[feature] >= lower_bound) & (df[feature] <= upper_bound)

    df.drop(index=df.index[~mask], inplace=True)
    df.reset_index(drop=True, inplace=True)

    new_count = df.shape[0]
    print(f"Removed {original_count - new_count} outliers from '{feature}'")
    print(f"New DataFrame shape: {df.shape}")

def mean_normalize_all(df):
    """Applies mean normalization to all non-binary, non-target features."""
    for feature in df.columns:
        if feature == 'smoking' or df[feature].nunique() == 2:
            continue
        df[feature] = (df[feature] - df[feature].mean()) / df[feature].std()
        print(f"[{feature}] Mean normalized → Max: {df[feature].max():.4f}, Min: {df[feature].min():.4f}")

def minmax_normalize_all(df):
    """Applies min-max normalization to all non-binary, non-target features."""
    for feature in df.columns:
        if feature == 'smoking' or df[feature].nunique() == 2:
            continue
        min_val, max_val = df[feature].min(), df[feature].max()
        df[feature] = (df[feature] - min_val) / (max_val - min_val)
        print(f"[{feature}] MinMax normalized → Max: {df[feature].max():.4f}, Min: {df[feature].min():.4f}")

def max_abs_normalize_all(df):
    """Applies max-abs normalization to all non-binary, non-target features."""
    for feature in df.columns:
        if feature == 'smoking' or df[feature].nunique() == 2:
            continue
        max_val = abs(df[feature].max())
        df[feature] = df[feature] / max_val
        print(f"[{feature}] MaxAbs normalized → Max: {df[feature].max():.4f}, Min: {df[feature].min():.4f}")


In [5]:

def remove_outliers(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
    return df

# Assuming normalization functions are already defined
def mean_normalize_all(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        df[column] = (df[column] - df[column].mean()) / df[column].std()

def minmax_normalize_all(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

def max_abs_normalize_all(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        df[column] = df[column] / df[column].abs().max()

# Assuming df is your original DataFrame
print("Initial DataFrame Shape:", df.shape)

# Remove outliers from selected features
for feature in ['ALT', 'LDL']:
    if feature in df.columns:
        print(f"Removing outliers from {feature}...")
        df = remove_outliers(df, feature)
    else:
        print(f"Feature '{feature}' not found in the DataFrame.")

# Drop irrelevant or unwanted features (columns)
unwanted_columns = ['hearing(right)', 'Cholesterol']
df.drop(columns=[col for col in unwanted_columns if col in df.columns], inplace=True, errors='ignore')

# Check the DataFrame after cleaning
print("DataFrame after removing outliers and dropping unwanted columns:", df.shape)

# Create separate copies for different normalization techniques
df_mean_norm = df.copy()
df_minmax_norm = df.copy()
df_maxabs_norm = df.copy()

# Apply normalization methods
print("Applying Mean Normalization...")
mean_normalize_all(df_mean_norm)

print("Applying Min-Max Normalization...")
minmax_normalize_all(df_minmax_norm)

print("Applying Max-Abs Normalization...")
max_abs_normalize_all(df_maxabs_norm)

# Use min-max normalized data as the final DataFrame
df = df_minmax_norm.copy()

# Final check
print("Final DataFrame Shape after Normalization:", df.shape)


Initial DataFrame Shape: (159256, 11)
Removing outliers from ALT...
Feature 'LDL' not found in the DataFrame.
DataFrame after removing outliers and dropping unwanted columns: (152510, 9)
Applying Mean Normalization...
Applying Min-Max Normalization...
Applying Max-Abs Normalization...
Final DataFrame Shape after Normalization: (152510, 9)


## Split dataset
 * 70% training
 * 15% validation
 * 15% test

In [6]:
# Separate target and features
y = df['smoking']
X = df.drop('smoking', axis=1)

# Split data into training (70%) and temporary (30%) sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Split the temporary set equally into validation and test sets (15% each)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Reset indices to maintain consistency
for subset in [X_train, y_train, X_valid, y_valid, X_test, y_test]:
    subset.reset_index(drop=True, inplace=True)


### Class definition

In [7]:
class BaggingClassifier(ClassifierMixin, BaseEstimator):
    _parameter_constraints = {
        "max_depth": [int, None],
        "n_estimators": [int],
        "max_samples": [float],
        "bootstrap": [bool],
        "random_state": [int, None],
    }

    def __init__(self, max_depth=None, n_estimators=50, max_samples=1.0, bootstrap=True, random_state=None):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.bootstrap = bootstrap
        self.random_state = random_state

    @_fit_context(prefer_skip_nested_validation=True)
    def fit(self, X, y):
        X, y = self._validate_data(X, y)
        check_classification_targets(y)

        self.label_encoder = LabelEncoder()
        y = self.label_encoder.fit_transform(y)
        self.classes_ = self.label_encoder.classes_
        self.models = []

        rng = np.random.default_rng(self.random_state)
        n_samples = min(int(len(X) * self.max_samples), len(X))

        for _ in range(self.n_estimators):
            if self.bootstrap:
                indices = rng.choice(len(X), size=n_samples, replace=True)
                X_sample, y_sample = X[indices], y[indices]
            else:
                X_sample, y_sample = X, y

            if len(np.unique(y_sample)) < 2:
                continue

            model = DecisionTreeClassifier(max_depth=self.max_depth, random_state=self.random_state)
            model.fit(X_sample, y_sample)
            self.models.append(model)

        self.is_fitted_ = True
        return self

    def predict(self, X):
        check_is_fitted(self)
        X = self._validate_data(X, accept_sparse=True, reset=False)

        predictions = np.array([model.predict(X) for model in self.models])
        majority_votes = np.apply_along_axis(
            lambda preds: np.bincount(preds, minlength=len(self.classes_)).argmax(),
            axis=0,
            arr=predictions
        )
        return majority_votes


In [8]:
print("Hyperparameter Tuning for Bagging:")

# Define parameter grid for Bagging Classifier
param_grid_bag = {
    'n_estimators': [50, 100, 150],     # Number of base estimators
    'max_samples': [0.5, 0.7, 1.0],     # Fraction of samples to draw
    'random_state': [42]                # Ensure reproducibility
}

# Set up GridSearchCV with BaggingClassifier and the defined parameter grid
grid_search_bag = GridSearchCV(
    estimator=BaggingClassifier(),
    param_grid=param_grid_bag,
    scoring='f1',        # Scoring based on F1 score
    verbose=2,           # Display detailed progress
    error_score='raise'  # Raise error if an issue occurs
)

# Fit the grid search to training data
grid_search_bag.fit(X_train, y_train)

# Output the best parameters found during tuning
print("Best Parameters for Bagging:", grid_search_bag.best_params_)


Hyperparameter Tuning for Bagging:
Fitting 5 folds for each of 9 candidates, totalling 45 fits




[CV] END ..max_samples=0.5, n_estimators=50, random_state=42; total time=   7.6s




[CV] END ..max_samples=0.5, n_estimators=50, random_state=42; total time=   7.2s




[CV] END ..max_samples=0.5, n_estimators=50, random_state=42; total time=   7.0s




[CV] END ..max_samples=0.5, n_estimators=50, random_state=42; total time=   6.9s




[CV] END ..max_samples=0.5, n_estimators=50, random_state=42; total time=   7.0s




[CV] END .max_samples=0.5, n_estimators=100, random_state=42; total time=  13.9s




[CV] END .max_samples=0.5, n_estimators=100, random_state=42; total time=  14.3s




[CV] END .max_samples=0.5, n_estimators=100, random_state=42; total time=  13.8s




[CV] END .max_samples=0.5, n_estimators=100, random_state=42; total time=  13.9s




[CV] END .max_samples=0.5, n_estimators=100, random_state=42; total time=  14.1s




[CV] END .max_samples=0.5, n_estimators=150, random_state=42; total time=  21.4s




[CV] END .max_samples=0.5, n_estimators=150, random_state=42; total time=  20.8s




[CV] END .max_samples=0.5, n_estimators=150, random_state=42; total time=  20.8s




[CV] END .max_samples=0.5, n_estimators=150, random_state=42; total time=  21.3s




[CV] END .max_samples=0.5, n_estimators=150, random_state=42; total time=  20.8s




[CV] END ..max_samples=0.7, n_estimators=50, random_state=42; total time=   9.5s




[CV] END ..max_samples=0.7, n_estimators=50, random_state=42; total time=   9.5s




[CV] END ..max_samples=0.7, n_estimators=50, random_state=42; total time=   9.5s




[CV] END ..max_samples=0.7, n_estimators=50, random_state=42; total time=   9.5s




[CV] END ..max_samples=0.7, n_estimators=50, random_state=42; total time=   9.4s




[CV] END .max_samples=0.7, n_estimators=100, random_state=42; total time=  19.3s




[CV] END .max_samples=0.7, n_estimators=100, random_state=42; total time=  19.1s




[CV] END .max_samples=0.7, n_estimators=100, random_state=42; total time=  18.8s




[CV] END .max_samples=0.7, n_estimators=100, random_state=42; total time=  18.9s




[CV] END .max_samples=0.7, n_estimators=100, random_state=42; total time=  18.9s




[CV] END .max_samples=0.7, n_estimators=150, random_state=42; total time=  28.4s




[CV] END .max_samples=0.7, n_estimators=150, random_state=42; total time=  28.4s




[CV] END .max_samples=0.7, n_estimators=150, random_state=42; total time=  29.2s




[CV] END .max_samples=0.7, n_estimators=150, random_state=42; total time=  28.3s




[CV] END .max_samples=0.7, n_estimators=150, random_state=42; total time=  28.3s




[CV] END ..max_samples=1.0, n_estimators=50, random_state=42; total time=  13.2s




[CV] END ..max_samples=1.0, n_estimators=50, random_state=42; total time=  13.3s




[CV] END ..max_samples=1.0, n_estimators=50, random_state=42; total time=  13.2s




[CV] END ..max_samples=1.0, n_estimators=50, random_state=42; total time=  13.2s




[CV] END ..max_samples=1.0, n_estimators=50, random_state=42; total time=  13.3s




[CV] END .max_samples=1.0, n_estimators=100, random_state=42; total time=  28.1s




[CV] END .max_samples=1.0, n_estimators=100, random_state=42; total time=  26.6s




[CV] END .max_samples=1.0, n_estimators=100, random_state=42; total time=  26.4s




[CV] END .max_samples=1.0, n_estimators=100, random_state=42; total time=  26.3s




[CV] END .max_samples=1.0, n_estimators=100, random_state=42; total time=  26.4s




[CV] END .max_samples=1.0, n_estimators=150, random_state=42; total time=  39.9s




[CV] END .max_samples=1.0, n_estimators=150, random_state=42; total time=  39.8s




[CV] END .max_samples=1.0, n_estimators=150, random_state=42; total time=  39.7s




[CV] END .max_samples=1.0, n_estimators=150, random_state=42; total time=  40.2s




[CV] END .max_samples=1.0, n_estimators=150, random_state=42; total time=  40.2s




Best Parameters for Bagging: {'max_samples': 0.5, 'n_estimators': 150, 'random_state': 42}


### Validation

In [9]:
# Make predictions using the best model found from grid search
y_pred_bag = grid_search_bag.best_estimator_.predict(X_valid)

# Calculate evaluation metrics
accuracy = accuracy_score(y_valid, y_pred_bag)
f1 = f1_score(y_valid, y_pred_bag)
conf_matrix = confusion_matrix(y_valid, y_pred_bag)

# Print the evaluation results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)




Accuracy: 0.661828991082357
F1 Score: 0.6026707755521314
Confusion Matrix:
 [[9273 3776]
 [3960 5867]]


### Class definition

In [10]:
class AdaBoostClassifier(ClassifierMixin, BaseEstimator):

    _parameter_constraints = {
        "max_depth": [int, None],
        "n_estimators": [int, None],
        "learning_rate": [float, int],
        "random_state": [int, None],
    }

    def __init__(self, max_depth=1, n_estimators=50, learning_rate=1.0, random_state=None):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.random_state = random_state

    @_fit_context(prefer_skip_nested_validation=True)
    def fit(self, X, y):
        # Validate input data and ensure it's a classification task
        X, y = self._validate_data(X, y)
        check_classification_targets(y)

        # Label encoding and initialization
        self.label_encoder = LabelEncoder()
        y = self.label_encoder.fit_transform(y)
        self.classes_ = self.label_encoder.classes_

        self.models_ = []
        self.alphas_ = []
        n_samples = len(y)

        # Initialize sample weights equally
        self.sample_weights_ = np.ones(n_samples) / n_samples

        # Ensure there's more than one class
        if len(np.unique(y)) == 1:
            raise ValueError("Classifier can't predict when only one class is present.")

        for i in range(self.n_estimators):
            # Train weak learner (decision tree)
            model = DecisionTreeClassifier(max_depth=self.max_depth, random_state=self.random_state)
            model.fit(X, y, sample_weight=self.sample_weights_)

            # Calculate weighted error
            y_pred = model.predict(X)
            incorrect = y_pred != y
            weighted_error = np.dot(self.sample_weights_, incorrect)

            # Stop training if error is too high or too low
            if weighted_error >= 0.5 or weighted_error <= 1e-5:
                break

            # Compute model weight (alpha)
            alpha = self.learning_rate * np.log((1 - weighted_error) / weighted_error)

            # Update sample weights (increase weight of misclassified samples)
            self.sample_weights_ *= np.exp(alpha * incorrect)
            self.sample_weights_ /= np.sum(self.sample_weights_)

            # Store the model and its weight
            self.models_.append(model)
            self.alphas_.append(alpha)

        self.is_fitted_ = True
        return self

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)

        # Initialize prediction scores for each class
        pred = np.zeros((X.shape[0], len(self.classes_)))
        for model, alpha in zip(self.models_, self.alphas_):
            predictions = model.predict(X)
            for i, cls in enumerate(self.classes_):
                pred[:, i] += alpha * (predictions == cls)

        # Return the class with the highest score
        return self.classes_[np.argmax(pred, axis=1)]


### Hyperparameter tunning

In [11]:
print("Hyperparameter Tuning for AdaBoost:")

param_grid_boost = {
    'n_estimators': [50, 100, 200],          # Number of base estimators
    'learning_rate': [0.01, 0.1, 1, 10],     # Learning rate
    'random_state': [42]                     # Ensure reproducibility
}

# GridSearchCV to find the best combination of hyperparameters
grid_search_boost = GridSearchCV(AdaBoostClassifier(), param_grid_boost, 
                                 scoring='f1', verbose=2, error_score='raise')

# Fit the grid search model to the training data
grid_search_boost.fit(X_train, y_train)

# Print the best hyperparameters found during the grid search
print("Best Parameters for AdaBoost:", grid_search_boost.best_params_)


Hyperparameter Tuning for AdaBoost:
Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV] END learning_rate=0.01, n_estimators=50, random_state=42; total time=   1.6s




[CV] END learning_rate=0.01, n_estimators=50, random_state=42; total time=   1.4s




[CV] END learning_rate=0.01, n_estimators=50, random_state=42; total time=   1.3s




[CV] END learning_rate=0.01, n_estimators=50, random_state=42; total time=   1.3s




[CV] END learning_rate=0.01, n_estimators=50, random_state=42; total time=   1.3s




[CV] END learning_rate=0.01, n_estimators=100, random_state=42; total time=   2.8s




[CV] END learning_rate=0.01, n_estimators=100, random_state=42; total time=   2.7s




[CV] END learning_rate=0.01, n_estimators=100, random_state=42; total time=   2.7s




[CV] END learning_rate=0.01, n_estimators=100, random_state=42; total time=   3.0s




[CV] END learning_rate=0.01, n_estimators=100, random_state=42; total time=   2.9s




[CV] END learning_rate=0.01, n_estimators=200, random_state=42; total time=   5.4s




[CV] END learning_rate=0.01, n_estimators=200, random_state=42; total time=   5.8s




[CV] END learning_rate=0.01, n_estimators=200, random_state=42; total time=   6.1s




[CV] END learning_rate=0.01, n_estimators=200, random_state=42; total time=   5.3s




[CV] END learning_rate=0.01, n_estimators=200, random_state=42; total time=   5.3s




[CV] END learning_rate=0.1, n_estimators=50, random_state=42; total time=   1.3s




[CV] END learning_rate=0.1, n_estimators=50, random_state=42; total time=   1.4s




[CV] END learning_rate=0.1, n_estimators=50, random_state=42; total time=   1.4s




[CV] END learning_rate=0.1, n_estimators=50, random_state=42; total time=   1.3s




[CV] END learning_rate=0.1, n_estimators=50, random_state=42; total time=   1.3s




[CV] END learning_rate=0.1, n_estimators=100, random_state=42; total time=   2.6s




[CV] END learning_rate=0.1, n_estimators=100, random_state=42; total time=   2.6s




[CV] END learning_rate=0.1, n_estimators=100, random_state=42; total time=   2.6s




[CV] END learning_rate=0.1, n_estimators=100, random_state=42; total time=   2.6s




[CV] END learning_rate=0.1, n_estimators=100, random_state=42; total time=   2.7s




[CV] END learning_rate=0.1, n_estimators=200, random_state=42; total time=   5.2s




[CV] END learning_rate=0.1, n_estimators=200, random_state=42; total time=   5.3s




[CV] END learning_rate=0.1, n_estimators=200, random_state=42; total time=   5.3s




[CV] END learning_rate=0.1, n_estimators=200, random_state=42; total time=   5.2s




[CV] END learning_rate=0.1, n_estimators=200, random_state=42; total time=   5.7s




[CV] END ..learning_rate=1, n_estimators=50, random_state=42; total time=   1.3s




[CV] END ..learning_rate=1, n_estimators=50, random_state=42; total time=   1.2s




[CV] END ..learning_rate=1, n_estimators=50, random_state=42; total time=   1.4s




[CV] END ..learning_rate=1, n_estimators=50, random_state=42; total time=   1.3s




[CV] END ..learning_rate=1, n_estimators=50, random_state=42; total time=   1.3s




[CV] END .learning_rate=1, n_estimators=100, random_state=42; total time=   2.7s




[CV] END .learning_rate=1, n_estimators=100, random_state=42; total time=   2.8s




[CV] END .learning_rate=1, n_estimators=100, random_state=42; total time=   2.7s




[CV] END .learning_rate=1, n_estimators=100, random_state=42; total time=   2.6s




[CV] END .learning_rate=1, n_estimators=100, random_state=42; total time=   2.7s




[CV] END .learning_rate=1, n_estimators=200, random_state=42; total time=   5.5s




[CV] END .learning_rate=1, n_estimators=200, random_state=42; total time=   6.0s




[CV] END .learning_rate=1, n_estimators=200, random_state=42; total time=   5.6s




[CV] END .learning_rate=1, n_estimators=200, random_state=42; total time=   5.4s




[CV] END .learning_rate=1, n_estimators=200, random_state=42; total time=   5.7s
[CV] END .learning_rate=10, n_estimators=50, random_state=42; total time=   0.0s
[CV] END .learning_rate=10, n_estimators=50, random_state=42; total time=   0.0s




[CV] END .learning_rate=10, n_estimators=50, random_state=42; total time=   0.0s
[CV] END .learning_rate=10, n_estimators=50, random_state=42; total time=   0.0s
[CV] END .learning_rate=10, n_estimators=50, random_state=42; total time=   0.0s




[CV] END learning_rate=10, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=10, n_estimators=100, random_state=42; total time=   0.0s




[CV] END learning_rate=10, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=10, n_estimators=100, random_state=42; total time=   0.0s




[CV] END learning_rate=10, n_estimators=100, random_state=42; total time=   0.0s
[CV] END learning_rate=10, n_estimators=200, random_state=42; total time=   0.0s




[CV] END learning_rate=10, n_estimators=200, random_state=42; total time=   0.0s
[CV] END learning_rate=10, n_estimators=200, random_state=42; total time=   0.0s




[CV] END learning_rate=10, n_estimators=200, random_state=42; total time=   0.0s
[CV] END learning_rate=10, n_estimators=200, random_state=42; total time=   0.0s




Best Parameters for AdaBoost: {'learning_rate': 1, 'n_estimators': 200, 'random_state': 42}


### Validation

In [12]:
# Predict using the best model found by GridSearchCV
y_pred_boost = grid_search_boost.best_estimator_.predict(X_valid)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_valid, y_pred_boost))
print("F1 score:", f1_score(y_valid, y_pred_boost))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_boost))


Accuracy: 0.6708777758349361
F1 score: 0.5964949890133447
Confusion Matrix:
 [[9782 3267]
 [4262 5565]]


### Class definition

In [14]:
class RandomForestClassifier(ClassifierMixin, BaseEstimator):
    _parameter_constraints = {
        "max_depth": [int, None],
        "n_estimators": [int],
        "max_features": [int, float, None],
        "bootstrap": [bool],
        "random_state": [int, None],
    }

    def __init__(self, max_depth=None, n_estimators=50, max_features=None, bootstrap=True, random_state=None):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.random_state = random_state

    def _sample_data(self, X, y):
        """Generate bootstrap sample or use full dataset."""
        n_samples = X.shape[0]
        if self.bootstrap:
            indices = np.random.choice(n_samples, size=n_samples, replace=True)
            return X[indices], y[indices]
        return X, y

    def _train_tree(self, X, y):
        """Train a decision tree on the provided data."""
        model = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features, random_state=self.random_state)
        model.fit(X, y)
        return model

    def fit(self, X, y):
        # Validate and preprocess data
        X, y = self._validate_data(X, y)
        check_classification_targets(y)
        
        self.label_encoder = LabelEncoder()
        y_encoded = self.label_encoder.fit_transform(y)
        self.classes_ = self.label_encoder.classes_

        self.models_ = []
        
        for _ in range(self.n_estimators):
            X_sample, y_sample = self._sample_data(X, y_encoded)
            tree = self._train_tree(X_sample, y_sample)
            self.models_.append(tree)

        self.is_fitted_ = True
        return self

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)

        # Collect predictions and perform majority voting
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.array([np.bincount(pred).argmax() for pred in predictions.T])


### Hyperparameter tunning

In [15]:
from scipy.stats import randint  # Importing randint for random integer selection
from sklearn.ensemble import RandomForestClassifier  # Importing RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV  # Importing RandomizedSearchCV

print("Hyperparameter Tuning for Random Forest:")

# Define the parameter distribution for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': [50, 100, 200, 300],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2', None, 3, 7],  # Features to consider for splits
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of trees
    'min_samples_split': randint(2, 20),  # The minimum number of samples required to split a node
    'min_samples_leaf': randint(1, 20),  # The minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    'random_state': [42]  # Ensure reproducibility
}

# Set up RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=param_dist_rf,
    n_iter=100,  # Number of random combinations to try
    scoring='f1',
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores for parallel processing
)

# Fit the random search
random_search_rf.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters for Random Forest:", random_search_rf.best_params_)


Hyperparameter Tuning for Random Forest:
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters for Random Forest: {'bootstrap': True, 'max_depth': 10, 'max_features': 7, 'min_samples_leaf': 12, 'min_samples_split': 7, 'n_estimators': 300, 'random_state': 42}


### Validation

In [16]:
# Predict on the validation set using the best estimator found by the random search
y_pred_rf = random_search_rf.best_estimator_.predict(X_valid)

# Evaluate the model
print("Accuracy:", accuracy_score(y_valid, y_pred_rf))
print("F1 score:", f1_score(y_valid, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_rf))


Accuracy: 0.6805385556915544
F1 score: 0.64
Confusion Matrix:
 [[9072 3977]
 [3331 6496]]


# Final model
The 3 models happened to give a similar performance, we will choose the AdaBoost since technically it has the highest f1 score

### Final Testing

In [17]:
print("Finalizing Model:")
# Using the best model from GridSearchCV (AdaBoost in this case)
best_model = grid_search_boost.best_estimator_

# Predict on the test set
y_pred_final = best_model.predict(X_test)

# Print performance metrics
print("Final Model Accuracy:", accuracy_score(y_test, y_pred_final))
print("Final Model F1 score:", f1_score(y_test, y_pred_final))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_final))
print("Classification Report:\n", classification_report(y_test, y_pred_final))


Finalizing Model:
Final Model Accuracy: 0.6704987542072824
Final Model F1 score: 0.5961208744106301
Confusion Matrix:
 [[9776 3274]
 [4264 5563]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.70      0.75      0.72     13050
         1.0       0.63      0.57      0.60      9827

    accuracy                           0.67     22877
   macro avg       0.66      0.66      0.66     22877
weighted avg       0.67      0.67      0.67     22877




### Cell 2: Markdown - Data Processing Introduction
- **Purpose**: Introduces the data processing section.
- **Content**: A markdown cell stating, "Data is processed as discussed in the other two notebooks."
- **Functionality**: Provides context, indicating that data processing follows methods described in related notebooks (e.g., `explanatory.ipynb`).

### Cell 3: Code - Data Loading and Feature Selection
- **Purpose**: Loads the dataset, selects a random subset of features, and saves the result.
- **Content**:
  - Defines student IDs `[8110, 8000, 7497]` and computes a `random_seed` (sum = 23607) for reproducibility.
  - Sets the random seed using `random.seed(random_seed)`.
  - Defines file paths: `input_csv = "data.csv"` and `output_csv = "my_data.csv"`.
  - Loads `data.csv` into a DataFrame (`df`).
  - Specifies the target column (`smoking`) and excludes it from feature candidates.
  - Randomly selects 10 features using `random.sample` and prints them.
  - Appends the target column to the selected features.
  - Creates a new DataFrame (`selected_df`) with the selected features and saves it to `my_data.csv` without the index.
- **Functionality**: Prepares a subset of the dataset with 10 randomly selected features plus the target, ensuring reproducibility, and saves it for further processing.

### Cell 4: Code - Load Processed Dataset
- **Purpose**: Loads the processed dataset and removes unwanted columns.
- **Content**:
  - Loads `my_data.csv` into a DataFrame (`df`).
  - Drops the `Unnamed: 0` column if present (common artifact from CSV operations).
- **Functionality**: Ensures the processed dataset is loaded and cleaned of unnecessary index columns.

### Cell 5: Code - Outlier Removal and Normalization Functions (Initial Definitions)
- **Purpose**: Defines functions for outlier removal and three normalization techniques.
- **Content**:
  - Defines `remove_outliers(df, feature)`:
    - Skips binary features (two unique values).
    - Calculates 10th and 90th percentiles as bounds.
    - Removes rows outside these bounds, resets the index, and prints the number of outliers removed.
  - Defines `mean_normalize_all(df)`:
    - Applies mean normalization to non-binary, non-target features: `(x - mean) / std`.
    - Prints max and min values post-normalization.
  - Defines `minmax_normalize_all(df)`:
    - Applies min-max normalization to non-binary, non-target features: `(x - min) / (max - min)`.
    - Prints max and min values.
  - Defines `max_abs_normalize_all(df)`:
    - Applies max-abs normalization to non-binary, non-target features: `x / max(abs(x))`.
    - Prints max and min values.
- **Functionality**: Provides modular functions for data cleaning (outlier removal) and feature scaling (normalization), with detailed output for monitoring.

### Cell 6: Code - Outlier Removal and Normalization (Updated Definitions and Application)
- **Purpose**: Redefines outlier removal using IQR, redefines normalization functions, and applies them to the dataset.
- **Content**:
  - Redefines `remove_outliers(df, feature)`:
    - Uses IQR method: calculates Q1, Q3, and bounds as `Q1 - 1.5*IQR` and `Q3 + 1.5*IQR`.
    - Filters rows within bounds and returns the DataFrame.
  - Redefines `mean_normalize_all(df)`, `minmax_normalize_all(df)`, and `max_abs_normalize_all(df)`:
    - Applies normalization to all numerical columns (not skipping binary or target explicitly).
  - Prints initial DataFrame shape.
  - Applies `remove_outliers` to `ALT` and `LDL` if present, printing results.
  - Drops `hearing(right)` and `Cholesterol` if present.
  - Creates copies of the DataFrame for each normalization method (`df_mean_norm`, `df_minmax_norm`, `df_maxabs_norm`).
  - Applies each normalization method and prints results.
  - Selects `df_minmax_norm` as the final DataFrame (`df`).
  - Prints final DataFrame shape.
- **Functionality**: Cleans the dataset by removing outliers using IQR, drops irrelevant columns, and applies three normalization techniques, selecting min-max normalization for further use.

### Cell 7: Markdown - Dataset Splitting Introduction
- **Purpose**: Introduces the dataset splitting section.
- **Content**: A markdown cell specifying the split ratios: "70% training, 15% validation, 15% test."
- **Functionality**: Sets expectations for the data partitioning process.

### Cell 8: Code - Dataset Splitting
- **Purpose**: Splits the dataset into training, validation, and test sets.
- **Content**:
  - Separates features (`X`) and target (`y`) from `df`.
  - Splits data into 70% training and 30% temporary sets using `train_test_split` with `random_state=42` and `stratify=y`.
  - Splits the temporary set into 15% validation and 15% test sets, maintaining stratification.
  - Resets indices for all subsets.
- **Functionality**: Partitions the dataset into training, validation, and test sets, ensuring consistent class distribution via stratification.

### Cell 9: Markdown - Bagging Classifier Definition Header
- **Purpose**: Labels the section for the Bagging Classifier implementation.
- **Content**: A markdown cell with the header "Class definition."
- **Functionality**: Organizes the notebook by marking the Bagging model section.

### Cell 10: Code - Bagging Classifier Implementation
- **Purpose**: Defines a custom `BaggingClassifier` class.
- **Content**:
  - Defines `BaggingClassifier` inheriting from `ClassifierMixin` and `BaseEstimator`.
  - Specifies parameters: `max_depth`, `n_estimators`, `max_samples`, `bootstrap`, `random_state`.
  - Implements `fit`:
    - Validates input and encodes labels.
    - Trains multiple `DecisionTreeClassifier` models on bootstrap samples (if enabled).
    - Skips models if the sample has fewer than two classes.
  - Implements `predict`:
    - Collects predictions from all models and returns majority votes.
- **Functionality**: Provides a custom bagging ensemble classifier using decision trees, supporting bootstrap sampling and majority voting.

### Cell 11: Code - Bagging Hyperparameter Tuning
- **Purpose**: Performs hyperparameter tuning for the Bagging Classifier.
- **Content**:
  - Defines a parameter grid: `n_estimators=[50, 100, 150]`, `max_samples=[0.5, 0.7, 1.0]`, `random_state=[42]`.
  - Sets up `GridSearchCV` with `BaggingClassifier`, using F1 score as the metric.
  - Fits the grid search on training data.
  - Prints the best parameters.
- **Functionality**: Optimizes the Bagging Classifier’s hyperparameters using grid search, focusing on F1 score.

### Cell 12: Markdown - Bagging Validation Header
- **Purpose**: Labels the validation section for the Bagging Classifier.
- **Content**: A markdown cell with the header "Validation."
- **Functionality**: Organizes the notebook by marking the Bagging validation section.

### Cell 13: Code - Bagging Validation
- **Purpose**: Evaluates the Bagging Classifier on the validation set.
- **Content**:
  - Predicts on the validation set using the best model from grid search.
  - Computes accuracy, F1 score, and confusion matrix.
  - Prints the results.
- **Functionality**: Assesses the performance of the tuned Bagging Classifier on the validation set using key metrics.

### Cell 14: Markdown - AdaBoost Classifier Definition Header
- **Purpose**: Labels the section for the AdaBoost Classifier implementation.
- **Content**: A markdown cell with the header "Class definition."
- **Functionality**: Organizes the notebook by marking the AdaBoost model section.

### Cell 15: Code - AdaBoost Classifier Implementation
- **Purpose**: Defines a custom `AdaBoostClassifier` class.
- **Content**:
  - Defines `AdaBoostClassifier` with parameters: `max_depth`, `n_estimators`, `learning_rate`, `random_state`.
  - Implements `fit`:
    - Validates input and encodes labels.
    - Initializes equal sample weights.
    - Iteratively trains decision trees, updating weights based on errors and computing model weights (`alpha`).
    - Stops if weighted error is too high or too low.
  - Implements `predict`:
    - Combines weighted predictions from all models and returns the class with the highest score.
- **Functionality**: Provides a custom AdaBoost ensemble classifier, boosting weak decision trees by focusing on misclassified samples.

### Cell 16: Markdown - AdaBoost Hyperparameter Tuning Header
- **Purpose**: Labels the section for AdaBoost hyperparameter tuning.
- **Content**: A markdown cell with the header "Hyperparameter tunning."
- **Functionality**: Organizes the notebook by marking the AdaBoost tuning section.

### Cell 17: Code - AdaBoost Hyperparameter Tuning
- **Purpose**: Performs hyperparameter tuning for the AdaBoost Classifier.
- **Content**:
  - Defines a parameter grid: `n_estimators=[50, 100, 200]`, `learning_rate=[0.01, 0.1, 1, 10]`, `random_state=[42]`.
  - Sets up `GridSearchCV` with `AdaBoostClassifier`, using F1 score.
  - Fits the grid search on training data.
  - Prints the best parameters.
- **Functionality**: Optimizes the AdaBoost Classifier’s hyperparameters using grid search, prioritizing F1 score.

### Cell 18: Markdown - AdaBoost Validation Header
- **Purpose**: Labels the validation section for the AdaBoost Classifier.
- **Content**: A markdown cell with the header "Validation."
- **Functionality**: Organizes the notebook by marking the AdaBoost validation section.

### Cell 19: Code - AdaBoost Validation
- **Purpose**: Evaluates the AdaBoost Classifier on the validation set.
- **Content**:
  - Predicts on the validation set using the best model from grid search.
  - Computes accuracy, F1 score, and confusion matrix.
  - Prints the results.
- **Functionality**: Assesses the performance of the tuned AdaBoost Classifier on the validation set.

### Cell 20: Markdown - Random Forest Classifier Definition Header
- **Purpose**: Labels the section for the Random Forest Classifier implementation.
- **Content**: A markdown cell with the header "Class definition."
- **Functionality**: Organizes the notebook by marking the Random Forest model section.

### Cell 21: Code - Random Forest Classifier Implementation
- **Purpose**: Defines a custom `RandomForestClassifier` class.
- **Content**:
  - Defines `RandomForestClassifier` with parameters: `max_depth`, `n_estimators`, `max_features`, `bootstrap`, `random_state`.
  - Implements helper methods: `_sample_data` for bootstrap sampling, `_train_tree` for training decision trees.
  - Implements `fit`:
    - Validates input and encodes labels.
    - Trains multiple decision trees on bootstrap samples with random feature subsets.
  - Implements `predict`:
    - Collects predictions from all trees and returns majority votes.
- **Functionality**: Provides a custom random forest classifier, combining bagging with random feature selection for robustness.

### Cell 22: Markdown - Random Forest Hyperparameter Tuning Header
- **Purpose**: Labels the section for Random Forest hyperparameter tuning.
- **Content**: A markdown cell with the header "Hyperparameter tunning."
- **Functionality**: Organizes the notebook by marking the Random Forest tuning section.

### Cell 23: Code - Random Forest Hyperparameter Tuning
- **Purpose**: Performs hyperparameter tuning for the Random Forest Classifier.
- **Content**:
  - Imports `randint`, `RandomForestClassifier` (from scikit-learn), and `RandomizedSearchCV`.
  - Defines a parameter distribution: `n_estimators=[50, 100, 200, 300]`, `max_features=['sqrt', 'log2', None, 3, 7]`, `max_depth=[None, 10, 20, 30, 40]`, `min_samples_split=randint(2, 20)`, `min_samples_leaf=randint(1, 20)`, `bootstrap=[True, False]`, `random_state=[42]`.
  - Sets up `RandomizedSearchCV` with 100 iterations, using F1 score and parallel processing.
  - Fits the random search on training data.
  - Prints the best parameters.
- **Functionality**: Optimizes the Random Forest Classifier’s hyperparameters using randomized search, exploring a wide parameter space efficiently.

### Cell 24: Markdown - Random Forest Validation Header
- **Purpose**: Labels the validation section for the Random Forest Classifier.
- **Content**: A markdown cell with the header "Validation."
- **Functionality**: Organizes the notebook by marking the Random Forest validation section.

### Cell 25: Code - Random Forest Validation
- **Purpose**: Evaluates the Random Forest Classifier on the validation set.
- **Content**:
  - Predicts on the validation set using the best model from random search.
  - Computes accuracy, F1 score, and confusion matrix.
  - Prints the results.
- **Functionality**: Assesses the performance of the tuned Random Forest Classifier on the validation set.

### Cell 26: Markdown - Final Model Selection and Testing
- **Purpose**: Introduces the final model selection and testing section.
- **Content**: A markdown cell stating that all three models performed similarly, with AdaBoost selected due to the highest F1 score, followed by a header "Final Testing."
- **Functionality**: Justifies the choice of AdaBoost for final evaluation and organizes the testing section.

### Cell 27: Code - Final Model Testing
- **Purpose**: Evaluates the selected AdaBoost model on the test set.
- **Content**:
  - Uses the best AdaBoost model from `grid_search_boost`.
  - Predicts on the test set.
  - Computes accuracy, F1 score, confusion matrix, and classification report.
  - Prints the results.
- **Functionality**: Provides a comprehensive evaluation of the final model’s performance on unseen test data.

---

## Summary
The `training.ipynb` notebook systematically builds and evaluates machine learning models to predict smoking behavior:
1. **Data Preparation**:
   - Loads the dataset, selects 10 random features, and saves a subset.
   - Removes outliers using IQR for `ALT` and `LDL`, drops irrelevant columns, and applies min-max normalization.
   - Splits data into 70% training, 15% validation, and 15% test sets with stratification.
2. **Model Implementation**:
   - Implements custom `BaggingClassifier`, `AdaBoostClassifier`, and `RandomForestClassifier` classes, each using decision trees as base learners.
3. **Hyperparameter Tuning**:
   - Tunes Bagging and AdaBoost using `GridSearchCV` and Random Forest using `RandomizedSearchCV`, optimizing for F1 score.
4. **Validation**:
   - Evaluates each model on the validation set using accuracy, F1 score, and confusion matrix.
5. **Final Testing**:
   - Selects AdaBoost as the final model (highest F1 score) and evaluates it on the test set, reporting detailed metrics.

The notebook is well-organized, with modular functions and custom implementations that enhance understanding of ensemble methods. Reproducibility is ensured via a fixed random seed, and the use of F1 score as the primary metric aligns with handling potentially imbalanced classes.

---