In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel


In [2]:

def preprocess_data(features):
    # Convert column names to strings
    features.columns = features.columns.astype(str)
    
    # Replace '?' with NaN
    features = features.replace('?', np.nan)
    
    # Separate numerical and categorical columns
    num_cols = features.select_dtypes(include=[np.number]).columns
    cat_cols = features.select_dtypes(exclude=[np.number]).columns
    
    # Advanced imputation using KNN for numerical columns
    imputer = KNNImputer(n_neighbors=5)
    if not num_cols.empty:
        features[num_cols] = imputer.fit_transform(features[num_cols])
    
    # Handle categorical variables
    for col in cat_cols:
        features[col] = features[col].fillna(features[col].mode()[0])
        features[col] = pd.factorize(features[col])[0]
    
    # Scale features
    scaler = StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
    
    # Ensure column names are strings
    features.columns = features.columns.astype(str)
    
    return features


In [3]:

def engineer_features(data):
    # Convert all column names to strings first
    data.columns = data.columns.astype(str)
    
    # First, let's print the data types to debug
    print("Column dtypes before feature engineering:")
    print(data.dtypes)
    
    # Get only numeric columns
    num_cols = data.select_dtypes(include=['int64', 'float64']).columns
    print("\nNumeric columns selected:", num_cols.tolist())
    
    # Create a copy of the data to avoid modifying the original
    engineered_data = data.copy()
    
    # Create interaction features
    new_features = {}
    for i, col1 in enumerate(num_cols[:-1]):
        for j, col2 in enumerate(num_cols[i+1:]):
            col_name = f'interaction_{col1}_{col2}'
            new_features[col_name] = data[col1].astype(float) * data[col2].astype(float)
    
    # Add interaction features
    if new_features:
        interaction_df = pd.DataFrame(new_features)
        engineered_data = pd.concat([engineered_data, interaction_df], axis=1)
    
    # Add polynomial features
    poly_features = {}
    for col in num_cols:
        # Square term
        poly_features[f'{col}_squared'] = data[col].astype(float) ** 2
        # Cube term
        poly_features[f'{col}_cubed'] = data[col].astype(float) ** 3
    
    # Add polynomial features
    if poly_features:
        poly_df = pd.DataFrame(poly_features)
        engineered_data = pd.concat([engineered_data, poly_df], axis=1)
    
    # Ensure all column names are strings
    engineered_data.columns = engineered_data.columns.astype(str)
    
    print("\nShape after feature engineering:", engineered_data.shape)
    return engineered_data


In [4]:

class StackedEnsembleClassifier:
    def __init__(self):
        self.base_models = [
            GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
            RandomForestClassifier(n_estimators=100),
            AdaBoostClassifier(n_estimators=100),
            MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500)
        ]
        self.meta_model = GradientBoostingClassifier(n_estimators=50)
        
    def fit(self, X, y):
        # Train base models
        self.meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            model.fit(X, y)
            self.meta_features[:, i] = model.predict_proba(X)[:, 1]
            
        # Train meta model
        self.meta_model.fit(self.meta_features, y)
        
    def predict(self, X):
        meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            meta_features[:, i] = model.predict_proba(X)[:, 1]
            
        return self.meta_model.predict(meta_features)


In [7]:

def main():
    # Load data
    data = pd.read_csv('credit_approval/crx.data', header=None)
    
    # Convert column names to strings right after loading
    data.columns = data.columns.astype(str)
    
    # Print original target values
    print("Original target values:", data.iloc[:, -1].unique())
    
    # Convert target to binary first (before any preprocessing)
    data.iloc[:, -1] = (data.iloc[:, -1] == '+').astype(int)
    print("After initial conversion:", data.iloc[:, -1].unique())
    
    # Preprocess features only (excluding target)
    features = data.iloc[:, :-1].copy()
    target = data.iloc[:, -1].copy()
    
    # Preprocess features
    processed_features = preprocess_data(features)
    print("After preprocessing target:", target.unique())
    
    # Engineer features
    engineered_features = engineer_features(processed_features)
    print("After engineering target:", target.unique())
    
    # Ensure target remains binary
    target = target.astype(int)
    print("Final target values:", target.unique())
    
    # Split into features and target
    X = engineered_features
    y = target
    
    # Ensure X has string column names
    X.columns = X.columns.astype(str)
    
    # Feature selection
    selector = SelectFromModel(GradientBoostingClassifier())
    X_selected = selector.fit_transform(X, y)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.2, random_state=42
    )
    
    # Verify target values in train and test sets
    print("Train set target values:", y_train.unique())
    print("Test set target values:", y_test.unique())
    
    # Train stacked ensemble
    model = StackedEnsembleClassifier()
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    print("\nPredicted target values:", np.unique(y_pred))
    print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    main()

Original target values: ['+' '-']
After initial conversion: [1 0]
After preprocessing target: [1 0]
Column dtypes before feature engineering:
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
dtype: object

Numeric columns selected: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']

Shape after feature engineering: (690, 150)
After engineering target: [1 0]
Final target values: [1 0]
Train set target values: [0 1]
Test set target values: [0 1]





Predicted target values: [0 1]

Accuracy: 0.8985507246376812

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.96      0.90        68
           1       0.95      0.84      0.89        70

    accuracy                           0.90       138
   macro avg       0.90      0.90      0.90       138
weighted avg       0.90      0.90      0.90       138

