In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.linear_model
import sklearn.ensemble
import sklearn.metrics
import sklearn.neighbors
import sklearn.svm
import sklearn.tree
from sklearn.model_selection import GridSearchCV, train_test_split




In [4]:
def extract_x_and_y(df, y_column):
    y = df[y_column]
    X = df.drop(columns=[y_column])
    return X, y



In [5]:

def train_model(model_dict, X, y, metric='f1', k=5):
    name = model_dict['name']
    param_grid = model_dict['parameters']
    clf = GridSearchCV(estimator=model_dict['class'], param_grid=param_grid, cv=k, scoring=metric)
    clf.fit(X, y)
    best_score = clf.best_score_
    return name, clf, best_score



In [7]:
def train_all_models(models, X, y, metric='accuracy', k=5):
    # Initialize the list
    final_list = []

    for model in models:
        model_info = train_model(model, X, y, metric, k)
        final_list.append(model_info)

    # Sort the final list
    final_list = sorted(final_list, key=lambda score: score[2], reverse=True)
    return final_list



In [8]:
def specify_models():
    models = [
        {
            'name': 'K Nearest Neighbors Classifier',
            'class': sklearn.neighbors.KNeighborsClassifier(),
            'parameters': {'n_neighbors': range(1, 12)}
        },
        {
            'name': 'Support Vector Classifier with Linear Kernel',
            'class': sklearn.svm.LinearSVC(dual='auto', max_iter=100000),
            'parameters': {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
        },
        {
            'name': 'Support Vector Classifier with Radial Kernel',
            'class': sklearn.svm.SVC(kernel='rbf', max_iter=1000),
            'parameters': {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
        },
        {
            'name': "Logistic Regression with LASSO",
            'class': sklearn.linear_model.LogisticRegression(penalty='l1', solver='liblinear', max_iter=200000),  # Increased max_iter and changed solver
            'parameters': {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
        },
        {
            'name': "Stochastic Gradient Descent Classifier",
            'class': sklearn.linear_model.SGDClassifier(),
            'parameters': {'max_iter': [100, 1000], 'alpha': [0.0001, 0.001, 0.01, 0.1]}
        },
        {
            'name': "Decision Tree Classifier",
            'class': sklearn.tree.DecisionTreeClassifier(),
            'parameters': {'max_depth': range(3, 15)}
        },
        {
            'name': "Random Forest Classifier",
            'class': sklearn.ensemble.RandomForestClassifier(),
            'parameters': {'n_estimators': [10, 20, 50, 100, 200]}
        },
        {
            'name': "Extremely Randomized Trees Classifier",
            'class': sklearn.ensemble.ExtraTreesClassifier(),
            'parameters': {'n_estimators': [10, 20, 50, 100, 200]}
        }
    ]
    return models



In [9]:
def auto_train_binary_classifier(df, y_column, models, test_size=0.2, random_state=42, metric='f1', k=5):
    # Extract features and target variable
    X, y = extract_x_and_y(df, y_column)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Train all the models
    final_model = train_all_models(models, X_train, y_train, metric=metric, k=k)

    # Take the best model, its name and the score
    best_model_name, best_model, train_set_score = final_model[0]

  
    # Test set performance
   
    predicted = best_model.predict(X_test)
    test_set_score = sklearn.metrics.accuracy_score(y_test, predicted)

    return best_model_name, best_model, train_set_score, test_set_score



In [10]:

# Test the implementation

if __name__ == "__main__":
    from sklearn.datasets import load_breast_cancer

    # Load the breast cancer dataset
    cancer = load_breast_cancer()
    cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
    cancer_df['target'] = pd.Series(cancer.target)

    # Specify models
    models = specify_models()
    
    # Train models and get best performance
    best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(
        cancer_df, 'target', models)

    # Print results
    print(f"Best Model: {best_model_name}")
    print(f"Trained Model: {best_model}")
    print(f"Training Set Score (F1): {train_set_score}")
    print(f"Test Set Score (Accuracy): {test_set_score}")

Best Model: Extremely Randomized Trees Classifier
Trained Model: GridSearchCV(cv=5, estimator=ExtraTreesClassifier(),
             param_grid={'n_estimators': [10, 20, 50, 100, 200]}, scoring='f1')
Training Set Score (F1): 0.9759783938300874
Test Set Score (Accuracy): 0.9649122807017544
