SVM


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/smallRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # List of kernels to try
    kernels = ['linear', 'rbf', 'poly']

    for kernel in kernels:
        # Initialize an SVM model with the current kernel
        svm_model = SVC(kernel=kernel)

        # Train the SVM model on the training data
        svm_model.fit(X_train, y_train)

        # Evaluate the SVM model on the testing set
        accuracy = svm_model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)

        # Store the results in a dictionary
        result = {
            'Feature_Count': count,
            'Kernel': kernel,
            'Accuracy_Test_Set': accuracy,
            'Cross_Val_Scores': cv_scores,
            'Mean_Cross_Val_Score': np.mean(cv_scores)
        }

        # Append the result to the list of results
        results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Kernel,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,linear,0.4375,"[0.5384615384615384, 0.5384615384615384, 0.538...",0.55641
1,10,rbf,0.625,"[0.6153846153846154, 0.6153846153846154, 0.384...",0.55641
2,10,poly,0.625,"[0.7692307692307693, 0.5384615384615384, 0.307...",0.50641
3,15,linear,0.4375,"[0.6153846153846154, 0.6153846153846154, 0.384...",0.523077
4,15,rbf,0.625,"[0.5384615384615384, 0.6153846153846154, 0.384...",0.541026
5,15,poly,0.5,"[0.7692307692307693, 0.5384615384615384, 0.307...",0.50641
6,25,linear,0.625,"[0.6153846153846154, 0.8461538461538461, 0.538...",0.716667
7,25,rbf,0.6875,"[0.6153846153846154, 0.8461538461538461, 0.384...",0.652564
8,25,poly,0.5,"[0.7692307692307693, 0.6153846153846154, 0.384...",0.603846
9,50,linear,0.4375,"[0.6153846153846154, 0.6923076923076923, 0.692...",0.7


Naive Bayes

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB  # Import Gaussian Naive Bayes

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/smallRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Gaussian Naive Bayes model
    nb_model = GaussianNB()

    # Train the Naive Bayes model on the training data
    nb_model.fit(X_train, y_train)

    # Evaluate the Naive Bayes model on the testing set
    accuracy = nb_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Naive Bayes',  # Added to specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,Naive Bayes,0.75,"[0.6153846153846154, 0.6923076923076923, 0.307...",0.539744
1,15,Naive Bayes,0.6875,"[0.6923076923076923, 0.7692307692307693, 0.307...",0.570513
2,25,Naive Bayes,0.8125,"[0.46153846153846156, 0.7692307692307693, 0.30...",0.591026
3,50,Naive Bayes,0.75,"[0.38461538461538464, 0.7692307692307693, 0.30...",0.542308
4,100,Naive Bayes,0.625,"[0.5384615384615384, 0.6153846153846154, 0.307...",0.525641
5,150,Naive Bayes,0.625,"[0.46153846153846156, 0.6923076923076923, 0.23...",0.510256
6,175,Naive Bayes,0.6875,"[0.5384615384615384, 0.6923076923076923, 0.230...",0.508974
7,180,Naive Bayes,0.75,"[0.5384615384615384, 0.6923076923076923, 0.307...",0.541026
8,200,Naive Bayes,0.75,"[0.46153846153846156, 0.6923076923076923, 0.23...",0.49359


Logistic Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/smallRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Logistic Regression model
    # Increase max_iter to 1000 or a higher value
    lr_model = LogisticRegression(max_iter=1000)

    # Train the Logistic Regression model on the training data
    lr_model.fit(X_train, y_train)

    # Evaluate the Logistic Regression model on the testing set
    accuracy = lr_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Logistic Regression',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,Logistic Regression,0.625,"[0.6153846153846154, 0.6923076923076923, 0.615...",0.617949
1,15,Logistic Regression,0.5625,"[0.6153846153846154, 0.46153846153846156, 0.30...",0.476923
2,25,Logistic Regression,0.4375,"[0.46153846153846156, 0.6153846153846154, 0.76...",0.719231
3,50,Logistic Regression,0.5,"[0.6153846153846154, 0.6153846153846154, 0.846...",0.765385
4,100,Logistic Regression,0.6875,"[0.5384615384615384, 0.5384615384615384, 0.923...",0.716667
5,150,Logistic Regression,0.625,"[0.6923076923076923, 0.6153846153846154, 0.923...",0.746154
6,175,Logistic Regression,0.6875,"[0.6923076923076923, 0.6153846153846154, 0.923...",0.762821
7,180,Logistic Regression,0.625,"[0.6923076923076923, 0.6153846153846154, 0.923...",0.762821
8,200,Logistic Regression,0.625,"[0.6153846153846154, 0.6923076923076923, 0.923...",0.729487


ANN

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# Import MLPClassifier from scikit-learn
from sklearn.neural_network import MLPClassifier

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/smallRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for neural networks)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize an MLPClassifier (Artificial Neural Network)
    ann_model = MLPClassifier(hidden_layer_sizes=(
        100, 50), max_iter=1000, random_state=32)

    # Train the ANN model on the training data
    ann_model.fit(X_train, y_train)

    # Evaluate the ANN model on the testing set
    accuracy = ann_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(ann_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'ANN',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,ANN,0.6875,"[0.5384615384615384, 0.7692307692307693, 0.615...",0.701282
1,15,ANN,0.5625,"[0.6923076923076923, 0.6923076923076923, 0.615...",0.65
2,25,ANN,0.8125,"[0.9230769230769231, 0.8461538461538461, 0.692...",0.842308
3,50,ANN,0.5625,"[0.6153846153846154, 0.6923076923076923, 0.846...",0.730769
4,100,ANN,0.625,"[0.7692307692307693, 0.7692307692307693, 0.615...",0.697436
5,150,ANN,0.625,"[0.6923076923076923, 0.7692307692307693, 0.615...",0.682051
6,175,ANN,0.8125,"[0.6153846153846154, 0.7692307692307693, 0.615...",0.683333
7,180,ANN,0.8125,"[0.6923076923076923, 0.8461538461538461, 0.692...",0.712821
8,200,ANN,0.6875,"[0.6153846153846154, 0.8461538461538461, 0.615...",0.715385


Full Result Table for 3 Folds

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List of models to compare, including Random Forest
models = {
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    # Add Random Forest
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05  # You can adjust this threshold as needed

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/smallRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Train the model on the training data
        model.fit(X_train, y_train)

        # Evaluate the model on the testing set
        accuracy = model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(model, X_train, y_train, cv=3)

        # Calculate the mean cross-validation score
        mean_cv_score = np.mean(cv_scores)

        # Calculate the difference between test accuracy and mean cross-validation score
        score_difference = accuracy - mean_cv_score

        # Check if the model may be overfitting based on the threshold
        if score_difference <= 0.1:
            # The model is not overfitting
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Accuracy on Test Set': accuracy,
            'Mean Cross-Validation Score': mean_cv_score,
            'Score Difference': score_difference
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold3.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Best Model: Random Forest
Best Feature Count: 100
Best Accuracy: 0.8125

Results DataFrame:


Unnamed: 0,Model,Feature Count,Accuracy on Test Set,Mean Cross-Validation Score,Score Difference
3,ANN,10,0.75,0.730159,0.019841
8,ANN,15,0.5625,0.587302,-0.024802
13,ANN,25,0.75,0.746032,0.003968
18,ANN,50,0.75,0.68254,0.06746
23,ANN,100,0.4375,0.714286,-0.276786
28,ANN,150,0.6875,0.698413,-0.010913
33,ANN,175,0.6875,0.666667,0.020833
38,ANN,180,0.6875,0.730159,-0.042659
43,ANN,200,0.8125,0.650794,0.161706
2,Logistic Regression,10,0.75,0.68254,0.06746


Full Result Table For 5 Folds

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [75,76,77,78,79,80,81,82,83,84,85]

# List of models to compare, including Random Forest
models = {
    # 'SVM': SVC(kernel='linear'),
    # 'Naive Bayes': GaussianNB(),
    # 'Logistic Regression': LogisticRegression(max_iter=1000),
    # 'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}
# xgbOOST
# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE160310/smallRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Define hyperparameter grids for GridSearchCV
        param_grid_rf = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt'],
            'bootstrap': [True, False]
        }

        param_grid_xgb = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2, 0.3],
            'max_depth': [3, 4, 5, 6],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.2, 0.3]
        }

        if model_name == 'Random Forest':
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid_rf,
                                       scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
        elif model_name == 'XGBoost':
            grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid_xgb,
                                             n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42, verbose=2)

        # Fit the grid search to your data
        grid_search.fit(X_train, y_train)

        # Get the best hyperparameters and model
        best_params = grid_search.best_params_
        best_accuracy_cv = grid_search.best_score_
        best_model = grid_search.best_estimator_

        # Get the cross-validation scores
        cross_val_scores = grid_search.cv_results_['mean_test_score']

        # Store mean and standard deviation of cross-validation scores
        mean_cv_score = np.mean(cross_val_scores)
        std_cv_score = np.std(cross_val_scores)

        # Evaluate the best model on the testing set
        accuracy_test = best_model.score(X_test, y_test)

        # Check if the model may be overfitting based on the threshold
        if (accuracy_test - best_accuracy_cv) <= overfitting_threshold:
            if accuracy_test > best_accuracy:
                best_accuracy = accuracy_test
                best_model_name = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Best Hyperparameters': best_params,
            'Accuracy on Test Set': accuracy_test,
            'Mean CV Score': mean_cv_score,
            'Std CV Score': std_cv_score
        })

        # Print feature count done
        print(f"Feature count {count} done for {model_name}")

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model_name}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold5.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 75 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 75 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 76 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 76 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 77 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 77 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 78 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 78 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 79 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 79 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 80 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 80 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 81 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 81 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 82 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 82 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 83 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 83 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 84 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 84 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 85 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 85 done for XGBoost
Best Model: XGBoost
Best Feature Count: 76
Best Accuracy: 0.875

Results DataFrame:


Unnamed: 0,Model,Feature Count,Best Hyperparameters,Accuracy on Test Set,Mean CV Score,Std CV Score
0,Random Forest,75,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.8125,0.786895,0.035603
2,Random Forest,76,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.9375,0.825237,0.030173
4,Random Forest,77,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.8125,0.826804,0.027324
6,Random Forest,78,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.875,0.806149,0.022058
8,Random Forest,79,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.875,0.834117,0.028338
10,Random Forest,80,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.875,0.840741,0.014409
12,Random Forest,81,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.875,0.78585,0.030833
14,Random Forest,82,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.875,0.846201,0.019043
16,Random Forest,83,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.8125,0.82189,0.027393
18,Random Forest,84,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.9375,0.826353,0.024064
