In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # List of kernels to try
    kernels = ['linear', 'rbf', 'poly']

    for kernel in kernels:
        # Initialize an SVM model with the current kernel
        svm_model = SVC(kernel=kernel)

        # Train the SVM model on the training data
        svm_model.fit(X_train, y_train)

        # Evaluate the SVM model on the testing set
        accuracy = svm_model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)

        # Store the results in a dictionary
        result = {
            'Feature_Count': count,
            'Kernel': kernel,
            'Accuracy_Test_Set': accuracy,
            'Cross_Val_Scores': cv_scores,
            'Mean_Cross_Val_Score': np.mean(cv_scores)
        }

        # Append the result to the list of results
        results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Kernel,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,linear,0.6875,"[0.6923076923076923, 0.6153846153846154, 0.923...",0.629487
1,10,rbf,0.5625,"[0.6923076923076923, 0.6153846153846154, 0.538...",0.569231
2,10,poly,0.625,"[0.6153846153846154, 0.6923076923076923, 0.538...",0.585897
3,15,linear,0.6875,"[0.7692307692307693, 1.0, 0.9230769230769231, ...",0.855128
4,15,rbf,0.5625,"[0.6923076923076923, 0.7692307692307693, 0.615...",0.632051
5,15,poly,0.5625,"[0.6923076923076923, 0.6923076923076923, 0.615...",0.583333
6,25,linear,0.5,"[0.5384615384615384, 0.6153846153846154, 0.769...",0.684615
7,25,rbf,0.5,"[0.6153846153846154, 0.6153846153846154, 0.461...",0.505128
8,25,poly,0.5625,"[0.6153846153846154, 0.6153846153846154, 0.615...",0.535897
9,50,linear,0.5625,"[0.6153846153846154, 0.5384615384615384, 0.923...",0.582051


NAIVE BAYES


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB  # Import Gaussian Naive Bayes

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Gaussian Naive Bayes model
    nb_model = GaussianNB()

    # Train the Naive Bayes model on the training data
    nb_model.fit(X_train, y_train)

    # Evaluate the Naive Bayes model on the testing set
    accuracy = nb_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Naive Bayes',  # Added to specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,Naive Bayes,0.6875,"[0.6923076923076923, 0.6153846153846154, 0.769...",0.615385
1,15,Naive Bayes,0.5625,"[0.8461538461538461, 0.9230769230769231, 0.846...",0.773077
2,25,Naive Bayes,0.6875,"[0.8461538461538461, 0.9230769230769231, 0.769...",0.791026
3,50,Naive Bayes,0.625,"[0.6923076923076923, 0.6923076923076923, 0.692...",0.682051
4,100,Naive Bayes,0.75,"[0.8461538461538461, 0.7692307692307693, 0.692...",0.728205
5,150,Naive Bayes,0.6875,"[1.0, 0.9230769230769231, 0.7692307692307693, ...",0.821795
6,175,Naive Bayes,0.8125,"[0.9230769230769231, 0.7692307692307693, 0.846...",0.774359
7,180,Naive Bayes,0.8125,"[0.6923076923076923, 0.7692307692307693, 0.846...",0.744872
8,200,Naive Bayes,0.75,"[0.9230769230769231, 0.8461538461538461, 0.769...",0.807692


Logistic Regression

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Logistic Regression model
    # Increase max_iter to 1000 or a higher value
    lr_model = LogisticRegression(max_iter=1000)

    # Train the Logistic Regression model on the training data
    lr_model.fit(X_train, y_train)

    # Evaluate the Logistic Regression model on the testing set
    accuracy = lr_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Logistic Regression',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,Logistic Regression,0.8125,"[0.46153846153846156, 0.6153846153846154, 0.69...",0.637179
1,15,Logistic Regression,0.6875,"[0.6153846153846154, 1.0, 0.6923076923076923, ...",0.761538
2,25,Logistic Regression,0.5625,"[0.7692307692307693, 0.8461538461538461, 0.846...",0.792308
3,50,Logistic Regression,0.5625,"[0.6153846153846154, 0.46153846153846156, 0.92...",0.583333
4,100,Logistic Regression,0.875,"[0.8461538461538461, 0.9230769230769231, 0.692...",0.825641
5,150,Logistic Regression,0.4375,"[0.6153846153846154, 0.9230769230769231, 0.615...",0.747436
6,175,Logistic Regression,0.875,"[0.9230769230769231, 0.6923076923076923, 0.615...",0.729487
7,180,Logistic Regression,0.6875,"[0.6923076923076923, 0.6923076923076923, 0.538...",0.667949
8,200,Logistic Regression,0.625,"[0.8461538461538461, 0.9230769230769231, 0.846...",0.789744


ANN

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# Import MLPClassifier from scikit-learn
from sklearn.neural_network import MLPClassifier

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for neural networks)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize an MLPClassifier (Artificial Neural Network)
    ann_model = MLPClassifier(hidden_layer_sizes=(
        100, 50), max_iter=1000, random_state=32)

    # Train the ANN model on the training data
    ann_model.fit(X_train, y_train)

    # Evaluate the ANN model on the testing set
    accuracy = ann_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(ann_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'ANN',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,10,ANN,0.8125,"[0.6923076923076923, 0.6923076923076923, 0.846...",0.696154
1,15,ANN,0.6875,"[0.6923076923076923, 0.7692307692307693, 0.769...",0.729487
2,25,ANN,0.8125,"[0.7692307692307693, 0.8461538461538461, 0.769...",0.79359
3,50,ANN,0.6875,"[0.6923076923076923, 0.6153846153846154, 0.769...",0.615385
4,100,ANN,0.75,"[0.7692307692307693, 0.7692307692307693, 0.846...",0.74359
5,150,ANN,0.75,"[0.9230769230769231, 0.9230769230769231, 0.846...",0.855128
6,175,ANN,0.8125,"[0.7692307692307693, 0.8461538461538461, 0.769...",0.79359
7,180,ANN,0.8125,"[0.8461538461538461, 0.9230769230769231, 0.923...",0.838462
8,200,ANN,0.6875,"[0.7692307692307693, 0.8461538461538461, 0.846...",0.775641


Full Result Table For 3 Folds

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List of models to compare, including Random Forest
models = {
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    # Add Random Forest
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05  # You can adjust this threshold as needed

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/Ashfa Fathima/OneDrive - University of Jaffna/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Train the model on the training data
        model.fit(X_train, y_train)

        # Evaluate the model on the testing set
        accuracy = model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(model, X_train, y_train, cv=3)

        # Calculate the mean cross-validation score
        mean_cv_score = np.mean(cv_scores)

        # Calculate the difference between test accuracy and mean cross-validation score
        score_difference = accuracy - mean_cv_score

        # Check if the model may be overfitting based on the threshold
        if score_difference <= 0.1:
            # The model is not overfitting
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Accuracy on Test Set': accuracy,
            'Mean Cross-Validation Score': mean_cv_score,
            'Score Difference': score_difference
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold3.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Best Model: ANN
Best Feature Count: 175
Best Accuracy: 0.875

Results DataFrame:


Unnamed: 0,Model,Feature Count,Accuracy on Test Set,Mean Cross-Validation Score,Score Difference
3,ANN,10,0.8125,0.714286,0.098214
8,ANN,15,0.5625,0.746032,-0.183532
13,ANN,25,0.8125,0.761905,0.050595
18,ANN,50,0.625,0.666667,-0.041667
23,ANN,100,0.6875,0.714286,-0.026786
28,ANN,150,0.75,0.746032,0.003968
33,ANN,175,0.875,0.793651,0.081349
38,ANN,180,0.9375,0.793651,0.143849
43,ANN,200,0.75,0.777778,-0.027778
2,Logistic Regression,10,0.75,0.730159,0.019841


Full Result Table For 5 Folds

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [10, 15, 25, 50, 100, 150, 175, 180, 200]

# List of models to compare, including Random Forest
models = {
    # 'SVM': SVC(kernel='linear'),
    # 'Naive Bayes': GaussianNB(),
    # 'Logistic Regression': LogisticRegression(max_iter=1000),
    # 'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}
# xgbOOST
# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05  

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Disese Group', axis=1)
    y = df['Disese Group']
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Define hyperparameter grids for GridSearchCV
        param_grid_rf = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt'],
            'bootstrap': [True, False]
        }

        param_grid_xgb = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2, 0.3],
            'max_depth': [3, 4, 5, 6],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.2, 0.3]
        }

        if model_name == 'Random Forest':
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid_rf,
                                       scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
        elif model_name == 'XGBoost':
            grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid_xgb,
                                             n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42, verbose=2)

        # Fit the grid search to your data
        grid_search.fit(X_train, y_train)

        # Get the best hyperparameters and model
        best_params = grid_search.best_params_
        best_accuracy_cv = grid_search.best_score_
        best_model = grid_search.best_estimator_

        # Evaluate the best model on the testing set
        accuracy_test = best_model.score(X_test, y_test)

        # Check if the model may be overfitting based on the threshold
        if (accuracy_test - best_accuracy_cv) <= overfitting_threshold:
            if accuracy_test > best_accuracy:
                best_accuracy = accuracy_test
                best_model_name = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Best Hyperparameters': best_params,
            'Accuracy on Test Set': accuracy_test,
            'Best Cross-Validation Score': best_accuracy_cv
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model_name}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold5.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Model: Random Forest
Best Feature Count: 150
Best Accuracy: 0.8125

Results DataFrame:


Unnamed: 0,Model,Feature Count,Best Hyperparameters,Accuracy on Test Set,Best Cross-Validation Score
0,Random Forest,10,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.75,0.776923
2,Random Forest,15,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.5625,0.791026
4,Random Forest,25,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.6875,0.758974
6,Random Forest,50,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.75,0.824359
8,Random Forest,100,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.75,0.839744
10,Random Forest,150,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.8125,0.774359
12,Random Forest,175,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.9375,0.807692
14,Random Forest,180,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.625,0.774359
16,Random Forest,200,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.9375,0.791026
1,XGBoost,10,"{'subsample': 0.9, 'n_estimators': 50, 'min_ch...",0.75,0.762821


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [49,50,51,79,80]

# List of models to compare, including Random Forest
models = {
    # 'SVM': SVC(kernel='linear'),
    # 'Naive Bayes': GaussianNB(),
    # 'Logistic Regression': LogisticRegression(max_iter=1000),
    # 'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}
# xgbOOST
# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE160310/totalRNA/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Define hyperparameter grids for GridSearchCV
        param_grid_rf = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt'],
            'bootstrap': [True, False]
        }

        param_grid_xgb = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2, 0.3],
            'max_depth': [3, 4, 5, 6],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.2, 0.3]
        }

        if model_name == 'Random Forest':
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid_rf,
                                       scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
        elif model_name == 'XGBoost':
            grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid_xgb,
                                             n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42, verbose=2)

        # Fit the grid search to your data
        grid_search.fit(X_train, y_train)

        # Get the best hyperparameters and model
        best_params = grid_search.best_params_
        best_accuracy_cv = grid_search.best_score_
        best_model = grid_search.best_estimator_

        # Get the cross-validation scores
        cross_val_scores = grid_search.cv_results_['mean_test_score']

        # Store mean and standard deviation of cross-validation scores
        mean_cv_score = np.mean(cross_val_scores)
        std_cv_score = np.std(cross_val_scores)

        # Evaluate the best model on the testing set
        accuracy_test = best_model.score(X_test, y_test)

        # Check if the model may be overfitting based on the threshold
        if (accuracy_test - best_accuracy_cv) <= overfitting_threshold:
            if accuracy_test > best_accuracy:
                best_accuracy = accuracy_test
                best_model_name = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Best Hyperparameters': best_params,
            'Accuracy on Test Set': accuracy_test,
            'Mean CV Score': mean_cv_score,
            'Std CV Score': std_cv_score
        })

        # Print feature count done
        print(f"Feature count {count} done for {model_name}")

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model_name}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold5.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 49 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 49 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 50 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 50 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 51 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 51 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 79 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 79 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 80 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 80 done for XGBoost
Best Model: Random Forest
Best Feature Count: 50
Best Accuracy: 0.75

Results DataFrame:


Unnamed: 0,Model,Feature Count,Best Hyperparameters,Accuracy on Test Set,Mean CV Score,Std CV Score
0,Random Forest,49,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.625,0.820726,0.016587
2,Random Forest,50,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.75,0.826448,0.025106
4,Random Forest,51,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.5,0.637868,0.026939
6,Random Forest,79,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.625,0.739316,0.021376
8,Random Forest,80,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.375,0.828086,0.019225
1,XGBoost,49,"{'subsample': 1.0, 'n_estimators': 200, 'min_c...",0.6875,0.731397,0.047973
3,XGBoost,50,"{'subsample': 0.9, 'n_estimators': 100, 'min_c...",0.75,0.795667,0.060316
5,XGBoost,51,"{'subsample': 0.8, 'n_estimators': 50, 'min_ch...",0.5,0.586731,0.035585
7,XGBoost,79,"{'subsample': 1.0, 'n_estimators': 50, 'min_ch...",0.625,0.758692,0.061504
9,XGBoost,80,"{'subsample': 0.7, 'n_estimators': 200, 'min_c...",0.4375,0.780449,0.081044
