SVM

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150,175,180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # List of kernels to try
    kernels = ['linear', 'rbf', 'poly']

    for kernel in kernels:
        # Initialize an SVM model with the current kernel
        svm_model = SVC(kernel=kernel)

        # Train the SVM model on the training data
        svm_model.fit(X_train, y_train)

        # Evaluate the SVM model on the testing set
        accuracy = svm_model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)

        # Store the results in a dictionary
        result = {
            'Feature_Count': count,
            'Kernel': kernel,
            'Accuracy_Test_Set': accuracy,
            'Cross_Val_Scores': cv_scores,
            'Mean_Cross_Val_Score': np.mean(cv_scores)
        }

        # Append the result to the list of results
        results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Kernel,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,linear,0.571429,"[0.5, 0.7272727272727273, 0.6363636363636364, ...",0.590909
1,25,rbf,0.357143,"[0.5, 0.5454545454545454, 0.5454545454545454, ...",0.536364
2,25,poly,0.714286,"[0.6666666666666666, 0.5454545454545454, 0.636...",0.642424
3,50,linear,0.857143,"[0.3333333333333333, 0.5454545454545454, 0.727...",0.521212
4,50,rbf,0.357143,"[0.5, 0.5454545454545454, 0.5454545454545454, ...",0.536364
5,50,poly,0.785714,"[0.4166666666666667, 0.36363636363636365, 0.54...",0.428788
6,100,linear,0.5,"[0.5833333333333334, 0.7272727272727273, 0.727...",0.625758
7,100,rbf,0.357143,"[0.5, 0.5454545454545454, 0.5454545454545454, ...",0.536364
8,100,poly,0.428571,"[0.4166666666666667, 0.8181818181818182, 0.636...",0.64697
9,150,linear,0.642857,"[0.5833333333333334, 0.5454545454545454, 0.545...",0.498485


Naive Bayes

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB  # Import Gaussian Naive Bayes

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Gaussian Naive Bayes model
    nb_model = GaussianNB()

    # Train the Naive Bayes model on the training data
    nb_model.fit(X_train, y_train)

    # Evaluate the Naive Bayes model on the testing set
    accuracy = nb_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Naive Bayes',  # Added to specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df


Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,Naive Bayes,0.571429,"[0.8333333333333334, 0.9090909090909091, 0.545...",0.748485
1,50,Naive Bayes,0.642857,"[0.5833333333333334, 0.6363636363636364, 0.636...",0.643939
2,100,Naive Bayes,0.5,"[0.6666666666666666, 0.7272727272727273, 0.727...",0.678788
3,150,Naive Bayes,0.714286,"[0.75, 0.6363636363636364, 0.7272727272727273,...",0.713636
4,175,Naive Bayes,0.5,"[0.75, 0.8181818181818182, 0.6363636363636364,...",0.677273
5,180,Naive Bayes,0.642857,"[0.5833333333333334, 0.6363636363636364, 0.636...",0.625758
6,200,Naive Bayes,0.571429,"[0.6666666666666666, 0.8181818181818182, 0.454...",0.678788


Logistic Regression

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Logistic Regression model
    # Increase max_iter to 1000 or a higher value
    lr_model = LogisticRegression(max_iter=1000)

    # Train the Logistic Regression model on the training data
    lr_model.fit(X_train, y_train)

    # Evaluate the Logistic Regression model on the testing set
    accuracy = lr_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Logistic Regression',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df


Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,Logistic Regression,0.571429,"[0.5833333333333334, 0.7272727272727273, 0.636...",0.662121
1,50,Logistic Regression,0.785714,"[0.5833333333333334, 0.5454545454545454, 0.636...",0.589394
2,100,Logistic Regression,0.642857,"[0.5, 0.6363636363636364, 0.6363636363636364, ...",0.518182
3,150,Logistic Regression,0.642857,"[0.5, 0.7272727272727273, 0.45454545454545453,...",0.518182
4,175,Logistic Regression,0.642857,"[0.4166666666666667, 0.6363636363636364, 0.636...",0.501515
5,180,Logistic Regression,0.642857,"[0.5833333333333334, 0.6363636363636364, 0.545...",0.534848
6,200,Logistic Regression,0.642857,"[0.5, 0.8181818181818182, 0.5454545454545454, ...",0.554545


ANN

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# Import MLPClassifier from scikit-learn
from sklearn.neural_network import MLPClassifier

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for neural networks)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize an MLPClassifier (Artificial Neural Network)
    ann_model = MLPClassifier(hidden_layer_sizes=(
        100, 50), max_iter=1000, random_state=32)

    # Train the ANN model on the training data
    ann_model.fit(X_train, y_train)

    # Evaluate the ANN model on the testing set
    accuracy = ann_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(ann_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'ANN',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df


Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,ANN,0.642857,"[0.5, 0.5454545454545454, 0.6363636363636364, ...",0.572727
1,50,ANN,0.571429,"[0.3333333333333333, 0.5454545454545454, 0.636...",0.466667
2,100,ANN,0.642857,"[0.5, 0.5454545454545454, 0.7272727272727273, ...",0.590909
3,150,ANN,0.642857,"[0.5, 0.5454545454545454, 0.7272727272727273, ...",0.536364
4,175,ANN,0.714286,"[0.5, 0.45454545454545453, 0.45454545454545453...",0.445455
5,180,ANN,0.642857,"[0.5, 0.8181818181818182, 0.7272727272727273, ...",0.572727
6,200,ANN,0.571429,"[0.6666666666666666, 0.6363636363636364, 0.727...",0.587879


Full Result Table for 3 folds

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List of models to compare, including Random Forest
models = {
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    # Add Random Forest
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05  # You can adjust this threshold as needed

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Train the model on the training data
        model.fit(X_train, y_train)

        # Evaluate the model on the testing set
        accuracy = model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(model, X_train, y_train, cv=3)

        # Calculate the mean cross-validation score
        mean_cv_score = np.mean(cv_scores)

        # Calculate the difference between test accuracy and mean cross-validation score
        score_difference = accuracy - mean_cv_score

        # Check if the model may be overfitting based on the threshold
        if score_difference <= 0.1:
            # The model is not overfitting
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Accuracy on Test Set': accuracy,
            'Mean Cross-Validation Score': mean_cv_score,
            'Score Difference': score_difference
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold3.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Best Model: Random Forest
Best Feature Count: 50
Best Accuracy: 0.9285714285714286

Results DataFrame:


Unnamed: 0,Model,Feature Count,Accuracy on Test Set,Mean Cross-Validation Score,Score Difference
3,ANN,25,0.428571,0.570175,-0.141604
8,ANN,50,0.642857,0.481481,0.161376
13,ANN,100,0.571429,0.6423,-0.070872
18,ANN,150,0.642857,0.463938,0.17892
23,ANN,175,0.642857,0.426901,0.215957
28,ANN,180,0.642857,0.479532,0.163325
33,ANN,200,0.642857,0.550682,0.092175
2,Logistic Regression,25,0.642857,0.608187,0.03467
7,Logistic Regression,50,0.785714,0.516569,0.269145
12,Logistic Regression,100,0.428571,0.586745,-0.158173


Full Result Table for 5 folds

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# List of feature counts for datasets
feature_counts = [10,11,12,13,14,15]

# List of models to compare, including Random Forest
models = {
    # 'SVM': SVC(kernel='linear'),
    # 'Naive Bayes': GaussianNB(),
    # 'Logistic Regression': LogisticRegression(max_iter=1000),
    # 'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}
# xgbOOST
# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05  

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Define hyperparameter grids for GridSearchCV
        param_grid_rf = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt'],
            'bootstrap': [True, False]
        }

        param_grid_xgb = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2, 0.3],
            'max_depth': [3, 4, 5, 6],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.2, 0.3]
        }

        if model_name == 'Random Forest':
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid_rf,
                                       scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
        elif model_name == 'XGBoost':
            grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid_xgb,
                                             n_iter=100, scoring='accuracy', cv=5, n_jobs=-1, random_state=42, verbose=2)

        # Fit the grid search to your data
        grid_search.fit(X_train, y_train)

        # Get the best hyperparameters and model
        best_params = grid_search.best_params_
        best_accuracy_cv = grid_search.best_score_
        best_model = grid_search.best_estimator_

        # Get the cross-validation scores
        cross_val_scores = grid_search.cv_results_['mean_test_score']

        # Store mean and standard deviation of cross-validation scores
        mean_cv_score = np.mean(cross_val_scores)
        std_cv_score = np.std(cross_val_scores)

        # Evaluate the best model on the testing set
        accuracy_test = best_model.score(X_test, y_test)

        # Check if the model may be overfitting based on the threshold
        if (accuracy_test - best_accuracy_cv) <= overfitting_threshold:
            if accuracy_test > best_accuracy:
                best_accuracy = accuracy_test
                best_model_name = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Best Hyperparameters': best_params,
            'Accuracy on Test Set': accuracy_test,
            'Mean CV Score': mean_cv_score,
            'Std CV Score': std_cv_score
        })

        # Print feature count done
        print(f"Feature count {count} done for {model_name}")

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model_name}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df

# # Create a table plot of the results DataFrame
# fig, ax = plt.subplots(figsize=(20, 12))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=results_df.values, colLabels=results_df.columns,
#          cellLoc='center', loc='center')

# # Save the table plot as an image
# table_image_path = 'results_table_fold5.png'
# plt.savefig(table_image_path, bbox_inches='tight')
# plt.show()

# # Print the path to the saved image
# print(f"Results table image saved at: {table_image_path}")


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 10 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 10 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 11 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 11 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 12 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 12 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 13 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 13 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 14 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 14 done for XGBoost
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Feature count 15 done for Random Forest
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Feature count 15 done for XGBoost
Best Model: XGBoost
Best Feature Count: 10
Best Accuracy: 0.7142857142857143

Results DataFrame:


Unnamed: 0,Model,Feature Count,Best Hyperparameters,Accuracy on Test Set,Mean CV Score,Std CV Score
0,Random Forest,10,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.571429,0.831089,0.024359
2,Random Forest,11,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.571429,0.863945,0.032034
4,Random Forest,12,"{'bootstrap': False, 'max_depth': None, 'max_f...",0.5,0.853648,0.032795
6,Random Forest,13,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.571429,0.806061,0.032239
8,Random Forest,14,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.714286,0.828563,0.027094
10,Random Forest,15,"{'bootstrap': True, 'max_depth': None, 'max_fe...",0.714286,0.852525,0.034863
1,XGBoost,10,"{'subsample': 0.7, 'n_estimators': 200, 'min_c...",0.714286,0.739182,0.107768
3,XGBoost,11,"{'subsample': 0.7, 'n_estimators': 200, 'min_c...",0.642857,0.72097,0.099187
5,XGBoost,12,"{'subsample': 0.7, 'n_estimators': 100, 'min_c...",0.571429,0.711439,0.100313
7,XGBoost,13,"{'subsample': 1.0, 'n_estimators': 50, 'min_ch...",0.571429,0.715894,0.106263
