SVM

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150,175,180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # List of kernels to try
    kernels = ['linear', 'rbf', 'poly']

    for kernel in kernels:
        # Initialize an SVM model with the current kernel
        svm_model = SVC(kernel=kernel)

        # Train the SVM model on the training data
        svm_model.fit(X_train, y_train)

        # Evaluate the SVM model on the testing set
        accuracy = svm_model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)

        # Store the results in a dictionary
        result = {
            'Feature_Count': count,
            'Kernel': kernel,
            'Accuracy_Test_Set': accuracy,
            'Cross_Val_Scores': cv_scores,
            'Mean_Cross_Val_Score': np.mean(cv_scores)
        }

        # Append the result to the list of results
        results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df

Unnamed: 0,Feature_Count,Kernel,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,linear,0.571429,"[0.5, 0.6363636363636364, 0.6363636363636364, ...",0.590909
1,25,rbf,0.357143,"[0.5, 0.5454545454545454, 0.5454545454545454, ...",0.536364
2,25,poly,0.428571,"[0.5833333333333334, 0.5454545454545454, 0.636...",0.571212
3,50,linear,0.785714,"[0.3333333333333333, 0.6363636363636364, 0.727...",0.575758
4,50,rbf,0.357143,"[0.5, 0.5454545454545454, 0.5454545454545454, ...",0.536364
5,50,poly,0.785714,"[0.4166666666666667, 0.8181818181818182, 0.545...",0.556061
6,100,linear,0.571429,"[0.3333333333333333, 0.36363636363636365, 0.45...",0.448485
7,100,rbf,0.357143,"[0.5, 0.5454545454545454, 0.5454545454545454, ...",0.536364
8,100,poly,0.428571,"[0.5, 0.36363636363636365, 0.5454545454545454,...",0.481818
9,150,linear,0.571429,"[0.5, 0.5454545454545454, 0.7272727272727273, ...",0.536364


Naive Bayes

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB  # Import Gaussian Naive Bayes

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Gaussian Naive Bayes model
    nb_model = GaussianNB()

    # Train the Naive Bayes model on the training data
    nb_model.fit(X_train, y_train)

    # Evaluate the Naive Bayes model on the testing set
    accuracy = nb_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Naive Bayes',  # Added to specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df


Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,Naive Bayes,0.571429,"[0.8333333333333334, 0.8181818181818182, 0.545...",0.693939
1,50,Naive Bayes,0.714286,"[0.5, 0.8181818181818182, 0.6363636363636364, ...",0.627273
2,100,Naive Bayes,0.571429,"[0.6666666666666666, 0.5454545454545454, 0.818...",0.715152
3,150,Naive Bayes,0.642857,"[0.75, 0.8181818181818182, 0.7272727272727273,...",0.75
4,175,Naive Bayes,0.714286,"[0.75, 0.6363636363636364, 0.7272727272727273,...",0.713636
5,180,Naive Bayes,0.642857,"[0.8333333333333334, 0.6363636363636364, 0.727...",0.730303
6,200,Naive Bayes,0.571429,"[0.75, 0.8181818181818182, 0.6363636363636364,...",0.731818


Logistic Regression

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Initialize a Logistic Regression model
    # Increase max_iter to 1000 or a higher value
    lr_model = LogisticRegression(max_iter=1000)

    # Train the Logistic Regression model on the training data
    lr_model.fit(X_train, y_train)

    # Evaluate the Logistic Regression model on the testing set
    accuracy = lr_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'Logistic Regression',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df


Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,Logistic Regression,0.5,"[0.5, 0.6363636363636364, 0.5454545454545454, ...",0.590909
1,50,Logistic Regression,0.857143,"[0.3333333333333333, 0.6363636363636364, 0.636...",0.575758
2,100,Logistic Regression,0.714286,"[0.4166666666666667, 0.5454545454545454, 0.454...",0.44697
3,150,Logistic Regression,0.714286,"[0.5, 0.6363636363636364, 0.7272727272727273, ...",0.554545
4,175,Logistic Regression,0.642857,"[0.6666666666666666, 0.6363636363636364, 0.727...",0.606061
5,180,Logistic Regression,0.642857,"[0.5833333333333334, 0.6363636363636364, 0.727...",0.571212
6,200,Logistic Regression,0.642857,"[0.5, 0.8181818181818182, 0.6363636363636364, ...",0.554545


ANN

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# Import MLPClassifier from scikit-learn
from sklearn.neural_network import MLPClassifier

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for neural networks)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize an MLPClassifier (Artificial Neural Network)
    ann_model = MLPClassifier(hidden_layer_sizes=(
        100, 50), max_iter=1000, random_state=32)

    # Train the ANN model on the training data
    ann_model.fit(X_train, y_train)

    # Evaluate the ANN model on the testing set
    accuracy = ann_model.score(X_test, y_test)

    # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
    cv_scores = cross_val_score(ann_model, X_train, y_train, cv=5)

    # Store the results in a dictionary
    result = {
        'Feature_Count': count,
        'Algorithm': 'ANN',  # Specify the algorithm used
        'Accuracy_Test_Set': accuracy,
        'Cross_Val_Scores': cv_scores,
        'Mean_Cross_Val_Score': np.mean(cv_scores)
    }

    # Append the result to the list of results
    results.append(result)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the final accuracy result table
results_df


Unnamed: 0,Feature_Count,Algorithm,Accuracy_Test_Set,Cross_Val_Scores,Mean_Cross_Val_Score
0,25,ANN,0.642857,"[0.5833333333333334, 0.6363636363636364, 0.636...",0.571212
1,50,ANN,0.571429,"[0.4166666666666667, 0.8181818181818182, 0.727...",0.574242
2,100,ANN,0.642857,"[0.5, 0.36363636363636365, 0.5454545454545454,...",0.427273
3,150,ANN,0.428571,"[0.5, 0.5454545454545454, 0.8181818181818182, ...",0.554545
4,175,ANN,0.642857,"[0.5833333333333334, 0.7272727272727273, 0.454...",0.607576
5,180,ANN,0.571429,"[0.6666666666666666, 0.45454545454545453, 0.72...",0.515152
6,200,ANN,0.642857,"[0.4166666666666667, 0.5454545454545454, 0.727...",0.483333


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# List of feature counts for datasets
feature_counts = [25, 50, 100, 150, 175, 180, 200]

# List of models to compare
models = {
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'ANN': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000)
}

# Variables to track the best accuracy and model
best_accuracy = 0
best_model = None
best_feature_count = None

# Threshold for identifying potential overfitting
overfitting_threshold = 0.05  # You can adjust this threshold as needed

# Lists to store results
results = []

for count in feature_counts:
    # Load the dataset
    file_path = f"C:/Users/ACER/OneDrive - University of Jaffna/UOJ/Education/Research/Data Sets/GSE140842/Feature Selection/Information Gain/data_k_{count}.csv"
    df = pd.read_csv(file_path, index_col=0)

    X = df.drop('Diagnosis', axis=1)
    y = df['Diagnosis']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=32)

    # Standardize the features (important for some models)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        # Train the model on the training data
        model.fit(X_train, y_train)

        # Evaluate the model on the testing set
        accuracy = model.score(X_test, y_test)

        # Perform k-fold cross-validation (e.g., 5-fold cross-validation)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)

        # Calculate the mean cross-validation score
        mean_cv_score = np.mean(cv_scores)

        # Calculate the difference between test accuracy and mean cross-validation score
        score_difference = accuracy - mean_cv_score

        # Check if the model may be overfitting based on the threshold
        if score_difference <= 0.1:
            # The model is not overfitting
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model_name
                best_feature_count = count

        # Append the results for the current model and feature count to the list
        results.append({
            'Model': model_name,
            'Feature Count': count,
            'Accuracy on Test Set': accuracy,
            'Mean Cross-Validation Score': mean_cv_score,
            'Score Difference': score_difference
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the results DataFrame by model name
results_df = results_df.sort_values(by=['Model', 'Feature Count'])

# Print the best model and feature count
print(f"Best Model: {best_model}")
print(f"Best Feature Count: {best_feature_count}")
print(f"Best Accuracy: {best_accuracy}")

# Print the results DataFrame
print("\nResults DataFrame:")
results_df


Best Model: Naive Bayes
Best Feature Count: 50
Best Accuracy: 0.7142857142857143

Results DataFrame:


Unnamed: 0,Model,Feature Count,Accuracy on Test Set,Mean Cross-Validation Score,Score Difference
3,ANN,25,0.428571,0.625758,-0.197186
7,ANN,50,0.857143,0.537879,0.319264
11,ANN,100,0.5,0.554545,-0.054545
15,ANN,150,0.714286,0.519697,0.194589
19,ANN,175,0.714286,0.607576,0.10671
23,ANN,180,0.714286,0.642424,0.071861
27,ANN,200,0.571429,0.556061,0.015368
2,Logistic Regression,25,0.428571,0.465152,-0.03658
6,Logistic Regression,50,0.857143,0.522727,0.334416
10,Logistic Regression,100,0.357143,0.481818,-0.124675
